Merge branch 'develop' into yotamfe/develop

This commit is contained in:
Axel Kohlmeyer
2023-01-26 11:47:32 -05:00
78 changed files with 12730 additions and 252 deletions

View File

@ -99,8 +99,15 @@ function(check_for_autogen_files source_dir)
endfunction()
macro(pkg_depends PKG1 PKG2)
if(PKG_${PKG1} AND NOT (PKG_${PKG2} OR BUILD_${PKG2}))
message(FATAL_ERROR "The ${PKG1} package needs LAMMPS to be built with the ${PKG2} package")
if(DEFINED BUILD_${PKG2})
if(PKG_${PKG1} AND NOT BUILD_${PKG2})
message(FATAL_ERROR "The ${PKG1} package needs LAMMPS to be built with -D BUILD_${PKG2}=ON")
endif()
elseif(DEFINED PKG_${PKG2})
if(PKG_${PKG1} AND NOT PKG_${PKG2})
message(WARNING "The ${PKG1} package depends on the ${PKG2} package. Enabling it.")
set(PKG_${PKG2} ON CACHE BOOL "" FORCE)
endif()
endif()
endmacro()

View File

@ -1,4 +1,9 @@
find_package(ZLIB REQUIRED)
find_package(ZLIB)
if(NOT ZLIB_FOUND)
message(WARNING "No Zlib development support found. Disabling COMPRESS package...")
set(PKG_COMPRESS OFF CACHE BOOL "" FORCE)
return()
endif()
target_link_libraries(lammps PRIVATE ZLIB::ZLIB)
find_package(PkgConfig QUIET)

View File

@ -26,6 +26,19 @@ elseif(GPU_PREC STREQUAL "SINGLE")
set(GPU_PREC_SETTING "SINGLE_SINGLE")
endif()
option(GPU_DEBUG "Enable debugging code of the GPU package" OFF)
mark_as_advanced(GPU_DEBUG)
if(PKG_AMOEBA AND FFT_SINGLE)
message(FATAL_ERROR "GPU acceleration of AMOEBA is not (yet) compatible with single precision FFT")
endif()
if (PKG_AMOEBA)
list(APPEND GPU_SOURCES
${GPU_SOURCES_DIR}/amoeba_convolution_gpu.h
${GPU_SOURCES_DIR}/amoeba_convolution_gpu.cpp)
endif()
file(GLOB GPU_LIB_SOURCES ${CONFIGURE_DEPENDS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/[^.]*.cpp)
file(MAKE_DIRECTORY ${LAMMPS_LIB_BINARY_DIR}/gpu)
@ -151,7 +164,12 @@ if(GPU_API STREQUAL "CUDA")
add_library(gpu STATIC ${GPU_LIB_SOURCES} ${GPU_LIB_CUDPP_SOURCES} ${GPU_OBJS})
target_link_libraries(gpu PRIVATE ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY})
target_include_directories(gpu PRIVATE ${LAMMPS_LIB_BINARY_DIR}/gpu ${CUDA_INCLUDE_DIRS})
target_compile_definitions(gpu PRIVATE -DUSE_CUDA -D_${GPU_PREC_SETTING} -DMPI_GERYON -DUCL_NO_EXIT ${GPU_CUDA_MPS_FLAGS})
target_compile_definitions(gpu PRIVATE -DUSE_CUDA -D_${GPU_PREC_SETTING} ${GPU_CUDA_MPS_FLAGS})
if(GPU_DEBUG)
target_compile_definitions(gpu PRIVATE -DUCL_DEBUG -DGERYON_KERNEL_DUMP)
else()
target_compile_definitions(gpu PRIVATE -DMPI_GERYON -DUCL_NO_EXIT)
endif()
if(CUDPP_OPT)
target_include_directories(gpu PRIVATE ${LAMMPS_LIB_SOURCE_DIR}/gpu/cudpp_mini)
target_compile_definitions(gpu PRIVATE -DUSE_CUDPP)
@ -192,6 +210,7 @@ elseif(GPU_API STREQUAL "OPENCL")
${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff.cu
${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_zbl.cu
${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_mod.cu
${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_hippo.cu
)
foreach(GPU_KERNEL ${GPU_LIB_CU})
@ -208,6 +227,7 @@ elseif(GPU_API STREQUAL "OPENCL")
GenerateOpenCLHeader(tersoff ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff.cu)
GenerateOpenCLHeader(tersoff_zbl ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_zbl_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_zbl_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_zbl.cu)
GenerateOpenCLHeader(tersoff_mod ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_mod_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_mod_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_mod.cu)
GenerateOpenCLHeader(hippo ${CMAKE_CURRENT_BINARY_DIR}/gpu/hippo_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_hippo_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_hippo.cu)
list(APPEND GPU_LIB_SOURCES
${CMAKE_CURRENT_BINARY_DIR}/gpu/gayberne_cl.h
@ -217,14 +237,18 @@ elseif(GPU_API STREQUAL "OPENCL")
${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_cl.h
${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_zbl_cl.h
${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_mod_cl.h
${CMAKE_CURRENT_BINARY_DIR}/gpu/hippo_cl.h
)
add_library(gpu STATIC ${GPU_LIB_SOURCES})
target_link_libraries(gpu PRIVATE OpenCL::OpenCL)
target_include_directories(gpu PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/gpu)
target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT)
target_compile_definitions(gpu PRIVATE -DUSE_OPENCL)
target_compile_definitions(gpu PRIVATE -DUSE_OPENCL -D_${GPU_PREC_SETTING})
if(GPU_DEBUG)
target_compile_definitions(gpu PRIVATE -DUCL_DEBUG -DGERYON_KERNEL_DUMP)
else()
target_compile_definitions(gpu PRIVATE -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT)
endif()
target_link_libraries(lammps PRIVATE gpu)
add_executable(ocl_get_devices ${LAMMPS_LIB_SOURCE_DIR}/gpu/geryon/ucl_get_devices.cpp)
@ -374,8 +398,12 @@ elseif(GPU_API STREQUAL "HIP")
add_library(gpu STATIC ${GPU_LIB_SOURCES})
target_include_directories(gpu PRIVATE ${LAMMPS_LIB_BINARY_DIR}/gpu)
target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -DMPI_GERYON -DUCL_NO_EXIT)
target_compile_definitions(gpu PRIVATE -DUSE_HIP)
target_compile_definitions(gpu PRIVATE -DUSE_HIP -D_${GPU_PREC_SETTING})
if(GPU_DEBUG)
target_compile_definitions(gpu PRIVATE -DUCL_DEBUG -DGERYON_KERNEL_DUMP)
else()
target_compile_definitions(gpu PRIVATE -DMPI_GERYON -DUCL_NO_EXIT)
endif()
target_link_libraries(gpu PRIVATE hip::host)
if(HIP_USE_DEVICE_SORT)

View File

@ -126,10 +126,11 @@ CMake build
-D GPU_API=value # value = opencl (default) or cuda or hip
-D GPU_PREC=value # precision setting
# value = double or mixed (default) or single
-D HIP_PATH # path to HIP installation. Must be set if GPU_API=HIP
-D GPU_ARCH=value # primary GPU hardware choice for GPU_API=cuda
# value = sm_XX, see below
# default is sm_50
# value = sm_XX (see below, default is sm_50)
-D GPU_DEBUG=value # enable debug code in the GPU package library, mostly useful for developers
# value = yes or no (default)
-D HIP_PATH=value # value = path to HIP installation. Must be set if GPU_API=HIP
-D HIP_ARCH=value # primary GPU hardware choice for GPU_API=hip
# value depends on selected HIP_PLATFORM
# default is 'gfx906' for HIP_PLATFORM=amd and 'sm_50' for HIP_PLATFORM=nvcc

View File

@ -39,7 +39,7 @@ OPT.
* :doc:`agni (o) <pair_agni>`
* :doc:`airebo (io) <pair_airebo>`
* :doc:`airebo/morse (io) <pair_airebo>`
* :doc:`amoeba <pair_amoeba>`
* :doc:`amoeba (g) <pair_amoeba>`
* :doc:`atm <pair_atm>`
* :doc:`awpmd/cut <pair_awpmd>`
* :doc:`beck (go) <pair_beck>`
@ -126,7 +126,7 @@ OPT.
* :doc:`hbond/dreiding/lj (o) <pair_hbond_dreiding>`
* :doc:`hbond/dreiding/morse (o) <pair_hbond_dreiding>`
* :doc:`hdnnp <pair_hdnnp>`
* :doc:`hippo <pair_amoeba>`
* :doc:`hippo (g) <pair_amoeba>`
* :doc:`ilp/graphene/hbn (t) <pair_ilp_graphene_hbn>`
* :doc:`ilp/tmd (t) <pair_ilp_tmd>`
* :doc:`kolmogorov/crespi/full <pair_kolmogorov_crespi_full>`

View File

@ -39,6 +39,9 @@ Syntax
*masslimit* value = massmin massmax
massmin = minimum molecular weight of species to delete
massmax = maximum molecular weight of species to delete
*delete_rate_limit* value = Nlimit Nsteps
Nlimit = maximum number of deletions allowed to occur within interval
Nsteps = the interval (number of timesteps) over which to count deletions
Examples
""""""""
@ -142,7 +145,13 @@ When using the *masslimit* keyword, each line of the *filedel* file
contains the timestep on which deletions occurs, followed by how many
of each species are deleted (with quantities preceding chemical
formulae). The *specieslist* and *masslimit* keywords cannot both be
used in the same *reaxff/species* fix.
used in the same *reaxff/species* fix. The *delete_rate_limit*
keyword can enforce an upper limit on the overall rate of molecule
deletion. The number of deletion occurrences is limited to Nlimit
within an interval of Nsteps timesteps. When using the
*delete_rate_limit* keyword, no deletions are permitted to occur
within the first Nsteps timesteps of the first run (after reading a
either a data or restart file).
----------

View File

@ -732,8 +732,8 @@ choices:
* Use one of the 4 NPT or NPH styles for the rigid bodies. Use the
*dilate* all option so that it will dilate the positions of the
*non-rigid particles as well. Use :doc:`fix nvt <fix_nh>` (or any
*other thermostat) for the non-rigid particles.
non-rigid particles as well. Use :doc:`fix nvt <fix_nh>` (or any
other thermostat) for the non-rigid particles.
* Use :doc:`fix npt <fix_nh>` for the group of non-rigid particles. Use
the *dilate* all option so that it will dilate the center-of-mass
positions of the rigid bodies as well. Use one of the 4 NVE or 2 NVT

View File

@ -1,11 +1,18 @@
.. index:: pair_style amoeba
.. index:: pair_style amoeba/gpu
.. index:: pair_style hippo
.. index:: pair_style hippo/gpu
pair_style amoeba command
=========================
Accelerator Variants: *amoeba/gpu*
pair_style hippo command
========================
Accelerator Variants: *hippo/gpu*
Syntax
""""""
@ -127,6 +134,10 @@ version discussed in :ref:`(Ponder) <amoeba-Ponder>`, :ref:`(Ren)
implementation of HIPPO in LAMMPS matches the version discussed in
:ref:`(Rackers) <amoeba-Rackers>`.
.. versionadded:: TBD
Accelerator support via the GPU package is available.
----------
Only a single pair_coeff command is used with either the *amoeba* and
@ -187,6 +198,19 @@ These pair styles can only be used via the *pair* keyword of the
----------
.. include:: accel_styles.rst
.. note::
Using the GPU accelerated pair styles 'amoeba/gpu' or 'hippo/gpu'
when compiling the GPU package for OpenCL has a few known issues
when running on integrated GPUs and the calculation may crash.
The GPU accelerated pair styles are also not (yet) compatible
with single precision FFTs.
----------
Restrictions
""""""""""""

View File

@ -6,6 +6,6 @@ CUDA_HOME=/usr/local/cuda
endif
gpu_SYSINC =
gpu_SYSLIB = -lcudart -lcuda
gpu_SYSLIB = -lcudart -lcuda -lcufft
gpu_SYSPATH = -L$(CUDA_HOME)/lib64 -L$(CUDA_HOME)/lib64/stubs

View File

@ -1,9 +1,17 @@
# Common headers for kernels
PRE1_H = lal_preprocessor.h lal_aux_fun1.h
# Headers for Geryon
UCL_H = $(wildcard ./geryon/ucl*.h)
NVD_H = $(wildcard ./geryon/nvd*.h) $(UCL_H) lal_preprocessor.h \
lal_pre_cuda_hip.h
ALL_H = $(NVD_H) $(wildcard ./lal_*.h)
# Headers for Host files
HOST_H = lal_answer.h lal_atom.h lal_balance.h lal_base_atomic.h lal_base_amoeba.h \
lal_base_charge.h lal_base_dipole.h lal_base_dpd.h \
lal_base_ellipsoid.h lal_base_three.h lal_device.h lal_neighbor.h \
lal_neighbor_shared.h lal_pre_ocl_config.h $(NVD_H)
# Source files
SRCS := $(wildcard ./lal_*.cpp)
OBJS := $(subst ./,$(OBJ_DIR)/,$(SRCS:%.cpp=%.o))
@ -54,13 +62,40 @@ $(OBJ_DIR)/pppm_d.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h \
$(OBJ_DIR)/pppm_d_cubin.h: $(OBJ_DIR)/pppm_d.cubin
$(BIN2C) -c -n pppm_d $(OBJ_DIR)/pppm_d.cubin > $(OBJ_DIR)/pppm_d_cubin.h
$(OBJ_DIR)/%_cubin.h: lal_%.cu $(ALL_H)
$(OBJ_DIR)/%_cubin.h: lal_%.cu $(PRE1_H)
$(CUDA) --fatbin -DNV_KERNEL -o $(OBJ_DIR)/$*.cubin $(OBJ_DIR)/lal_$*.cu
$(BIN2C) -c -n $* $(OBJ_DIR)/$*.cubin > $@
# host code compilation
$(OBJ_DIR)/lal_%.o: lal_%.cpp $(CUHS) $(ALL_H)
$(OBJ_DIR)/lal_answer.o: lal_answer.cpp $(HOST_H)
$(CUDR) -o $@ -c lal_answer.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_dpd_tstat_ext.o: lal_dpd_tstat_ext.cpp lal_dpd.h $(HOST_H)
$(CUDR) -o $@ -c lal_dpd_tstat_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_eam_alloy_ext.o: lal_eam_alloy_ext.cpp lal_eam.h $(HOST_H)
$(CUDR) -o $@ -c lal_eam_alloy_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_eam_fs_ext.o: lal_eam_fs_ext.cpp lal_eam.h $(HOST_H)
$(CUDR) -o $@ -c lal_eam_fs_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_neighbor.o: lal_neighbor.cpp $(HOST_H)
$(CUDR) -o $@ -c lal_neighbor.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_neighbor_shared.o: lal_neighbor_shared.cpp $(HOST_H)
$(CUDR) -o $@ -c lal_neighbor_shared.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_pppm.o: lal_pppm.cpp pppm_f_cubin.h pppm_d_cubin.h $(HOST_H)
$(CUDR) -o $@ -c lal_pppm.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_%_ext.o: lal_%_ext.cpp lal_%.h $(HOST_H)
$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
$(OBJ_DIR)/lal_base_%.o: lal_base_%.cpp $(HOST_H)
$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
$(OBJ_DIR)/lal_%.o: lal_%.cpp %_cubin.h $(HOST_H)
$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
#ifdef CUDPP_OPT
@ -77,10 +112,10 @@ $(OBJ_DIR)/cudpp_plan_manager.o: cudpp_mini/cudpp_plan_manager.cpp
$(CUDR) -o $@ -c cudpp_mini/cudpp_plan_manager.cpp -Icudpp_mini
$(OBJ_DIR)/radixsort_app.cu_o: cudpp_mini/radixsort_app.cu
$(CUDA) -o $@ -c cudpp_mini/radixsort_app.cu
$(CUDA) -o $@ -c cudpp_mini/radixsort_app.cu -Icudpp_mini
$(OBJ_DIR)/scan_app.cu_o: cudpp_mini/scan_app.cu
$(CUDA) -o $@ -c cudpp_mini/scan_app.cu
$(CUDA) -o $@ -c cudpp_mini/scan_app.cu -Icudpp_mini
#endif
# build libgpu.a

View File

@ -6,7 +6,7 @@ UCL_H = $(wildcard ./geryon/ucl*.h)
OCL_H = $(wildcard ./geryon/ocl*.h) $(UCL_H) lal_precision.h
# Headers for Host files
HOST_H = lal_answer.h lal_atom.h lal_balance.h lal_base_atomic.h \
HOST_H = lal_answer.h lal_atom.h lal_balance.h lal_base_atomic.h lal_base_amoeba.h \
lal_base_charge.h lal_base_dipole.h lal_base_dpd.h \
lal_base_ellipsoid.h lal_base_three.h lal_device.h lal_neighbor.h \
lal_neighbor_shared.h lal_pre_ocl_config.h $(OCL_H)
@ -74,6 +74,9 @@ $(OBJ_DIR)/tersoff_mod_cl.h: lal_tersoff_mod.cu $(PRE1_H) lal_tersoff_mod_extra.
$(OBJ_DIR)/tersoff_zbl_cl.h: lal_tersoff_zbl.cu $(PRE1_H) lal_tersoff_zbl_extra.h
$(BSH) ./geryon/file_to_cstr.sh tersoff_zbl $(PRE1_H) lal_tersoff_zbl_extra.h lal_tersoff_zbl.cu $(OBJ_DIR)/tersoff_zbl_cl.h;
$(OBJ_DIR)/hippo_cl.h: lal_hippo.cu $(PRE1_H) lal_hippo_extra.h
$(BSH) ./geryon/file_to_cstr.sh hippo $(PRE1_H) lal_hippo_extra.h lal_hippo.cu $(OBJ_DIR)/hippo_cl.h;
$(OBJ_DIR)/%_cl.h: lal_%.cu $(PRE1_H)
$(BSH) ./geryon/file_to_cstr.sh $* $(PRE1_H) $< $@;

View File

@ -26,6 +26,9 @@
#ifdef UCL_DEBUG
#define UCL_SYNC_DEBUG
#define UCL_DESTRUCT_CHECK
#define UCL_DEBUG_ARG(arg) arg
#else
#define UCL_DEBUG_ARG(arg)
#endif
#ifndef UCL_NO_API_CHECK

View File

@ -33,6 +33,9 @@
#ifdef UCL_DEBUG
#define UCL_SYNC_DEBUG
#define UCL_DESTRUCT_CHECK
#define UCL_DEBUG_ARG(arg) arg
#else
#define UCL_DEBUG_ARG(arg)
#endif
#ifndef UCL_NO_API_CHECK

View File

@ -309,15 +309,14 @@ class UCL_Device {
/// Return the maximum memory pitch in bytes for current device
inline size_t max_pitch() { return max_pitch(_device); }
/// Return the maximum memory pitch in bytes
inline size_t max_pitch(const int i) { return 0; }
inline size_t max_pitch(const int) { return 0; }
/// Returns false if accelerator cannot be shared by multiple processes
/** If it cannot be determined, true is returned **/
inline bool sharing_supported() { return sharing_supported(_device); }
/// Returns false if accelerator cannot be shared by multiple processes
/** If it cannot be determined, true is returned **/
inline bool sharing_supported(const int i)
{ return true; }
inline bool sharing_supported(const int) { return true; }
/// True if the device is a sub-device
inline bool is_subdevice()

View File

@ -33,6 +33,9 @@
#ifdef UCL_DEBUG
#define UCL_SYNC_DEBUG
#define UCL_DESTRUCT_CHECK
#define UCL_DEBUG_ARG(arg) arg
#else
#define UCL_DEBUG_ARG(arg)
#endif
#ifndef UCL_NO_API_CHECK

View File

@ -137,7 +137,7 @@ inline int _host_view(mat_type &mat, copy_type &cm, const size_t o,
template <class mat_type>
inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
const enum UCL_MEMOPT kind, const enum UCL_MEMOPT /*kind2*/){
cl_mem_flags buffer_perm;
cl_map_flags map_perm;
if (kind==UCL_READ_ONLY) {
@ -583,7 +583,7 @@ template <> struct _ucl_memcpy<1,0> {
template <class p1, class p2>
static inline void mc(p1 &dst, const p2 &src, const size_t n,
cl_command_queue &cq, const cl_bool block,
const size_t dst_offset, const size_t src_offset) {
const size_t /*dst_offset*/, const size_t src_offset) {
if (src.cbegin()==dst.cbegin()) {
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 1S\n";
@ -641,7 +641,7 @@ template <> struct _ucl_memcpy<0,1> {
template <class p1, class p2>
static inline void mc(p1 &dst, const p2 &src, const size_t n,
cl_command_queue &cq, const cl_bool block,
const size_t dst_offset, const size_t src_offset) {
const size_t dst_offset, const size_t /*src_offset*/) {
if (src.cbegin()==dst.cbegin()) {
if (block) ucl_sync(cq);
#ifdef UCL_DBG_MEM_TRACE

View File

@ -35,19 +35,19 @@ class UCL_Texture {
UCL_Texture() {}
~UCL_Texture() {}
/// Construct with a specified texture reference
inline UCL_Texture(UCL_Program &prog, const char *texture_name) { }
inline UCL_Texture(UCL_Program & /*prog*/, const char * /*texture_name*/) { }
/// Set the texture reference for this object
inline void get_texture(UCL_Program &prog, const char *texture_name) { }
inline void get_texture(UCL_Program & /*prog*/, const char * /*texture_name*/) { }
/// Bind a float array where each fetch grabs a vector of length numel
template<class mat_typ>
inline void bind_float(mat_typ &vec, const unsigned numel) { }
inline void bind_float(mat_typ & /*vec*/, const unsigned /*numel*/) { }
/// Unbind the texture reference from the memory allocation
inline void unbind() { }
/// Make a texture reference available to kernel
inline void allow(UCL_Kernel &kernel) { }
inline void allow(UCL_Kernel & /*kernel*/) { }
private:
friend class UCL_Kernel;
@ -62,7 +62,7 @@ class UCL_Const {
inline UCL_Const(UCL_Program &prog, const char *global_name)
{ get_global(prog,global_name); }
/// Set the global reference for this object
inline void get_global(UCL_Program &prog, const char *global_name) {
inline void get_global(UCL_Program &prog, const char * /*global_name*/) {
if (_active) {
CL_DESTRUCT_CALL(clReleaseContext(_context));
CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq));

View File

@ -71,7 +71,7 @@ class UCL_Timer {
inline void init(UCL_Device &dev) { init(dev,dev.cq()); }
/// Initialize command queue for timing
inline void init(UCL_Device &dev, command_queue &cq) {
inline void init(UCL_Device & /*dev*/, command_queue &cq) {
clear();
_cq=cq;
clRetainCommandQueue(_cq);

View File

@ -205,12 +205,11 @@ template <> struct _host_host_copy<1,1> {
// Should never be here
template <int host_t1, int host_t2> struct _host_host_copy {
template <class mat1, class mat2>
static inline void hhc(mat1 &dst, const mat2 &src, const size_t numel) {
static inline void hhc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*numel*/) {
assert(0==1);
}
template <class mat1, class mat2>
static inline void hhc(mat1 &dst, const mat2 &src, const size_t rows,
const size_t cols) {
static inline void hhc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*rows*/, const size_t /*cols*/) {
assert(0==1);
}
};
@ -470,24 +469,22 @@ template <int host_type1> struct _ucl_cast_copy<host_type1,1> {
// Neither on host or both on host
template <> struct _ucl_cast_copy<1,1> {
template <class mat1, class mat2, class mat3>
static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
mat3 &cast_buffer, command_queue &cq) {
static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*numel*/,
mat3 & /*cast_buffer*/, command_queue & /*cq*/) {
assert(0==1);
}
template <class mat1, class mat2, class mat3>
static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
mat3 &cast_buffer) {
static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*numel*/, mat3 & /*cast_buffer*/) {
assert(0==1);
}
template <class mat1, class mat2, class mat3>
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
const size_t cols, mat3 &cast_buffer) {
static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*rows*/,
const size_t /*cols*/, mat3 & /*cast_buffer*/) {
assert(0==1);
}
template <class mat1, class mat2, class mat3>
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
const size_t cols, mat3 &cast_buffer,
command_queue &cq) {
static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*rows*/,
const size_t /*cols*/, mat3 & /*cast_buffer*/, command_queue & /*cq*/) {
assert(0==1);
}
};
@ -495,24 +492,22 @@ template <> struct _ucl_cast_copy<1,1> {
// Neither on host or both on host
template <> struct _ucl_cast_copy<0,0> {
template <class mat1, class mat2, class mat3>
static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
mat3 &cast_buffer, command_queue &cq) {
static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*numel*/,
mat3 & /*cast_buffer*/, command_queue & /*cq*/) {
assert(0==1);
}
template <class mat1, class mat2, class mat3>
static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
mat3 &cast_buffer) {
static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*numel*/, mat3 & /*cast_buffer*/) {
assert(0==1);
}
template <class mat1, class mat2, class mat3>
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
const size_t cols, mat3 &cast_buffer) {
static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*rows*/,
const size_t /*cols*/, mat3 & /*cast_buffer*/) {
assert(0==1);
}
template <class mat1, class mat2, class mat3>
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
const size_t cols, mat3 &cast_buffer,
command_queue &cq) {
static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*rows*/,
const size_t cols, mat3 & /*cast_buffer*/, command_queue & /*cq*/) {
assert(0==1);
}
};

View File

@ -125,7 +125,7 @@ class UCL_D_Vec : public UCL_BaseMat {
* - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs **/
template <class ucl_type>
inline void view(ucl_type &input, const size_t rows, const size_t cols) {
inline void view(ucl_type &input, const size_t UCL_DEBUG_ARG(rows), const size_t cols) {
#ifdef UCL_DEBUG
assert(rows==1);
#endif
@ -230,8 +230,8 @@ class UCL_D_Vec : public UCL_BaseMat {
* - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs **/
template <class ucl_type>
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
const size_t cols) {
inline void view_offset(const size_t offset,ucl_type &input,
const size_t UCL_DEBUG_ARG(rows), const size_t cols) {
#ifdef UCL_DEBUG
assert(rows==1);
#endif

View File

@ -126,7 +126,7 @@ class UCL_H_Vec : public UCL_BaseMat {
* allocating container when using CUDA APIs
* - Viewing a device container on the host is not supported **/
template <class ucl_type>
inline void view(ucl_type &input, const size_t rows, const size_t cols) {
inline void view(ucl_type &input, const size_t UCL_DEBUG_ARG(rows), const size_t cols) {
#ifdef UCL_DEBUG
assert(rows==1);
#endif
@ -188,7 +188,7 @@ class UCL_H_Vec : public UCL_BaseMat {
* allocating container when using CUDA APIs
* - Viewing a device pointer on the host is not supported **/
template <class ptr_type>
inline void view(ptr_type *input, const size_t rows, const size_t cols,
inline void view(ptr_type *input, const size_t UCL_DEBUG_ARG(rows), const size_t cols,
UCL_Device &dev) {
#ifdef UCL_DEBUG
assert(rows==1);
@ -233,7 +233,7 @@ class UCL_H_Vec : public UCL_BaseMat {
* allocating container when using CUDA APIs
* - Viewing a device container on the host is not supported **/
template <class ucl_type>
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
inline void view_offset(const size_t offset,ucl_type &input,const size_t UCL_DEBUG_ARG(rows),
const size_t cols) {
#ifdef UCL_DEBUG
assert(rows==1);

View File

@ -27,7 +27,7 @@ template <int st> struct _ucl_s_obj_help;
// -- Can potentially use same memory if shared by accelerator
template <> struct _ucl_s_obj_help<1> {
template <class t1, class t2, class t3>
static inline int alloc(t1 &host, t2 &device, t3 &_buffer,
static inline int alloc(t1 &host, t2 &device, t3 & /*_buffer*/,
const int cols, UCL_Device &acc,
const enum UCL_MEMOPT kind1,
const enum UCL_MEMOPT kind2) {
@ -131,41 +131,37 @@ template <> struct _ucl_s_obj_help<1> {
}
template <class t1, class t2, class t3>
static inline void copy(t1 &dst, t2 &src, t3 &buffer, const bool async) {
static inline void copy(t1 &dst, t2 &src, t3 & /*buffer*/, const bool async) {
ucl_copy(dst,src,async);
}
template <class t1, class t2, class t3>
static inline void copy(t1 &dst, t2 &src, t3 &buffer, command_queue &cq) {
static inline void copy(t1 &dst, t2 &src, t3 & /*buffer*/, command_queue &cq) {
ucl_copy(dst,src,cq);
}
template <class t1, class t2, class t3>
static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer,
const bool async) {
static inline void copy(t1 &dst, t2 &src, const int cols, t3 & /*buffer*/, const bool async) {
ucl_copy(dst,src,cols,async);
}
template <class t1, class t2, class t3>
static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer,
command_queue &cq) {
static inline void copy(t1 &dst, t2 &src, const int cols, t3 & /*buffer*/, command_queue &cq) {
ucl_copy(dst,src,cols,cq);
}
template <class t1, class t2, class t3>
static inline void copy(t1 &dst, t2 &src, const int rows, const int cols,
t3 &buffer, const bool async) {
static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, t3 & /*buffer*/, const bool async) {
ucl_copy(dst,src,rows,cols,async);
}
template <class t1, class t2, class t3>
static inline void copy(t1 &dst, t2 &src, const int rows, const int cols,
t3 &buffer, command_queue &cq) {
static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, t3 & /*buffer*/, command_queue &cq) {
ucl_copy(dst,src,rows,cols,cq);
}
template <class t1, class t2, class t3>
static inline int dev_resize(t1 &device, t2 &host, t3 &buff,const int cols) {
static inline int dev_resize(t1 &device, t2 &host, t3 & /*buff*/,const int cols) {
if (device.kind()==UCL_VIEW) {
device.view(host);
return UCL_SUCCESS;
@ -353,7 +349,7 @@ template <int st> struct _ucl_s_obj_help {
}
template <class t1, class t2, class t3>
static inline int dev_resize(t1 &device, t2 &host, t3 &buff,const int cols) {
static inline int dev_resize(t1 &device, t2 & /*host*/, t3 &buff,const int cols) {
int err=buff.resize(cols);
if (err!=UCL_SUCCESS)
return err;

322
lib/gpu/lal_amoeba.cpp Normal file
View File

@ -0,0 +1,322 @@
/***************************************************************************
amoeba.cpp
-------------------
Trung Dac Nguyen (Northwestern)
Class for acceleration of the amoeba pair style.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : trung.nguyen@northwestern.edu
***************************************************************************/
#if defined(USE_OPENCL)
#include "amoeba_cl.h"
#elif defined(USE_CUDART)
const char *amoeba=0;
#else
#include "amoeba_cubin.h"
#endif
#include "lal_amoeba.h"
#include <cassert>
namespace LAMMPS_AL {
#define AmoebaT Amoeba<numtyp, acctyp>
extern Device<PRECISION,ACC_PRECISION> device;
template <class numtyp, class acctyp>
AmoebaT::Amoeba() : BaseAmoeba<numtyp,acctyp>(),
_allocated(false) {
}
template <class numtyp, class acctyp>
AmoebaT::~Amoeba() {
clear();
}
template <class numtyp, class acctyp>
int AmoebaT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors);
}
template <class numtyp, class acctyp>
int AmoebaT::init(const int ntypes, const int max_amtype, const int max_amclass,
const double *host_pdamp, const double *host_thole,
const double *host_dirdamp, const int *host_amtype2class,
const double *host_special_hal,
const double * /*host_special_repel*/,
const double * /*host_special_disp*/,
const double *host_special_mpole,
const double * /*host_special_polar_wscale*/,
const double *host_special_polar_piscale,
const double *host_special_polar_pscale,
const double *host_csix, const double *host_adisp,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const int maxspecial15,
const double cell_size, const double gpu_split, FILE *_screen,
const double polar_dscale, const double polar_uscale) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15,
cell_size,gpu_split,_screen,amoeba,
"k_amoeba_multipole", "k_amoeba_udirect2b",
"k_amoeba_umutual2b", "k_amoeba_polar",
"k_amoeba_fphi_uind", "k_amoeba_fphi_mpole",
"k_amoeba_short_nbor", "k_amoeba_special15");
if (success!=0)
return success;
// If atom type constants fit in shared memory use fast kernel
int lj_types=ntypes;
shared_types=false;
int max_shared_types=this->device->max_shared_types();
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
lj_types=max_shared_types;
shared_types=true;
}
_lj_types=lj_types;
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp4> host_write(max_amtype, *(this->ucl_device), UCL_WRITE_ONLY);
for (int i = 0; i < max_amtype; i++) {
host_write[i].x = host_pdamp[i];
host_write[i].y = host_thole[i];
host_write[i].z = host_dirdamp[i];
host_write[i].w = host_amtype2class[i];
}
coeff_amtype.alloc(max_amtype,*(this->ucl_device), UCL_READ_ONLY);
ucl_copy(coeff_amtype,host_write,false);
UCL_H_Vec<numtyp4> host_write2(max_amclass, *(this->ucl_device), UCL_WRITE_ONLY);
for (int i = 0; i < max_amclass; i++) {
host_write2[i].x = host_csix[i];
host_write2[i].y = host_adisp[i];
host_write2[i].z = (numtyp)0;
host_write2[i].w = (numtyp)0;
}
coeff_amclass.alloc(max_amclass,*(this->ucl_device), UCL_READ_ONLY);
ucl_copy(coeff_amclass,host_write2,false);
UCL_H_Vec<numtyp4> dview(5, *(this->ucl_device), UCL_WRITE_ONLY);
sp_amoeba.alloc(5,*(this->ucl_device),UCL_READ_ONLY);
for (int i=0; i<5; i++) {
dview[i].x=host_special_hal[i];
dview[i].y=host_special_polar_piscale[i];
dview[i].z=host_special_polar_pscale[i];
dview[i].w=host_special_mpole[i];
}
ucl_copy(sp_amoeba,dview,5,false);
_polar_dscale = polar_dscale;
_polar_uscale = polar_uscale;
_allocated=true;
this->_max_bytes=coeff_amtype.row_bytes() + coeff_amclass.row_bytes()
+ sp_amoeba.row_bytes() + this->_tep.row_bytes()
+ this->_fieldp.row_bytes() + this->_thetai1.row_bytes()
+ this->_thetai2.row_bytes() + this->_thetai3.row_bytes()
+ this->_igrid.row_bytes() + this->_cgrid_brick.row_bytes();
return 0;
}
template <class numtyp, class acctyp>
void AmoebaT::clear() {
if (!_allocated)
return;
_allocated=false;
coeff_amtype.clear();
coeff_amclass.clear();
sp_amoeba.clear();
this->clear_atomic();
}
template <class numtyp, class acctyp>
double AmoebaT::host_memory_usage() const {
return this->host_memory_usage_atomic()+sizeof(Amoeba<numtyp,acctyp>);
}
// ---------------------------------------------------------------------------
// Calculate the multipole real-space term, returning tep
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int AmoebaT::multipole_real(const int eflag, const int vflag) {
int ainum=this->ans->inum();
if (ainum == 0)
return 0;
int _nall=this->atom->nall();
int nbor_pitch=this->nbor->nbor_pitch();
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
this->time_pair.start();
// Build the short neighbor list for the cutoff off2_mpole,
// at this point mpole is the first kernel in a time step for AMOEBA
this->k_short_nbor.set_size(GX,BX);
this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
&this->_nbor_data->begin(),
&this->dev_short_nbor, &this->_off2_mpole, &ainum,
&nbor_pitch, &this->_threads_per_atom);
this->k_multipole.set_size(GX,BX);
this->k_multipole.run(&this->atom->x, &this->atom->extra,
&coeff_amtype, &sp_amoeba,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->ans->force, &this->ans->engv, &this->_tep,
&eflag, &vflag, &ainum, &_nall, &nbor_pitch,
&this->_threads_per_atom, &this->_aewald, &this->_felec,
&this->_off2_mpole, &_polar_dscale, &_polar_uscale);
this->time_pair.stop();
return GX;
}
// ---------------------------------------------------------------------------
// Launch the real-space permanent field kernel
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int AmoebaT::udirect2b(const int /*eflag*/, const int /*vflag*/) {
int ainum=this->ans->inum();
if (ainum == 0)
return 0;
int _nall=this->atom->nall();
int nbor_pitch=this->nbor->nbor_pitch();
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
this->time_pair.start();
// Build the short neighbor list for the cutoff _off2_polar, if not done yet
// this is the first kernel in a time step where _off2_polar is used
if (!this->short_nbor_polar_avail) {
this->k_short_nbor.set_size(GX,BX);
this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
&this->_nbor_data->begin(),
&this->dev_short_nbor, &this->_off2_polar, &ainum,
&nbor_pitch, &this->_threads_per_atom);
this->short_nbor_polar_avail = true;
}
this->k_udirect2b.set_size(GX,BX);
this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_amoeba,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->_fieldp, &ainum, &_nall, &nbor_pitch,
&this->_threads_per_atom, &this->_aewald, &this->_off2_polar,
&_polar_dscale, &_polar_uscale);
this->time_pair.stop();
return GX;
}
// ---------------------------------------------------------------------------
// Launch the real-space induced field kernel, returning field and fieldp
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int AmoebaT::umutual2b(const int /*eflag*/, const int /*vflag*/) {
int ainum=this->ans->inum();
if (ainum == 0)
return 0;
int _nall=this->atom->nall();
int nbor_pitch=this->nbor->nbor_pitch();
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
this->time_pair.start();
// Build the short neighbor list if not done yet
if (!this->short_nbor_polar_avail) {
this->k_short_nbor.set_size(GX,BX);
this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
&this->_nbor_data->begin(), &this->dev_short_nbor,
&this->_off2_polar, &ainum, &nbor_pitch,
&this->_threads_per_atom);
this->short_nbor_polar_avail = true;
}
this->k_umutual2b.set_size(GX,BX);
this->k_umutual2b.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_amoeba,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor, &this->_fieldp, &ainum, &_nall,
&nbor_pitch, &this->_threads_per_atom, &this->_aewald,
&this->_off2_polar, &_polar_dscale, &_polar_uscale);
this->time_pair.stop();
return GX;
}
// ---------------------------------------------------------------------------
// Launch the polar real-space kernel, returning tep
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int AmoebaT::polar_real(const int eflag, const int vflag) {
int ainum=this->ans->inum();
if (ainum == 0)
return 0;
int _nall=this->atom->nall();
int nbor_pitch=this->nbor->nbor_pitch();
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
const int GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
/*
const int cus = this->device->gpu->cus();
while (GX < cus && GX > 1) {
BX /= 2;
GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
}
*/
this->time_pair.start();
// Build the short neighbor list if not done yet
if (!this->short_nbor_polar_avail) {
this->k_short_nbor.set_size(GX,BX);
this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
&this->_nbor_data->begin(),
&this->dev_short_nbor, &this->_off2_polar, &ainum,
&nbor_pitch, &this->_threads_per_atom);
this->short_nbor_polar_avail = true;
}
this->k_polar.set_size(GX,BX);
this->k_polar.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_amoeba,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->ans->force, &this->ans->engv, &this->_tep,
&eflag, &vflag, &ainum, &_nall, &nbor_pitch,
&this->_threads_per_atom, &this->_aewald, &this->_felec,
&this->_off2_polar, &_polar_dscale, &_polar_uscale);
this->time_pair.stop();
// Signal that short nbor list is not avail for the next time step
// do it here because polar_real() is the last kernel in a time step at this point
this->short_nbor_polar_avail = false;
return GX;
}
template class Amoeba<PRECISION,ACC_PRECISION>;
}

2099
lib/gpu/lal_amoeba.cu Normal file

File diff suppressed because it is too large Load Diff

100
lib/gpu/lal_amoeba.h Normal file
View File

@ -0,0 +1,100 @@
/***************************************************************************
amoeba.h
-------------------
Trung Dac Nguyen (Northwestern)
Class for acceleration of the amoeba pair style.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : trung.nguyen@northwestern.edu
***************************************************************************/
#ifndef LAL_AMOEBA_H
#define LAL_AMOEBA_H
#include "lal_base_amoeba.h"
namespace LAMMPS_AL {
template <class numtyp, class acctyp>
class Amoeba : public BaseAmoeba<numtyp, acctyp> {
public:
Amoeba();
~Amoeba();
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(const int ntypes, const int max_amtype, const int max_amclass,
const double *host_pdamp, const double *host_thole,
const double *host_dirdamp, const int *host_amtype2class,
const double *host_special_mpole,
const double *host_special_hal,
const double *host_special_repel,
const double *host_special_disp,
const double *host_special_polar_wscale,
const double *host_special_polar_piscale,
const double *host_special_polar_pscale,
const double *host_csix, const double *host_adisp,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const int maxspecial15, const double cell_size,
const double gpu_split, FILE *_screen,
const double polar_dscale, const double polar_uscale);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
/// Returns memory usage on device per atom
int bytes_per_atom(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage() const;
// --------------------------- TYPE DATA --------------------------
/// pdamp = coeff_amtype.x; thole = coeff_amtype.y;
/// dirdamp = coeff_amtype.z; amtype2class = coeff_amtype.w
UCL_D_Vec<numtyp4> coeff_amtype;
/// csix = coeff_amclass.x; adisp = coeff_amclass.y;
UCL_D_Vec<numtyp4> coeff_amclass;
/// Special amoeba values [0-4]:
/// sp_amoeba.x = special_hal
/// sp_amoeba.y = special_polar_pscale,
/// sp_amoeba.z = special_polar_piscale
/// sp_amoeba.w = special_mpole
UCL_D_Vec<numtyp4> sp_amoeba;
/// If atom type constants fit in shared memory, use fast kernels
bool shared_types;
/// Number of atom types
int _lj_types;
numtyp _polar_dscale, _polar_uscale;
numtyp _qqrd2e;
protected:
bool _allocated;
int multipole_real(const int eflag, const int vflag);
int udirect2b(const int eflag, const int vflag);
int umutual2b(const int eflag, const int vflag);
int polar_real(const int eflag, const int vflag);
};
}
#endif

213
lib/gpu/lal_amoeba_ext.cpp Normal file
View File

@ -0,0 +1,213 @@
/***************************************************************************
amoeba_ext.cpp
-------------------
Trung Dac Nguyen (Northwestern)
Functions for LAMMPS access to amoeba acceleration routines.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : trung.nguyen@northwestern.edu
***************************************************************************/
#include <iostream>
#include <cassert>
#include <cmath>
#include "lal_amoeba.h"
using namespace std;
using namespace LAMMPS_AL;
static Amoeba<PRECISION,ACC_PRECISION> AMOEBAMF;
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclass,
const double *host_pdamp, const double *host_thole,
const double *host_dirdamp, const int *host_amtype2class,
const double *host_special_hal,
const double *host_special_repel,
const double *host_special_disp,
const double *host_special_mpole,
const double *host_special_polar_wscale,
const double *host_special_polar_piscale,
const double *host_special_polar_pscale,
const double *host_csix, const double *host_adisp,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const int maxspecial15,
const double cell_size, int &gpu_mode, FILE *screen,
const double polar_dscale, const double polar_uscale) {
AMOEBAMF.clear();
gpu_mode=AMOEBAMF.device->gpu_mode();
double gpu_split=AMOEBAMF.device->particle_split();
int first_gpu=AMOEBAMF.device->first_device();
int last_gpu=AMOEBAMF.device->last_device();
int world_me=AMOEBAMF.device->world_me();
int gpu_rank=AMOEBAMF.device->gpu_rank();
int procs_per_gpu=AMOEBAMF.device->procs_per_gpu();
AMOEBAMF.device->init_message(screen,"amoeba",first_gpu,last_gpu);
bool message=false;
if (AMOEBAMF.device->replica_me()==0 && screen)
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fflush(screen);
}
int init_ok=0;
if (world_me==0)
init_ok=AMOEBAMF.init(ntypes, max_amtype, max_amclass,
host_pdamp, host_thole, host_dirdamp,
host_amtype2class, host_special_hal,
host_special_repel, host_special_disp,
host_special_mpole, host_special_polar_wscale,
host_special_polar_piscale, host_special_polar_pscale,
host_csix, host_adisp, nlocal, nall, max_nbors,
maxspecial, maxspecial15, cell_size, gpu_split,
screen, polar_dscale, polar_uscale);
AMOEBAMF.device->world_barrier();
if (message)
fprintf(screen,"Done.\n");
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0)
init_ok=AMOEBAMF.init(ntypes, max_amtype, max_amclass,
host_pdamp, host_thole, host_dirdamp,
host_amtype2class, host_special_hal,
host_special_repel, host_special_disp,
host_special_mpole, host_special_polar_wscale,
host_special_polar_piscale, host_special_polar_pscale,
host_csix, host_adisp, nlocal, nall, max_nbors,
maxspecial, maxspecial15, cell_size, gpu_split,
screen, polar_dscale, polar_uscale);
AMOEBAMF.device->gpu_barrier();
if (message)
fprintf(screen,"Done.\n");
}
if (message)
fprintf(screen,"\n");
if (init_ok==0)
AMOEBAMF.estimate_gpu_overhead();
return init_ok;
}
void amoeba_gpu_clear() {
AMOEBAMF.clear();
}
int** amoeba_gpu_precompute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *host_amtype,
int *host_amgroup, double **host_rpole, double ** /*host_uind*/,
double ** /*host_uinp*/, double * /*host_pval*/,
double *sublo, double *subhi, tagint *tag,
int **nspecial, tagint **special,
int *nspecial15, tagint **special15,
const bool eflag_in, const bool vflag_in,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time,
bool &success, double *host_q, double *boxlo, double *prd) {
return AMOEBAMF.precompute(ago, inum_full, nall, host_x, host_type,
host_amtype, host_amgroup, host_rpole,
nullptr, nullptr, nullptr, sublo, subhi, tag,
nspecial, special, nspecial15, special15,
eflag_in, vflag_in, eatom, vatom,
host_start, ilist, jnum, cpu_time,
success, host_q, boxlo, prd);
}
void amoeba_gpu_compute_multipole_real(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
int *host_amtype, int *host_amgroup, double **host_rpole,
double *sublo, double *subhi, tagint *tag, int **nspecial,
tagint **special, int *nspecial15, tagint** special15,
const bool eflag, const bool vflag, const bool eatom,
const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time,
bool &success, const double aewald, const double felec, const double off2,
double *host_q, double *boxlo, double *prd, void **tep_ptr) {
AMOEBAMF.compute_multipole_real(ago, inum_full, nall, host_x, host_type,
host_amtype, host_amgroup, host_rpole, nullptr, sublo, subhi,
tag, nspecial, special, nspecial15, special15,
eflag, vflag, eatom, vatom, host_start, ilist, jnum,
cpu_time, success, aewald, felec, off2, host_q, boxlo, prd, tep_ptr);
}
void amoeba_gpu_compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole,
double **host_uind, double **host_uinp,
const double aewald, const double off2, void **fieldp_ptr) {
AMOEBAMF.compute_udirect2b(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, nullptr,
aewald, off2, fieldp_ptr);
}
void amoeba_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole,
double **host_uind, double **host_uinp,
const double aewald, const double off2, void **fieldp_ptr) {
AMOEBAMF.compute_umutual2b(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, nullptr,
aewald, off2, fieldp_ptr);
}
void amoeba_gpu_update_fieldp(void **fieldp_ptr) {
AMOEBAMF.update_fieldp(fieldp_ptr);
}
void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole,
double **host_uind, double **host_uinp,
const bool eflag_in, const bool vflag_in,
const bool eatom, const bool vatom,
const double aewald, const double felec, const double off2,
void **tep_ptr) {
AMOEBAMF.compute_polar_real(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, nullptr,
eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
}
void amoeba_gpu_precompute_kspace(const int inum_full, const int bsorder,
double ***host_thetai1, double ***host_thetai2,
double ***host_thetai3, int** igrid,
const int nzlo_out, const int nzhi_out,
const int nylo_out, const int nyhi_out,
const int nxlo_out, const int nxhi_out) {
AMOEBAMF.precompute_kspace(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3, igrid,
nzlo_out, nzhi_out, nylo_out, nyhi_out, nxlo_out, nxhi_out);
}
void amoeba_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1,
void **host_fdip_phi2, void **host_fdip_sum_phi) {
AMOEBAMF.compute_fphi_uind(host_grid_brick, host_fdip_phi1,
host_fdip_phi2, host_fdip_sum_phi);
}
void amoeba_gpu_fphi_mpole(double ***host_grid_brick, void **host_fphi, const double felec) {
AMOEBAMF.compute_fphi_mpole(host_grid_brick, host_fphi, felec);
}
void amoeba_setup_fft(const int numel, const int element_type) {
AMOEBAMF.setup_fft(numel, element_type);
}
void amoeba_compute_fft1d(void* in, void* out, const int numel, const int mode) {
AMOEBAMF.compute_fft1d(in, out, numel, mode);
}
double amoeba_gpu_bytes() {
return AMOEBAMF.host_memory_usage();
}

View File

@ -48,6 +48,8 @@ int AtomT::bytes_per_atom() const {
bytes+=sizeof(numtyp);
if (_vel)
bytes+=4*sizeof(numtyp);
if (_extra_fields>0)
bytes+=_extra_fields*sizeof(numtyp4);
return bytes;
}
@ -122,6 +124,11 @@ bool AtomT::alloc(const int nall) {
UCL_READ_ONLY)==UCL_SUCCESS);
gpu_bytes+=v.device.row_bytes();
}
if (_extra_fields>0) {
success=success && (extra.alloc(_max_atoms*_extra_fields,*dev,UCL_WRITE_ONLY,
UCL_READ_ONLY)==UCL_SUCCESS);
gpu_bytes+=extra.device.row_bytes();
}
if (_gpu_nbor>0) {
if (_bonds) {
@ -156,7 +163,8 @@ bool AtomT::alloc(const int nall) {
template <class numtyp, class acctyp>
bool AtomT::add_fields(const bool charge, const bool rot,
const int gpu_nbor, const bool bonds, const bool vel) {
const int gpu_nbor, const bool bonds, const bool vel,
const int extra_fields) {
bool success=true;
// Ignore host/device transfers?
int gpu_bytes=0;
@ -191,7 +199,17 @@ bool AtomT::add_fields(const bool charge, const bool rot,
}
}
if (bonds && !_bonds) {
if (extra_fields > 0 && _extra_fields==0) {
_extra_fields=extra_fields;
_other=true;
if (_host_view==false) {
success=success && (extra.alloc(_max_atoms*_extra_fields,*dev,UCL_WRITE_ONLY,
UCL_READ_ONLY)==UCL_SUCCESS);
gpu_bytes+=extra.device.row_bytes();
}
}
if (bonds && _bonds==false) {
_bonds=true;
if (_bonds && _gpu_nbor>0) {
success=success && (dev_tag.alloc(_max_atoms,*dev,
@ -254,7 +272,8 @@ bool AtomT::add_fields(const bool charge, const bool rot,
template <class numtyp, class acctyp>
bool AtomT::init(const int nall, const bool charge, const bool rot,
UCL_Device &devi, const int gpu_nbor, const bool bonds, const bool vel) {
UCL_Device &devi, const int gpu_nbor, const bool bonds, const bool vel,
const int extra_fields) {
clear();
bool success=true;
@ -262,13 +281,15 @@ bool AtomT::init(const int nall, const bool charge, const bool rot,
_q_avail=false;
_quat_avail=false;
_v_avail=false;
_extra_avail=false;
_resized=false;
_gpu_nbor=gpu_nbor;
_bonds=bonds;
_charge=charge;
_rot=rot;
_vel=vel;
_other=_charge || _rot || _vel;
_extra_fields=extra_fields;
_other=_charge || _rot || _vel || (extra_fields>0);
dev=&devi;
_time_transfer=0;
@ -282,10 +303,14 @@ bool AtomT::init(const int nall, const bool charge, const bool rot,
time_q.init(*dev);
time_quat.init(*dev);
time_vel.init(*dev);
time_extra.init(*dev);
time_pos.zero();
time_q.zero();
time_quat.zero();
time_vel.zero();
time_extra.zero();
_time_cast=0.0;
#ifdef GPU_CAST
@ -308,6 +333,8 @@ void AtomT::clear_resize() {
quat.clear();
if (_vel)
v.clear();
if (_extra_fields>0)
extra.clear();
dev_cell_id.clear();
dev_particle_id.clear();
@ -350,6 +377,7 @@ void AtomT::clear() {
time_q.clear();
time_quat.clear();
time_vel.clear();
time_extra.clear();
clear_resize();
#ifdef GPU_CAST
@ -370,12 +398,19 @@ double AtomT::host_memory_usage() const {
atom_bytes+=4;
if (_vel)
atom_bytes+=4;
if (_extra_fields>0)
atom_bytes+=_extra_fields;
return _max_atoms*atom_bytes*sizeof(numtyp)+sizeof(Atom<numtyp,acctyp>);
}
#ifdef USE_CUDPP
#define USE_CUDPP_ARG(arg) arg
#else
#define USE_CUDPP_ARG(arg)
#endif
// Sort arrays for neighbor list calculation
template <class numtyp, class acctyp>
void AtomT::sort_neighbor(const int num_atoms) {
void AtomT::sort_neighbor(const int USE_CUDPP_ARG(num_atoms)) {
#ifdef USE_CUDPP
CUDPPResult result = cudppSort(sort_plan, (unsigned *)dev_cell_id.begin(),
(int *)dev_particle_id.begin(),

View File

@ -76,7 +76,7 @@ class Atom {
* gpu_nbor 2 if binning on host and neighboring on device **/
bool init(const int nall, const bool charge, const bool rot,
UCL_Device &dev, const int gpu_nbor=0, const bool bonds=false,
const bool vel=false);
const bool vel=false, const int extra_fields=0);
/// Check if we have enough device storage and realloc if not
/** Returns true if resized with any call during this timestep **/
@ -96,7 +96,7 @@ class Atom {
* gpu_nbor 1 if neighboring will be performed on device
* gpu_nbor 2 if binning on host and neighboring on device **/
bool add_fields(const bool charge, const bool rot, const int gpu_nbor,
const bool bonds, const bool vel=false);
const bool bonds, const bool vel=false, const int extra_fields=0);
/// Returns true if GPU is using charges
bool charge() { return _charge; }
@ -107,6 +107,9 @@ class Atom {
/// Returns true if GPU is using velocities
bool velocity() { return _vel; }
/// Returns true if GPU is using extra fields
bool using_extra() { return (_extra_fields>0); }
/// Only free matrices of length inum or nall for resizing
void clear_resize();
@ -128,6 +131,8 @@ class Atom {
time_quat.add_to_total();
if (_vel)
time_vel.add_to_total();
if (_extra_fields>0)
time_extra.add_to_total();
}
/// Add copy times to timers
@ -139,6 +144,8 @@ class Atom {
time_quat.zero();
if (_vel)
time_vel.zero();
if (_extra_fields>0)
time_extra.zero();
}
/// Return the total time for host/device data transfer
@ -158,6 +165,10 @@ class Atom {
total+=time_vel.total_seconds();
time_vel.zero_total();
}
if (_extra_fields>0) {
total+=time_extra.total_seconds();
time_extra.zero_total();
}
return total+_time_transfer/1000.0;
}
@ -281,7 +292,11 @@ class Atom {
/// Signal that we need to transfer atom data for next timestep
inline void data_unavail()
{ _x_avail=false; _q_avail=false; _quat_avail=false; _v_avail=false; _resized=false; }
{ _x_avail=false; _q_avail=false; _quat_avail=false; _v_avail=false; _extra_avail=false; _resized=false; }
/// Signal that we need to transfer atom extra data for next kernel call
inline void extra_data_unavail()
{ _extra_avail=false; }
typedef struct { double x,y,z; } vec3d;
typedef struct { numtyp x,y,z,w; } vec4d_t;
@ -312,7 +327,7 @@ class Atom {
/// Copy positions and types to device asynchronously
/** Copies nall() elements **/
inline void add_x_data(double **host_ptr, int *host_type) {
inline void add_x_data(double ** /*host_ptr*/, int * /*host_type*/) {
time_pos.start();
if (_x_avail==false) {
#ifdef GPU_CAST
@ -426,7 +441,7 @@ class Atom {
/// Copy velocities and tags to device asynchronously
/** Copies nall() elements **/
inline void add_v_data(double **host_ptr, tagint *host_tag) {
inline void add_v_data(double ** /*host_ptr*/, tagint * /*host_tag*/) {
time_vel.start();
if (_v_avail==false) {
#ifdef GPU_CAST
@ -450,6 +465,33 @@ class Atom {
add_v_data(host_ptr,host_tag);
}
// Cast extras to write buffer
template<class cpytyp>
inline void cast_extra_data(cpytyp *host_ptr) {
if (_extra_avail==false) {
double t=MPI_Wtime();
#if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
#pragma omp parallel for simd schedule(static)
#elif (LAL_USE_OMP_SIMD == 1)
#pragma omp simd
#endif
for (int i=0; i<_nall*_extra_fields; i++)
extra[i]=host_ptr[i];
_time_cast+=MPI_Wtime()-t;
}
}
// Copy extras to device
/** Copies nall()*_extra elements **/
inline void add_extra_data() {
time_extra.start();
if (_extra_avail==false) {
extra.update_device(_nall*_extra_fields,true);
_extra_avail=true;
}
time_extra.stop();
}
/// Add in casting time from additional data (seconds)
inline void add_cast_time(double t) { _time_cast+=t; }
@ -473,6 +515,8 @@ class Atom {
UCL_Vector<numtyp,numtyp> quat;
/// Velocities
UCL_Vector<numtyp,numtyp> v;
/// Extras
UCL_Vector<numtyp4,numtyp4> extra;
#ifdef GPU_CAST
UCL_Vector<numtyp,numtyp> x_cast;
@ -493,7 +537,7 @@ class Atom {
UCL_H_Vec<int> host_particle_id;
/// Device timers
UCL_Timer time_pos, time_q, time_quat, time_vel;
UCL_Timer time_pos, time_q, time_quat, time_vel, time_extra;
/// Geryon device
UCL_Device *dev;
@ -508,11 +552,12 @@ class Atom {
bool _compiled;
// True if data has been copied to device already
bool _x_avail, _q_avail, _quat_avail, _v_avail, _resized;
bool _x_avail, _q_avail, _quat_avail, _v_avail, _extra_avail, _resized;
bool alloc(const int nall);
bool _allocated, _rot, _charge, _bonds, _vel, _other;
int _extra_fields;
int _max_atoms, _nall, _gpu_nbor;
bool _host_view;
double _time_cast, _time_transfer;

962
lib/gpu/lal_base_amoeba.cpp Normal file
View File

@ -0,0 +1,962 @@
/***************************************************************************
base_amoeba.cpp
-------------------
Trung Dac Nguyen (Northwestern)
Base class for pair styles needing per-particle data for position,
charge, and type.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : trung.nguyen@northwestern.edu
***************************************************************************/
#include "lal_base_amoeba.h"
namespace LAMMPS_AL {
#define BaseAmoebaT BaseAmoeba<numtyp, acctyp>
extern Device<PRECISION,ACC_PRECISION> global_device;
template <class numtyp, class acctyp>
BaseAmoebaT::BaseAmoeba() : _compiled(false), _max_bytes(0), short_nbor_polar_avail(false) {
device=&global_device;
ans=new Answer<numtyp,acctyp>();
nbor=new Neighbor();
pair_program=nullptr;
ucl_device=nullptr;
}
template <class numtyp, class acctyp>
BaseAmoebaT::~BaseAmoeba() {
delete ans;
delete nbor;
k_multipole.clear();
k_udirect2b.clear();
k_umutual2b.clear();
k_fphi_uind.clear();
k_fphi_mpole.clear();
k_polar.clear();
k_special15.clear();
k_short_nbor.clear();
#if 0 // !defined(USE_OPENCL) && !defined(USE_HIP)
if (fft_plan_created) cufftDestroy(plan);
#endif
if (pair_program) delete pair_program;
}
template <class numtyp, class acctyp>
int BaseAmoebaT::bytes_per_atom_atomic(const int max_nbors) const {
return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
nbor->bytes_per_atom(max_nbors);
}
template <class numtyp, class acctyp>
int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
const int max_nbors, const int maxspecial,
const int maxspecial15,
const double cell_size, const double gpu_split,
FILE *_screen, const void *pair_program,
const char *k_name_multipole,
const char *k_name_udirect2b,
const char *k_name_umutual2b,
const char *k_name_polar,
const char *k_name_fphi_uind,
const char *k_name_fphi_mpole,
const char *k_name_short_nbor,
const char* k_name_special15) {
screen=_screen;
int gpu_nbor=0;
if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_NEIGH)
gpu_nbor=1;
else if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
gpu_nbor=2;
int _gpu_host=0;
int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
if (host_nlocal>0)
_gpu_host=1;
_threads_per_atom=device->threads_per_charge();
bool charge = true;
bool rot = false;
bool vel = false;
_extra_fields = 24; // round up to accomodate quadruples of numtyp values
// rpole 13; uind 3; uinp 3; amtype, amgroup; pval
int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial,vel,_extra_fields/4);
if (success!=0)
return success;
if (ucl_device!=device->gpu) _compiled=false;
ucl_device=device->gpu;
atom=&device->atom;
_block_size=device->pair_block_size();
_block_bio_size=device->block_bio_pair();
compile_kernels(*ucl_device,pair_program,k_name_multipole,
k_name_udirect2b, k_name_umutual2b,k_name_polar,
k_name_fphi_uind, k_name_fphi_mpole,
k_name_short_nbor, k_name_special15);
if (_threads_per_atom>1 && gpu_nbor==0) {
nbor->packing(true);
_nbor_data=&(nbor->dev_packed);
} else {
_nbor_data=&(nbor->dev_nbor);
}
bool alloc_packed=false;
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,
_gpu_host,max_nbors,cell_size,alloc_packed,
_threads_per_atom);
if (success!=0)
return success;
// Initialize host-device load balancer
hd_balancer.init(device,gpu_nbor,gpu_split);
// Initialize timers for the selected GPU
time_pair.init(*ucl_device);
time_pair.zero();
pos_tex.bind_float(atom->x,4);
q_tex.bind_float(atom->q,1);
_max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
_maxspecial=maxspecial;
_maxspecial15=maxspecial15;
// allocate per-atom array tep
int ef_nall=nlocal; //nall;
if (ef_nall==0)
ef_nall=2000;
dev_short_nbor.alloc(ef_nall*(2+max_nbors),*(this->ucl_device),UCL_READ_WRITE);
_max_tep_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
_tep.alloc(_max_tep_size*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
_max_fieldp_size = _max_tep_size;
_fieldp.alloc(_max_fieldp_size*8,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
_max_thetai_size = 0;
_nmax = nall;
dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY);
dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY);
dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);
#if 0 // !defined(USE_OPENCL) && !defined(USE_HIP)
fft_plan_created = false;
#endif
#ifdef ASYNC_DEVICE_COPY
_end_command_queue=ucl_device->num_queues();
ucl_device->push_command_queue();
#endif
return success;
}
template <class numtyp, class acctyp>
void BaseAmoebaT::estimate_gpu_overhead(const int add_kernels) {
device->estimate_gpu_overhead(1+add_kernels,_gpu_overhead,_driver_overhead);
}
template <class numtyp, class acctyp>
void BaseAmoebaT::clear_atomic() {
// Output any timing information
acc_timers();
double avg_split=hd_balancer.all_avg_split();
_gpu_overhead*=hd_balancer.timestep();
_driver_overhead*=hd_balancer.timestep();
device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
_gpu_overhead,_driver_overhead,_threads_per_atom,screen);
time_pair.clear();
hd_balancer.clear();
dev_short_nbor.clear();
nbor->clear();
ans->clear();
_tep.clear();
_fieldp.clear();
_thetai1.clear();
_thetai2.clear();
_thetai3.clear();
_igrid.clear();
_fdip_phi1.clear();
_fdip_phi2.clear();
_fdip_sum_phi.clear();
_cgrid_brick.clear();
dev_nspecial15.clear();
dev_special15.clear();
dev_special15_t.clear();
}
// ---------------------------------------------------------------------------
// Copy neighbor list from host
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int * BaseAmoebaT::reset_nbors(const int nall, const int inum, int *ilist,
int *numj, int **firstneigh, bool &success) {
success=true;
int mn=nbor->max_nbor_loop(inum,numj,ilist);
resize_atom(inum,nall,success);
resize_local(inum,mn,success);
if (!success)
return nullptr;
nbor->get_host(inum,ilist,numj,firstneigh,block_size());
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
if (bytes>_max_an_bytes)
_max_an_bytes=bytes;
return ilist;
}
// ---------------------------------------------------------------------------
// Build neighbor list on device
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
inline int BaseAmoebaT::build_nbor_list(const int inum, const int host_inum,
const int nall, double **host_x,
int *host_type, double *sublo,
double *subhi, tagint *tag,
int **nspecial, tagint **special,
int *nspecial15, tagint **special15,
bool &success) {
success=true;
resize_atom(inum,nall,success);
resize_local(inum,host_inum,nbor->max_nbors(),success);
if (!success)
return 0;
atom->cast_copy_x(host_x,host_type);
int mn;
nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi,
tag, nspecial, special, success, mn, ans->error_flag);
// add one-five neighbors
if (_maxspecial15>0) {
UCL_H_Vec<int> view_nspecial15;
UCL_H_Vec<tagint> view_special15;
view_nspecial15.view(nspecial15,nall,*ucl_device);
view_special15.view(special15[0],nall*_maxspecial15,*ucl_device);
ucl_copy(dev_nspecial15,view_nspecial15,nall,false);
ucl_copy(dev_special15_t,view_special15,_maxspecial15*nall,false);
nbor->transpose(dev_special15, dev_special15_t, _maxspecial15, nall);
add_onefive_neighbors();
}
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
if (bytes>_max_an_bytes)
_max_an_bytes=bytes;
return mn;
}
// ---------------------------------------------------------------------------
// Prepare for multiple kernel calls in a time step:
// - reallocate per-atom arrays, if needed
// - transfer extra data from host to device
// - build the full neighbor lists for use by different kernels
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *host_amtype,
int *host_amgroup, double **host_rpole,
double **host_uind, double **host_uinp, double *host_pval,
double *sublo, double *subhi, tagint *tag,
int **nspecial, tagint **special,
int *nspecial15, tagint **special15,
const bool eflag_in, const bool vflag_in,
const bool eatom, const bool vatom, int &host_start,
int **&ilist, int **&jnum, const double cpu_time,
bool &success, double *host_q, double * /*boxlo*/, double * /*prd*/) {
acc_timers();
if (eatom) _eflag=2;
else if (eflag_in) _eflag=1;
else _eflag=0;
if (vatom) _vflag=2;
else if (vflag_in) _vflag=1;
else _vflag=0;
#ifdef LAL_NO_BLOCK_REDUCE
if (_eflag) _eflag=2;
if (_vflag) _vflag=2;
#endif
set_kernel(_eflag,_vflag);
// ------------------- Resize 1-5 neighbor arrays ------------------------
if (nall>_nmax) {
_nmax = nall;
dev_nspecial15.clear();
dev_special15.clear();
dev_special15_t.clear();
dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY);
dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY);
dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);
}
if (inum_full==0) {
host_start=0;
// Make sure textures are correct if realloc by a different hybrid style
resize_atom(0,nall,success);
zero_timers();
return nullptr;
}
hd_balancer.balance(cpu_time);
int inum=hd_balancer.get_gpu_count(ago,inum_full);
ans->inum(inum);
host_start=inum;
// Build neighbor list on GPU if necessary
if (ago==0) {
_max_nbors = build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
sublo, subhi, tag, nspecial, special, nspecial15, special15,
success);
if (!success)
return nullptr;
atom->cast_q_data(host_q);
hd_balancer.start_timer();
} else {
atom->cast_x_data(host_x,host_type);
atom->cast_q_data(host_q);
hd_balancer.start_timer();
atom->add_x_data(host_x,host_type);
}
atom->add_q_data();
cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
atom->add_extra_data();
*ilist=nbor->host_ilist.begin();
*jnum=nbor->host_acc.begin();
// re-allocate dev_short_nbor if necessary
if (inum_full*(2+_max_nbors) > dev_short_nbor.cols()) {
int _nmax=static_cast<int>(static_cast<double>(inum_full)*1.10);
dev_short_nbor.resize((2+_max_nbors)*_nmax);
}
hd_balancer.stop_timer();
return nbor->host_jlist.begin()-host_start;
}
// ---------------------------------------------------------------------------
// Compute multipole real-space part
// precompute() should be already invoked before mem (re)allocation
// this is the first part in a time step done on the GPU for AMOEBA for now
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void BaseAmoebaT::compute_multipole_real(const int /*ago*/, const int inum_full,
const int /*nall*/, double ** /*host_x*/,
int * /*host_type*/, int * /*host_amtype*/,
int * /*host_amgroup*/, double ** /*host_rpole*/,
double */*host_pval*/, double * /*sublo*/,
double * /*subhi*/, tagint * /*tag*/,
int ** /*nspecial*/, tagint ** /*special*/,
int * /*nspecial15*/, tagint ** /*special15*/,
const bool /*eflag_in*/, const bool /*vflag_in*/,
const bool /*eatom*/, const bool /*vatom*/,
int & /*host_start*/, int ** /*ilist*/, int ** /*jnum*/,
const double /*cpu_time*/, bool & /*success*/,
const double aewald, const double felec,
const double off2_mpole, double * /*host_q*/,
double * /*boxlo*/, double * /*prd*/, void **tep_ptr) {
// ------------------- Resize _tep array ------------------------
if (inum_full>_max_tep_size) {
_max_tep_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
_tep.resize(_max_tep_size*4);
}
*tep_ptr=_tep.host.begin();
_off2_mpole = off2_mpole;
_felec = felec;
_aewald = aewald;
multipole_real(_eflag,_vflag);
// leave the answers (forces, energies and virial) on the device,
// only copy them back in the last kernel (polar_real)
//ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
//device->add_ans_object(ans);
// copy tep from device to host
_tep.update_host(_max_tep_size*4,false);
}
// ---------------------------------------------------------------------------
// Reneighbor on GPU if necessary, and then compute the direct real space part
// of the permanent field
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void BaseAmoebaT::compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole,
double **host_uind, double **host_uinp, double *host_pval,
const double aewald, const double off2_polar,
void** fieldp_ptr) {
// all the necessary data arrays are already copied from host to device
cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
atom->add_extra_data();
*fieldp_ptr=_fieldp.host.begin();
// specify the correct cutoff and alpha values
_off2_polar = off2_polar;
_aewald = aewald;
udirect2b(_eflag,_vflag);
// copy field and fieldp from device to host (_fieldp store both arrays, one after another)
_fieldp.update_host(_max_fieldp_size*8,false);
}
// ---------------------------------------------------------------------------
// Reneighbor on GPU if necessary, and then compute the direct real space part
// of the induced field
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double ** /*host_rpole*/,
double **host_uind, double **host_uinp, double * /*host_pval*/,
const double aewald, const double off2_polar,
void** /*fieldp_ptr*/) {
// only copy the necessary data arrays that are updated over the iterations
// use nullptr for the other arrays that are already copied from host to device
cast_extra_data(host_amtype, host_amgroup, nullptr, host_uind, host_uinp, nullptr);
atom->add_extra_data();
// set the correct cutoff and alpha
_off2_polar = off2_polar;
_aewald = aewald;
// launch the kernel
umutual2b(_eflag,_vflag);
// copy field and fieldp from device to host (_fieldp store both arrays, one after another)
// NOTE: move this step to update_fieldp() to delay device-host transfer
// after umutual1 and self are done on the GPU
// *fieldp_ptr=_fieldp.host.begin();
// _fieldp.update_host(_max_fieldp_size*8,false);
}
// ---------------------------------------------------------------------------
// Prepare for umutual1() after bspline_fill() is done on host
// - reallocate per-atom arrays, thetai1, thetai2, thetai3, and igrid if needed
// host_thetai1, host_thetai2, host_thetai3 are allocated with nmax by bsordermax by 4
// host_igrid is allocated with nmax by 4
// - transfer extra data from host to device
// NOTE: can be re-used for fphi_mpole() but with a different bsorder value
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void BaseAmoebaT::precompute_kspace(const int inum_full, const int bsorder,
double ***host_thetai1, double ***host_thetai2,
double ***host_thetai3, int** host_igrid,
const int nzlo_out, const int nzhi_out,
const int nylo_out, const int nyhi_out,
const int nxlo_out, const int nxhi_out) {
// update bsorder with that of the kspace solver
_bsorder = bsorder;
// allocate or resize per-atom arrays
// _max_thetai_size, _max_tep_size and _max_fieldp_size are essentially _nmax
// will be consolidated once all terms are ready
if (_max_thetai_size == 0) {
_max_thetai_size = static_cast<int>(static_cast<double>(inum_full)*1.10);
_thetai1.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY);
_thetai2.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY);
_thetai3.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY);
_igrid.alloc(_max_thetai_size*4,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY);
_fdip_phi1.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_READ_WRITE);
_fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_READ_WRITE);
_fdip_sum_phi.alloc(_max_thetai_size*20,*(this->ucl_device),UCL_READ_WRITE);
} else {
if ((int)_thetai1.cols()<_max_thetai_size*bsorder) {
_max_thetai_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
_thetai1.resize(_max_thetai_size*bsorder);
_thetai2.resize(_max_thetai_size*bsorder);
_thetai3.resize(_max_thetai_size*bsorder);
_igrid.resize(_max_thetai_size*4);
_fdip_phi1.resize(_max_thetai_size*10);
_fdip_phi2.resize(_max_thetai_size*10);
_fdip_sum_phi.resize(_max_thetai_size*20);
}
}
#ifdef ASYNC_DEVICE_COPY
_thetai1.cq(ucl_device->cq(_end_command_queue));
_thetai2.cq(ucl_device->cq(_end_command_queue));
_thetai3.cq(ucl_device->cq(_end_command_queue));
#endif
// pack host data to device
for (int i = 0; i < inum_full; i++)
for (int j = 0; j < bsorder; j++) {
int idx = i*bsorder + j;
numtyp4 v;
v.x = host_thetai1[i][j][0];
v.y = host_thetai1[i][j][1];
v.z = host_thetai1[i][j][2];
v.w = host_thetai1[i][j][3];
_thetai1[idx] = v;
}
_thetai1.update_device(true);
for (int i = 0; i < inum_full; i++)
for (int j = 0; j < bsorder; j++) {
int idx = i*bsorder + j;
numtyp4 v;
v.x = host_thetai2[i][j][0];
v.y = host_thetai2[i][j][1];
v.z = host_thetai2[i][j][2];
v.w = host_thetai2[i][j][3];
_thetai2[idx] = v;
}
_thetai2.update_device(true);
for (int i = 0; i < inum_full; i++)
for (int j = 0; j < bsorder; j++) {
int idx = i*bsorder + j;
numtyp4 v;
v.x = host_thetai3[i][j][0];
v.y = host_thetai3[i][j][1];
v.z = host_thetai3[i][j][2];
v.w = host_thetai3[i][j][3];
_thetai3[idx] = v;
}
_thetai3.update_device(true);
for (int i = 0; i < inum_full; i++) {
int idx = i*4;
_igrid[idx+0] = host_igrid[i][0];
_igrid[idx+1] = host_igrid[i][1];
_igrid[idx+2] = host_igrid[i][2];
}
_igrid.update_device(true);
// _cgrid_brick holds the grid-based potential
_nzlo_out = nzlo_out;
_nzhi_out = nzhi_out;
_nylo_out = nylo_out;
_nyhi_out = nyhi_out;
_nxlo_out = nxlo_out;
_nxhi_out = nxhi_out;
_ngridz = nzhi_out - nzlo_out + 1;
_ngridy = nyhi_out - nylo_out + 1;
_ngridx = nxhi_out - nxlo_out + 1;
_num_grid_points = _ngridx * _ngridy * _ngridz;
int numel = _num_grid_points;
if (_cgrid_brick.cols() == 0) {
int nsize=(int)(((double)numel)*1.1);
_cgrid_brick.alloc(nsize, *(this->ucl_device), UCL_READ_WRITE, UCL_READ_ONLY);
} else if (numel > (int)_cgrid_brick.cols()) {
_cgrid_brick.resize(numel);
}
}
// ---------------------------------------------------------------------------
// fphi_uind = induced potential from grid
// fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
// NOTE: host_grid_brick is from ic_kspace post_convolution()
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void BaseAmoebaT::compute_fphi_uind(double ****host_grid_brick,
void **host_fdip_phi1,
void **host_fdip_phi2,
void **host_fdip_sum_phi)
{
int n = 0;
for (int iz = _nzlo_out; iz <= _nzhi_out; iz++)
for (int iy = _nylo_out; iy <= _nyhi_out; iy++)
for (int ix = _nxlo_out; ix <= _nxhi_out; ix++) {
numtyp2 v;
v.x = host_grid_brick[iz][iy][ix][0];
v.y = host_grid_brick[iz][iy][ix][1];
_cgrid_brick[n] = v;
n++;
}
_cgrid_brick.update_device(_num_grid_points, false);
#ifdef ASYNC_DEVICE_COPY
ucl_device->sync();
#endif
// launch the kernel with its execution configuration (see below)
fphi_uind();
// copy data from device to host
_fdip_phi1.update_host(_max_thetai_size*10, false);
_fdip_phi2.update_host(_max_thetai_size*10, false);
_fdip_sum_phi.update_host(_max_thetai_size*20, false);
// return the pointers to the host-side arrays
*host_fdip_phi1 = _fdip_phi1.host.begin();
*host_fdip_phi2 = _fdip_phi2.host.begin();
*host_fdip_sum_phi = _fdip_sum_phi.host.begin();
}
// ---------------------------------------------------------------------------
// Interpolate the potential from the PME grid
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int BaseAmoebaT::fphi_uind() {
int ainum=ans->inum();
if (ainum == 0)
return 0;
// Compute the block size and grid size to keep all cores busy
const int BX=block_size();
const int GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
time_pair.start();
int ngridxy = _ngridx * _ngridy;
k_fphi_uind.set_size(GX,BX);
k_fphi_uind.run(&_thetai1, &_thetai2, &_thetai3, &_igrid, &_cgrid_brick,
&_fdip_phi1, &_fdip_phi2, &_fdip_sum_phi, &_bsorder, &ainum,
&_nzlo_out, &_nylo_out, &_nxlo_out, &ngridxy, &_ngridx);
time_pair.stop();
return GX;
}
// ---------------------------------------------------------------------------
// fphi_mpole = multipole potential from grid (limited to polar_kspace for now)
// fphi_mpole extracts the permanent multipole potential from
// the particle mesh Ewald grid
// NOTE: host_grid_brick is from p_kspace post_convolution()
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void BaseAmoebaT::compute_fphi_mpole(double ***host_grid_brick, void **host_fphi, const double felec)
{
int n = 0;
for (int iz = _nzlo_out; iz <= _nzhi_out; iz++)
for (int iy = _nylo_out; iy <= _nyhi_out; iy++)
for (int ix = _nxlo_out; ix <= _nxhi_out; ix++) {
numtyp2 v;
v.x = host_grid_brick[iz][iy][ix];
v.y = (numtyp)0;
_cgrid_brick[n] = v;
n++;
}
_cgrid_brick.update_device(_num_grid_points, false);
_felec = felec;
fphi_mpole();
_fdip_sum_phi.update_host(_max_thetai_size*20, false);
*host_fphi = _fdip_sum_phi.host.begin();
}
// ---------------------------------------------------------------------------
// Interpolate the potential from the PME grid
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int BaseAmoebaT::fphi_mpole() {
int ainum=ans->inum();
if (ainum == 0)
return 0;
// Compute the block size and grid size to keep all cores busy
const int BX=block_size();
const int GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
time_pair.start();
int ngridxy = _ngridx * _ngridy;
k_fphi_mpole.set_size(GX,BX);
k_fphi_mpole.run(&_thetai1, &_thetai2, &_thetai3, &_igrid, &_cgrid_brick,
&_fdip_sum_phi, &_bsorder, &ainum, &_felec,
&_nzlo_out, &_nylo_out, &_nxlo_out, &ngridxy, &_ngridx);
time_pair.stop();
return GX;
}
// ---------------------------------------------------------------------------
// Reneighbor on GPU if necessary, and then compute polar real-space
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void BaseAmoebaT::compute_polar_real(int *host_amtype, int *host_amgroup,
double **host_rpole, double **host_uind,
double **host_uinp, double *host_pval,
const bool eflag_in, const bool vflag_in,
const bool eatom, const bool vatom,
const double aewald, const double felec,
const double off2_polar, void **tep_ptr) {
// cast necessary data arrays from host to device
cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
atom->add_extra_data();
*tep_ptr=_tep.host.begin();
_off2_polar = off2_polar;
_felec = felec;
_aewald = aewald;
const int red_blocks=polar_real(_eflag,_vflag);
// only copy answers (forces, energies and virial) back from the device
// in the last kernel (which is polar_real here)
ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
device->add_ans_object(ans);
// copy tep from device to host
_tep.update_host(_max_tep_size*4,false);
}
// ---------------------------------------------------------------------------
// Return the memory bytes allocated on the host and device
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
double BaseAmoebaT::host_memory_usage_atomic() const {
return device->atom.host_memory_usage()+nbor->host_memory_usage()+
4*sizeof(numtyp)+sizeof(BaseAmoeba<numtyp,acctyp>);
}
// ---------------------------------------------------------------------------
// Setup the FFT plan: only placeholder for now
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void BaseAmoebaT::setup_fft(const int /*numel*/, const int /*element_type*/)
{
// TODO: setting up FFT plan based on the backend (cuFFT or hipFFT)
}
// ---------------------------------------------------------------------------
// Compute FFT on the device: only placeholder for now
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void BaseAmoebaT::compute_fft1d(void * /*in*/, void * /*out*/,
const int /*numel*/, const int /*mode*/)
{
// TODO: setting up FFT plan based on the backend (cuFFT or hipFFT)
#if 0 // !defined(USE_OPENCL) && !defined(USE_HIP)
if (fft_plan_created == false) {
int m = numel/2;
cufftPlan1d(&plan, m, CUFFT_Z2Z, 1);
fft_plan_created = true;
}
// n = number of double complex
int n = numel/2;
// copy the host array to the device (data)
UCL_Vector<cufftDoubleComplex,cufftDoubleComplex> data;
data.alloc(n, *(this->ucl_device), UCL_READ_WRITE, UCL_READ_WRITE);
int m = 0;
double* d_in = (double*)in;
for (int i = 0; i < n; i++) {
data[i].x = d_in[m];
data[i].y = d_in[m+1];
m += 2;
}
data.update_device(false);
// perform the in-place forward FFT
cufftResult result = cufftExecZ2Z(plan, (cufftDoubleComplex*)&data.device,
(cufftDoubleComplex*)&data.device, CUFFT_FORWARD);
if (result != CUFFT_SUCCESS) printf("failed cufft %d\n", result);
ucl_device->sync();
data.update_host(false);
// copy back the data to the host array
m = 0;
double* d_out = (double*)out;
for (int i = 0; i < n; i++) {
d_out[m] = data[i].x;
d_out[m+1] = data[i].y;
m += 2;
}
data.clear();
#endif
}
// ---------------------------------------------------------------------------
// Copy the extra data from host to device
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole,
double** uind, double** uinp, double* pval) {
// signal that we need to transfer extra data from the host
atom->extra_data_unavail();
int _nall=atom->nall();
numtyp4 *pextra=reinterpret_cast<numtyp4*>(&(atom->extra[0]));
int n = 0;
int nstride = 1; //4;
if (rpole) {
for (int i = 0; i < _nall; i++) {
int idx = n+i*nstride;
pextra[idx].x = rpole[i][0];
pextra[idx].y = rpole[i][1];
pextra[idx].z = rpole[i][2];
pextra[idx].w = rpole[i][3];
}
n += nstride*_nall;
for (int i = 0; i < _nall; i++) {
int idx = n+i*nstride;
pextra[idx].x = rpole[i][4];
pextra[idx].y = rpole[i][5];
pextra[idx].z = rpole[i][6];
pextra[idx].w = rpole[i][8];
}
n += nstride*_nall;
for (int i = 0; i < _nall; i++) {
int idx = n+i*nstride;
pextra[idx].x = rpole[i][9];
pextra[idx].y = rpole[i][12];
pextra[idx].z = (numtyp)amtype[i];
pextra[idx].w = (numtyp)amgroup[i];
}
} else {
n += 2*nstride*_nall;
}
n += nstride*_nall;
if (uind) {
for (int i = 0; i < _nall; i++) {
int idx = n+i*nstride;
pextra[idx].x = uind[i][0];
pextra[idx].y = uind[i][1];
pextra[idx].z = uind[i][2];
pextra[idx].w = 0;
}
}
n += nstride*_nall;
if (uinp) {
for (int i = 0; i < _nall; i++) {
int idx = n+i*nstride;
pextra[idx].x = uinp[i][0];
pextra[idx].y = uinp[i][1];
pextra[idx].z = uinp[i][2];
pextra[idx].w = 0;
}
}
n += nstride*_nall;
if (pval) {
for (int i = 0; i < _nall; i++) {
int idx = n+i*nstride;
pextra[idx].x = pval[i];
pextra[idx].y = 0;
pextra[idx].z = 0;
pextra[idx].w = 0;
}
}
}
// ---------------------------------------------------------------------------
// Compile (load) the kernel strings and set the kernels
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
const char *kname_multipole,
const char *kname_udirect2b,
const char *kname_umutual2b,
const char *kname_polar,
const char *kname_fphi_uind,
const char *kname_fphi_mpole,
const char *kname_short_nbor,
const char* kname_special15) {
if (_compiled)
return;
if (pair_program) delete pair_program;
pair_program=new UCL_Program(dev);
std::string oclstring = device->compile_string()+" -DEVFLAG=1";
pair_program->load_string(pair_str, oclstring.c_str(),nullptr, screen);
k_multipole.set_function(*pair_program, kname_multipole);
k_udirect2b.set_function(*pair_program, kname_udirect2b);
k_umutual2b.set_function(*pair_program, kname_umutual2b);
k_polar.set_function(*pair_program, kname_polar);
k_fphi_uind.set_function(*pair_program, kname_fphi_uind);
k_fphi_mpole.set_function(*pair_program, kname_fphi_mpole);
k_short_nbor.set_function(*pair_program, kname_short_nbor);
k_special15.set_function(*pair_program, kname_special15);
pos_tex.get_texture(*pair_program, "pos_tex");
q_tex.get_texture(*pair_program, "q_tex");
_compiled=true;
#if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
if (dev.has_subgroup_support()) {
int mx_subgroup_sz = k_polar.max_subgroup_size(_block_size);
if (_threads_per_atom > mx_subgroup_sz)
_threads_per_atom = mx_subgroup_sz;
device->set_simd_size(mx_subgroup_sz);
}
#endif
}
// ---------------------------------------------------------------------------
// Specify 1-5 neighbors from the current neighbor list
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int BaseAmoebaT::add_onefive_neighbors() {
// Compute the block size and grid size to keep all cores busy
const int BX=block_size();
int GX=static_cast<int>(ceil(static_cast<double>(ans->inum())/
(BX/_threads_per_atom)));
int _nall=atom->nall();
int ainum=ans->inum();
int nbor_pitch=nbor->nbor_pitch();
k_special15.set_size(GX,BX);
k_special15.run(&nbor->dev_nbor, &_nbor_data->begin(),
&atom->dev_tag, &dev_nspecial15, &dev_special15,
&ainum, &_nall, &nbor_pitch,
&_threads_per_atom);
return GX;
}
template class BaseAmoeba<PRECISION,ACC_PRECISION>;
}

325
lib/gpu/lal_base_amoeba.h Normal file
View File

@ -0,0 +1,325 @@
/***************************************************************************
base_amoeba.h
-------------------
Trung Dac Nguyen (Northwestern)
Base class for pair styles needing per-particle data for position,
charge, and type.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : trung.nguyen@northwestern.edu
***************************************************************************/
#ifndef LAL_BASE_AMOEBA_H
#define LAL_BASE_AMOEBA_H
#include "lal_device.h"
#include "lal_balance.h"
#include "mpi.h"
#if defined(USE_OPENCL)
#include "geryon/ocl_texture.h"
#elif defined(USE_CUDART)
#include "geryon/nvc_texture.h"
#elif defined(USE_HIP)
#include "geryon/hip_texture.h"
#else
#include "geryon/nvd_texture.h"
#endif
//#define ASYNC_DEVICE_COPY
#if !defined(USE_OPENCL) && !defined(USE_HIP)
// temporary workaround for int2 also defined in cufft
#ifdef int2
#undef int2
#endif
#include "cufft.h"
#endif
namespace LAMMPS_AL {
template <class numtyp, class acctyp>
class BaseAmoeba {
public:
BaseAmoeba();
virtual ~BaseAmoeba();
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
* \param k_name name for the kernel for force calculation
*
* Returns:
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init_atomic(const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const int maxspecial15, const double cell_size,
const double gpu_split, FILE *screen, const void *pair_program,
const char *kname_multipole, const char *kname_udirect2b,
const char *kname_umutual2b, const char *kname_polar,
const char *kname_fphi_uind, const char *kname_fphi_mpole,
const char *kname_short_nbor, const char* kname_special15);
/// Estimate the overhead for GPU context changes and CPU driver
void estimate_gpu_overhead(const int add_kernels=0);
/// Check if there is enough storage for atom arrays and realloc if not
/** \param success set to false if insufficient memory **/
inline void resize_atom(const int inum, const int nall, bool &success) {
if (atom->resize(nall, success)) {
pos_tex.bind_float(atom->x,4);
q_tex.bind_float(atom->q,1);
}
ans->resize(inum,success);
}
/// Check if there is enough storage for neighbors and realloc if not
/** \param nlocal number of particles whose nbors must be stored on device
* \param host_inum number of particles whose nbors need to copied to host
* \param current maximum number of neighbors
* \note olist_size=total number of local particles **/
inline void resize_local(const int inum, const int max_nbors, bool &success) {
nbor->resize(inum,max_nbors,success);
}
/// Check if there is enough storage for neighbors and realloc if not
/** \param nlocal number of particles whose nbors must be stored on device
* \param host_inum number of particles whose nbors need to copied to host
* \param current maximum number of neighbors
* \note host_inum is 0 if the host is performing neighboring
* \note nlocal+host_inum=total number local particles
* \note olist_size=0 **/
inline void resize_local(const int inum, const int host_inum,
const int max_nbors, bool &success) {
nbor->resize(inum,host_inum,max_nbors,success);
}
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear_atomic();
/// Returns memory usage on device per atom
int bytes_per_atom_atomic(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage_atomic() const;
/// Accumulate timers
inline void acc_timers() {
if (device->time_device()) {
nbor->acc_timers(screen);
time_pair.add_to_total();
atom->acc_timers();
ans->acc_timers();
}
}
/// Zero timers
inline void zero_timers() {
time_pair.zero();
atom->zero_timers();
ans->zero_timers();
}
/// Copy neighbor list from host
int * reset_nbors(const int nall, const int inum, int *ilist, int *numj,
int **firstneigh, bool &success);
/// Build neighbor list on device
int build_nbor_list(const int inum, const int host_inum,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, tagint *tag, int **nspecial,
tagint **special, int *nspecial15, tagint **special15,
bool &success);
/// Reallocate per-atom arrays if needed, and build neighbor lists once, if needed
virtual int** precompute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *host_amtype,
int *host_amgroup, double **host_rpole, double **host_uind,
double **host_uinp, double *host_pval, double *sublo, double *subhi,
tagint *tag, int **nspecial, tagint **special,
int *nspecial15, tagint **special15,
const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **&ilist, int **&numj, const double cpu_time, bool &success,
double *charge, double *boxlo, double *prd);
/// Compute multipole real-space with device neighboring
virtual void compute_multipole_real(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *host_amtype,
int *host_amgroup, double **host_rpole, double *host_pval,
double *sublo, double *subhi, tagint *tag,
int **nspecial, tagint **special, int *nspecial15, tagint **special15,
const bool eflag, const bool vflag, const bool eatom, const bool vatom,
int &host_start, int **ilist, int **numj, const double cpu_time,
bool &success, const double aewald, const double felec,
const double off2_mpole, double *charge, double *boxlo,
double *prd, void **tep_ptr);
/// Compute the real space part of the permanent field (udirect2b) with device neighboring
virtual void compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole,
double **host_uind, double **host_uinp, double *host_pval,
const double aewald, const double off2_polar, void **fieldp_ptr);
/// Compute the real space part of the induced field (umutual2b) with device neighboring
virtual void compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole,
double **host_uind, double **host_uinp, double *host_pval,
const double aewald, const double off2_polar, void **fieldp_ptr);
/// Allocate/resize per-atom arrays before the kspace parts in induce() and polar
virtual void precompute_kspace(const int inum_full, const int bsorder,
double ***host_thetai1, double ***host_thetai2,
double ***host_thetai3, int** igrid,
const int nzlo_out, const int nzhi_out,
const int nylo_out, const int nyhi_out,
const int nxlo_out, const int nxhi_out);
/// Interpolate the induced potential from the grid
virtual void compute_fphi_uind(double ****host_grid_brick,
void **host_fdip_phi1, void **host_fdip_phi2,
void **host_fdip_sum_phi);
/// Interpolate the multipolar potential from the grid
virtual void compute_fphi_mpole(double ***host_grid_brick, void **host_fphi,
const double felec);
/// Compute polar real-space with device neighboring
virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole,
double **host_uind, double **host_uinp, double *host_pval,
const bool eflag, const bool vflag,
const bool eatom, const bool vatom,
const double aewald, const double felec, const double off2_polar,
void **tep_ptr);
// copy field and fieldp from device to host after umutual2b
virtual void update_fieldp(void **fieldp_ptr) {
*fieldp_ptr=_fieldp.host.begin();
// _fieldp store both arrays, one after another
_fieldp.update_host(_max_fieldp_size*8,false);
}
/// setup a plan for FFT, where size is the number of elements
void setup_fft(const int size, const int element_type=0);
/// compute forward/backward FFT on the device
void compute_fft1d(void* in, void* out, const int numel, const int mode);
// -------------------------- DEVICE DATA -------------------------
/// Device Properties and Atom and Neighbor storage
Device<numtyp,acctyp> *device;
/// Geryon device
UCL_Device *ucl_device;
/// Device Timers
UCL_Timer time_pair;
/// Host device load balancer
Balance<numtyp,acctyp> hd_balancer;
/// LAMMPS pointer for screen output
FILE *screen;
// --------------------------- ATOM DATA --------------------------
/// Atom Data
Atom<numtyp,acctyp> *atom;
UCL_Vector<numtyp,numtyp> polar1, polar2, polar3, polar4, polar5;
/// cast host arrays into a single array for atom->extra
void cast_extra_data(int* amtype, int* amgroup, double** rpole,
double** uind, double** uinp, double* pval=nullptr);
/// Per-atom arrays
UCL_Vector<acctyp,acctyp> _tep, _fieldp;
int _nmax, _max_tep_size, _max_fieldp_size;
int _bsorder;
UCL_Vector<numtyp4,numtyp4> _thetai1, _thetai2, _thetai3;
UCL_Vector<int,int> _igrid;
UCL_Vector<numtyp2,numtyp2> _cgrid_brick;
UCL_Vector<acctyp,acctyp> _fdip_phi1, _fdip_phi2, _fdip_sum_phi;
int _max_thetai_size;
int _nzlo_out, _nzhi_out, _nylo_out, _nyhi_out, _nxlo_out, _nxhi_out;
int _ngridx, _ngridy, _ngridz, _num_grid_points;
int _end_command_queue;
// ------------------------ FORCE/ENERGY DATA -----------------------
Answer<numtyp,acctyp> *ans;
// --------------------------- NBOR DATA ----------------------------
/// Neighbor data
Neighbor *nbor;
/// Device storage for 1-5 special neighbor counts
UCL_D_Vec<int> dev_nspecial15;
/// Device storage for special neighbors
UCL_D_Vec<tagint> dev_special15, dev_special15_t;
int add_onefive_neighbors();
UCL_D_Vec<int> dev_short_nbor;
// ------------------------- DEVICE KERNELS -------------------------
UCL_Program *pair_program;
UCL_Kernel k_multipole, k_udirect2b, k_umutual2b, k_polar;
UCL_Kernel k_fphi_uind, k_fphi_mpole;
UCL_Kernel k_special15, k_short_nbor;
inline int block_size() { return _block_size; }
inline void set_kernel(const int /*eflag*/, const int /*vflag*/) {}
// --------------------------- TEXTURES -----------------------------
UCL_Texture pos_tex;
UCL_Texture q_tex;
protected:
bool _compiled;
int _block_size, _block_bio_size, _threads_per_atom;
int _extra_fields;
double _max_bytes, _max_an_bytes, _maxspecial, _maxspecial15, _max_nbors;
double _gpu_overhead, _driver_overhead;
bool short_nbor_polar_avail;
UCL_D_Vec<int> *_nbor_data;
numtyp _aewald,_felec;
numtyp _off2_hal,_off2_repulse,_off2_disp,_off2_mpole,_off2_polar;
int _eflag, _vflag;
void compile_kernels(UCL_Device &dev, const void *pair_string,
const char *kname_multipole, const char *kname_udirect2b,
const char *kname_umutual2b, const char *kname_polar,
const char *kname_fphi_uind, const char *kname_fphi_mpole,
const char *kname_short_nbor, const char* kname_special15);
virtual int multipole_real(const int eflag, const int vflag) = 0;
virtual int udirect2b(const int eflag, const int vflag) = 0;
virtual int umutual2b(const int eflag, const int vflag) = 0;
virtual int fphi_uind();
virtual int fphi_mpole();
virtual int polar_real(const int eflag, const int vflag) = 0;
#if !defined(USE_OPENCL) && !defined(USE_HIP)
cufftHandle plan;
#endif
bool fft_plan_created;
};
}
#endif

View File

@ -72,7 +72,9 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall,
_threads_per_atom=device->threads_per_atom();
int success=device->init(*ans,false,false,nlocal,nall,maxspecial);
bool charge = false;
bool rot = false;
int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial);
if (success!=0)
return success;

View File

@ -72,7 +72,9 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall,
_threads_per_atom=device->threads_per_charge();
int success=device->init(*ans,true,false,nlocal,nall,maxspecial);
bool charge = true;
bool rot = false;
int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial);
if (success!=0)
return success;

View File

@ -73,7 +73,9 @@ int BaseDipoleT::init_atomic(const int nlocal, const int nall,
_threads_per_atom=device->threads_per_charge();
int success=device->init(*ans,true,true,nlocal,nall,maxspecial);
bool charge = true;
bool rot = true;
int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial);
if (success!=0)
return success;

View File

@ -72,7 +72,10 @@ int BaseDPDT::init_atomic(const int nlocal, const int nall,
_threads_per_atom=device->threads_per_atom();
int success=device->init(*ans,false,false,nlocal,nall,maxspecial,true);
bool charge = false;
bool rot = false;
bool vel = true;
int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial,vel);
if (success!=0)
return success;
@ -193,7 +196,7 @@ void BaseDPDT::compute(const int f_ago, const int inum_full, const int nall,
const double cpu_time, bool &success, tagint *tag,
double **host_v, const double dtinvsqrt,
const int seed, const int timestep,
const int nlocal, double *boxlo, double *prd) {
const int /*nlocal*/, double * /*boxlo*/, double * /*prd*/) {
acc_timers();
int eflag, vflag;
if (eatom) eflag=2;
@ -258,7 +261,7 @@ int** BaseDPDT::compute(const int ago, const int inum_full,
const double cpu_time, bool &success,
double **host_v, const double dtinvsqrt,
const int seed, const int timestep,
double *boxlo, double *prd) {
double * /*boxlo*/, double * /*prd*/) {
acc_timers();
int eflag, vflag;
if (eatom) eflag=2;

View File

@ -94,7 +94,9 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
else
_threads_per_atom=device->threads_per_three();
int success=device->init(*ans,false,false,nlocal,nall,maxspecial);
bool charge = false;
bool rot = false;
int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial);
if (success!=0)
return success;

View File

@ -44,19 +44,15 @@ int CHARMMLongT::bytes_per_atom(const int max_nbors) const {
}
template <class numtyp, class acctyp>
int CHARMMLongT::init(const int ntypes,
double host_cut_bothsq, double **host_lj1,
double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset,
double *host_special_lj, const int nlocal,
const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *_screen,
double host_cut_ljsq, const double host_cut_coulsq,
double *host_special_coul, const double qqrd2e,
const double g_ewald, const double cut_lj_innersq,
const double denom_lj, double **epsilon,
double **sigma, const bool mix_arithmetic) {
int CHARMMLongT::init(const int ntypes, double host_cut_bothsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double ** /*host_offset*/, double *host_special_lj, const int nlocal,
const int nall, const int max_nbors, const int maxspecial,
const double cell_size, const double gpu_split, FILE *_screen,
double host_cut_ljsq, const double host_cut_coulsq,
double *host_special_coul, const double qqrd2e, const double g_ewald,
const double cut_lj_innersq, const double denom_lj, double **epsilon,
double **sigma, const bool mix_arithmetic) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
_screen,charmm_long,"k_charmm_long");

View File

@ -52,7 +52,7 @@ DeviceT::~Device() {
}
template <class numtyp, class acctyp>
int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int ngpu,
int DeviceT::init_device(MPI_Comm /*world*/, MPI_Comm replica, const int ngpu,
const int first_gpu_id, const int gpu_mode,
const double p_split, const int t_per_atom,
const double user_cell_size, char *ocl_args,
@ -386,6 +386,9 @@ int DeviceT::set_ocl_params(std::string s_config, const std::string &extra_args)
}
_ocl_compile_string="-cl-mad-enable ";
#ifdef CL_VERSION_2_0
_ocl_compile_string+="-cl-std=CL2.0 ";
#endif
if (params[4]!="0") _ocl_compile_string+="-cl-fast-relaxed-math ";
_ocl_compile_string+=std::string(OCL_INT_TYPE)+" "+
std::string(OCL_PRECISION_COMPILE);
@ -438,7 +441,7 @@ template <class numtyp, class acctyp>
int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
const bool rot, const int nlocal,
const int nall, const int maxspecial,
const bool vel) {
const bool vel, const int extra_fields) {
if (!_device_init)
return -1;
if (sizeof(acctyp)==sizeof(double) && !gpu->double_precision())
@ -467,7 +470,7 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
if (_init_count==0) {
// Initialize atom and nbor data
if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor>0 && maxspecial>0,vel))
if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor>0 && maxspecial>0,vel,extra_fields))
return -3;
_data_in_estimate++;
@ -477,6 +480,9 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
_data_in_estimate++;
if (vel)
_data_in_estimate++;
if (extra_fields>0)
_data_in_estimate++;
} else {
if (!atom.charge() && charge)
_data_in_estimate++;
@ -484,7 +490,9 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
_data_in_estimate++;
if (!atom.velocity() && vel)
_data_in_estimate++;
if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial,vel))
if (atom.using_extra() && extra_fields>0)
_data_in_estimate++;
if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial,vel,extra_fields))
return -3;
}
@ -520,7 +528,7 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const int nlocal,
template <class numtyp, class acctyp>
int DeviceT::init_nbor(Neighbor *nbor, const int nlocal,
const int host_nlocal, const int nall,
const int host_nlocal, const int /*nall*/,
const int maxspecial, const int gpu_host,
const int max_nbors, const double cutoff,
const bool pre_cut, const int threads_per_atom,

View File

@ -61,6 +61,7 @@ class Device {
* \param nall Total number of local+ghost particles
* \param maxspecial Maximum mumber of special bonded atoms per atom
* \param vel True if velocities need to be stored
* \param extra_fields Nonzero if extra fields need to be stored
*
* Returns:
* - 0 if successful
@ -70,7 +71,7 @@ class Device {
* - -5 Double precision is not supported on card **/
int init(Answer<numtyp,acctyp> &ans, const bool charge, const bool rot,
const int nlocal, const int nall, const int maxspecial,
const bool vel=false);
const bool vel=false, const int extra_fields=0);
/// Initialize the device for Atom storage only
/** \param nlocal Total number of local particles to allocate memory for

View File

@ -28,10 +28,10 @@ static DPD<PRECISION,ACC_PRECISION> DPDTMF;
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
int dpd_tstat_gpu_init(const int ntypes, double **cutsq, double **host_a0,
double **host_gamma, double **host_sigma, double **host_cut,
double *special_lj, const int inum,
const int nall, const int max_nbors, const int maxspecial,
const double cell_size, int &gpu_mode, FILE *screen) {
double **host_gamma, double **host_sigma, double **host_cut,
double *special_lj, const int inum,
const int nall, const int /*max_nbors*/, const int maxspecial,
const double cell_size, int &gpu_mode, FILE *screen) {
DPDTMF.clear();
gpu_mode=DPDTMF.device->gpu_mode();
double gpu_split=DPDTMF.device->particle_split();

View File

@ -310,7 +310,7 @@ void EAMT::compute(const int f_ago, const int inum_full, const int nlocal,
const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag_in, const bool vflag_in,
const bool eatom, const bool vatom,
const bool /*eatom*/, const bool /*vatom*/,
int &host_start, const double cpu_time,
bool &success, void **fp_ptr) {
this->acc_timers();
@ -386,8 +386,8 @@ int** EAMT::compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, double *sublo,
double *subhi, tagint *tag, int **nspecial,
tagint **special, const bool eflag_in,
const bool vflag_in, const bool eatom,
const bool vatom, int &host_start, int **ilist, int **jnum,
const bool vflag_in, const bool /*eatom*/,
const bool /*vatom*/, int &host_start, int **ilist, int **jnum,
const double cpu_time, bool &success, int &inum,
void **fp_ptr) {
this->acc_timers();

641
lib/gpu/lal_hippo.cpp Normal file
View File

@ -0,0 +1,641 @@
/***************************************************************************
hippo.cpp
-------------------
Trung Dac Nguyen (Northwestern)
Class for acceleration of the hippo pair style.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : trung.nguyen@northwestern.edu
***************************************************************************/
#if defined(USE_OPENCL)
#include "hippo_cl.h"
#elif defined(USE_CUDART)
const char *hippo=0;
#else
#include "hippo_cubin.h"
#endif
#include "lal_hippo.h"
#include <cassert>
namespace LAMMPS_AL {
#define HippoT Hippo<numtyp, acctyp>
extern Device<PRECISION,ACC_PRECISION> device;
template <class numtyp, class acctyp>
HippoT::Hippo() : BaseAmoeba<numtyp,acctyp>(),
_allocated(false) {
}
template <class numtyp, class acctyp>
HippoT::~Hippo() {
clear();
k_repulsion.clear();
k_dispersion.clear();
}
template <class numtyp, class acctyp>
int HippoT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors);
}
template <class numtyp, class acctyp>
int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass,
const double *host_pdamp, const double *host_thole,
const double *host_dirdamp, const int *host_amtype2class,
const double *host_special_repel, const double *host_special_disp,
const double *host_special_mpole,
const double *host_special_polar_wscale,
const double *host_special_polar_piscale,
const double *host_special_polar_pscale,
const double *host_sizpr, const double *host_dmppr, const double *host_elepr,
const double *host_csix, const double *host_adisp,
const double *host_pcore, const double *host_palpha,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const int maxspecial15,
const double cell_size, const double gpu_split, FILE *_screen,
const double polar_dscale, const double polar_uscale) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15,
cell_size,gpu_split,_screen,hippo,
"k_hippo_multipole", "k_hippo_udirect2b",
"k_hippo_umutual2b", "k_hippo_polar",
"k_hippo_fphi_uind", "k_hippo_fphi_mpole",
"k_hippo_short_nbor", "k_hippo_special15");
if (success!=0)
return success;
// specific to HIPPO
k_repulsion.set_function(*(this->pair_program),"k_hippo_repulsion");
k_dispersion.set_function(*(this->pair_program),"k_hippo_dispersion");
// If atom type constants fit in shared memory use fast kernel
int lj_types=ntypes;
shared_types=false;
int max_shared_types=this->device->max_shared_types();
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
lj_types=max_shared_types;
shared_types=true;
}
_lj_types=lj_types;
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp4> host_write(max_amtype, *(this->ucl_device), UCL_WRITE_ONLY);
for (int i = 0; i < max_amtype; i++) {
host_write[i].x = host_pdamp[i];
host_write[i].y = host_thole[i];
host_write[i].z = host_dirdamp[i];
host_write[i].w = host_amtype2class[i];
}
coeff_amtype.alloc(max_amtype,*(this->ucl_device), UCL_READ_ONLY);
ucl_copy(coeff_amtype,host_write,false);
for (int i = 0; i < max_amtype; i++) {
host_write[i].x = host_sizpr[i];
host_write[i].y = host_dmppr[i];
host_write[i].z = host_elepr[i];
host_write[i].w = (numtyp)0;
}
coeff_rep.alloc(max_amtype,*(this->ucl_device), UCL_READ_ONLY);
ucl_copy(coeff_rep,host_write,false);
UCL_H_Vec<numtyp4> host_write2(max_amclass, *(this->ucl_device), UCL_WRITE_ONLY);
for (int i = 0; i < max_amclass; i++) {
host_write2[i].x = host_csix[i];
host_write2[i].y = host_adisp[i];
host_write2[i].z = host_pcore[i];
host_write2[i].w = host_palpha[i];
}
coeff_amclass.alloc(max_amclass,*(this->ucl_device), UCL_READ_ONLY);
ucl_copy(coeff_amclass,host_write2,false);
UCL_H_Vec<numtyp4> dview(5, *(this->ucl_device), UCL_WRITE_ONLY);
sp_polar.alloc(5,*(this->ucl_device),UCL_READ_ONLY);
for (int i=0; i<5; i++) {
dview[i].x=host_special_polar_wscale[i];
dview[i].y=host_special_polar_piscale[i];
dview[i].z=host_special_polar_pscale[i];
dview[i].w=host_special_mpole[i];
}
ucl_copy(sp_polar,dview,5,false);
sp_nonpolar.alloc(5,*(this->ucl_device),UCL_READ_ONLY);
for (int i=0; i<5; i++) {
dview[i].x=host_special_repel[i];
dview[i].y=host_special_disp[i];
dview[i].z=(numtyp)0;
dview[i].w=(numtyp)0;
}
ucl_copy(sp_nonpolar,dview,5,false);
_polar_dscale = polar_dscale;
_polar_uscale = polar_uscale;
_allocated=true;
this->_max_bytes=coeff_amtype.row_bytes() + coeff_rep.row_bytes()
+ coeff_amclass.row_bytes() + sp_polar.row_bytes()
+ sp_nonpolar.row_bytes() + this->_tep.row_bytes()
+ this->_fieldp.row_bytes() + this->_thetai1.row_bytes()
+ this->_thetai2.row_bytes() + this->_thetai3.row_bytes()
+ this->_igrid.row_bytes() + this->_cgrid_brick.row_bytes();
return 0;
}
template <class numtyp, class acctyp>
void HippoT::clear() {
if (!_allocated)
return;
_allocated=false;
coeff_amtype.clear();
coeff_rep.clear();
coeff_amclass.clear();
sp_polar.clear();
sp_nonpolar.clear();
this->clear_atomic();
}
template <class numtyp, class acctyp>
double HippoT::host_memory_usage() const {
return this->host_memory_usage_atomic()+sizeof(Hippo<numtyp,acctyp>);
}
// ---------------------------------------------------------------------------
// Compute the repulsion term, returning tep
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void HippoT::compute_repulsion(const int /*ago*/, const int inum_full,
const int /*nall*/, double ** /*host_x*/,
int * /*host_type*/, int * /*host_amtype*/,
int * /*host_amgroup*/, double ** /*host_rpole*/,
double * /*sublo*/, double * /*subhi*/, tagint * /*tag*/,
int ** /*nspecial*/, tagint ** /*special*/,
int * /*nspecial15*/, tagint ** /*special15*/,
const bool eflag_in, const bool vflag_in,
const bool eatom, const bool vatom,
int & /*host_start*/, int ** /*ilist*/, int ** /*jnum*/,
const double /*cpu_time*/, bool & /*success*/,
const double /*aewald*/, const double off2_repulse,
double * /*host_q*/, double * /*boxlo*/, double * /*prd*/,
double cut2, double c0, double c1, double c2,
double c3, double c4, double c5, void **tep_ptr) {
this->acc_timers();
int eflag, vflag;
if (eatom) eflag=2;
else if (eflag_in) eflag=1;
else eflag=0;
if (vatom) vflag=2;
else if (vflag_in) vflag=1;
else vflag=0;
#ifdef LAL_NO_BLOCK_REDUCE
if (eflag) eflag=2;
if (vflag) vflag=2;
#endif
this->set_kernel(eflag,vflag);
// ------------------- Resize _tep array ------------------------
if (inum_full>this->_max_tep_size) {
this->_max_tep_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
this->_tep.resize(this->_max_tep_size*4);
}
*tep_ptr=this->_tep.host.begin();
this->_off2_repulse = off2_repulse;
_cut2 = cut2;
_c0 = c0;
_c1 = c1;
_c2 = c2;
_c3 = c3;
_c4 = c4;
_c5 = c5;
repulsion(this->_eflag,this->_vflag);
// copy tep from device to host
this->_tep.update_host(this->_max_tep_size*4,false);
}
// ---------------------------------------------------------------------------
// Launch the repulsion kernel
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int HippoT::repulsion(const int eflag, const int vflag) {
int ainum=this->ans->inum();
if (ainum == 0)
return 0;
int _nall=this->atom->nall();
int nbor_pitch=this->nbor->nbor_pitch();
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
this->time_pair.start();
// Build the short neighbor list for the cutoff off2_disp,
// at this point repuslion is the first kernel in a time step for HIPPO
this->k_short_nbor.set_size(GX,BX);
this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
&this->_nbor_data->begin(),
&this->dev_short_nbor, &this->_off2_repulse, &ainum,
&nbor_pitch, &this->_threads_per_atom);
k_repulsion.set_size(GX,BX);
k_repulsion.run(&this->atom->x, &this->atom->extra,
&coeff_rep, &sp_nonpolar,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->ans->force, &this->ans->engv, &this->_tep,
&eflag, &vflag, &ainum, &_nall, &nbor_pitch,
&this->_threads_per_atom, &this->_aewald,
&this->_off2_repulse, &_cut2,
&_c0, &_c1, &_c2, &_c3, &_c4, &_c5);
this->time_pair.stop();
return GX;
}
// ---------------------------------------------------------------------------
// Compute dispersion real-space
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void HippoT::compute_dispersion_real(int *host_amtype, int *host_amgroup,
double **host_rpole, const double aewald,
const double off2_disp) {
// cast necessary data arrays from host to device
this->cast_extra_data(host_amtype, host_amgroup, host_rpole,
nullptr, nullptr, nullptr);
this->atom->add_extra_data();
this->_off2_disp = off2_disp;
this->_aewald = aewald;
dispersion_real(this->_eflag,this->_vflag);
// only copy them back if this is the last kernel
// otherwise, commenting out these two lines to leave the answers
// (forces, energies and virial) on the device until the last kernel
//this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
//this->device->add_ans_object(this->ans);
}
// ---------------------------------------------------------------------------
// Launch the dispersion real-space kernel
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int HippoT::dispersion_real(const int eflag, const int vflag) {
int ainum=this->ans->inum();
if (ainum == 0)
return 0;
int _nall=this->atom->nall();
int nbor_pitch=this->nbor->nbor_pitch();
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
this->time_pair.start();
// Build the short neighbor list for the cutoff off2_disp,
// at this point dispersion is the first kernel in a time step
this->k_short_nbor.set_size(GX,BX);
this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
&this->_nbor_data->begin(),
&this->dev_short_nbor, &this->_off2_disp, &ainum,
&nbor_pitch, &this->_threads_per_atom);
k_dispersion.set_size(GX,BX);
k_dispersion.run(&this->atom->x, &this->atom->extra,
&coeff_amtype, &coeff_amclass, &sp_nonpolar,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->ans->force, &this->ans->engv,
&eflag, &vflag, &ainum, &_nall, &nbor_pitch,
&this->_threads_per_atom, &this->_aewald,
&this->_off2_disp);
this->time_pair.stop();
return GX;
}
// ---------------------------------------------------------------------------
// Compute the multipole real-space term, returning tep
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void HippoT::compute_multipole_real(const int /*ago*/, const int inum_full,
const int /*nall*/, double ** /*host_x*/,
int * /*host_type*/, int * /*host_amtype*/,
int * /*host_amgroup*/, double ** /*host_rpole*/,
double* host_pval, double * /*sublo*/,
double * /*subhi*/, tagint * /*tag*/,
int ** /*nspecial*/, tagint ** /*special*/,
int * /*nspecial15*/, tagint ** /*special15*/,
const bool /*eflag_in*/, const bool /*vflag_in*/,
const bool /*eatom*/, const bool /*vatom*/,
int & /*host_start*/, int ** /*ilist*/, int ** /*jnum*/,
const double /*cpu_time*/, bool & /*success*/,
const double aewald, const double felec,
const double off2_mpole, double * /*host_q*/,
double * /*boxlo*/, double * /*prd*/, void **tep_ptr) {
// cast necessary data arrays from host to device
this->cast_extra_data(nullptr, nullptr, nullptr, nullptr, nullptr, host_pval);
this->atom->add_extra_data();
// ------------------- Resize _tep array ------------------------
if (inum_full>this->_max_tep_size) {
this->_max_tep_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
this->_tep.resize(this->_max_tep_size*4);
}
*tep_ptr=this->_tep.host.begin();
this->_off2_mpole = off2_mpole;
this->_felec = felec;
this->_aewald = aewald;
multipole_real(this->_eflag,this->_vflag);
// copy tep from device to host
this->_tep.update_host(this->_max_tep_size*4,false);
}
// ---------------------------------------------------------------------------
// Launch the multipole real-space kernel
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int HippoT::multipole_real(const int eflag, const int vflag) {
int ainum=this->ans->inum();
if (ainum == 0)
return 0;
int _nall=this->atom->nall();
int nbor_pitch=this->nbor->nbor_pitch();
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
this->time_pair.start();
// Build the short neighbor list for the cutoff off2_mpole
this->k_short_nbor.set_size(GX,BX);
this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
&this->_nbor_data->begin(),
&this->dev_short_nbor, &this->_off2_mpole, &ainum,
&nbor_pitch, &this->_threads_per_atom);
this->k_multipole.set_size(GX,BX);
this->k_multipole.run(&this->atom->x, &this->atom->extra,
&coeff_amtype, &coeff_amclass, &sp_polar,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->ans->force, &this->ans->engv, &this->_tep,
&eflag, &vflag, &ainum, &_nall, &nbor_pitch,
&this->_threads_per_atom, &this->_aewald, &this->_felec,
&this->_off2_mpole, &_polar_dscale, &_polar_uscale);
this->time_pair.stop();
return GX;
}
// ---------------------------------------------------------------------------
// Compute the direct real space part of the permanent field
// returning field and fieldp
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void HippoT::compute_udirect2b(int * /*host_amtype*/, int * /*host_amgroup*/, double ** /*host_rpole*/,
double **host_uind, double **host_uinp, double* host_pval,
const double aewald, const double off2_polar,
void** fieldp_ptr) {
// all the necessary data arrays are already copied from host to device
this->cast_extra_data(nullptr, nullptr, nullptr, host_uind, host_uinp, host_pval);
this->atom->add_extra_data();
*fieldp_ptr=this->_fieldp.host.begin();
this->_off2_polar = off2_polar;
this->_aewald = aewald;
udirect2b(this->_eflag,this->_vflag);
// copy field and fieldp from device to host (_fieldp store both arrays, one after another)
this->_fieldp.update_host(this->_max_fieldp_size*8,false);
}
// ---------------------------------------------------------------------------
// Launch the real-space permanent field kernel
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int HippoT::udirect2b(const int /*eflag*/, const int /*vflag*/) {
int ainum=this->ans->inum();
if (ainum == 0)
return 0;
int _nall=this->atom->nall();
int nbor_pitch=this->nbor->nbor_pitch();
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
this->time_pair.start();
// Build the short neighbor list for the cutoff _off2_polar, if not done yet
// this is the first kernel in a time step where _off2_polar is used
if (!this->short_nbor_polar_avail) {
this->k_short_nbor.set_size(GX,BX);
this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
&this->_nbor_data->begin(),
&this->dev_short_nbor, &this->_off2_polar, &ainum,
&nbor_pitch, &this->_threads_per_atom);
this->short_nbor_polar_avail = true;
}
this->k_udirect2b.set_size(GX,BX);
this->k_udirect2b.run(&this->atom->x, &this->atom->extra,
&coeff_amtype, &coeff_amclass, &sp_polar,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->_fieldp, &ainum, &_nall, &nbor_pitch,
&this->_threads_per_atom, &this->_aewald, &this->_off2_polar,
&_polar_dscale, &_polar_uscale);
this->time_pair.stop();
return GX;
}
// ---------------------------------------------------------------------------
// Compute the direct real space term of the induced field
// returning field and fieldp
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void HippoT::compute_umutual2b(int * /*host_amtype*/, int * /*host_amgroup*/, double ** /*host_rpole*/,
double **host_uind, double **host_uinp, double * /*host_pval*/,
const double aewald, const double off2_polar, void ** /*fieldp_ptr*/) {
// cast necessary data arrays from host to device
this->cast_extra_data(nullptr, nullptr, nullptr, host_uind, host_uinp, nullptr);
this->atom->add_extra_data();
this->_off2_polar = off2_polar;
this->_aewald = aewald;
umutual2b(this->_eflag,this->_vflag);
// copy field and fieldp from device to host (_fieldp store both arrays, one after another)
// NOTE: move this step to update_fieldp() to delay device-host transfer
// *fieldp_ptr=this->_fieldp.host.begin();
// this->_fieldp.update_host(this->_max_fieldp_size*8,false);
}
// ---------------------------------------------------------------------------
// Launch the real-space induced field kernel
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int HippoT::umutual2b(const int /*eflag*/, const int /*vflag*/) {
int ainum=this->ans->inum();
if (ainum == 0)
return 0;
int _nall=this->atom->nall();
int nbor_pitch=this->nbor->nbor_pitch();
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
this->time_pair.start();
// Build the short neighbor list if not done yet
if (!this->short_nbor_polar_avail) {
this->k_short_nbor.set_size(GX,BX);
this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
&this->_nbor_data->begin(), &this->dev_short_nbor,
&this->_off2_polar, &ainum, &nbor_pitch,
&this->_threads_per_atom);
this->short_nbor_polar_avail = true;
}
this->k_umutual2b.set_size(GX,BX);
this->k_umutual2b.run(&this->atom->x, &this->atom->extra,
&coeff_amtype, &coeff_amclass, &sp_polar,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor, &this->_fieldp, &ainum, &_nall,
&nbor_pitch, &this->_threads_per_atom, &this->_aewald,
&this->_off2_polar, &_polar_dscale, &_polar_uscale);
this->time_pair.stop();
return GX;
}
// ---------------------------------------------------------------------------
// Reneighbor on GPU if necessary, and then compute polar real-space
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void HippoT::compute_polar_real(int * /*host_amtype*/, int * /*host_amgroup*/, double ** /*host_rpole*/,
double **host_uind, double **host_uinp, double * /*host_pval*/,
const bool eflag_in, const bool vflag_in,
const bool eatom, const bool vatom,
const double aewald, const double felec,
const double off2_polar, void **tep_ptr) {
// cast necessary data arrays from host to device
this->cast_extra_data(nullptr, nullptr, nullptr, host_uind, host_uinp, nullptr);
this->atom->add_extra_data();
*tep_ptr=this->_tep.host.begin();
this->_off2_polar = off2_polar;
this->_felec = felec;
this->_aewald = aewald;
const int red_blocks=polar_real(this->_eflag,this->_vflag);
// only copy answers (forces, energies and virial) back from the device
// in the last kernel in a timestep (which is polar_real here)
this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
this->device->add_ans_object(this->ans);
// copy tep from device to host
this->_tep.update_host(this->_max_tep_size*4,false);
}
// ---------------------------------------------------------------------------
// Launch the polar real-space kernel
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int HippoT::polar_real(const int eflag, const int vflag) {
int ainum=this->ans->inum();
if (ainum == 0)
return 0;
int _nall=this->atom->nall();
int nbor_pitch=this->nbor->nbor_pitch();
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
const int GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
/*
const int cus = this->device->gpu->cus();
while (GX < cus && GX > 1) {
BX /= 2;
GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
}
*/
this->time_pair.start();
// Build the short neighbor list if not done yet
if (!this->short_nbor_polar_avail) {
this->k_short_nbor.set_size(GX,BX);
this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
&this->_nbor_data->begin(),
&this->dev_short_nbor, &this->_off2_polar, &ainum,
&nbor_pitch, &this->_threads_per_atom);
this->short_nbor_polar_avail = true;
}
this->k_polar.set_size(GX,BX);
this->k_polar.run(&this->atom->x, &this->atom->extra,
&coeff_amtype, &coeff_amclass, &sp_polar,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->ans->force, &this->ans->engv, &this->_tep,
&eflag, &vflag, &ainum, &_nall, &nbor_pitch,
&this->_threads_per_atom, &this->_aewald, &this->_felec,
&this->_off2_polar, &_polar_dscale, &_polar_uscale);
this->time_pair.stop();
// Signal that short nbor list is not avail for the next time step
// do it here because polar_real() is the last kernel in a time step at this point
this->short_nbor_polar_avail = false;
return GX;
}
template class Hippo<PRECISION,ACC_PRECISION>;
}

2519
lib/gpu/lal_hippo.cu Normal file

File diff suppressed because it is too large Load Diff

166
lib/gpu/lal_hippo.h Normal file
View File

@ -0,0 +1,166 @@
/***************************************************************************
hippo.h
-------------------
Trung Dac Nguyen (Northwestern)
Class for acceleration of the hippo pair style.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : trung.nguyen@northwestern.edu
***************************************************************************/
#ifndef LAL_HIPPO_H
#define LAL_HIPPO_H
#include "lal_base_amoeba.h"
namespace LAMMPS_AL {
template <class numtyp, class acctyp>
class Hippo : public BaseAmoeba<numtyp, acctyp> {
public:
Hippo();
~Hippo();
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(const int ntypes, const int max_amtype, const int max_amclass,
const double *host_pdamp, const double *host_thole,
const double *host_dirdamp, const int *host_amtype2class,
const double *host_special_mpole,
const double *host_special_repel,
const double *host_special_disp,
const double *host_special_polar_wscale,
const double *host_special_polar_piscale,
const double *host_special_polar_pscale,
const double *host_sizpr, const double *host_dmppr, const double *host_elepr,
const double *host_csix, const double *host_adisp,
const double *host_pcore, const double *host_palpha,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const int maxspecial15, const double cell_size,
const double gpu_split, FILE *_screen,
const double polar_dscale, const double polar_uscale);
/// Compute repulsion with device neighboring
virtual void compute_repulsion(const int ago, const int inum_full,
const int nall, double **host_x,
int *host_type, int *host_amtype,
int *host_amgroup, double **host_rpole,
double *sublo, double *subhi, tagint *tag,
int **nspecial, tagint **special,
int *nspecial15, tagint **special15,
const bool eflag_in, const bool vflag_in,
const bool eatom, const bool vatom,
int &host_start, int **ilist, int **jnum,
const double cpu_time, bool &success,
const double aewald, const double off2_repulse,
double *host_q, double *boxlo, double *prd,
double cut2, double c0, double c1, double c2,
double c3, double c4, double c5,void** tep_ptr);
/// Compute dispersion real-space with device neighboring
virtual void compute_dispersion_real(int *host_amtype, int *host_amgroup,
double **host_rpole, const double aewald,
const double off2_disp);
/// Compute multipole real-space with device neighboring
virtual void compute_multipole_real(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *host_amtype,
int *host_amgroup, double **host_rpole, double *host_pval,
double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special,
int *nspecial15, tagint **special15,
const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **numj, const double cpu_time, bool &success,
const double aewald, const double felec, const double off2_mpole, double *charge,
double *boxlo, double *prd, void **tep_ptr);
/// Compute the real space part of the permanent field (udirect2b) with device neighboring
virtual void compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole,
double **host_uind, double **host_uinp, double* host_pval,
const double aewald, const double off2_polar, void** fieldp_ptr);
/// Compute the real space part of the induced field (umutual2b) with device neighboring
virtual void compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole,
double **host_uind, double **host_uinp, double *host_pval,
const double aewald, const double off2_polar,
void** fieldp_ptr);
/// Compute polar real-space with device neighboring
virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole,
double **host_uind, double **host_uinp, double *host_pval,
const bool eflag_in, const bool vflag_in,
const bool eatom, const bool vatom,
const double aewald, const double felec, const double off2_polar,
void **tep_ptr);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
/// Returns memory usage on device per atom
int bytes_per_atom(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage() const;
// --------------------------- TYPE DATA --------------------------
/// pdamp = coeff_amtype.x; thole = coeff_amtype.y;
/// dirdamp = coeff_amtype.z; amtype2class = coeff_amtype.w
UCL_D_Vec<numtyp4> coeff_amtype;
/// csix = coeff_amclass.x; adisp = coeff_amclass.y;
UCL_D_Vec<numtyp4> coeff_amclass;
/// sizpr = coeff_rep.x; dmppr = coeff_rep.y; elepr = coeff_rep.z;
UCL_D_Vec<numtyp4> coeff_rep;
/// Special polar values [0-4]:
/// sp_polar.x = special_polar_wscale
/// sp_polar.y special_polar_pscale,
/// sp_polar.z = special_polar_piscale
/// sp_polar.w = special_mpole
UCL_D_Vec<numtyp4> sp_polar;
/// Special nonpolar values [0-4]:
/// sp_nonpolar.x = special_hal
/// sp_nonpolar.y special_repel
/// sp_nonpolar.z = special_disp
UCL_D_Vec<numtyp4> sp_nonpolar;
/// If atom type constants fit in shared memory, use fast kernels
bool shared_types;
/// Number of atom types
int _lj_types;
numtyp _cut2,_c0,_c1,_c2,_c3,_c4,_c5;
numtyp _polar_dscale, _polar_uscale;
numtyp _qqrd2e;
UCL_Kernel k_repulsion, k_dispersion;
protected:
bool _allocated;
int repulsion(const int eflag, const int vflag);
int dispersion_real(const int eflag, const int vflag);
int multipole_real(const int eflag, const int vflag);
int udirect2b(const int eflag, const int vflag);
int umutual2b(const int eflag, const int vflag);
int polar_real(const int eflag, const int vflag);
};
}
#endif

231
lib/gpu/lal_hippo_ext.cpp Normal file
View File

@ -0,0 +1,231 @@
/***************************************************************************
hippo_ext.cpp
-------------------
Trung Dac Nguyen (Northwestern)
Functions for LAMMPS access to hippo acceleration routines.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : trung.nguyen@northwestern.edu
***************************************************************************/
#include <iostream>
#include <cassert>
#include <cmath>
#include "lal_hippo.h"
using namespace std;
using namespace LAMMPS_AL;
static Hippo<PRECISION,ACC_PRECISION> HIPPOMF;
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass,
const double *host_pdamp, const double *host_thole,
const double *host_dirdamp, const int *host_amtype2class,
const double *host_special_repel,
const double *host_special_disp,
const double *host_special_mpole,
const double *host_special_polar_wscale,
const double *host_special_polar_piscale,
const double *host_special_polar_pscale,
const double *host_sizpr, const double *host_dmppr, const double *host_elepr,
const double *host_csix, const double *host_adisp,
const double *host_pcore, const double *host_palpha,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const int maxspecial15,
const double cell_size, int &gpu_mode, FILE *screen,
const double polar_dscale, const double polar_uscale) {
HIPPOMF.clear();
gpu_mode=HIPPOMF.device->gpu_mode();
double gpu_split=HIPPOMF.device->particle_split();
int first_gpu=HIPPOMF.device->first_device();
int last_gpu=HIPPOMF.device->last_device();
int world_me=HIPPOMF.device->world_me();
int gpu_rank=HIPPOMF.device->gpu_rank();
int procs_per_gpu=HIPPOMF.device->procs_per_gpu();
HIPPOMF.device->init_message(screen,"HIPPO",first_gpu,last_gpu);
bool message=false;
if (HIPPOMF.device->replica_me()==0 && screen)
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fflush(screen);
}
int init_ok=0;
if (world_me==0)
init_ok=HIPPOMF.init(ntypes, max_amtype, max_amclass,
host_pdamp, host_thole, host_dirdamp,
host_amtype2class, host_special_repel, host_special_disp,
host_special_mpole, host_special_polar_wscale,
host_special_polar_piscale, host_special_polar_pscale,
host_sizpr, host_dmppr, host_elepr,
host_csix, host_adisp, host_pcore, host_palpha,
nlocal, nall, max_nbors,
maxspecial, maxspecial15, cell_size, gpu_split,
screen, polar_dscale, polar_uscale);
HIPPOMF.device->world_barrier();
if (message)
fprintf(screen,"Done.\n");
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0)
init_ok=HIPPOMF.init(ntypes, max_amtype, max_amclass,
host_pdamp, host_thole, host_dirdamp,
host_amtype2class, host_special_repel, host_special_disp,
host_special_mpole, host_special_polar_wscale,
host_special_polar_piscale, host_special_polar_pscale,
host_sizpr, host_dmppr, host_elepr,
host_csix, host_adisp, host_pcore, host_palpha,
nlocal, nall, max_nbors,
maxspecial, maxspecial15, cell_size, gpu_split,
screen, polar_dscale, polar_uscale);
HIPPOMF.device->gpu_barrier();
if (message)
fprintf(screen,"Done.\n");
}
if (message)
fprintf(screen,"\n");
if (init_ok==0)
HIPPOMF.estimate_gpu_overhead();
return init_ok;
}
void hippo_gpu_clear() {
HIPPOMF.clear();
}
int** hippo_gpu_precompute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *host_amtype,
int *host_amgroup, double **host_rpole,
double ** /*host_uind*/, double ** /*host_uinp*/, double * /*host_pval*/,
double *sublo, double *subhi, tagint *tag,
int **nspecial, tagint **special,
int *nspecial15, tagint **special15,
const bool eflag_in, const bool vflag_in,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time,
bool &success, double *host_q, double *boxlo, double *prd) {
return HIPPOMF.precompute(ago, inum_full, nall, host_x, host_type,
host_amtype, host_amgroup, host_rpole,
nullptr, nullptr, nullptr, sublo, subhi, tag,
nspecial, special, nspecial15, special15,
eflag_in, vflag_in, eatom, vatom,
host_start, ilist, jnum, cpu_time,
success, host_q, boxlo, prd);
}
void hippo_gpu_compute_repulsion(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
int *host_amtype, int *host_amgroup, double **host_rpole,
double *sublo, double *subhi, tagint *tag, int **nspecial,
tagint **special, int *nspecial15, tagint** special15,
const bool eflag, const bool vflag, const bool eatom,
const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time,
bool &success, const double aewald, const double off2,
double *host_q, double *boxlo, double *prd,
double cut2, double c0, double c1, double c2,
double c3, double c4, double c5, void **tep_ptr) {
HIPPOMF.compute_repulsion(ago, inum_full, nall, host_x, host_type,
host_amtype, host_amgroup, host_rpole, sublo, subhi,
tag, nspecial, special, nspecial15, special15,
eflag, vflag, eatom, vatom, host_start, ilist, jnum,
cpu_time, success, aewald, off2, host_q, boxlo, prd,
cut2, c0, c1, c2, c3, c4, c5, tep_ptr);
}
void hippo_gpu_compute_dispersion_real(int *host_amtype, int *host_amgroup,
double **host_rpole, const double aewald,
const double off2) {
HIPPOMF.compute_dispersion_real(host_amtype, host_amgroup, host_rpole,
aewald, off2);
}
void hippo_gpu_compute_multipole_real(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
int *host_amtype, int *host_amgroup, double **host_rpole,
double *host_pval, double *sublo, double *subhi, tagint *tag, int **nspecial,
tagint **special, int *nspecial15, tagint** special15,
const bool eflag, const bool vflag, const bool eatom,
const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time,
bool &success, const double aewald, const double felec, const double off2,
double *host_q, double *boxlo, double *prd, void **tep_ptr) {
HIPPOMF.compute_multipole_real(ago, inum_full, nall, host_x, host_type,
host_amtype, host_amgroup, host_rpole, host_pval, sublo, subhi,
tag, nspecial, special, nspecial15, special15,
eflag, vflag, eatom, vatom, host_start, ilist, jnum,
cpu_time, success, aewald, felec, off2, host_q, boxlo, prd, tep_ptr);
}
void hippo_gpu_compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole,
double **host_uind, double **host_uinp, double *host_pval,
const double aewald, const double off2, void **fieldp_ptr) {
HIPPOMF.compute_udirect2b(host_amtype, host_amgroup, host_rpole,
host_uind, host_uinp, host_pval,
aewald, off2, fieldp_ptr);
}
void hippo_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole,
double **host_uind, double **host_uinp, double *host_pval,
const double aewald, const double off2, void **fieldp_ptr) {
HIPPOMF.compute_umutual2b(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval,
aewald, off2, fieldp_ptr);
}
void hippo_gpu_update_fieldp(void **fieldp_ptr) {
HIPPOMF.update_fieldp(fieldp_ptr);
}
void hippo_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole,
double **host_uind, double **host_uinp, double *host_pval,
const bool eflag_in, const bool vflag_in,
const bool eatom, const bool vatom,
const double aewald, const double felec, const double off2,
void **tep_ptr) {
HIPPOMF.compute_polar_real(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval,
eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
}
void hippo_gpu_precompute_kspace(const int inum_full, const int bsorder,
double ***host_thetai1, double ***host_thetai2,
double ***host_thetai3, int** igrid,
const int nzlo_out, const int nzhi_out,
const int nylo_out, const int nyhi_out,
const int nxlo_out, const int nxhi_out) {
HIPPOMF.precompute_kspace(inum_full, bsorder, host_thetai1, host_thetai2,
host_thetai3, igrid, nzlo_out, nzhi_out,
nylo_out, nyhi_out, nxlo_out, nxhi_out);
}
void hippo_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1,
void **host_fdip_phi2, void **host_fdip_sum_phi) {
HIPPOMF.compute_fphi_uind(host_grid_brick, host_fdip_phi1, host_fdip_phi2, host_fdip_sum_phi);
}
double hippo_gpu_bytes() {
return HIPPOMF.host_memory_usage();
}

431
lib/gpu/lal_hippo_extra.h Normal file
View File

@ -0,0 +1,431 @@
/// **************************************************************************
// hippo_extra.h
// -------------------
// Trung Dac Nguyen
//
// Device code for hippo math routines
//
// __________________________________________________________________________
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________
//
// begin :
// email : ndactrung@gmail.com
// ***************************************************************************/*
#ifndef LAL_HIPPO_EXTRA_H
#define LAL_HIPPO_EXTRA_H
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#else
#endif
#define MY_PI2 (numtyp)1.57079632679489661923
#define MY_PI4 (numtyp)0.78539816339744830962
/* ----------------------------------------------------------------------
damprep generates coefficients for the Pauli repulsion
damping function for powers of the interatomic distance
literature reference:
J. A. Rackers and J. W. Ponder, "Classical Pauli Repulsion: An
Anisotropic, Atomic Multipole Model", Journal of Chemical Physics,
150, 084104 (2019)
------------------------------------------------------------------------- */
ucl_inline void damprep(const numtyp r, const numtyp r2, const numtyp rr1,
const numtyp rr3, const numtyp rr5, const numtyp rr7,
const numtyp rr9, const numtyp rr11, const int rorder,
const numtyp dmpi, const numtyp dmpk, numtyp dmpik[11])
{
numtyp r3,r4;
numtyp r5,r6,r7,r8;
numtyp s,ds,d2s;
numtyp d3s,d4s,d5s;
numtyp dmpi2,dmpk2;
numtyp dmpi22,dmpi23;
numtyp dmpi24,dmpi25;
numtyp dmpi26,dmpi27;
numtyp dmpk22,dmpk23;
numtyp dmpk24,dmpk25;
numtyp dmpk26;
numtyp eps,diff;
numtyp expi,expk;
numtyp dampi,dampk;
numtyp pre,term,tmp;
// compute tolerance value for damping exponents
eps = (numtyp)0.001;
diff = dmpi-dmpk; // fabs(dmpi-dmpk)
if (diff < (numtyp)0) diff = -diff;
// treat the case where alpha damping exponents are equal
if (diff < eps) {
r3 = r2 * r;
r4 = r3 * r;
r5 = r4 * r;
r6 = r5 * r;
r7 = r6 * r;
dmpi2 = (numtyp)0.5 * dmpi;
dampi = dmpi2 * r;
expi = ucl_exp(-dampi);
dmpi22 = dmpi2 * dmpi2;
dmpi23 = dmpi22 * dmpi2;
dmpi24 = dmpi23 * dmpi2;
dmpi25 = dmpi24 * dmpi2;
dmpi26 = dmpi25 * dmpi2;
pre = (numtyp)128.0;
s = (r + dmpi2*r2 + dmpi22*r3/(numtyp)3.0) * expi;
ds = (dmpi22*r3 + dmpi23*r4) * expi / (numtyp)3.0;
d2s = dmpi24 * expi * r5 / (numtyp)9.0;
d3s = dmpi25 * expi * r6 / (numtyp)45.0;
d4s = (dmpi25*r6 + dmpi26*r7) * expi / (numtyp)315.0;
if (rorder >= 11) {
r8 = r7 * r;
dmpi27 = dmpi2 * dmpi26;
d5s = (dmpi25*r6 + dmpi26*r7 + dmpi27*r8/(numtyp)3.0) * expi / (numtyp)945.0;
}
// treat the case where alpha damping exponents are unequal
} else {
r3 = r2 * r;
r4 = r3 * r;
r5 = r4 * r;
dmpi2 = (numtyp)0.5 * dmpi;
dmpk2 = (numtyp)0.5 * dmpk;
dampi = dmpi2 * r;
dampk = dmpk2 * r;
expi = ucl_exp(-dampi);
expk = ucl_exp(-dampk);
dmpi22 = dmpi2 * dmpi2;
dmpi23 = dmpi22 * dmpi2;
dmpi24 = dmpi23 * dmpi2;
dmpi25 = dmpi24 * dmpi2;
dmpk22 = dmpk2 * dmpk2;
dmpk23 = dmpk22 * dmpk2;
dmpk24 = dmpk23 * dmpk2;
dmpk25 = dmpk24 * dmpk2;
term = dmpi22 - dmpk22;
pre = (numtyp)8192.0 * dmpi23 * dmpk23 / (term*term*term*term); //ucl_powr(term,(numtyp)4.0);
tmp = (numtyp)4.0 * dmpi2 * dmpk2 / term;
s = (dampi-tmp)*expk + (dampk+tmp)*expi;
ds = (dmpi2*dmpk2*r2 - (numtyp)4.0*dmpi2*dmpk22*r/term -
(numtyp)4.0*dmpi2*dmpk2/term) * expk +
(dmpi2*dmpk2*r2 + (numtyp)4.0*dmpi22*dmpk2*r/term + (numtyp)4.0*dmpi2*dmpk2/term) * expi;
d2s = (dmpi2*dmpk2*r2/3.0 + dmpi2*dmpk22*r3/(numtyp)3.0 -
((numtyp)4.0/(numtyp)3.0)*dmpi2*dmpk23*r2/term - (numtyp)4.0*dmpi2*dmpk22*r/term -
(numtyp)4.0*dmpi2*dmpk2/term) * expk +
(dmpi2*dmpk2*r2/(numtyp)3.0 + dmpi22*dmpk2*r3/(numtyp)3.0 +
((numtyp)4.0/(numtyp)3.0)*dmpi23*dmpk2*r2/term + (numtyp)4.0*dmpi22*dmpk2*r/term +
(numtyp)4.0*dmpi2*dmpk2/term) * expi;
d3s = (dmpi2*dmpk23*r4/(numtyp)15.0 + dmpi2*dmpk22*r3/(numtyp)5.0 + dmpi2*dmpk2*r2/(numtyp)5.0 -
((numtyp)4.0/(numtyp)15.0)*dmpi2*dmpk24*r3/term - ((numtyp)8.0/(numtyp)5.0)*dmpi2*dmpk23*r2/term -
(numtyp)4.0*dmpi2*dmpk22*r/term - (numtyp)4.0/term*dmpi2*dmpk2) * expk +
(dmpi23*dmpk2*r4/(numtyp)15.0 + dmpi22*dmpk2*r3/(numtyp)5.0 + dmpi2*dmpk2*r2/(numtyp)5.0 +
((numtyp)4.0/(numtyp)15.0)*dmpi24*dmpk2*r3/term + ((numtyp)8.0/(numtyp)5.0)*dmpi23*dmpk2*r2/term +
(numtyp)4.0*dmpi22*dmpk2*r/term + (numtyp)4.0/term*dmpi2*dmpk2) * expi;
d4s = (dmpi2*dmpk24*r5/(numtyp)105.0 + ((numtyp)2.0/(numtyp)35.0)*dmpi2*dmpk23*r4 +
dmpi2*dmpk22*r3/(numtyp)7.0 + dmpi2*dmpk2*r2/(numtyp)7.0 -
((numtyp)4.0/(numtyp)105.0)*dmpi2*dmpk25*r4/term - ((numtyp)8.0/21.0)*dmpi2*dmpk24*r3/term -
((numtyp)12.0/(numtyp)7.0)*dmpi2*dmpk23*r2/term - (numtyp)4.0*dmpi2*dmpk22*r/term -
(numtyp)4.0*dmpi2*dmpk2/term) * expk +
(dmpi24*dmpk2*r5/(numtyp)105.0 + ((numtyp)2.0/(numtyp)35.0)*dmpi23*dmpk2*r4 +
dmpi22*dmpk2*r3/(numtyp)7.0 + dmpi2*dmpk2*r2/(numtyp)7.0 +
((numtyp)4.0/(numtyp)105.0)*dmpi25*dmpk2*r4/term + ((numtyp)8.0/(numtyp)21.0)*dmpi24*dmpk2*r3/term +
((numtyp)12.0/(numtyp)7.0)*dmpi23*dmpk2*r2/term + (numtyp)4.0*dmpi22*dmpk2*r/term +
(numtyp)4.0*dmpi2*dmpk2/term) * expi;
if (rorder >= 11) {
r6 = r5 * r;
dmpi26 = dmpi25 * dmpi2;
dmpk26 = dmpk25 * dmpk2;
d5s = (dmpi2*dmpk25*r6/(numtyp)945.0 + ((numtyp)2.0/(numtyp)189.0)*dmpi2*dmpk24*r5 +
dmpi2*dmpk23*r4/(numtyp)21.0 + dmpi2*dmpk22*r3/(numtyp)9.0 + dmpi2*dmpk2*r2/(numtyp)9.0 -
((numtyp)4.0/(numtyp)945.0)*dmpi2*dmpk26*r5/term -
((numtyp)4.0/(numtyp)63.0)*dmpi2*dmpk25*r4/term - ((numtyp)4.0/(numtyp)9.0)*dmpi2*dmpk24*r3/term -
((numtyp)16.0/(numtyp)9.0)*dmpi2*dmpk23*r2/term - (numtyp)4.0*dmpi2*dmpk22*r/term -
(numtyp)4.0*dmpi2*dmpk2/term) * expk +
(dmpi25*dmpk2*r6/(numtyp)945.0 + ((numtyp)2.0/(numtyp)189.0)*dmpi24*dmpk2*r5 +
dmpi23*dmpk2*r4/(numtyp)21.0 + dmpi22*dmpk2*r3/(numtyp)9.0 + dmpi2*dmpk2*r2/(numtyp)9.0 +
((numtyp)4.0/(numtyp)945.0)*dmpi26*dmpk2*r5/term + ((numtyp)4.0/(numtyp)63.0)*dmpi25*dmpk2*r4/term +
((numtyp)4.0/(numtyp)9.0)*dmpi24*dmpk2*r3/term + ((numtyp)16.0/(numtyp)9.0)*dmpi23*dmpk2*r2/term +
(numtyp)4.0*dmpi22*dmpk2*r/term + (numtyp)4.0*dmpi2*dmpk2/term) * expi;
}
}
// convert partial derivatives into full derivatives
s = s * rr1;
ds = ds * rr3;
d2s = d2s * rr5;
d3s = d3s * rr7;
d4s = d4s * rr9;
d5s = d5s * rr11;
dmpik[0] = (numtyp)0.5 * pre * s * s;
dmpik[2] = pre * s * ds;
dmpik[4] = pre * (s*d2s + ds*ds);
dmpik[6] = pre * (s*d3s + (numtyp)3.0*ds*d2s);
dmpik[8] = pre * (s*d4s + (numtyp)4.0*ds*d3s + (numtyp)3.0*d2s*d2s);
if (rorder >= 11) dmpik[10] = pre * (s*d5s + (numtyp)5.0*ds*d4s + (numtyp)10.0*d2s*d3s);
}
/* ----------------------------------------------------------------------
damppole generates coefficients for the charge penetration
damping function for powers of the interatomic distance
literature references:
L. V. Slipchenko and M. S. Gordon, "Electrostatic Energy in the
Effective Fragment Potential Method: Theory and Application to
the Benzene Dimer", Journal of Computational Chemistry, 28,
276-291 (2007) [Gordon f1 and f2 models]
J. A. Rackers, Q. Wang, C. Liu, J.-P. Piquemal, P. Ren and
J. W. Ponder, "An Optimized Charge Penetration Model for Use with
the AMOEBA Force Field", Physical Chemistry Chemical Physics, 19,
276-291 (2017)
------------------------------------------------------------------------- */
ucl_inline void damppole(const numtyp r, const int rorder,
const numtyp alphai, const numtyp alphak,
numtyp dmpi[9], numtyp dmpk[9], numtyp dmpik[11])
{
numtyp termi,termk;
numtyp termi2,termk2;
numtyp alphai2,alphak2;
numtyp eps,diff;
numtyp expi,expk;
numtyp dampi,dampk;
numtyp dampi2,dampi3;
numtyp dampi4,dampi5;
numtyp dampi6,dampi7;
numtyp dampi8;
numtyp dampk2,dampk3;
numtyp dampk4,dampk5;
numtyp dampk6;
// compute tolerance and exponential damping factors
eps = (numtyp)0.001;
diff = alphai-alphak;
if (diff < (numtyp)0) diff = -diff;
dampi = alphai * r;
dampk = alphak * r;
expi = ucl_exp(-dampi);
expk = ucl_exp(-dampk);
// core-valence charge penetration damping for Gordon f1
dampi2 = dampi * dampi;
dampi3 = dampi * dampi2;
dampi4 = dampi2 * dampi2;
dampi5 = dampi2 * dampi3;
dmpi[0] = (numtyp)1.0 - ((numtyp)1.0 + (numtyp)0.5*dampi)*expi;
dmpi[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2)*expi;
dmpi[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi;
dmpi[6] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + dampi4/(numtyp)30.0)*expi;
dmpi[8] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
(numtyp)4.0*dampi4/(numtyp)105.0 + dampi5/(numtyp)210.0)*expi;
if (diff < eps) {
dmpk[0] = dmpi[0];
dmpk[2] = dmpi[2];
dmpk[4] = dmpi[4];
dmpk[6] = dmpi[6];
dmpk[8] = dmpi[8];
} else {
dampk2 = dampk * dampk;
dampk3 = dampk * dampk2;
dampk4 = dampk2 * dampk2;
dampk5 = dampk2 * dampk3;
dmpk[0] = (numtyp)1.0 - ((numtyp)1.0 + (numtyp)0.5*dampk)*expk;
dmpk[2] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2)*expk;
dmpk[4] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0)*expk;
dmpk[6] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + dampk4/(numtyp)30.0)*expk;
dmpk[8] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 +
(numtyp)4.0*dampk4/(numtyp)105.0 + dampk5/(numtyp)210.0)*expk;
}
// valence-valence charge penetration damping for Gordon f1
if (diff < eps) {
dampi6 = dampi3 * dampi3;
dampi7 = dampi3 * dampi4;
dmpik[0] = (numtyp)1.0 - ((numtyp)1.0 + (numtyp)11.0*dampi/(numtyp)16.0 + (numtyp)3.0*dampi2/(numtyp)16.0 +
dampi3/(numtyp)48.0)*expi;
dmpik[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 +
(numtyp)7.0*dampi3/(numtyp)48.0 + dampi4/(numtyp)48.0)*expi;
dmpik[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
dampi4/(numtyp)24.0 + dampi5/(numtyp)144.0)*expi;
dmpik[6] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
dampi4/(numtyp)24.0 + dampi5/(numtyp)120.0 + dampi6/(numtyp)720.0)*expi;
dmpik[8] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
dampi4/(numtyp)24.0 + dampi5/(numtyp)120.0 + dampi6/(numtyp)720.0 +
dampi7/(numtyp)5040.0)*expi;
if (rorder >= 11) {
dampi8 = dampi4 * dampi4;
dmpik[10] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
dampi4/(numtyp)24.0 + dampi5/(numtyp)120.0 + dampi6/(numtyp)720.0 +
dampi7/(numtyp)5040.0 + dampi8/(numtyp)45360.0)*expi;
}
} else {
alphai2 = alphai * alphai;
alphak2 = alphak * alphak;
termi = alphak2 / (alphak2-alphai2);
termk = alphai2 / (alphai2-alphak2);
termi2 = termi * termi;
termk2 = termk * termk;
dmpik[0] = (numtyp)1.0 - termi2*(1.0 + (numtyp)2.0*termk + (numtyp)0.5*dampi)*expi -
termk2*((numtyp)1.0 + (numtyp)2.0*termi + (numtyp)0.5*dampk)*expk;
dmpik[2] = (numtyp)1.0 - termi2*((numtyp)1.0+dampi+(numtyp)0.5*dampi2)*expi -
termk2*((numtyp)1.0+dampk+(numtyp)0.5*dampk2)*expk -
(numtyp)2.0*termi2*termk*((numtyp)1.0+dampi)*expi -
(numtyp)2.0*termk2*termi*((numtyp)1.0+dampk)*expk;
dmpik[4] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi -
termk2*(1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0)*expk -
(numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + dampi2/(numtyp)3.0)*expi -
(numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + dampk2/(numtyp)3.0)*expk;
dmpik[6] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 +
dampi3/(numtyp)6.0 + dampi4/(numtyp)30.0)*expi -
termk2*((numtyp)1.0 + dampk + 0.5*dampk2 + dampk3/(numtyp)6.0 + dampk4/(numtyp)30.0)*expk -
(numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)2.0*dampi2/(numtyp)5.0 + dampi3/(numtyp)15.0)*expi -
(numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + (numtyp)2.0*dampk2/(numtyp)5.0 + dampk3/(numtyp)15.0)*expk;
dmpik[8] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
(numtyp)4.0*dampi4/(numtyp)105.0 + dampi5/(numtyp)210.0)*expi -
termk2*((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 +
(numtyp)4.0*dampk4/105.0 + dampk5/(numtyp)210.0)*expk -
(numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)3.0*dampi2/(numtyp)7.0 +
(numtyp)2.0*dampi3/(numtyp)21.0 + dampi4/(numtyp)105.0)*expi -
(numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + (numtyp)3.0*dampk2/(numtyp)7.0 +
(numtyp)2.0*dampk3/(numtyp)21.0 + dampk4/(numtyp)105.0)*expk;
if (rorder >= 11) {
dampi6 = dampi3 * dampi3;
dampk6 = dampk3 * dampk3;
dmpik[10] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
(numtyp)5.0*dampi4/(numtyp)126.0 + (numtyp)2.0*dampi5/(numtyp)315.0 +
dampi6/(numtyp)1890.0)*expi -
termk2*((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + (numtyp)5.0*dampk4/(numtyp)126.0 +
(numtyp)2.0*dampk5/(numtyp)315.0 + dampk6/(numtyp)1890.0)*expk -
(numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)4.0*dampi2/(numtyp)9.0 + dampi3/(numtyp)9.0 +
dampi4/(numtyp)63.0 + dampi5/(numtyp)945.0)*expi -
(numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + 4.0*dampk2/(numtyp)9.0 + dampk3/(numtyp)9.0 +
dampk4/(numtyp)63.0 + dampk5/(numtyp)945.0)*expk;
}
}
}
/* ----------------------------------------------------------------------
dampdir = direct field damping coefficents
dampdir generates coefficients for the direct field damping
function for powers of the interatomic distance
------------------------------------------------------------------------- */
ucl_inline void dampdir(numtyp r, numtyp alphai, numtyp alphak, numtyp *dmpi, numtyp *dmpk)
{
numtyp eps,diff;
numtyp expi,expk;
numtyp dampi,dampk;
numtyp dampi2,dampk2;
numtyp dampi3,dampk3;
numtyp dampi4,dampk4;
// compute tolerance and exponential damping factors
eps = (numtyp)0.001;
diff = alphai-alphak; // fabs(alphai-alphak);
if (diff < (numtyp)0) diff = -diff;
dampi = alphai * r;
dampk = alphak * r;
expi = ucl_exp(-dampi);
expk = ucl_exp(-dampk);
// core-valence charge penetration damping for Gordon f1 (HIPPO)
dampi2 = dampi * dampi;
dampi3 = dampi * dampi2;
dampi4 = dampi2 * dampi2;
dmpi[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2)*expi;
dmpi[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi;
dmpi[6] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + dampi4/(numtyp)30.0)*expi;
if (diff < eps) {
dmpk[2] = dmpi[2];
dmpk[4] = dmpi[4];
dmpk[6] = dmpi[6];
} else {
dampk2 = dampk * dampk;
dampk3 = dampk * dampk2;
dampk4 = dampk2 * dampk2;
dmpk[2] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2)*expk;
dmpk[4] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0)*expk;
dmpk[6] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + dampk4/30.0)*expk;
}
}
/* ----------------------------------------------------------------------
dampmut = mutual field damping coefficents
dampmut generates coefficients for the mutual field damping
function for powers of the interatomic distance
------------------------------------------------------------------------- */
ucl_inline void dampmut(numtyp r, numtyp alphai, numtyp alphak, numtyp dmpik[5])
{
numtyp termi,termk;
numtyp termi2,termk2;
numtyp alphai2,alphak2;
numtyp eps,diff;
numtyp expi,expk;
numtyp dampi,dampk;
numtyp dampi2,dampi3;
numtyp dampi4,dampi5;
numtyp dampk2,dampk3;
// compute tolerance and exponential damping factors
eps = (numtyp)0.001;
diff = alphai-alphak; // fabs(alphai-alphak);
if (diff < (numtyp)0) diff = -diff;
dampi = alphai * r;
dampk = alphak * r;
expi = ucl_exp(-dampi);
expk = ucl_exp(-dampk);
// valence-valence charge penetration damping for Gordon f1 (HIPPO)
dampi2 = dampi * dampi;
dampi3 = dampi * dampi2;
if (diff < eps) {
dampi4 = dampi2 * dampi2;
dampi5 = dampi2 * dampi3;
dmpik[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 +
7.0*dampi3/(numtyp)48.0 + dampi4/48.0)*expi;
dmpik[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
dampi4/(numtyp)24.0 + dampi5/(numtyp)144.0)*expi;
} else {
dampk2 = dampk * dampk;
dampk3 = dampk * dampk2;
alphai2 = alphai * alphai;
alphak2 = alphak * alphak;
termi = alphak2 / (alphak2-alphai2);
termk = alphai2 / (alphai2-alphak2);
termi2 = termi * termi;
termk2 = termk * termk;
dmpik[2] = (numtyp)1.0 - termi2*((numtyp)1.0+dampi+(numtyp)0.5*dampi2)*expi -
termk2*((numtyp)1.0+dampk+(numtyp)0.5*dampk2)*expk -
(numtyp)2.0*termi2*termk*((numtyp)1.0+dampi)*expi - (numtyp)2.0*termk2*termi*((numtyp)1.0+dampk)*expk;
dmpik[4] = (numtyp)1.0 - termi2*((numtyp)1.0+dampi+(numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi -
termk2*((numtyp)1.0+dampk+(numtyp)0.5*dampk2 + dampk3/(numtyp)6.00)*expk -
(numtyp)2.0*termi2*termk *((numtyp)1.0+dampi+dampi2/(numtyp)3.0)*expi -
(numtyp)2.0*termk2*termi *((numtyp)1.0+dampk+dampk2/(numtyp)3.0)*expk;
}
}
#endif

View File

@ -576,6 +576,11 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
time_nbor.stop();
if (_time_device)
time_nbor.add_to_total();
// on the host, special[i][j] = the special j neighbor of atom i (nall by maxspecial)
// on the device, transpose the matrix (1-d array) for coalesced reads
// dev_special[i][j] = the special i neighbor of atom j
time_transpose.start();
const int b2x=_block_cell_2d;
const int b2y=_block_cell_2d;
@ -679,6 +684,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
if (_cutoff < _cell_size) vadjust*=1.46;
mn=std::max(mn,static_cast<int>(ceil(_max_neighbor_factor*vadjust*mn)));
if (mn<33) mn+=3;
resize_max_neighbors<numtyp,acctyp>(mn,success);
set_nbor_block_size(mn/2);
if (!success)
@ -831,6 +837,17 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
time_nbor.stop();
}
void Neighbor::transpose(UCL_D_Vec<tagint> &out, const UCL_D_Vec<tagint> &in,
const int columns_in, const int rows_in)
{
const int b2x=_block_cell_2d;
const int b2y=_block_cell_2d;
const int g2x=static_cast<int>(ceil(static_cast<double>(columns_in)/b2x));
const int g2y=static_cast<int>(ceil(static_cast<double>(rows_in)/b2y));
_shared->k_transpose.set_size(g2x,g2y,b2x,b2y);
_shared->k_transpose.run(&out, &in, &columns_in, &rows_in);
}
template void Neighbor::build_nbor_list<PRECISION,ACC_PRECISION>
(double **x, const int inum, const int host_inum, const int nall,
Atom<PRECISION,ACC_PRECISION> &atom, double *sublo, double *subhi,

View File

@ -33,7 +33,7 @@
#endif
#endif
#if defined(USE_HIP)
#if defined(USE_HIP) || defined(__APPLE__)
#define LAL_USE_OLD_NEIGHBOR
#endif
@ -259,6 +259,10 @@ class Neighbor {
return o.str();
}
/// Helper function
void transpose(UCL_D_Vec<tagint> &out, const UCL_D_Vec<tagint> &in,
const int columns_in, const int rows_in);
private:
NeighborShared *_shared;
UCL_Device *dev;
@ -289,15 +293,17 @@ class Neighbor {
#endif
int _simd_size;
#ifdef LAL_USE_OLD_NEIGHBOR
inline void set_nbor_block_size(const int mn) {
#ifdef LAL_USE_OLD_NEIGHBOR
int desired=mn/(2*_simd_size);
desired*=_simd_size;
if (desired<_simd_size) desired=_simd_size;
else if (desired>_max_block_nbor_build) desired=_max_block_nbor_build;
_block_nbor_build=desired;
#endif
}
#else
inline void set_nbor_block_size(const int) {}
#endif
};
}

View File

@ -48,6 +48,19 @@ _texture_2d( pos_tex,int4);
#define LAL_USE_OLD_NEIGHBOR
#endif
/*
compute the id of the cell where the atoms belong to
x: atom coordinates
cell_id: cell ids
particle_id:
boxlo[0-2]: the lower left corner of the local box
ncell[xyz]: the number of cells in xyz dims
i_cell_size is the inverse cell size
inum = the number of the local atoms that are ported to the device
nall = the number of the local+ghost atoms that are ported to the device
cells_in_cutoff = the number of cells that are within the cutoff
*/
__kernel void calc_cell_id(const numtyp4 *restrict x_,
unsigned *restrict cell_id,
int *restrict particle_id,
@ -90,6 +103,8 @@ __kernel void calc_cell_id(const numtyp4 *restrict x_,
}
}
// compute the number of atoms in each cell
__kernel void kernel_calc_cell_counts(const unsigned *restrict cell_id,
int *restrict cell_counts,
int nall, int ncell) {

View File

@ -273,19 +273,19 @@ __kernel void interp(const __global numtyp4 *restrict x_,
int my=mz+fast_mul(ny,npts_x);
for (int m=0; m<order; m++) {
grdtyp y0=z0*rho1d_1[m][tid];
for (int l=0; l<order; l++) {
grdtyp x0=y0*rho1d_0[l][tid];
grdtyp4 el=brick[my+l];
ek.x-=x0*el.x;
ek.y-=x0*el.y;
ek.z-=x0*el.z;
}
for (int l=0; l<order; l++) {
grdtyp x0=y0*rho1d_0[l][tid];
grdtyp4 el=brick[my+l];
ek.x-=x0*el.x;
ek.y-=x0*el.y;
ek.z-=x0*el.z;
}
my+=npts_x;
}
mz+=npts_yx;
}
}
}
ans[ii]=ek;
}
}
}

View File

@ -182,12 +182,15 @@
#define ucl_cbrt cbrt
#define ucl_ceil ceil
#define ucl_abs fabs
#define ucl_recip(x) ((numtyp)1.0/(x))
#define ucl_rsqrt rsqrt
#define ucl_sqrt sqrt
#define ucl_recip(x) ((numtyp)1.0/(x))
#define ucl_erfc erfc
#else
#define ucl_exp expf
#define ucl_powr powf
#define ucl_atan atanf
#define ucl_cbrt cbrtf
#define ucl_ceil ceilf
@ -195,8 +198,7 @@
#define ucl_recip(x) ((numtyp)1.0/(x))
#define ucl_rsqrt rsqrtf
#define ucl_sqrt sqrtf
#define ucl_exp expf
#define ucl_powr powf
#define ucl_erfc erfcf
#endif

View File

@ -166,6 +166,7 @@
#define ucl_cbrt cbrt
#define ucl_ceil ceil
#define ucl_abs fabs
#define ucl_erfc erfc
#if defined(FAST_MATH) && !defined(_DOUBLE_DOUBLE)
@ -330,6 +331,10 @@
#define NEIGHMASK 0x3FFFFFFF
ucl_inline int sbmask(int j) { return j >> SBBITS & 3; };
#define SBBITS15 29
#define NEIGHMASK15 0x1FFFFFFF
ucl_inline int sbmask15(int j) { return j >> SBBITS15 & 7; };
// default to 32-bit smallint and other ints, 64-bit bigint:
// same as defined in src/lmptype.h
#if !defined(LAMMPS_SMALLSMALL) && !defined(LAMMPS_BIGBIG) && \

View File

@ -150,7 +150,7 @@ double SWT::host_memory_usage() const {
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int SWT::loop(const int eflag, const int vflag, const int evatom,
bool &success) {
bool & /*success*/) {
const int nbor_pitch=this->nbor->nbor_pitch();
// build the short neighbor list

View File

@ -106,6 +106,7 @@ _texture_2d( pos_tex,int4);
} \
}
// (SHUFFLE_AVAIL == 1)
#else
#define local_allocate_acc_zeta()
@ -202,6 +203,7 @@ _texture_2d( pos_tex,int4);
} \
}
// EVFLAG == 0
#else
#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom, \
@ -216,8 +218,8 @@ _texture_2d( pos_tex,int4);
ans[ii]=old; \
}
#endif
#endif
#endif // EVFLAG
#endif // SHUFFLE_AVAIL
#ifdef LAL_SIMD_IP_SYNC
#define t_per_atom t_per_atom_in

View File

@ -56,7 +56,7 @@ int VashishtaT::init(const int ntypes, const int nlocal, const int nall, const i
const double* costheta, const double* bigb,
const double* big2b, const double* bigc)
{
int success;
int success=0;
success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
_screen,vashishta,"k_vashishta","k_vashishta_three_center",
"k_vashishta_three_end","k_vashishta_short_nbor");
@ -211,7 +211,7 @@ double VashishtaT::host_memory_usage() const {
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int VashishtaT::loop(const int eflag, const int vflag, const int evatom,
bool &success) {
bool & /*success*/) {
const int nbor_pitch=this->nbor->nbor_pitch();
// build the short neighbor list

View File

@ -22,6 +22,7 @@
#include "memory.h"
#include "neighbor.h"
#include "remap_wrap.h"
#include "timer.h"
#include <cmath>
#include <cstring>
@ -326,15 +327,23 @@ FFT_SCALAR *AmoebaConvolution::pre_convolution_3d()
cfft[n++] = ZEROF;
}
double time0,time1;
if (timer->has_sync()) MPI_Barrier(world);
time0 = platform::walltime();
// perform forward FFT
fft1->compute(cfft,cfft,FFT3d::FORWARD);
time1 = platform::walltime();
if (SCALE) {
double scale = 1.0/nfft_global;
FFT_SCALAR scale = 1.0/nfft_global;
for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale;
}
time_fft += time1 - time0;
#if DEBUG_AMOEBA
debug_scalar(CFFT1,"PRE Convo / POST FFT");
debug_file(CFFT1,"pre.convo.post.fft");
@ -382,15 +391,24 @@ FFT_SCALAR *AmoebaConvolution::pre_convolution_4d()
debug_scalar(FFT,"PRE Convo / POST Remap");
debug_file(FFT,"pre.convo.post.remap");
#endif
double time0,time1;
if (timer->has_sync()) MPI_Barrier(world);
time0 = platform::walltime();
// perform forward FFT
fft1->compute(cfft,cfft,FFT3d::FORWARD);
time1 = platform::walltime();
if (SCALE) {
double scale = 1.0/nfft_global;
FFT_SCALAR scale = 1.0/nfft_global;
for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale;
}
time_fft += time1 - time0;
#if DEBUG_AMOEBA
debug_scalar(CFFT1,"PRE Convo / POST FFT");
debug_file(CFFT1,"pre.convo.post.fft");
@ -423,7 +441,16 @@ void *AmoebaConvolution::post_convolution_3d()
debug_scalar(CFFT1,"POST Convo / PRE FFT");
debug_file(CFFT1,"post.convo.pre.fft");
#endif
double time0,time1;
if (timer->has_sync()) MPI_Barrier(world);
time0 = platform::walltime();
fft2->compute(cfft,cfft,FFT3d::BACKWARD);
time1 = platform::walltime();
time_fft += time1 - time0;
#if DEBUG_AMOEBA
debug_scalar(CFFT2,"POST Convo / POST FFT");
@ -465,8 +492,18 @@ void *AmoebaConvolution::post_convolution_4d()
debug_scalar(CFFT1,"POST Convo / PRE FFT");
debug_file(CFFT1,"post.convo.pre.fft");
#endif
double time0,time1;
if (timer->has_sync()) MPI_Barrier(world);
time0 = platform::walltime();
fft2->compute(cfft,cfft,FFT3d::BACKWARD);
time1 = platform::walltime();
time_fft += time1 - time0;
#if DEBUG_AMOEBA
debug_scalar(CFFT2,"POST Convo / POST FFT");
debug_file(CFFT2,"post.convo.post.fft");

View File

@ -38,7 +38,7 @@ class AmoebaConvolution : protected Pointers {
int nxlo_out, nxhi_out, nylo_out, nyhi_out, nzlo_out, nzhi_out;
int nxlo_fft, nxhi_fft, nylo_fft, nyhi_fft, nzlo_fft, nzhi_fft;
bigint nfft_global; // nx * ny * nz
double *grid_brick_start; // lower left corner of (c)grid_brick data
FFT_SCALAR *grid_brick_start; // lower left corner of (c)grid_brick data
AmoebaConvolution(class LAMMPS *, class Pair *, int, int, int, int, int);
~AmoebaConvolution();
@ -47,35 +47,37 @@ class AmoebaConvolution : protected Pointers {
FFT_SCALAR *pre_convolution();
void *post_convolution();
private:
int which; // caller name for convolution being performed
int flag3d; // 1 if using 3d grid_brick, 0 for 4d cgrid_brick
int nbrick_owned; // owned grid points in brick decomp
int nbrick_ghosts; // owned + ghost brick grid points
int ngrid_either; // max of nbrick_owned or nfft_owned
double time_fft;
protected:
int which; // caller name for convolution being performed
int flag3d; // 1 if using 3d grid_brick, 0 for 4d cgrid_brick
int nbrick_owned; // owned grid points in brick decomp
int nbrick_ghosts; // owned + ghost brick grid points
int ngrid_either; // max of nbrick_owned or nfft_owned
class Pair *amoeba;
class FFT3d *fft1, *fft2;
class Grid3d *gc;
class Remap *remap;
double ***grid_brick; // 3d real brick grid with ghosts
double ****cgrid_brick; // 4d complex brick grid with ghosts
FFT_SCALAR ***grid_brick; // 3d real brick grid with ghosts
FFT_SCALAR ****cgrid_brick; // 4d complex brick grid with ghosts
FFT_SCALAR *grid_fft; // 3d FFT grid as 1d vector
FFT_SCALAR *cfft; // 3d complex FFT grid as 1d vector
double *gc_buf1, *gc_buf2; // buffers for GridComm
double *remap_buf; // buffer for Remap
FFT_SCALAR *gc_buf1, *gc_buf2; // buffers for GridComm
FFT_SCALAR *remap_buf; // buffer for Remap
void allocate_grid();
void deallocate_grid();
void *zero_3d();
void *zero_4d();
FFT_SCALAR *pre_convolution_3d();
FFT_SCALAR *pre_convolution_4d();
virtual FFT_SCALAR *pre_convolution_4d();
void *post_convolution_3d();
void *post_convolution_4d();
virtual void *post_convolution_4d();
void procs2grid2d(int, int, int, int &, int &);
// DEBUG

View File

@ -285,7 +285,7 @@ void PairAmoeba::dispersion_kspace()
// gridpre = my portion of 3d grid in brick decomp w/ ghost values
// zeroed by zero()
double ***gridpre = (double ***) d_kspace->zero();
FFT_SCALAR ***gridpre = (FFT_SCALAR ***) d_kspace->zero();
// map atoms to grid
@ -294,7 +294,7 @@ void PairAmoeba::dispersion_kspace()
// pre-convolution operations including forward FFT
// gridfft = my portion of complex 3d grid in FFT decomposition
double *gridfft = d_kspace->pre_convolution();
FFT_SCALAR *gridfft = d_kspace->pre_convolution();
// ---------------------
// convolution operation

View File

@ -24,6 +24,7 @@
#include "math_special.h"
#include "my_page.h"
#include "neigh_list.h"
#include "timer.h"
#include <cmath>
@ -381,8 +382,6 @@ void PairAmoeba::induce()
}
}
// if (comm->me == 0) printf("CG iteration count = %d\n",iter);
// terminate the calculation if dipoles failed to converge
// NOTE: could make this an error
@ -546,13 +545,19 @@ void PairAmoeba::ufield0c(double **field, double **fieldp)
}
}
// get the reciprocal space part of the mutual field
if (polar_kspace_flag) umutual1(field,fieldp);
double time0, time1, time2;
if (timer->has_sync()) MPI_Barrier(world);
time0 = platform::walltime();
// get the real space portion of the mutual field
if (polar_rspace_flag) umutual2b(field,fieldp);
time1 = platform::walltime();
// get the reciprocal space part of the mutual field
if (polar_kspace_flag) umutual1(field,fieldp);
time2 = platform::walltime();
// add the self-energy portion of the mutual field
@ -563,6 +568,11 @@ void PairAmoeba::ufield0c(double **field, double **fieldp)
fieldp[i][j] += term*uinp[i][j];
}
}
// accumulate timing information
time_mutual_rspace += time1 - time0;
time_mutual_kspace += time2 - time1;
}
/* ----------------------------------------------------------------------
@ -785,7 +795,12 @@ void PairAmoeba::dfield0c(double **field, double **fieldp)
// get the reciprocal space part of the permanent field
double time0, time1, time2;
if (timer->has_sync()) MPI_Barrier(world);
time0 = platform::walltime();
if (polar_kspace_flag) udirect1(field);
time1 = platform::walltime();
for (i = 0; i < nlocal; i++) {
for (j = 0; j < 3; j++) {
@ -796,6 +811,7 @@ void PairAmoeba::dfield0c(double **field, double **fieldp)
// get the real space portion of the permanent field
if (polar_rspace_flag) udirect2b(field,fieldp);
time2 = platform::walltime();
// get the self-energy portion of the permanent field
@ -806,6 +822,11 @@ void PairAmoeba::dfield0c(double **field, double **fieldp)
fieldp[i][j] += term*rpole[i][j+1];
}
}
// accumulate timing information
time_direct_kspace += time1 - time0;
time_direct_rspace += time2 - time1;
}
/* ----------------------------------------------------------------------
@ -842,18 +863,26 @@ void PairAmoeba::umutual1(double **field, double **fieldp)
}
}
double time0, time1;
// gridpre = my portion of 4d grid in brick decomp w/ ghost values
double ****gridpre = (double ****) ic_kspace->zero();
FFT_SCALAR ****gridpre = (FFT_SCALAR ****) ic_kspace->zero();
// map 2 values to grid
if (timer->has_sync()) MPI_Barrier(world);
time0 = platform::walltime();
grid_uind(fuind,fuinp,gridpre);
time1 = platform::walltime();
time_grid_uind += (time1 - time0);
// pre-convolution operations including forward FFT
// gridfft = my portion of complex 3d grid in FFT decomposition
double *gridfft = ic_kspace->pre_convolution();
FFT_SCALAR *gridfft = ic_kspace->pre_convolution();
// ---------------------
// convolution operation
@ -883,12 +912,18 @@ void PairAmoeba::umutual1(double **field, double **fieldp)
// post-convolution operations including backward FFT
// gridppost = my portion of 4d grid in brick decomp w/ ghost values
double ****gridpost = (double ****) ic_kspace->post_convolution();
FFT_SCALAR ****gridpost = (FFT_SCALAR ****) ic_kspace->post_convolution();
// get potential
if (timer->has_sync()) MPI_Barrier(world);
time0 = platform::walltime();
fphi_uind(gridpost,fdip_phi1,fdip_phi2,fdip_sum_phi);
time1 = platform::walltime();
time_fphi_uind += (time1 - time0);
// store fractional reciprocal potentials for OPT method
if (poltyp == OPT) {
@ -1055,7 +1090,7 @@ void PairAmoeba::udirect1(double **field)
// gridpre = my portion of 3d grid in brick decomp w/ ghost values
// zeroed by setup()
double ***gridpre = (double ***) i_kspace->zero();
FFT_SCALAR ***gridpre = (FFT_SCALAR ***) i_kspace->zero();
// map multipole moments to grid
@ -1064,7 +1099,7 @@ void PairAmoeba::udirect1(double **field)
// pre-convolution operations including forward FFT
// gridfft = my 1d portion of complex 3d grid in FFT decomp
double *gridfft = i_kspace->pre_convolution();
FFT_SCALAR *gridfft = i_kspace->pre_convolution();
// ---------------------
// convolution operation
@ -1109,7 +1144,7 @@ void PairAmoeba::udirect1(double **field)
// post-convolution operations including backward FFT
// gridppost = my portion of 3d grid in brick decomp w/ ghost values
double ***gridpost = (double ***) i_kspace->post_convolution();
FFT_SCALAR ***gridpost = (FFT_SCALAR ***) i_kspace->post_convolution();
// get potential

View File

@ -68,25 +68,23 @@ void PairAmoeba::moduli()
int maxfft = MAX(nfft1,nfft2);
maxfft = MAX(maxfft,nfft3);
double *array = new double[bsorder];
double *bsarray = new double[maxfft];
if (maxfft > _nfft_max) {
memory->destroy(_moduli_bsarray);
_nfft_max = maxfft;
memory->create(_moduli_bsarray,_nfft_max,"amoeba:_moduli_bsarray");
}
// compute and load the moduli values
double x = 0.0;
bspline(x,bsorder,array);
bspline(x,bsorder,_moduli_array);
for (i = 0; i < maxfft; i++) bsarray[i] = 0.0;
for (i = 0; i < bsorder; i++) bsarray[i+1] = array[i];
for (i = 0; i < maxfft; i++) _moduli_bsarray[i] = 0.0;
for (i = 0; i < bsorder; i++) _moduli_bsarray[i+1] = _moduli_array[i];
dftmod(bsmod1,bsarray,nfft1,bsorder);
dftmod(bsmod2,bsarray,nfft2,bsorder);
dftmod(bsmod3,bsarray,nfft3,bsorder);
// perform deallocation of local arrays
delete[] array;
delete[] bsarray;
dftmod(bsmod1,_moduli_bsarray,nfft1,bsorder);
dftmod(bsmod2,_moduli_bsarray,nfft2,bsorder);
dftmod(bsmod3,_moduli_bsarray,nfft3,bsorder);
}
/* ----------------------------------------------------------------------
@ -525,7 +523,7 @@ void PairAmoeba::frac_to_cart()
grid_mpole maps fractional atomic multipoles to PME grid
------------------------------------------------------------------------- */
void PairAmoeba::grid_mpole(double **fmp, double ***grid)
void PairAmoeba::grid_mpole(double **fmp, FFT_SCALAR ***grid)
{
int i,j,k,m,ib,jb,kb;
double v0,u0,t0;
@ -598,7 +596,7 @@ void PairAmoeba::grid_mpole(double **fmp, double ***grid)
the particle mesh Ewald grid
------------------------------------------------------------------------- */
void PairAmoeba::fphi_mpole(double ***grid, double **fphi)
void PairAmoeba::fphi_mpole(FFT_SCALAR ***grid, double **fphi)
{
int i,j,k,m,ib,jb,kb;
double v0,v1,v2,v3;
@ -742,7 +740,7 @@ void PairAmoeba::fphi_mpole(double ***grid, double **fphi)
grid_uind maps fractional induced dipoles to the PME grid
------------------------------------------------------------------------- */
void PairAmoeba::grid_uind(double **fuind, double **fuinp, double ****grid)
void PairAmoeba::grid_uind(double **fuind, double **fuinp, FFT_SCALAR ****grid)
{
int i,j,k,m,ib,jb,kb;
double v0,u0,t0;
@ -793,7 +791,7 @@ void PairAmoeba::grid_uind(double **fuind, double **fuinp, double ****grid)
fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
------------------------------------------------------------------------- */
void PairAmoeba::fphi_uind(double ****grid, double **fdip_phi1,
void PairAmoeba::fphi_uind(FFT_SCALAR ****grid, double **fdip_phi1,
double **fdip_phi2, double **fdip_sum_phi)
{
int i,j,k,m,ib,jb,kb;
@ -1042,7 +1040,7 @@ void PairAmoeba::fphi_uind(double ****grid, double **fdip_phi1,
grid_disp maps dispersion coefficients to PME grid
------------------------------------------------------------------------- */
void PairAmoeba::grid_disp(double ***grid)
void PairAmoeba::grid_disp(FFT_SCALAR ***grid)
{
int i,j,k,m,ib,jb,kb,itype,iclass;
double v0,u0,t0;

View File

@ -21,6 +21,7 @@
#include "math_const.h"
#include "math_special.h"
#include "neigh_list.h"
#include "timer.h"
#include <cmath>
@ -55,6 +56,8 @@ void PairAmoeba::multipole()
double qixx,qixy,qixz,qiyy,qiyz,qizz;
double cii,dii,qii;
double time0,time1,time2;
// set cutoffs, taper coeffs, and PME params
if (use_ewald) choose(MPOLE_LONG);
@ -78,13 +81,18 @@ void PairAmoeba::multipole()
felec = electric / am_dielectric;
if (timer->has_sync()) MPI_Barrier(world);
time0 = platform::walltime();
// compute the real space part of the Ewald summation
if (mpole_rspace_flag) multipole_real();
time1 = platform::walltime();
// compute the reciprocal space part of the Ewald summation
if (mpole_kspace_flag) multipole_kspace();
time2 = platform::walltime();
// compute the Ewald self-energy term over all the atoms
@ -109,6 +117,11 @@ void PairAmoeba::multipole()
e = fterm * (cii + term*(dii/3.0+2.0*term*qii/5.0));
empole += e;
}
// accumulate timing information
time_mpole_rspace += time1 - time0;
time_mpole_kspace += time2 - time1;
}
/* ----------------------------------------------------------------------
@ -361,6 +374,9 @@ void PairAmoeba::multipole_real()
bn[k] = (bfac*bn[k-1]+alsq2n*exp2a) / r2;
}
for (k = 0; k < 6; k++) bn[k] *= felec;
//if (i == 0 && j < 10) {
// printf("j = %d: aewald = %f; rr1 = %f; bn: %f %f %f %f %f %f\n", j, aewald, rr1, bn[0], bn[1], bn[2], bn[3], bn[4], bn[5]);
//}
// find damped multipole intermediates and energy value
@ -404,6 +420,8 @@ void PairAmoeba::multipole_real()
term2i*rr3i + term2k*rr3k + term2ik*rr3ik +
term3i*rr5i + term3k*rr5k + term3ik*rr5ik;
// find damped multipole intermediates for force and torque
de = term1*rr3 + term4ik*rr9ik + term5ik*rr11ik +
@ -444,6 +462,7 @@ void PairAmoeba::multipole_real()
term4 = 2.0 * (-ck*rr5+dkr*rr7-qkr*rr9);
term5 = 2.0 * (-ci*rr5-dir*rr7-qir*rr9);
term6 = 4.0 * rr7;
}
empole += e;
@ -482,6 +501,7 @@ void PairAmoeba::multipole_real()
tq[i][2] += ttmi[2];
// increment force-based gradient and torque on second site
// commenting out j parts for DEBUGGING
f[j][0] += frcx;
f[j][1] += frcy;
@ -638,7 +658,7 @@ void PairAmoeba::multipole_kspace()
// gridpre = my portion of 3d grid in brick decomp w/ ghost values
double ***gridpre = (double ***) m_kspace->zero();
FFT_SCALAR ***gridpre = (FFT_SCALAR ***) m_kspace->zero();
// map atoms to grid
@ -647,7 +667,7 @@ void PairAmoeba::multipole_kspace()
// pre-convolution operations including forward FFT
// gridfft = my portion of complex 3d grid in FFT decomp as 1d vector
double *gridfft = m_kspace->pre_convolution();
FFT_SCALAR *gridfft = m_kspace->pre_convolution();
// ---------------------
// convolution operation
@ -718,7 +738,7 @@ void PairAmoeba::multipole_kspace()
// post-convolution operations including backward FFT
// gridppost = my portion of 3d grid in brick decomp w/ ghost values
double ***gridpost = (double ***) m_kspace->post_convolution();
FFT_SCALAR ***gridpost = (FFT_SCALAR ***) m_kspace->post_convolution();
// get potential

View File

@ -21,6 +21,7 @@
#include "math_const.h"
#include "math_special.h"
#include "neigh_list.h"
#include "timer.h"
#include <cmath>
#include <cstring>
@ -55,6 +56,8 @@ void PairAmoeba::polar()
double fix[3],fiy[3],fiz[3];
double tep[3];
double time0,time1,time2;
// set cutoffs, taper coeffs, and PME params
if (use_ewald) choose(POLAR_LONG);
@ -76,11 +79,16 @@ void PairAmoeba::polar()
// compute the real space part of the dipole interactions
if (timer->has_sync()) MPI_Barrier(world);
time0 = platform::walltime();
if (polar_rspace_flag) polar_real();
time1 = platform::walltime();
// compute the reciprocal space part of dipole interactions
if (polar_kspace_flag) polar_kspace();
time2 = platform::walltime();
// compute the Ewald self-energy torque and virial terms
@ -133,6 +141,11 @@ void PairAmoeba::polar()
virpolar[4] -= vxz;
virpolar[5] -= vyz;
}
// accumulate timing information
time_polar_rspace += time1 - time0;
time_polar_kspace += time2 - time1;
}
/* ----------------------------------------------------------------------
@ -382,7 +395,7 @@ void PairAmoeba::polar_real()
factor_uscale = 1.0;
}
}
//if (i == 12 && j < 20) printf("j = %d: r = %f; factor_wscale = %f\n", j, sqrt(r2), factor_wscale);
r = sqrt(r2);
ck = rpole[j][0];
dkx = rpole[j][1];
@ -597,7 +610,6 @@ void PairAmoeba::polar_real()
dufld[i][3] += xr*tiz5 + zr*tix5 + 2.0*xr*zr*tuir;
dufld[i][4] += yr*tiz5 + zr*tiy5 + 2.0*yr*zr*tuir;
dufld[i][5] += zr*tiz5 + zr*zr*tuir;
dufld[j][0] -= xr*tkx5 + xr*xr*tukr;
dufld[j][1] -= xr*tky5 + yr*tkx5 + 2.0*xr*yr*tukr;
dufld[j][2] -= yr*tky5 + yr*yr*tukr;
@ -855,6 +867,7 @@ void PairAmoeba::polar_real()
frcx = -2.0 * depx;
frcy = -2.0 * depy;
frcz = -2.0 * depz;
}
// get the dtau/dr terms used for mutual polarization force
@ -1327,7 +1340,7 @@ void PairAmoeba::polar_kspace()
// gridpre = my portion of 3d grid in brick decomp w/ ghost values
double ***gridpre = (double ***) p_kspace->zero();
FFT_SCALAR ***gridpre = (FFT_SCALAR ***) p_kspace->zero();
// map atoms to grid
@ -1336,7 +1349,7 @@ void PairAmoeba::polar_kspace()
// pre-convolution operations including forward FFT
// gridfft = my portion of complex 3d grid in FFT decomp as 1d vector
double *gridfft = p_kspace->pre_convolution();
FFT_SCALAR *gridfft = p_kspace->pre_convolution();
// ---------------------
// convolution operation
@ -1386,7 +1399,7 @@ void PairAmoeba::polar_kspace()
// post-convolution operations including backward FFT
// gridppost = my portion of 3d grid in brick decomp w/ ghost values
double ***gridpost = (double ***) p_kspace->post_convolution();
FFT_SCALAR ***gridpost = (FFT_SCALAR ***) p_kspace->post_convolution();
// get potential
@ -1419,7 +1432,7 @@ void PairAmoeba::polar_kspace()
// gridpre2 = my portion of 4d grid in brick decomp w/ ghost values
double ****gridpre2 = (double ****) pc_kspace->zero();
FFT_SCALAR ****gridpre2 = (FFT_SCALAR ****) pc_kspace->zero();
// map 2 values to grid
@ -1428,7 +1441,7 @@ void PairAmoeba::polar_kspace()
// pre-convolution operations including forward FFT
// gridfft = my portion of complex 3d grid in FFT decomposition
double *gridfft = pc_kspace->pre_convolution();
FFT_SCALAR *gridfft = pc_kspace->pre_convolution();
// ---------------------
// convolution operation
@ -1451,7 +1464,7 @@ void PairAmoeba::polar_kspace()
// post-convolution operations including backward FFT
// gridppost = my portion of 4d grid in brick decomp w/ ghost values
double ****gridpost = (double ****) pc_kspace->post_convolution();
FFT_SCALAR ****gridpost = (FFT_SCALAR ****) pc_kspace->post_convolution();
// get potential
@ -1857,7 +1870,7 @@ void PairAmoeba::polar_kspace()
// gridpre = my portion of 3d grid in brick decomp w/ ghost values
// zeroed by zero()
double ***gridpre = (double ***) p_kspace->zero();
FFT_SCALAR ***gridpre = (FFT_SCALAR ***) p_kspace->zero();
// map atoms to grid
@ -1887,7 +1900,7 @@ void PairAmoeba::polar_kspace()
// gridpre = my portion of 3d grid in brick decomp w/ ghost values
// zeroed by zero()
gridpre = (double ***) p_kspace->zero();
gridpre = (FFT_SCALAR ***) p_kspace->zero();
// map atoms to grid
@ -1896,7 +1909,7 @@ void PairAmoeba::polar_kspace()
// pre-convolution operations including forward FFT
// gridfft1/2 = my portions of complex 3d grid in FFT decomp as 1d vectors
double *gridfft2 = p_kspace->pre_convolution();
FFT_SCALAR *gridfft2 = p_kspace->pre_convolution();
// ---------------------
// convolution operation
@ -1953,7 +1966,7 @@ void PairAmoeba::polar_kspace()
// gridpre = my portion of 3d grid in brick decomp w/ ghost values
// zeroed by zero()
double ***gridpre = (double ***) p_kspace->zero();
FFT_SCALAR ***gridpre = (FFT_SCALAR ***) p_kspace->zero();
// map atoms to grid
@ -1962,12 +1975,12 @@ void PairAmoeba::polar_kspace()
// pre-convolution operations including forward FFT
// gridfft = my portion of complex 3d grid in FFT decomp as 1d vector
double *gridfft = p_kspace->pre_convolution();
FFT_SCALAR *gridfft = p_kspace->pre_convolution();
// gridfft1 = copy of first FFT
int nfft_owned = p_kspace->nfft_owned;
memcpy(gridfft1,gridfft,2*nfft_owned*sizeof(double));
memcpy(gridfft1,gridfft,2*nfft_owned*sizeof(FFT_SCALAR));
// assign ??? to the PME grid
@ -1982,7 +1995,7 @@ void PairAmoeba::polar_kspace()
// gridpre = my portion of 3d grid in brick decomp w/ ghost values
gridpre = (double ***) p_kspace->zero();
gridpre = (FFT_SCALAR ***) p_kspace->zero();
// map atoms to grid
@ -1991,7 +2004,7 @@ void PairAmoeba::polar_kspace()
// pre-convolution operations including forward FFT
// gridfft = my portion of complex 3d grid in FFT decomp as 1d vector
double *gridfft2 = p_kspace->pre_convolution();
FFT_SCALAR *gridfft2 = p_kspace->pre_convolution();
// ---------------------
// convolution operation

View File

@ -194,8 +194,8 @@ void FixAmoebaBiTorsion::init()
// error check that PairAmoeba or PairHiippo exist
pair = nullptr;
pair = force->pair_match("amoeba",1,0);
if (!pair) pair = force->pair_match("hippo",1,0);
pair = force->pair_match("^amoeba",0,0);
if (!pair) pair = force->pair_match("^hippo",0,0);
if (!pair)
error->all(FLERR,"Cannot use fix amoeba/bitorsion w/out pair amoeba/hippo");

View File

@ -285,8 +285,9 @@ void ImproperAmoeba::init_style()
// check if PairAmoeba disabled improper terms
Pair *pair = nullptr;
pair = force->pair_match("amoeba",1,0);
if (!pair) pair = force->pair_match("hippo",1,0);
pair = force->pair_match("^amoeba",0,0);
if (!pair) pair = force->pair_match("^hippo",0,0);
if (!pair) error->all(FLERR,"Improper amoeba could not find pair amoeba/hippo");
int tmp;

View File

@ -29,6 +29,7 @@
#include "my_page.h"
#include "neigh_list.h"
#include "neighbor.h"
#include "timer.h"
#include "update.h"
#include <cmath>
@ -47,6 +48,7 @@ enum{MUTUAL,OPT,TCG,DIRECT};
enum{GEAR,ASPC,LSQR};
#define DELTASTACK 16
#define DEBUG_AMOEBA 0
/* ---------------------------------------------------------------------- */
@ -85,6 +87,10 @@ PairAmoeba::PairAmoeba(LAMMPS *lmp) : Pair(lmp)
cmp = fmp = nullptr;
cphi = fphi = nullptr;
_moduli_array = nullptr;
_moduli_bsarray = nullptr;
_nfft_max = 0;
poli = nullptr;
conj = conjp = nullptr;
vec = vecp = nullptr;
@ -227,6 +233,9 @@ PairAmoeba::~PairAmoeba()
memory->destroy(fphidp);
memory->destroy(cphidp);
memory->destroy(_moduli_array);
memory->destroy(_moduli_bsarray);
memory->destroy(thetai1);
memory->destroy(thetai2);
memory->destroy(thetai3);
@ -349,12 +358,22 @@ void PairAmoeba::compute(int eflag, int vflag)
if (update->ntimestep <= update->beginstep+1) {
time_init = time_hal = time_repulse = time_disp = time_mpole = 0.0;
time_induce = time_polar = time_qxfer = 0.0;
time_mpole_rspace = time_mpole_kspace = 0.0;
time_direct_rspace = time_direct_kspace = 0.0;
time_mutual_rspace = time_mutual_kspace = 0.0;
time_polar_rspace = time_polar_kspace = 0.0;
time_grid_uind = time_fphi_uind = 0.0;
if (ic_kspace) {
ic_kspace->time_fft = 0.0;
}
}
double time0,time1,time2,time3,time4,time5,time6,time7,time8;
MPI_Barrier(world);
time0 = MPI_Wtime();
if (timer->has_sync()) MPI_Barrier(world);
time0 = platform::walltime();
// if reneighboring step:
// augment neighbor list to include 1-5 neighbor flags
@ -410,8 +429,7 @@ void PairAmoeba::compute(int eflag, int vflag)
comm->forward_comm(this);
if (amoeba) pbc_xred();
time1 = MPI_Wtime();
time1 = platform::walltime();
// ----------------------------------------
// compute components of force field
@ -420,22 +438,22 @@ void PairAmoeba::compute(int eflag, int vflag)
// buffered 14-7 Vdwl, pairwise
if (amoeba && hal_flag) hal();
time2 = MPI_Wtime();
time2 = platform::walltime();
// Pauli repulsion, pairwise
if (!amoeba && repulse_flag) repulsion();
time3 = MPI_Wtime();
time3 = platform::walltime();
// Ewald dispersion, pairwise and long range
if (!amoeba && (disp_rspace_flag || disp_kspace_flag)) dispersion();
time4 = MPI_Wtime();
time4 = platform::walltime();
// multipole, pairwise and long range
if (mpole_rspace_flag || mpole_kspace_flag) multipole();
time5 = MPI_Wtime();
time5 = platform::walltime();
// induced dipoles, interative CG relaxation
// communicate induce() output values needed by ghost atoms
@ -445,17 +463,17 @@ void PairAmoeba::compute(int eflag, int vflag)
cfstyle = INDUCE;
comm->forward_comm(this);
}
time6 = MPI_Wtime();
time6 = platform::walltime();
// dipoles, pairwise and long range
if (polar_rspace_flag || polar_kspace_flag) polar();
time7 = MPI_Wtime();
time7 = platform::walltime();
// charge transfer, pairwise
if (!amoeba && qxfer_flag) charge_transfer();
time8 = MPI_Wtime();
time8 = platform::walltime();
// store energy components for output by compute pair command
@ -518,6 +536,44 @@ void PairAmoeba::finish()
MPI_Allreduce(&time_qxfer,&ave,1,MPI_DOUBLE,MPI_SUM,world);
time_qxfer = ave/comm->nprocs;
#if DEBUG_AMOEBA
// real-space/kspace breakdown
MPI_Allreduce(&time_mpole_rspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
time_mpole_rspace = ave/comm->nprocs;
MPI_Allreduce(&time_mpole_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
time_mpole_kspace = ave/comm->nprocs;
MPI_Allreduce(&time_direct_rspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
time_direct_rspace = ave/comm->nprocs;
MPI_Allreduce(&time_direct_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
time_direct_kspace = ave/comm->nprocs;
MPI_Allreduce(&time_mutual_rspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
time_mutual_rspace = ave/comm->nprocs;
MPI_Allreduce(&time_mutual_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
time_mutual_kspace = ave/comm->nprocs;
MPI_Allreduce(&time_polar_rspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
time_polar_rspace = ave/comm->nprocs;
MPI_Allreduce(&time_polar_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
time_polar_kspace = ave/comm->nprocs;
MPI_Allreduce(&time_grid_uind,&ave,1,MPI_DOUBLE,MPI_SUM,world);
time_grid_uind = ave/comm->nprocs;
MPI_Allreduce(&time_fphi_uind,&ave,1,MPI_DOUBLE,MPI_SUM,world);
time_fphi_uind = ave/comm->nprocs;
double time_mutual_fft = 0;
if (ic_kspace) time_mutual_fft = ic_kspace->time_fft;
MPI_Allreduce(&time_mutual_fft,&ave,1,MPI_DOUBLE,MPI_SUM,world);
time_mutual_fft = ave/comm->nprocs;
#endif // DEBUG_AMOEBA
double time_total = (time_init + time_hal + time_repulse + time_disp +
time_mpole + time_induce + time_polar + time_qxfer) / 100.0;
@ -534,8 +590,27 @@ void PairAmoeba::finish()
utils::logmesg(lmp," Induce time: {:<12.6g} {:6.2f}%\n", time_induce, time_induce/time_total);
utils::logmesg(lmp," Polar time: {:<12.6g} {:6.2f}%\n", time_polar, time_polar/time_total);
if (!amoeba)
utils::logmesg(lmp," Qxfer time: {:<12.6g} {:6.2f}%\n", time_qxfer, time_qxfer/time_total);
utils::logmesg(lmp," Total time: {:<12.6g}\n",time_total * 100.0);
utils::logmesg(lmp," Qxfer time: {:.6g} {:.6g}\n", time_qxfer, time_qxfer/time_total);
utils::logmesg(lmp," Total time: {:.6g}\n",time_total * 100.0);
#if DEBUG_AMOEBA
double rspace_time = time_mpole_rspace + time_direct_rspace + time_mutual_rspace + time_polar_rspace;
double kspace_time = time_mpole_kspace + time_direct_kspace + time_mutual_kspace + time_polar_kspace;
utils::logmesg(lmp," Real-space timing breakdown: {:.3g}%\n", rspace_time/time_total);
utils::logmesg(lmp," Mpole time: {:.6g} {:.3g}%\n", time_mpole_rspace, time_mpole_rspace/time_total);
utils::logmesg(lmp," Direct time: {:.6g} {:.3g}%\n", time_direct_rspace, time_direct_rspace/time_total);
utils::logmesg(lmp," Mutual time: {:.6g} {:.3g}%\n", time_mutual_rspace, time_mutual_rspace/time_total);
utils::logmesg(lmp," Polar time: {:.6g} {:.3g}%\n", time_polar_rspace, time_polar_rspace/time_total);
utils::logmesg(lmp," K-space timing breakdown : {:.3g}%\n", kspace_time/time_total);
utils::logmesg(lmp," Mpole time: {:.6g} {:.3g}%\n", time_mpole_kspace, time_mpole_kspace/time_total);
utils::logmesg(lmp," Direct time: {:.6g} {:.3g}%\n", time_direct_kspace, time_direct_kspace/time_total);
utils::logmesg(lmp," Mutual time: {:.6g} {:.3g}%\n", time_mutual_kspace, time_mutual_kspace/time_total);
utils::logmesg(lmp," - Grid : {:.6g} {:.3g}%\n", time_grid_uind, time_grid_uind/time_total);
utils::logmesg(lmp," - FFT : {:.6g} {:.3g}%\n", time_mutual_fft, time_mutual_fft/time_total);
utils::logmesg(lmp," - Interp : {:.6g} {:.3g}%\n", time_fphi_uind, time_fphi_uind/time_total);
utils::logmesg(lmp," Polar time: {:.6g} {:.3g}%\n", time_polar_kspace, time_polar_kspace/time_total);
#endif
}
}
@ -2320,6 +2395,8 @@ void PairAmoeba::grow_local()
firstneigh_pcpc = (double **)
memory->smalloc(nmax*sizeof(double *),"induce:firstneigh_pcpc");
}
memory->create(_moduli_array,bsordermax,"amoeba:_moduli_array");
}
/* ----------------------------------------------------------------------

View File

@ -82,6 +82,12 @@ class PairAmoeba : public Pair {
double time_init, time_hal, time_repulse, time_disp;
double time_mpole, time_induce, time_polar, time_qxfer;
double time_mpole_rspace, time_mpole_kspace;
double time_direct_rspace, time_direct_kspace;
double time_mutual_rspace, time_mutual_kspace;
double time_polar_rspace, time_polar_kspace;
double time_grid_uind, time_fphi_uind;
// energy/virial components
double ehal, erepulse, edisp, epolar, empole, eqxfer;
@ -324,8 +330,12 @@ class PairAmoeba : public Pair {
double *qfac; // convoulution pre-factors
double *gridfft1; // copy of p_kspace FFT grid
double **cmp, **fmp; // Cartesian and fractional multipoles
double **cphi, **fphi;
double **cmp,**fmp; // Cartesian and fractional multipoles
double **cphi,**fphi;
double *_moduli_array; // buffers for moduli
double *_moduli_bsarray;
int _nfft_max;
// params for current KSpace solve and FFT being worked on
@ -335,8 +345,12 @@ class PairAmoeba : public Pair {
double ctf[10][10]; // indices NOT flipped vs Fortran
double ftc[10][10]; // indices NOT flipped vs Fortran
class AmoebaConvolution *m_kspace, *p_kspace, *pc_kspace, *d_kspace;
class AmoebaConvolution *i_kspace, *ic_kspace;
class AmoebaConvolution *m_kspace; // multipole KSpace
class AmoebaConvolution *p_kspace; // polar KSpace
class AmoebaConvolution *pc_kspace;
class AmoebaConvolution *d_kspace; // dispersion KSpace
class AmoebaConvolution *i_kspace; // induce KSpace
class AmoebaConvolution *ic_kspace;
// FFT grid size factors
@ -347,33 +361,33 @@ class PairAmoeba : public Pair {
void hal();
void repulsion();
void damprep(double, double, double, double, double, double, double, double, int, double, double,
double *);
virtual void repulsion();
void damprep(double, double, double, double, double, double, double, double,
int, double, double, double *);
void dispersion();
void dispersion_real();
virtual void dispersion_real();
void dispersion_kspace();
void multipole();
void multipole_real();
virtual void multipole_real();
void multipole_kspace();
void polar();
void polar_energy();
void polar_real();
void polar_kspace();
virtual void polar_real();
virtual void polar_kspace();
void damppole(double, int, double, double, double *, double *, double *);
void induce();
virtual void induce();
void ulspred();
void ufield0c(double **, double **);
virtual void ufield0c(double **, double **);
void uscale0b(int, double **, double **, double **, double **);
void dfield0c(double **, double **);
void umutual1(double **, double **);
void umutual2b(double **, double **);
virtual void umutual1(double **, double **);
virtual void umutual2b(double **, double **);
void udirect1(double **);
void udirect2b(double **, double **);
virtual void udirect2b(double **, double **);
void dampmut(double, double, double, double *);
void dampdir(double, double, double, double *, double *);
void cholesky(int, double *, double *);
@ -393,11 +407,11 @@ class PairAmoeba : public Pair {
void fphi_to_cphi(double **, double **);
void frac_to_cart();
void grid_mpole(double **, double ***);
void fphi_mpole(double ***, double **);
void grid_uind(double **, double **, double ****);
void fphi_uind(double ****, double **, double **, double **);
void grid_disp(double ***);
void grid_mpole(double **, FFT_SCALAR ***);
void fphi_mpole(FFT_SCALAR ***, double **);
void grid_uind(double **, double **, FFT_SCALAR ****);
virtual void fphi_uind(FFT_SCALAR ****, double **, double **, double **);
void grid_disp(FFT_SCALAR ***);
void kewald();
void kewald_parallel(int, int, int, int, int &, int &, int &, int &, int &, int &, int &, int &,

View File

@ -45,6 +45,10 @@ depend () {
# add one if statement per parent package
# add one depend() call per child package that depends on that parent
if (test $1 = "AMOEBA") then
depend GPU
fi
if (test $1 = "ASPHERE") then
depend GPU
depend OPENMP

View File

@ -28,6 +28,8 @@ action () {
# list of files with optional dependcies
action amoeba_convolution_gpu.cpp amoeba_convolution.cpp
action amoeba_convolution_gpu.h amoeba_convolution.cpp
action fix_gpu.cpp
action fix_gpu.h
action fix_nve_gpu.h
@ -41,6 +43,8 @@ action fix_npt_gpu.cpp
action fix_nve_asphere_gpu.h fix_nve_asphere.h
action fix_nve_asphere_gpu.cpp fix_nve_asphere.cpp
action gpu_extra.h
action pair_amoeba_gpu.cpp pair_amoeba.cpp
action pair_amoeba_gpu.h pair_amoeba.h
action pair_beck_gpu.cpp pair_beck.cpp
action pair_beck_gpu.h pair_beck.h
action pair_born_coul_long_gpu.cpp pair_born_coul_long.cpp
@ -89,6 +93,8 @@ action pair_gauss_gpu.cpp pair_gauss.cpp
action pair_gauss_gpu.h pair_gauss.h
action pair_gayberne_gpu.cpp pair_gayberne.cpp
action pair_gayberne_gpu.h pair_gayberne.cpp
action pair_hippo_gpu.cpp pair_hippo.cpp
action pair_hippo_gpu.h pair_hippo.cpp
action pair_lj96_cut_gpu.cpp pair_lj96_cut.cpp
action pair_lj96_cut_gpu.h pair_lj96_cut.h
action pair_lj_charmm_coul_long_gpu.cpp pair_lj_charmm_coul_long.cpp
@ -113,6 +119,10 @@ action pair_lj_cut_coul_msm_gpu.cpp pair_lj_cut_coul_msm.cpp
action pair_lj_cut_coul_msm_gpu.h pair_lj_cut_coul_msm.h
action pair_lj_cut_gpu.cpp
action pair_lj_cut_gpu.h
action pair_lj_cut_dipole_long_gpu.cpp pair_lj_cut_dipole_long.cpp
action pair_lj_cut_dipole_long_gpu.h pair_lj_cut_dipole_long.cpp
action pair_lj_cut_tip4p_long_gpu.h pair_lj_cut_tip4p_long.cpp
action pair_lj_cut_tip4p_long_gpu.cpp pair_lj_cut_tip4p_long.cpp
action pair_lj_smooth_gpu.cpp pair_lj_smooth.cpp
action pair_lj_smooth_gpu.h pair_lj_smooth.cpp
action pair_lj_expand_gpu.cpp
@ -155,10 +165,6 @@ action pppm_gpu.cpp pppm.cpp
action pppm_gpu.h pppm.cpp
action pair_ufm_gpu.cpp pair_ufm.cpp
action pair_ufm_gpu.h pair_ufm.h
action pair_lj_cut_dipole_long_gpu.cpp pair_lj_cut_dipole_long.cpp
action pair_lj_cut_dipole_long_gpu.h pair_lj_cut_dipole_long.cpp
action pair_lj_cut_tip4p_long_gpu.h pair_lj_cut_tip4p_long.cpp
action pair_lj_cut_tip4p_long_gpu.cpp pair_lj_cut_tip4p_long.cpp
# edit 2 Makefile.package files to include/exclude package info

View File

@ -0,0 +1,181 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
https://www.lammps.org/ Sandia National Laboratories
LAMMPS Development team: developers@lammps.org
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
#include "amoeba_convolution_gpu.h"
#include "comm.h"
#include "fft3d_wrap.h"
#include "remap_wrap.h"
#include "grid3d.h"
using namespace LAMMPS_NS;
// DEBUG
#define DEBUG_AMOEBA 0
#if DEBUG_AMOEBA
char *labels[7] =
{(char *) "MPOLE_GRID", (char *) "POLAR_GRID",
(char *) "POLAR_GRIDC", (char *) "DISP_GRID",
(char *) "INDUCE_GRID", (char *) "INDUCE_GRIDC"};
enum{GRIDBRICK_OUT,GRIDBRICK_IN,FFT,CFFT1,CFFT2};
#endif
// END DEBUG
#define SCALE 0
//#define USE_AMOEBA_FFT
#ifdef USE_AMOEBA_FFT
// External functions from GPU library
int amoeba_setup_fft(const int size, const int numel, const int element_type);
int amoeba_compute_fft1d(void* in, void* out, const int numel, const int mode);
#endif
/* ----------------------------------------------------------------------
partition an FFT grid across processors
both for a brick and FFT x pencil decomposition
nx,nz,nz = global FFT grid size
order = size of stencil in each dimension that maps atoms to grid
adapted from PPPM::set_grid_local()
------------------------------------------------------------------------- */
AmoebaConvolutionGPU::AmoebaConvolutionGPU(LAMMPS *lmp, Pair *pair,
int nx_caller, int ny_caller, int nz_caller,
int order_caller, int which_caller) :
AmoebaConvolution(lmp, pair, nx_caller, ny_caller, nz_caller, order_caller,
which_caller)
{
}
/* ----------------------------------------------------------------------
perform pre-convolution grid operations for 4d cgrid_brick array
------------------------------------------------------------------------- */
FFT_SCALAR *AmoebaConvolutionGPU::pre_convolution_4d()
{
int ix,iy,iz,n;
// reverse comm for 4d brick grid + ghosts
#if DEBUG_AMOEBA
debug_scalar(GRIDBRICK_OUT,"PRE Convo / PRE Grid3d");
#endif
gc->reverse_comm(Grid3d::PAIR,amoeba,which,2,sizeof(FFT_SCALAR),
gc_buf1,gc_buf2,MPI_FFT_SCALAR);
#if DEBUG_AMOEBA
debug_scalar(GRIDBRICK_IN,"PRE Convo / POST Grid3d");
debug_file(GRIDBRICK_IN,"pre.convo.post.grid3d");
#endif
// copy owned 4d brick grid values to FFT grid
n = 0;
for (iz = nzlo_in; iz <= nzhi_in; iz++)
for (iy = nylo_in; iy <= nyhi_in; iy++)
for (ix = nxlo_in; ix <= nxhi_in; ix++) {
cfft[n++] = cgrid_brick[iz][iy][ix][0];
cfft[n++] = cgrid_brick[iz][iy][ix][1];
}
// remap FFT grid from brick to x pencil partitioning
// NOTE: could just setup FFT to start from brick decomp and skip remap
remap->perform(cfft,cfft,remap_buf);
#if DEBUG_AMOEBA
debug_scalar(FFT,"PRE Convo / POST Remap");
debug_file(FFT,"pre.convo.post.remap");
#endif
double time0,time1;
MPI_Barrier(world);
time0 = platform::walltime();
// perform forward FFT
#ifdef USE_AMOEBA_FFT
amoeba_compute_fft1d(cfft,cfft,2*nfft_owned,FFT3d::FORWARD);
#else
fft1->compute(cfft,cfft,FFT3d::FORWARD);
#endif
time1 = platform::walltime();
time_fft += time1 - time0;
if (SCALE) {
double scale = 1.0/nfft_global;
for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale;
}
#if DEBUG_AMOEBA
debug_scalar(CFFT1,"PRE Convo / POST FFT");
debug_file(CFFT1,"pre.convo.post.fft");
#endif
return cfft;
}
/* ----------------------------------------------------------------------
perform post-convolution grid operations for 4d cgrid_brick array
------------------------------------------------------------------------- */
void *AmoebaConvolutionGPU::post_convolution_4d()
{
int ix,iy,iz,n;
// perform backward FFT
#if DEBUG_AMOEBA
debug_scalar(CFFT1,"POST Convo / PRE FFT");
debug_file(CFFT1,"post.convo.pre.fft");
#endif
double time0,time1;
MPI_Barrier(world);
time0 = platform::walltime();
fft2->compute(cfft,cfft,FFT3d::BACKWARD);
time1 = platform::walltime();
time_fft += time1 - time0;
#if DEBUG_AMOEBA
debug_scalar(CFFT2,"POST Convo / POST FFT");
debug_file(CFFT2,"post.convo.post.fft");
#endif
// copy 1d complex values into 4d complex grid
n = 0;
for (iz = nzlo_in; iz <= nzhi_in; iz++)
for (iy = nylo_in; iy <= nyhi_in; iy++)
for (ix = nxlo_in; ix <= nxhi_in; ix++) {
cgrid_brick[iz][iy][ix][0] = cfft[n++];
cgrid_brick[iz][iy][ix][1] = cfft[n++];
}
// forward comm to populate ghost grid values
#if DEBUG_AMOEBA
debug_scalar(GRIDBRICK_IN,"POST Convo / PRE grid3d");
debug_file(GRIDBRICK_IN,"post.convo.pre.grid3d");
#endif
gc->forward_comm(Grid3d::PAIR,amoeba,which,2,sizeof(FFT_SCALAR),
gc_buf1,gc_buf2,MPI_FFT_SCALAR);
return (void *) cgrid_brick;
}

View File

@ -0,0 +1,32 @@
/* -*- c++ -*- ----------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
https://www.lammps.org/ Sandia National Laboratories
LAMMPS Development team: developers@lammps.org
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
#ifndef LMP_AMOEBA_CONVOLUTION_GPU_H
#define LMP_AMOEBA_CONVOLUTION_GPU_H
#include "amoeba_convolution.h"
namespace LAMMPS_NS {
class AmoebaConvolutionGPU : public AmoebaConvolution {
public:
AmoebaConvolutionGPU(class LAMMPS *, class Pair *, int, int, int, int, int);
FFT_SCALAR *pre_convolution_4d() override;
void *post_convolution_4d() override;
};
} // namespace LAMMPS_NS
#endif

View File

@ -131,7 +131,7 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) :
_gpu_mode = GPU_NEIGH;
_particle_split = 1.0;
int nthreads = 0;
int newtonflag = 0;
int newtonflag = force->newton_pair;
int threads_per_atom = -1;
double binsize = 0.0;
char *opencl_args = nullptr;
@ -360,6 +360,8 @@ double FixGPU::memory_usage()
return bytes;
}
/* ---------------------------------------------------------------------- */
double FixGPU::binsize(const double subx, const double suby,
const double subz, const int nlocal,
const double cut) {

2067
src/GPU/pair_amoeba_gpu.cpp Normal file

File diff suppressed because it is too large Load Diff

72
src/GPU/pair_amoeba_gpu.h Normal file
View File

@ -0,0 +1,72 @@
/* -*- c++ -*- ----------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
https://www.lammps.org/, Sandia National Laboratories
LAMMPS Development team: developers@lammps.org
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
#ifdef PAIR_CLASS
// clang-format off
PairStyle(amoeba/gpu,PairAmoebaGPU);
// clang-format on
#else
#ifndef LMP_PAIR_AMOEBA_GPU_H
#define LMP_PAIR_AMOEBA_GPU_H
#include "pair_amoeba.h"
namespace LAMMPS_NS {
class PairAmoebaGPU : public PairAmoeba {
public:
PairAmoebaGPU(LAMMPS *lmp);
~PairAmoebaGPU() override;
void init_style() override;
double memory_usage() override;
enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH };
void induce() override;
void multipole_real() override;
void udirect2b(double **, double **) override;
void umutual1(double **, double **) override;
void fphi_uind(FFT_SCALAR ****, double **, double **, double **) override;
void umutual2b(double **, double **) override;
void ufield0c(double **, double **) override;
void polar_real() override;
void polar_kspace() override;
private:
int gpu_mode;
double cpu_time;
void *tq_pinned;
void *fieldp_pinned;
bool acc_float;
bool gpu_hal_ready;
bool gpu_repulsion_ready;
bool gpu_dispersion_real_ready;
bool gpu_multipole_real_ready;
bool gpu_udirect2b_ready;
bool gpu_umutual1_ready;
bool gpu_fphi_uind_ready;
bool gpu_umutual2b_ready;
bool gpu_polar_real_ready;
void udirect2b_cpu();
template<class numtyp>
void compute_force_from_torque(const numtyp*, double**, double*);
};
} // namespace LAMMPS_NS
#endif
#endif

1494
src/GPU/pair_hippo_gpu.cpp Normal file

File diff suppressed because it is too large Load Diff

73
src/GPU/pair_hippo_gpu.h Normal file
View File

@ -0,0 +1,73 @@
/* -*- c++ -*- ----------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
https://www.lammps.org/, Sandia National Laboratories
LAMMPS Development team: developers@lammps.org
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
#ifdef PAIR_CLASS
// clang-format off
PairStyle(hippo/gpu,PairHippoGPU);
// clang-format on
#else
#ifndef LMP_PAIR_HIPPO_GPU_H
#define LMP_PAIR_HIPPO_GPU_H
#include "pair_amoeba.h"
namespace LAMMPS_NS {
class PairHippoGPU : public PairAmoeba {
public:
PairHippoGPU(LAMMPS *lmp);
~PairHippoGPU() override;
void init_style() override;
double memory_usage() override;
enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH };
void induce() override;
void repulsion() override;
void dispersion_real() override;
void multipole_real() override;
void udirect2b(double **, double **) override;
void umutual1(double **, double **) override;
void fphi_uind(FFT_SCALAR ****, double **, double **, double **) override;
void umutual2b(double **, double **) override;
void ufield0c(double **, double **) override;
void polar_real() override;
private:
int gpu_mode;
double cpu_time;
void *tq_pinned;
void *fieldp_pinned;
bool acc_float;
bool gpu_hal_ready;
bool gpu_repulsion_ready;
bool gpu_dispersion_real_ready;
bool gpu_multipole_real_ready;
bool gpu_udirect2b_ready;
bool gpu_umutual1_ready;
bool gpu_fphi_uind_ready;
bool gpu_umutual2b_ready;
bool gpu_polar_real_ready;
void udirect2b_cpu();
template<class numtyp>
void compute_force_from_torque(const numtyp*, double**, double*);
};
} // namespace LAMMPS_NS
#endif
#endif

View File

@ -21,6 +21,7 @@
#include "atom.h"
#include "atom_vec.h"
#include "citeme.h"
#include "comm.h"
#include "domain.h"
#include "error.h"
@ -36,12 +37,25 @@
#include "pair_reaxff.h"
#include "reaxff_defs.h"
#include <algorithm>
#include <cstring>
#include <exception>
#include <random>
using namespace LAMMPS_NS;
using namespace FixConst;
static const char cite_reaxff_species_delete[] =
"fix reaxff/species, 'delete' keyword: https://doi.org/10.1016/j.carbon.2022.11.002\n\n"
"@Article{Gissinger23,\n"
" author = {J. R. Gissinger, S. R. Zavada, J. G. Smith, J. Kemppainen, I. Gallegos, G. M. Odegard, E. J. Siochi, K. E. Wise},\n"
" title = {Predicting char yield of high-temperature resins},\n"
" journal = {Carbon},\n"
" year = 2023,\n"
" volume = 202,\n"
" pages = {336-347}\n"
"}\n\n";
/* ---------------------------------------------------------------------- */
FixReaxFFSpecies::FixReaxFFSpecies(LAMMPS *lmp, int narg, char **arg) :
@ -145,6 +159,7 @@ FixReaxFFSpecies::FixReaxFFSpecies(LAMMPS *lmp, int narg, char **arg) :
ele = filepos = filedel = nullptr;
eleflag = posflag = padflag = 0;
delflag = specieslistflag = masslimitflag = 0;
delete_Nlimit = delete_Nsteps = 0;
singlepos_opened = multipos_opened = del_opened = 0;
multipos = 0;
@ -221,7 +236,12 @@ FixReaxFFSpecies::FixReaxFFSpecies(LAMMPS *lmp, int narg, char **arg) :
} else
error->all(FLERR, "Unknown fix reaxff/species delete option: {}", arg[iarg]);
// rate limit when deleting molecules
} else if (strcmp(arg[iarg], "delete_rate_limit") == 0) {
if (iarg + 3 > narg) utils::missing_cmd_args(FLERR, "fix reaxff/species delete_rate_limit", error);
delete_Nlimit = utils::numeric(FLERR, arg[iarg+1], false, lmp);
delete_Nsteps = utils::numeric(FLERR, arg[iarg+2], false, lmp);
iarg += 3;
// position of molecules
} else if (strcmp(arg[iarg], "position") == 0) {
if (iarg + 3 > narg) utils::missing_cmd_args(FLERR, "fix reaxff/species position", error);
@ -260,6 +280,15 @@ FixReaxFFSpecies::FixReaxFFSpecies(LAMMPS *lmp, int narg, char **arg) :
if (delflag && specieslistflag && masslimitflag)
error->all(FLERR, "Incompatible combination fix reaxff/species command options");
if (delete_Nlimit > 0) {
if (lmp->citeme) lmp->citeme->add(cite_reaxff_species_delete);
memory->create(delete_Tcount,delete_Nsteps,"reaxff/species:delete_Tcount");
for (int i = 0; i < delete_Nsteps; i++)
delete_Tcount[i] = -1;
delete_Tcount[0] = 0;
}
vector_nmole = 0;
vector_nspec = 0;
}
@ -279,6 +308,7 @@ FixReaxFFSpecies::~FixReaxFFSpecies()
memory->destroy(Mol2Spec);
memory->destroy(MolType);
memory->destroy(MolName);
memory->destroy(delete_Tcount);
delete[] filepos;
delete[] filedel;
@ -375,7 +405,13 @@ void FixReaxFFSpecies::Output_ReaxFF_Bonds(bigint ntimestep, FILE * /*fp*/)
// point to fix_ave_atom
f_SPECBOND->end_of_step();
if (ntimestep != nvalid) return;
if (ntimestep != nvalid) {
// push back delete_Tcount on every step
if (delete_Nlimit > 0)
for (int i = delete_Nsteps-1; i > 0; i--)
delete_Tcount[i] = delete_Tcount[i-1];
return;
}
nlocal = atom->nlocal;
@ -826,6 +862,15 @@ void FixReaxFFSpecies::WritePos(int Nmole, int Nspec)
void FixReaxFFSpecies::DeleteSpecies(int Nmole, int Nspec)
{
int ndeletions;
int headroom = -1;
if (delete_Nlimit > 0) {
if (delete_Tcount[delete_Nsteps-1] == -1) return;
ndeletions = delete_Tcount[0] - delete_Tcount[delete_Nsteps-1];
headroom = MAX(0, delete_Nlimit - ndeletions);
if (headroom == 0) return;
}
int i, j, m, n, itype, cid;
int ndel, ndelone, count, count_tmp;
int *Nameall;
@ -856,7 +901,23 @@ void FixReaxFFSpecies::DeleteSpecies(int Nmole, int Nspec)
int *marklist;
memory->create(marklist, nlocal, "reaxff/species:marklist");
for (m = 1; m <= Nmole; m++) {
std::random_device rnd;
std::minstd_rand park_rng(rnd());
int *molrange;
memory->create(molrange,Nmole,"reaxff/species:molrange");
for (m = 0; m < Nmole; m++)
molrange[m] = m + 1;
if (delete_Nlimit > 0) {
// shuffle index when using rate_limit, in case order is biased
if (comm->me == 0)
std::shuffle(&molrange[0],&molrange[Nmole], park_rng);
MPI_Bcast(&molrange[0], Nmole, MPI_INT, 0, world);
}
int this_delete_Tcount = 0;
for (int mm = 0; mm < Nmole; mm++) {
if (this_delete_Tcount == headroom) break;
m = molrange[mm];
localmass = totalmass = count = nmarklist = 0;
for (n = 0; n < ntypes; n++) Name[n] = 0;
@ -896,6 +957,7 @@ void FixReaxFFSpecies::DeleteSpecies(int Nmole, int Nspec)
// find corresponding moltype
if (totalmass > massmin && totalmass < massmax) {
this_delete_Tcount++;
for (j = 0; j < nmarklist; j++) {
mark[marklist[j]] = 1;
deletecount[Mol2Spec[m - 1]] += 1.0 / (double) count;
@ -905,6 +967,7 @@ void FixReaxFFSpecies::DeleteSpecies(int Nmole, int Nspec)
if (count > 0) {
for (i = 0; i < ndelspec; i++) {
if (del_species[i] == species_str) {
this_delete_Tcount++;
for (j = 0; j < nmarklist; j++) {
mark[marklist[j]] = 1;
deletecount[i] += 1.0 / (double) count;
@ -976,6 +1039,14 @@ void FixReaxFFSpecies::DeleteSpecies(int Nmole, int Nspec)
}
}
// push back delete_Tcount on every step
if (delete_Nlimit > 0) {
for (i = delete_Nsteps-1; i > 0; i--)
delete_Tcount[i] = delete_Tcount[i-1];
delete_Tcount[0] += this_delete_Tcount;
}
if (ndel && (atom->map_style != Atom::MAP_NONE)) {
atom->nghost = 0;
atom->map_init();
@ -988,6 +1059,7 @@ void FixReaxFFSpecies::DeleteSpecies(int Nmole, int Nspec)
memory->destroy(marklist);
memory->destroy(mark);
memory->destroy(deletecount);
memory->destroy(molrange);
}
/* ---------------------------------------------------------------------- */

View File

@ -60,6 +60,7 @@ class FixReaxFFSpecies : public Fix {
FILE *fp, *pos, *fdel;
int eleflag, posflag, multipos, padflag, setupflag;
int delflag, specieslistflag, masslimitflag;
int delete_Nlimit, delete_Nsteps, *delete_Tcount;
double massmin, massmax;
int singlepos_opened, multipos_opened, del_opened;
char *ele, **eletype, *filepos, *filedel;

View File

@ -2345,6 +2345,18 @@ void Atom::setup_sort_bins()
return;
}
#ifdef LMP_GPU
if (userbinsize == 0.0) {
auto ifix = dynamic_cast<FixGPU *>(modify->get_fix_by_id("package_gpu"));
if (ifix) {
const double subx = domain->subhi[0] - domain->sublo[0];
const double suby = domain->subhi[1] - domain->sublo[1];
const double subz = domain->subhi[2] - domain->sublo[2];
binsize = ifix->binsize(subx, suby, subz, atom->nlocal, 0.5 * neighbor->cutneighmax);
}
}
#endif
double bininv = 1.0/binsize;
// nbin xyz = local bins

View File

@ -535,6 +535,7 @@ void Neighbor::init()
int flag=0;
for (int isub=0; isub < ph->nstyles; ++isub) {
if (force->pair_match("amoeba",0,isub)
|| force->pair_match("hippo",0,isub)
|| force->pair_match("coul/wolf",0,isub)
|| force->pair_match("coul/dsf",0,isub)
|| force->pair_match("coul/exclude",0)
@ -545,6 +546,7 @@ void Neighbor::init()
special_flag[1] = special_flag[2] = special_flag[3] = 2;
} else {
if (force->pair_match("amoeba",0)
|| force->pair_match("hippo",0)
|| force->pair_match("coul/wolf",0)
|| force->pair_match("coul/dsf",0)
|| force->pair_match("coul/exclude",0)