diff --git a/cmake/Modules/LAMMPSUtils.cmake b/cmake/Modules/LAMMPSUtils.cmake index d42f91f10e..9b42dafc44 100644 --- a/cmake/Modules/LAMMPSUtils.cmake +++ b/cmake/Modules/LAMMPSUtils.cmake @@ -99,8 +99,15 @@ function(check_for_autogen_files source_dir) endfunction() macro(pkg_depends PKG1 PKG2) - if(PKG_${PKG1} AND NOT (PKG_${PKG2} OR BUILD_${PKG2})) - message(FATAL_ERROR "The ${PKG1} package needs LAMMPS to be built with the ${PKG2} package") + if(DEFINED BUILD_${PKG2}) + if(PKG_${PKG1} AND NOT BUILD_${PKG2}) + message(FATAL_ERROR "The ${PKG1} package needs LAMMPS to be built with -D BUILD_${PKG2}=ON") + endif() + elseif(DEFINED PKG_${PKG2}) + if(PKG_${PKG1} AND NOT PKG_${PKG2}) + message(WARNING "The ${PKG1} package depends on the ${PKG2} package. Enabling it.") + set(PKG_${PKG2} ON CACHE BOOL "" FORCE) + endif() endif() endmacro() diff --git a/cmake/Modules/Packages/COMPRESS.cmake b/cmake/Modules/Packages/COMPRESS.cmake index bdcf1aa3f8..4e1ab846a7 100644 --- a/cmake/Modules/Packages/COMPRESS.cmake +++ b/cmake/Modules/Packages/COMPRESS.cmake @@ -1,4 +1,9 @@ -find_package(ZLIB REQUIRED) +find_package(ZLIB) +if(NOT ZLIB_FOUND) + message(WARNING "No Zlib development support found. Disabling COMPRESS package...") + set(PKG_COMPRESS OFF CACHE BOOL "" FORCE) + return() +endif() target_link_libraries(lammps PRIVATE ZLIB::ZLIB) find_package(PkgConfig QUIET) diff --git a/cmake/Modules/Packages/GPU.cmake b/cmake/Modules/Packages/GPU.cmake index dd66276ae4..24d9538206 100644 --- a/cmake/Modules/Packages/GPU.cmake +++ b/cmake/Modules/Packages/GPU.cmake @@ -26,6 +26,19 @@ elseif(GPU_PREC STREQUAL "SINGLE") set(GPU_PREC_SETTING "SINGLE_SINGLE") endif() +option(GPU_DEBUG "Enable debugging code of the GPU package" OFF) +mark_as_advanced(GPU_DEBUG) + +if(PKG_AMOEBA AND FFT_SINGLE) + message(FATAL_ERROR "GPU acceleration of AMOEBA is not (yet) compatible with single precision FFT") +endif() + +if (PKG_AMOEBA) + list(APPEND GPU_SOURCES + ${GPU_SOURCES_DIR}/amoeba_convolution_gpu.h + ${GPU_SOURCES_DIR}/amoeba_convolution_gpu.cpp) +endif() + file(GLOB GPU_LIB_SOURCES ${CONFIGURE_DEPENDS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/[^.]*.cpp) file(MAKE_DIRECTORY ${LAMMPS_LIB_BINARY_DIR}/gpu) @@ -151,7 +164,12 @@ if(GPU_API STREQUAL "CUDA") add_library(gpu STATIC ${GPU_LIB_SOURCES} ${GPU_LIB_CUDPP_SOURCES} ${GPU_OBJS}) target_link_libraries(gpu PRIVATE ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY}) target_include_directories(gpu PRIVATE ${LAMMPS_LIB_BINARY_DIR}/gpu ${CUDA_INCLUDE_DIRS}) - target_compile_definitions(gpu PRIVATE -DUSE_CUDA -D_${GPU_PREC_SETTING} -DMPI_GERYON -DUCL_NO_EXIT ${GPU_CUDA_MPS_FLAGS}) + target_compile_definitions(gpu PRIVATE -DUSE_CUDA -D_${GPU_PREC_SETTING} ${GPU_CUDA_MPS_FLAGS}) + if(GPU_DEBUG) + target_compile_definitions(gpu PRIVATE -DUCL_DEBUG -DGERYON_KERNEL_DUMP) + else() + target_compile_definitions(gpu PRIVATE -DMPI_GERYON -DUCL_NO_EXIT) + endif() if(CUDPP_OPT) target_include_directories(gpu PRIVATE ${LAMMPS_LIB_SOURCE_DIR}/gpu/cudpp_mini) target_compile_definitions(gpu PRIVATE -DUSE_CUDPP) @@ -192,6 +210,7 @@ elseif(GPU_API STREQUAL "OPENCL") ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff.cu ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_zbl.cu ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_mod.cu + ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_hippo.cu ) foreach(GPU_KERNEL ${GPU_LIB_CU}) @@ -208,6 +227,7 @@ elseif(GPU_API STREQUAL "OPENCL") GenerateOpenCLHeader(tersoff ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff.cu) GenerateOpenCLHeader(tersoff_zbl ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_zbl_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_zbl_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_zbl.cu) GenerateOpenCLHeader(tersoff_mod ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_mod_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_mod_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_mod.cu) + GenerateOpenCLHeader(hippo ${CMAKE_CURRENT_BINARY_DIR}/gpu/hippo_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_hippo_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_hippo.cu) list(APPEND GPU_LIB_SOURCES ${CMAKE_CURRENT_BINARY_DIR}/gpu/gayberne_cl.h @@ -217,14 +237,18 @@ elseif(GPU_API STREQUAL "OPENCL") ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_cl.h ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_zbl_cl.h ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_mod_cl.h + ${CMAKE_CURRENT_BINARY_DIR}/gpu/hippo_cl.h ) add_library(gpu STATIC ${GPU_LIB_SOURCES}) target_link_libraries(gpu PRIVATE OpenCL::OpenCL) target_include_directories(gpu PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/gpu) - target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT) - target_compile_definitions(gpu PRIVATE -DUSE_OPENCL) - + target_compile_definitions(gpu PRIVATE -DUSE_OPENCL -D_${GPU_PREC_SETTING}) + if(GPU_DEBUG) + target_compile_definitions(gpu PRIVATE -DUCL_DEBUG -DGERYON_KERNEL_DUMP) + else() + target_compile_definitions(gpu PRIVATE -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT) + endif() target_link_libraries(lammps PRIVATE gpu) add_executable(ocl_get_devices ${LAMMPS_LIB_SOURCE_DIR}/gpu/geryon/ucl_get_devices.cpp) @@ -374,8 +398,12 @@ elseif(GPU_API STREQUAL "HIP") add_library(gpu STATIC ${GPU_LIB_SOURCES}) target_include_directories(gpu PRIVATE ${LAMMPS_LIB_BINARY_DIR}/gpu) - target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -DMPI_GERYON -DUCL_NO_EXIT) - target_compile_definitions(gpu PRIVATE -DUSE_HIP) + target_compile_definitions(gpu PRIVATE -DUSE_HIP -D_${GPU_PREC_SETTING}) + if(GPU_DEBUG) + target_compile_definitions(gpu PRIVATE -DUCL_DEBUG -DGERYON_KERNEL_DUMP) + else() + target_compile_definitions(gpu PRIVATE -DMPI_GERYON -DUCL_NO_EXIT) + endif() target_link_libraries(gpu PRIVATE hip::host) if(HIP_USE_DEVICE_SORT) diff --git a/doc/src/Build_extras.rst b/doc/src/Build_extras.rst index be2ba0fc60..659d185e18 100644 --- a/doc/src/Build_extras.rst +++ b/doc/src/Build_extras.rst @@ -126,10 +126,11 @@ CMake build -D GPU_API=value # value = opencl (default) or cuda or hip -D GPU_PREC=value # precision setting # value = double or mixed (default) or single - -D HIP_PATH # path to HIP installation. Must be set if GPU_API=HIP -D GPU_ARCH=value # primary GPU hardware choice for GPU_API=cuda - # value = sm_XX, see below - # default is sm_50 + # value = sm_XX (see below, default is sm_50) + -D GPU_DEBUG=value # enable debug code in the GPU package library, mostly useful for developers + # value = yes or no (default) + -D HIP_PATH=value # value = path to HIP installation. Must be set if GPU_API=HIP -D HIP_ARCH=value # primary GPU hardware choice for GPU_API=hip # value depends on selected HIP_PLATFORM # default is 'gfx906' for HIP_PLATFORM=amd and 'sm_50' for HIP_PLATFORM=nvcc diff --git a/doc/src/Commands_pair.rst b/doc/src/Commands_pair.rst index f5924f12c7..fd1ef692c5 100644 --- a/doc/src/Commands_pair.rst +++ b/doc/src/Commands_pair.rst @@ -39,7 +39,7 @@ OPT. * :doc:`agni (o) ` * :doc:`airebo (io) ` * :doc:`airebo/morse (io) ` - * :doc:`amoeba ` + * :doc:`amoeba (g) ` * :doc:`atm ` * :doc:`awpmd/cut ` * :doc:`beck (go) ` @@ -126,7 +126,7 @@ OPT. * :doc:`hbond/dreiding/lj (o) ` * :doc:`hbond/dreiding/morse (o) ` * :doc:`hdnnp ` - * :doc:`hippo ` + * :doc:`hippo (g) ` * :doc:`ilp/graphene/hbn (t) ` * :doc:`ilp/tmd (t) ` * :doc:`kolmogorov/crespi/full ` diff --git a/doc/src/fix_reaxff_species.rst b/doc/src/fix_reaxff_species.rst index c78c05a35e..383b8212f9 100644 --- a/doc/src/fix_reaxff_species.rst +++ b/doc/src/fix_reaxff_species.rst @@ -39,6 +39,9 @@ Syntax *masslimit* value = massmin massmax massmin = minimum molecular weight of species to delete massmax = maximum molecular weight of species to delete + *delete_rate_limit* value = Nlimit Nsteps + Nlimit = maximum number of deletions allowed to occur within interval + Nsteps = the interval (number of timesteps) over which to count deletions Examples """""""" @@ -142,7 +145,13 @@ When using the *masslimit* keyword, each line of the *filedel* file contains the timestep on which deletions occurs, followed by how many of each species are deleted (with quantities preceding chemical formulae). The *specieslist* and *masslimit* keywords cannot both be -used in the same *reaxff/species* fix. +used in the same *reaxff/species* fix. The *delete_rate_limit* +keyword can enforce an upper limit on the overall rate of molecule +deletion. The number of deletion occurrences is limited to Nlimit +within an interval of Nsteps timesteps. When using the +*delete_rate_limit* keyword, no deletions are permitted to occur +within the first Nsteps timesteps of the first run (after reading a +either a data or restart file). ---------- diff --git a/doc/src/fix_rigid.rst b/doc/src/fix_rigid.rst index 9a958e50d1..3a2477f90a 100644 --- a/doc/src/fix_rigid.rst +++ b/doc/src/fix_rigid.rst @@ -732,8 +732,8 @@ choices: * Use one of the 4 NPT or NPH styles for the rigid bodies. Use the *dilate* all option so that it will dilate the positions of the - *non-rigid particles as well. Use :doc:`fix nvt ` (or any - *other thermostat) for the non-rigid particles. + non-rigid particles as well. Use :doc:`fix nvt ` (or any + other thermostat) for the non-rigid particles. * Use :doc:`fix npt ` for the group of non-rigid particles. Use the *dilate* all option so that it will dilate the center-of-mass positions of the rigid bodies as well. Use one of the 4 NVE or 2 NVT diff --git a/doc/src/pair_amoeba.rst b/doc/src/pair_amoeba.rst index f5c0ea14df..6ef92a6938 100644 --- a/doc/src/pair_amoeba.rst +++ b/doc/src/pair_amoeba.rst @@ -1,11 +1,18 @@ .. index:: pair_style amoeba +.. index:: pair_style amoeba/gpu .. index:: pair_style hippo +.. index:: pair_style hippo/gpu pair_style amoeba command ========================= +Accelerator Variants: *amoeba/gpu* + pair_style hippo command ======================== + +Accelerator Variants: *hippo/gpu* + Syntax """""" @@ -127,6 +134,10 @@ version discussed in :ref:`(Ponder) `, :ref:`(Ren) implementation of HIPPO in LAMMPS matches the version discussed in :ref:`(Rackers) `. +.. versionadded:: TBD + +Accelerator support via the GPU package is available. + ---------- Only a single pair_coeff command is used with either the *amoeba* and @@ -187,6 +198,19 @@ These pair styles can only be used via the *pair* keyword of the ---------- +.. include:: accel_styles.rst + +.. note:: + + Using the GPU accelerated pair styles 'amoeba/gpu' or 'hippo/gpu' + when compiling the GPU package for OpenCL has a few known issues + when running on integrated GPUs and the calculation may crash. + + The GPU accelerated pair styles are also not (yet) compatible + with single precision FFTs. + +---------- + Restrictions """""""""""" diff --git a/lib/gpu/Makefile.lammps.standard b/lib/gpu/Makefile.lammps.standard index 9526e8e373..0bb3394b3e 100644 --- a/lib/gpu/Makefile.lammps.standard +++ b/lib/gpu/Makefile.lammps.standard @@ -6,6 +6,6 @@ CUDA_HOME=/usr/local/cuda endif gpu_SYSINC = -gpu_SYSLIB = -lcudart -lcuda +gpu_SYSLIB = -lcudart -lcuda -lcufft gpu_SYSPATH = -L$(CUDA_HOME)/lib64 -L$(CUDA_HOME)/lib64/stubs diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile index 56942d3f3c..298d404117 100644 --- a/lib/gpu/Nvidia.makefile +++ b/lib/gpu/Nvidia.makefile @@ -1,9 +1,17 @@ +# Common headers for kernels +PRE1_H = lal_preprocessor.h lal_aux_fun1.h + # Headers for Geryon UCL_H = $(wildcard ./geryon/ucl*.h) NVD_H = $(wildcard ./geryon/nvd*.h) $(UCL_H) lal_preprocessor.h \ lal_pre_cuda_hip.h -ALL_H = $(NVD_H) $(wildcard ./lal_*.h) +# Headers for Host files +HOST_H = lal_answer.h lal_atom.h lal_balance.h lal_base_atomic.h lal_base_amoeba.h \ + lal_base_charge.h lal_base_dipole.h lal_base_dpd.h \ + lal_base_ellipsoid.h lal_base_three.h lal_device.h lal_neighbor.h \ + lal_neighbor_shared.h lal_pre_ocl_config.h $(NVD_H) + # Source files SRCS := $(wildcard ./lal_*.cpp) OBJS := $(subst ./,$(OBJ_DIR)/,$(SRCS:%.cpp=%.o)) @@ -54,13 +62,40 @@ $(OBJ_DIR)/pppm_d.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h \ $(OBJ_DIR)/pppm_d_cubin.h: $(OBJ_DIR)/pppm_d.cubin $(BIN2C) -c -n pppm_d $(OBJ_DIR)/pppm_d.cubin > $(OBJ_DIR)/pppm_d_cubin.h -$(OBJ_DIR)/%_cubin.h: lal_%.cu $(ALL_H) +$(OBJ_DIR)/%_cubin.h: lal_%.cu $(PRE1_H) $(CUDA) --fatbin -DNV_KERNEL -o $(OBJ_DIR)/$*.cubin $(OBJ_DIR)/lal_$*.cu $(BIN2C) -c -n $* $(OBJ_DIR)/$*.cubin > $@ # host code compilation -$(OBJ_DIR)/lal_%.o: lal_%.cpp $(CUHS) $(ALL_H) +$(OBJ_DIR)/lal_answer.o: lal_answer.cpp $(HOST_H) + $(CUDR) -o $@ -c lal_answer.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_dpd_tstat_ext.o: lal_dpd_tstat_ext.cpp lal_dpd.h $(HOST_H) + $(CUDR) -o $@ -c lal_dpd_tstat_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_eam_alloy_ext.o: lal_eam_alloy_ext.cpp lal_eam.h $(HOST_H) + $(CUDR) -o $@ -c lal_eam_alloy_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_eam_fs_ext.o: lal_eam_fs_ext.cpp lal_eam.h $(HOST_H) + $(CUDR) -o $@ -c lal_eam_fs_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_neighbor.o: lal_neighbor.cpp $(HOST_H) + $(CUDR) -o $@ -c lal_neighbor.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_neighbor_shared.o: lal_neighbor_shared.cpp $(HOST_H) + $(CUDR) -o $@ -c lal_neighbor_shared.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_pppm.o: lal_pppm.cpp pppm_f_cubin.h pppm_d_cubin.h $(HOST_H) + $(CUDR) -o $@ -c lal_pppm.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_%_ext.o: lal_%_ext.cpp lal_%.h $(HOST_H) + $(CUDR) -o $@ -c $< -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_base_%.o: lal_base_%.cpp $(HOST_H) + $(CUDR) -o $@ -c $< -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_%.o: lal_%.cpp %_cubin.h $(HOST_H) $(CUDR) -o $@ -c $< -I$(OBJ_DIR) #ifdef CUDPP_OPT @@ -77,10 +112,10 @@ $(OBJ_DIR)/cudpp_plan_manager.o: cudpp_mini/cudpp_plan_manager.cpp $(CUDR) -o $@ -c cudpp_mini/cudpp_plan_manager.cpp -Icudpp_mini $(OBJ_DIR)/radixsort_app.cu_o: cudpp_mini/radixsort_app.cu - $(CUDA) -o $@ -c cudpp_mini/radixsort_app.cu + $(CUDA) -o $@ -c cudpp_mini/radixsort_app.cu -Icudpp_mini $(OBJ_DIR)/scan_app.cu_o: cudpp_mini/scan_app.cu - $(CUDA) -o $@ -c cudpp_mini/scan_app.cu + $(CUDA) -o $@ -c cudpp_mini/scan_app.cu -Icudpp_mini #endif # build libgpu.a diff --git a/lib/gpu/Opencl.makefile b/lib/gpu/Opencl.makefile index 2ff98827d4..d318da15dd 100644 --- a/lib/gpu/Opencl.makefile +++ b/lib/gpu/Opencl.makefile @@ -6,7 +6,7 @@ UCL_H = $(wildcard ./geryon/ucl*.h) OCL_H = $(wildcard ./geryon/ocl*.h) $(UCL_H) lal_precision.h # Headers for Host files -HOST_H = lal_answer.h lal_atom.h lal_balance.h lal_base_atomic.h \ +HOST_H = lal_answer.h lal_atom.h lal_balance.h lal_base_atomic.h lal_base_amoeba.h \ lal_base_charge.h lal_base_dipole.h lal_base_dpd.h \ lal_base_ellipsoid.h lal_base_three.h lal_device.h lal_neighbor.h \ lal_neighbor_shared.h lal_pre_ocl_config.h $(OCL_H) @@ -74,6 +74,9 @@ $(OBJ_DIR)/tersoff_mod_cl.h: lal_tersoff_mod.cu $(PRE1_H) lal_tersoff_mod_extra. $(OBJ_DIR)/tersoff_zbl_cl.h: lal_tersoff_zbl.cu $(PRE1_H) lal_tersoff_zbl_extra.h $(BSH) ./geryon/file_to_cstr.sh tersoff_zbl $(PRE1_H) lal_tersoff_zbl_extra.h lal_tersoff_zbl.cu $(OBJ_DIR)/tersoff_zbl_cl.h; +$(OBJ_DIR)/hippo_cl.h: lal_hippo.cu $(PRE1_H) lal_hippo_extra.h + $(BSH) ./geryon/file_to_cstr.sh hippo $(PRE1_H) lal_hippo_extra.h lal_hippo.cu $(OBJ_DIR)/hippo_cl.h; + $(OBJ_DIR)/%_cl.h: lal_%.cu $(PRE1_H) $(BSH) ./geryon/file_to_cstr.sh $* $(PRE1_H) $< $@; diff --git a/lib/gpu/geryon/hip_macros.h b/lib/gpu/geryon/hip_macros.h index 96313ec87e..e16caf4944 100644 --- a/lib/gpu/geryon/hip_macros.h +++ b/lib/gpu/geryon/hip_macros.h @@ -26,6 +26,9 @@ #ifdef UCL_DEBUG #define UCL_SYNC_DEBUG #define UCL_DESTRUCT_CHECK +#define UCL_DEBUG_ARG(arg) arg +#else +#define UCL_DEBUG_ARG(arg) #endif #ifndef UCL_NO_API_CHECK diff --git a/lib/gpu/geryon/nvd_macros.h b/lib/gpu/geryon/nvd_macros.h index ac2e6cc682..19c8ff4b6c 100644 --- a/lib/gpu/geryon/nvd_macros.h +++ b/lib/gpu/geryon/nvd_macros.h @@ -33,6 +33,9 @@ #ifdef UCL_DEBUG #define UCL_SYNC_DEBUG #define UCL_DESTRUCT_CHECK +#define UCL_DEBUG_ARG(arg) arg +#else +#define UCL_DEBUG_ARG(arg) #endif #ifndef UCL_NO_API_CHECK diff --git a/lib/gpu/geryon/ocl_device.h b/lib/gpu/geryon/ocl_device.h index 4163d40881..588c53c8fa 100644 --- a/lib/gpu/geryon/ocl_device.h +++ b/lib/gpu/geryon/ocl_device.h @@ -309,15 +309,14 @@ class UCL_Device { /// Return the maximum memory pitch in bytes for current device inline size_t max_pitch() { return max_pitch(_device); } /// Return the maximum memory pitch in bytes - inline size_t max_pitch(const int i) { return 0; } + inline size_t max_pitch(const int) { return 0; } /// Returns false if accelerator cannot be shared by multiple processes /** If it cannot be determined, true is returned **/ inline bool sharing_supported() { return sharing_supported(_device); } /// Returns false if accelerator cannot be shared by multiple processes /** If it cannot be determined, true is returned **/ - inline bool sharing_supported(const int i) - { return true; } + inline bool sharing_supported(const int) { return true; } /// True if the device is a sub-device inline bool is_subdevice() diff --git a/lib/gpu/geryon/ocl_macros.h b/lib/gpu/geryon/ocl_macros.h index 5e5a190ede..652d7795e9 100644 --- a/lib/gpu/geryon/ocl_macros.h +++ b/lib/gpu/geryon/ocl_macros.h @@ -33,6 +33,9 @@ #ifdef UCL_DEBUG #define UCL_SYNC_DEBUG #define UCL_DESTRUCT_CHECK +#define UCL_DEBUG_ARG(arg) arg +#else +#define UCL_DEBUG_ARG(arg) #endif #ifndef UCL_NO_API_CHECK diff --git a/lib/gpu/geryon/ocl_memory.h b/lib/gpu/geryon/ocl_memory.h index bfc260889a..5d8b9808bd 100644 --- a/lib/gpu/geryon/ocl_memory.h +++ b/lib/gpu/geryon/ocl_memory.h @@ -137,7 +137,7 @@ inline int _host_view(mat_type &mat, copy_type &cm, const size_t o, template inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n, - const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){ + const enum UCL_MEMOPT kind, const enum UCL_MEMOPT /*kind2*/){ cl_mem_flags buffer_perm; cl_map_flags map_perm; if (kind==UCL_READ_ONLY) { @@ -583,7 +583,7 @@ template <> struct _ucl_memcpy<1,0> { template static inline void mc(p1 &dst, const p2 &src, const size_t n, cl_command_queue &cq, const cl_bool block, - const size_t dst_offset, const size_t src_offset) { + const size_t /*dst_offset*/, const size_t src_offset) { if (src.cbegin()==dst.cbegin()) { #ifdef UCL_DBG_MEM_TRACE std::cerr << "UCL_COPY 1S\n"; @@ -641,7 +641,7 @@ template <> struct _ucl_memcpy<0,1> { template static inline void mc(p1 &dst, const p2 &src, const size_t n, cl_command_queue &cq, const cl_bool block, - const size_t dst_offset, const size_t src_offset) { + const size_t dst_offset, const size_t /*src_offset*/) { if (src.cbegin()==dst.cbegin()) { if (block) ucl_sync(cq); #ifdef UCL_DBG_MEM_TRACE diff --git a/lib/gpu/geryon/ocl_texture.h b/lib/gpu/geryon/ocl_texture.h index 8ddde5b2a3..87db3794a6 100644 --- a/lib/gpu/geryon/ocl_texture.h +++ b/lib/gpu/geryon/ocl_texture.h @@ -35,19 +35,19 @@ class UCL_Texture { UCL_Texture() {} ~UCL_Texture() {} /// Construct with a specified texture reference - inline UCL_Texture(UCL_Program &prog, const char *texture_name) { } + inline UCL_Texture(UCL_Program & /*prog*/, const char * /*texture_name*/) { } /// Set the texture reference for this object - inline void get_texture(UCL_Program &prog, const char *texture_name) { } + inline void get_texture(UCL_Program & /*prog*/, const char * /*texture_name*/) { } /// Bind a float array where each fetch grabs a vector of length numel template - inline void bind_float(mat_typ &vec, const unsigned numel) { } + inline void bind_float(mat_typ & /*vec*/, const unsigned /*numel*/) { } /// Unbind the texture reference from the memory allocation inline void unbind() { } /// Make a texture reference available to kernel - inline void allow(UCL_Kernel &kernel) { } + inline void allow(UCL_Kernel & /*kernel*/) { } private: friend class UCL_Kernel; @@ -62,7 +62,7 @@ class UCL_Const { inline UCL_Const(UCL_Program &prog, const char *global_name) { get_global(prog,global_name); } /// Set the global reference for this object - inline void get_global(UCL_Program &prog, const char *global_name) { + inline void get_global(UCL_Program &prog, const char * /*global_name*/) { if (_active) { CL_DESTRUCT_CALL(clReleaseContext(_context)); CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq)); diff --git a/lib/gpu/geryon/ocl_timer.h b/lib/gpu/geryon/ocl_timer.h index 189871e631..8f55a91a28 100644 --- a/lib/gpu/geryon/ocl_timer.h +++ b/lib/gpu/geryon/ocl_timer.h @@ -71,7 +71,7 @@ class UCL_Timer { inline void init(UCL_Device &dev) { init(dev,dev.cq()); } /// Initialize command queue for timing - inline void init(UCL_Device &dev, command_queue &cq) { + inline void init(UCL_Device & /*dev*/, command_queue &cq) { clear(); _cq=cq; clRetainCommandQueue(_cq); diff --git a/lib/gpu/geryon/ucl_copy.h b/lib/gpu/geryon/ucl_copy.h index c906a14f30..94b57f7a09 100644 --- a/lib/gpu/geryon/ucl_copy.h +++ b/lib/gpu/geryon/ucl_copy.h @@ -205,12 +205,11 @@ template <> struct _host_host_copy<1,1> { // Should never be here template struct _host_host_copy { template - static inline void hhc(mat1 &dst, const mat2 &src, const size_t numel) { + static inline void hhc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*numel*/) { assert(0==1); } template - static inline void hhc(mat1 &dst, const mat2 &src, const size_t rows, - const size_t cols) { + static inline void hhc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*rows*/, const size_t /*cols*/) { assert(0==1); } }; @@ -470,24 +469,22 @@ template struct _ucl_cast_copy { // Neither on host or both on host template <> struct _ucl_cast_copy<1,1> { template - static inline void cc(mat1 &dst, const mat2 &src, const size_t numel, - mat3 &cast_buffer, command_queue &cq) { + static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*numel*/, + mat3 & /*cast_buffer*/, command_queue & /*cq*/) { assert(0==1); } template - static inline void cc(mat1 &dst, const mat2 &src, const size_t numel, - mat3 &cast_buffer) { + static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*numel*/, mat3 & /*cast_buffer*/) { assert(0==1); } template - static inline void cc(mat1 &dst, const mat2 &src, const size_t rows, - const size_t cols, mat3 &cast_buffer) { + static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*rows*/, + const size_t /*cols*/, mat3 & /*cast_buffer*/) { assert(0==1); } template - static inline void cc(mat1 &dst, const mat2 &src, const size_t rows, - const size_t cols, mat3 &cast_buffer, - command_queue &cq) { + static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*rows*/, + const size_t /*cols*/, mat3 & /*cast_buffer*/, command_queue & /*cq*/) { assert(0==1); } }; @@ -495,24 +492,22 @@ template <> struct _ucl_cast_copy<1,1> { // Neither on host or both on host template <> struct _ucl_cast_copy<0,0> { template - static inline void cc(mat1 &dst, const mat2 &src, const size_t numel, - mat3 &cast_buffer, command_queue &cq) { + static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*numel*/, + mat3 & /*cast_buffer*/, command_queue & /*cq*/) { assert(0==1); } template - static inline void cc(mat1 &dst, const mat2 &src, const size_t numel, - mat3 &cast_buffer) { + static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*numel*/, mat3 & /*cast_buffer*/) { assert(0==1); } template - static inline void cc(mat1 &dst, const mat2 &src, const size_t rows, - const size_t cols, mat3 &cast_buffer) { + static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*rows*/, + const size_t /*cols*/, mat3 & /*cast_buffer*/) { assert(0==1); } template - static inline void cc(mat1 &dst, const mat2 &src, const size_t rows, - const size_t cols, mat3 &cast_buffer, - command_queue &cq) { + static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*rows*/, + const size_t cols, mat3 & /*cast_buffer*/, command_queue & /*cq*/) { assert(0==1); } }; diff --git a/lib/gpu/geryon/ucl_d_vec.h b/lib/gpu/geryon/ucl_d_vec.h index 9158e145b3..5e281fef07 100644 --- a/lib/gpu/geryon/ucl_d_vec.h +++ b/lib/gpu/geryon/ucl_d_vec.h @@ -125,7 +125,7 @@ class UCL_D_Vec : public UCL_BaseMat { * - The view does not prevent the memory from being freed by the * allocating container when using CUDA APIs **/ template - inline void view(ucl_type &input, const size_t rows, const size_t cols) { + inline void view(ucl_type &input, const size_t UCL_DEBUG_ARG(rows), const size_t cols) { #ifdef UCL_DEBUG assert(rows==1); #endif @@ -230,8 +230,8 @@ class UCL_D_Vec : public UCL_BaseMat { * - The view does not prevent the memory from being freed by the * allocating container when using CUDA APIs **/ template - inline void view_offset(const size_t offset,ucl_type &input,const size_t rows, - const size_t cols) { + inline void view_offset(const size_t offset,ucl_type &input, + const size_t UCL_DEBUG_ARG(rows), const size_t cols) { #ifdef UCL_DEBUG assert(rows==1); #endif diff --git a/lib/gpu/geryon/ucl_h_vec.h b/lib/gpu/geryon/ucl_h_vec.h index 2f49f9f633..9f734ac40c 100644 --- a/lib/gpu/geryon/ucl_h_vec.h +++ b/lib/gpu/geryon/ucl_h_vec.h @@ -126,7 +126,7 @@ class UCL_H_Vec : public UCL_BaseMat { * allocating container when using CUDA APIs * - Viewing a device container on the host is not supported **/ template - inline void view(ucl_type &input, const size_t rows, const size_t cols) { + inline void view(ucl_type &input, const size_t UCL_DEBUG_ARG(rows), const size_t cols) { #ifdef UCL_DEBUG assert(rows==1); #endif @@ -188,7 +188,7 @@ class UCL_H_Vec : public UCL_BaseMat { * allocating container when using CUDA APIs * - Viewing a device pointer on the host is not supported **/ template - inline void view(ptr_type *input, const size_t rows, const size_t cols, + inline void view(ptr_type *input, const size_t UCL_DEBUG_ARG(rows), const size_t cols, UCL_Device &dev) { #ifdef UCL_DEBUG assert(rows==1); @@ -233,7 +233,7 @@ class UCL_H_Vec : public UCL_BaseMat { * allocating container when using CUDA APIs * - Viewing a device container on the host is not supported **/ template - inline void view_offset(const size_t offset,ucl_type &input,const size_t rows, + inline void view_offset(const size_t offset,ucl_type &input,const size_t UCL_DEBUG_ARG(rows), const size_t cols) { #ifdef UCL_DEBUG assert(rows==1); diff --git a/lib/gpu/geryon/ucl_s_obj_help.h b/lib/gpu/geryon/ucl_s_obj_help.h index a10f3cdb3f..9bc2c40fe2 100644 --- a/lib/gpu/geryon/ucl_s_obj_help.h +++ b/lib/gpu/geryon/ucl_s_obj_help.h @@ -27,7 +27,7 @@ template struct _ucl_s_obj_help; // -- Can potentially use same memory if shared by accelerator template <> struct _ucl_s_obj_help<1> { template - static inline int alloc(t1 &host, t2 &device, t3 &_buffer, + static inline int alloc(t1 &host, t2 &device, t3 & /*_buffer*/, const int cols, UCL_Device &acc, const enum UCL_MEMOPT kind1, const enum UCL_MEMOPT kind2) { @@ -131,41 +131,37 @@ template <> struct _ucl_s_obj_help<1> { } template - static inline void copy(t1 &dst, t2 &src, t3 &buffer, const bool async) { + static inline void copy(t1 &dst, t2 &src, t3 & /*buffer*/, const bool async) { ucl_copy(dst,src,async); } template - static inline void copy(t1 &dst, t2 &src, t3 &buffer, command_queue &cq) { + static inline void copy(t1 &dst, t2 &src, t3 & /*buffer*/, command_queue &cq) { ucl_copy(dst,src,cq); } template - static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer, - const bool async) { + static inline void copy(t1 &dst, t2 &src, const int cols, t3 & /*buffer*/, const bool async) { ucl_copy(dst,src,cols,async); } template - static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer, - command_queue &cq) { + static inline void copy(t1 &dst, t2 &src, const int cols, t3 & /*buffer*/, command_queue &cq) { ucl_copy(dst,src,cols,cq); } template - static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, - t3 &buffer, const bool async) { + static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, t3 & /*buffer*/, const bool async) { ucl_copy(dst,src,rows,cols,async); } template - static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, - t3 &buffer, command_queue &cq) { + static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, t3 & /*buffer*/, command_queue &cq) { ucl_copy(dst,src,rows,cols,cq); } template - static inline int dev_resize(t1 &device, t2 &host, t3 &buff,const int cols) { + static inline int dev_resize(t1 &device, t2 &host, t3 & /*buff*/,const int cols) { if (device.kind()==UCL_VIEW) { device.view(host); return UCL_SUCCESS; @@ -353,7 +349,7 @@ template struct _ucl_s_obj_help { } template - static inline int dev_resize(t1 &device, t2 &host, t3 &buff,const int cols) { + static inline int dev_resize(t1 &device, t2 & /*host*/, t3 &buff,const int cols) { int err=buff.resize(cols); if (err!=UCL_SUCCESS) return err; diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp new file mode 100644 index 0000000000..5e19997913 --- /dev/null +++ b/lib/gpu/lal_amoeba.cpp @@ -0,0 +1,322 @@ +/*************************************************************************** + amoeba.cpp + ------------------- + Trung Dac Nguyen (Northwestern) + + Class for acceleration of the amoeba pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : trung.nguyen@northwestern.edu + ***************************************************************************/ + +#if defined(USE_OPENCL) +#include "amoeba_cl.h" +#elif defined(USE_CUDART) +const char *amoeba=0; +#else +#include "amoeba_cubin.h" +#endif + +#include "lal_amoeba.h" +#include +namespace LAMMPS_AL { +#define AmoebaT Amoeba + +extern Device device; + +template +AmoebaT::Amoeba() : BaseAmoeba(), + _allocated(false) { +} + +template +AmoebaT::~Amoeba() { + clear(); +} + +template +int AmoebaT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template +int AmoebaT::init(const int ntypes, const int max_amtype, const int max_amclass, + const double *host_pdamp, const double *host_thole, + const double *host_dirdamp, const int *host_amtype2class, + const double *host_special_hal, + const double * /*host_special_repel*/, + const double * /*host_special_disp*/, + const double *host_special_mpole, + const double * /*host_special_polar_wscale*/, + const double *host_special_polar_piscale, + const double *host_special_polar_pscale, + const double *host_csix, const double *host_adisp, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const int maxspecial15, + const double cell_size, const double gpu_split, FILE *_screen, + const double polar_dscale, const double polar_uscale) { + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15, + cell_size,gpu_split,_screen,amoeba, + "k_amoeba_multipole", "k_amoeba_udirect2b", + "k_amoeba_umutual2b", "k_amoeba_polar", + "k_amoeba_fphi_uind", "k_amoeba_fphi_mpole", + "k_amoeba_short_nbor", "k_amoeba_special15"); + if (success!=0) + return success; + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + + UCL_H_Vec host_write(max_amtype, *(this->ucl_device), UCL_WRITE_ONLY); + for (int i = 0; i < max_amtype; i++) { + host_write[i].x = host_pdamp[i]; + host_write[i].y = host_thole[i]; + host_write[i].z = host_dirdamp[i]; + host_write[i].w = host_amtype2class[i]; + } + + coeff_amtype.alloc(max_amtype,*(this->ucl_device), UCL_READ_ONLY); + ucl_copy(coeff_amtype,host_write,false); + + UCL_H_Vec host_write2(max_amclass, *(this->ucl_device), UCL_WRITE_ONLY); + for (int i = 0; i < max_amclass; i++) { + host_write2[i].x = host_csix[i]; + host_write2[i].y = host_adisp[i]; + host_write2[i].z = (numtyp)0; + host_write2[i].w = (numtyp)0; + } + + coeff_amclass.alloc(max_amclass,*(this->ucl_device), UCL_READ_ONLY); + ucl_copy(coeff_amclass,host_write2,false); + + UCL_H_Vec dview(5, *(this->ucl_device), UCL_WRITE_ONLY); + sp_amoeba.alloc(5,*(this->ucl_device),UCL_READ_ONLY); + for (int i=0; i<5; i++) { + dview[i].x=host_special_hal[i]; + dview[i].y=host_special_polar_piscale[i]; + dview[i].z=host_special_polar_pscale[i]; + dview[i].w=host_special_mpole[i]; + } + ucl_copy(sp_amoeba,dview,5,false); + + _polar_dscale = polar_dscale; + _polar_uscale = polar_uscale; + + _allocated=true; + this->_max_bytes=coeff_amtype.row_bytes() + coeff_amclass.row_bytes() + + sp_amoeba.row_bytes() + this->_tep.row_bytes() + + this->_fieldp.row_bytes() + this->_thetai1.row_bytes() + + this->_thetai2.row_bytes() + this->_thetai3.row_bytes() + + this->_igrid.row_bytes() + this->_cgrid_brick.row_bytes(); + return 0; +} + +template +void AmoebaT::clear() { + if (!_allocated) + return; + _allocated=false; + + coeff_amtype.clear(); + coeff_amclass.clear(); + sp_amoeba.clear(); + + this->clear_atomic(); +} + +template +double AmoebaT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(Amoeba); +} + +// --------------------------------------------------------------------------- +// Calculate the multipole real-space term, returning tep +// --------------------------------------------------------------------------- +template +int AmoebaT::multipole_real(const int eflag, const int vflag) { + int ainum=this->ans->inum(); + if (ainum == 0) + return 0; + + int _nall=this->atom->nall(); + int nbor_pitch=this->nbor->nbor_pitch(); + + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + this->time_pair.start(); + + // Build the short neighbor list for the cutoff off2_mpole, + // at this point mpole is the first kernel in a time step for AMOEBA + + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->_off2_mpole, &ainum, + &nbor_pitch, &this->_threads_per_atom); + + this->k_multipole.set_size(GX,BX); + this->k_multipole.run(&this->atom->x, &this->atom->extra, + &coeff_amtype, &sp_amoeba, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, + &this->ans->force, &this->ans->engv, &this->_tep, + &eflag, &vflag, &ainum, &_nall, &nbor_pitch, + &this->_threads_per_atom, &this->_aewald, &this->_felec, + &this->_off2_mpole, &_polar_dscale, &_polar_uscale); + this->time_pair.stop(); + + return GX; +} + +// --------------------------------------------------------------------------- +// Launch the real-space permanent field kernel +// --------------------------------------------------------------------------- +template +int AmoebaT::udirect2b(const int /*eflag*/, const int /*vflag*/) { + int ainum=this->ans->inum(); + if (ainum == 0) + return 0; + + int _nall=this->atom->nall(); + int nbor_pitch=this->nbor->nbor_pitch(); + + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + this->time_pair.start(); + + // Build the short neighbor list for the cutoff _off2_polar, if not done yet + // this is the first kernel in a time step where _off2_polar is used + + if (!this->short_nbor_polar_avail) { + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->_off2_polar, &ainum, + &nbor_pitch, &this->_threads_per_atom); + this->short_nbor_polar_avail = true; + } + + this->k_udirect2b.set_size(GX,BX); + this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_amoeba, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, + &this->_fieldp, &ainum, &_nall, &nbor_pitch, + &this->_threads_per_atom, &this->_aewald, &this->_off2_polar, + &_polar_dscale, &_polar_uscale); + + this->time_pair.stop(); + return GX; +} + +// --------------------------------------------------------------------------- +// Launch the real-space induced field kernel, returning field and fieldp +// --------------------------------------------------------------------------- +template +int AmoebaT::umutual2b(const int /*eflag*/, const int /*vflag*/) { + int ainum=this->ans->inum(); + if (ainum == 0) + return 0; + + int _nall=this->atom->nall(); + int nbor_pitch=this->nbor->nbor_pitch(); + + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + this->time_pair.start(); + + // Build the short neighbor list if not done yet + if (!this->short_nbor_polar_avail) { + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), &this->dev_short_nbor, + &this->_off2_polar, &ainum, &nbor_pitch, + &this->_threads_per_atom); + this->short_nbor_polar_avail = true; + } + + this->k_umutual2b.set_size(GX,BX); + this->k_umutual2b.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_amoeba, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->_fieldp, &ainum, &_nall, + &nbor_pitch, &this->_threads_per_atom, &this->_aewald, + &this->_off2_polar, &_polar_dscale, &_polar_uscale); + + this->time_pair.stop(); + return GX; +} + +// --------------------------------------------------------------------------- +// Launch the polar real-space kernel, returning tep +// --------------------------------------------------------------------------- +template +int AmoebaT::polar_real(const int eflag, const int vflag) { + int ainum=this->ans->inum(); + if (ainum == 0) + return 0; + + int _nall=this->atom->nall(); + int nbor_pitch=this->nbor->nbor_pitch(); + + // Compute the block size and grid size to keep all cores busy + + const int BX=this->block_size(); + const int GX=static_cast(ceil(static_cast(ainum)/(BX/this->_threads_per_atom))); + /* + const int cus = this->device->gpu->cus(); + while (GX < cus && GX > 1) { + BX /= 2; + GX=static_cast(ceil(static_cast(ainum)/(BX/this->_threads_per_atom))); + } + */ + this->time_pair.start(); + + // Build the short neighbor list if not done yet + if (!this->short_nbor_polar_avail) { + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->_off2_polar, &ainum, + &nbor_pitch, &this->_threads_per_atom); + this->short_nbor_polar_avail = true; + } + + this->k_polar.set_size(GX,BX); + this->k_polar.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_amoeba, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, + &this->ans->force, &this->ans->engv, &this->_tep, + &eflag, &vflag, &ainum, &_nall, &nbor_pitch, + &this->_threads_per_atom, &this->_aewald, &this->_felec, + &this->_off2_polar, &_polar_dscale, &_polar_uscale); + this->time_pair.stop(); + + // Signal that short nbor list is not avail for the next time step + // do it here because polar_real() is the last kernel in a time step at this point + + this->short_nbor_polar_avail = false; + + return GX; +} + +template class Amoeba; +} diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu new file mode 100644 index 0000000000..f572d3ebd0 --- /dev/null +++ b/lib/gpu/lal_amoeba.cu @@ -0,0 +1,2099 @@ +// ************************************************************************** +// amoeba.cu +// ------------------- +// Trung Dac Nguyen (Northwestern) +// +// Device code for acceleration of the amoeba pair style +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : trung.nguyen@northwestern.edu +// *************************************************************************** + +#if defined(NV_KERNEL) || defined(USE_HIP) + +#include "lal_aux_fun1.h" +#ifdef LAMMPS_SMALLBIG +#define tagint int +#endif +#ifdef LAMMPS_BIGBIG +#include "inttypes.h" +#define tagint int64_t +#endif +#ifdef LAMMPS_SMALLSMALL +#define tagint int +#endif +#ifndef _DOUBLE_DOUBLE +_texture( pos_tex,float4); +_texture( q_tex,float); +#else +_texture_2d( pos_tex,int4); +_texture( q_tex,int2); +#endif + +#else +#define pos_tex x_ +#define q_tex q_ +#ifdef LAMMPS_SMALLBIG +#define tagint int +#endif +#ifdef LAMMPS_BIGBIG +#define tagint long +#endif +#ifdef LAMMPS_SMALLSMALL +#define tagint int +#endif + +#endif // defined(NV_KERNEL) || defined(USE_HIP) + + +#if (SHUFFLE_AVAIL == 0) + +#define local_allocate_store_ufld() \ + __local acctyp red_acc[6][BLOCK_PAIR]; + +#define store_answers_amoeba_tq(tq, ii, inum,tid, t_per_atom, offset, i, \ + tep) \ + if (t_per_atom>1) { \ + red_acc[0][tid]=tq.x; \ + red_acc[1][tid]=tq.y; \ + red_acc[2][tid]=tq.z; \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + simdsync(); \ + if (offset < s) { \ + for (int r=0; r<3; r++) \ + red_acc[r][tid] += red_acc[r][tid+s]; \ + } \ + } \ + tq.x=red_acc[0][tid]; \ + tq.y=red_acc[1][tid]; \ + tq.z=red_acc[2][tid]; \ + } \ + if (offset==0 && ii1) { \ + red_acc[0][tid]=ufld[0]; \ + red_acc[1][tid]=ufld[1]; \ + red_acc[2][tid]=ufld[2]; \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + simdsync(); \ + if (offset < s) { \ + for (int r=0; r<3; r++) \ + red_acc[r][tid] += red_acc[r][tid+s]; \ + } \ + } \ + ufld[0]=red_acc[0][tid]; \ + ufld[1]=red_acc[1][tid]; \ + ufld[2]=red_acc[2][tid]; \ + red_acc[0][tid]=dufld[0]; \ + red_acc[1][tid]=dufld[1]; \ + red_acc[2][tid]=dufld[2]; \ + red_acc[3][tid]=dufld[3]; \ + red_acc[4][tid]=dufld[4]; \ + red_acc[5][tid]=dufld[5]; \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + simdsync(); \ + if (offset < s) { \ + for (int r=0; r<6; r++) \ + red_acc[r][tid] += red_acc[r][tid+s]; \ + } \ + } \ + dufld[0]=red_acc[0][tid]; \ + dufld[1]=red_acc[1][tid]; \ + dufld[2]=red_acc[2][tid]; \ + dufld[3]=red_acc[3][tid]; \ + dufld[4]=red_acc[4][tid]; \ + dufld[5]=red_acc[5][tid]; \ + } \ + if (offset==0 && ii1) { \ + red_acc[0][tid]=_fieldp[0]; \ + red_acc[1][tid]=_fieldp[1]; \ + red_acc[2][tid]=_fieldp[2]; \ + red_acc[3][tid]=_fieldp[3]; \ + red_acc[4][tid]=_fieldp[4]; \ + red_acc[5][tid]=_fieldp[5]; \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + simdsync(); \ + if (offset < s) { \ + for (int r=0; r<6; r++) \ + red_acc[r][tid] += red_acc[r][tid+s]; \ + } \ + } \ + _fieldp[0]=red_acc[0][tid]; \ + _fieldp[1]=red_acc[1][tid]; \ + _fieldp[2]=red_acc[2][tid]; \ + _fieldp[3]=red_acc[3][tid]; \ + _fieldp[4]=red_acc[4][tid]; \ + _fieldp[5]=red_acc[5][tid]; \ + } \ + if (offset==0 && ii1) { \ + simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z); \ + if (EVFLAG && (vflag==2 || eflag==2)) { \ + if (eflag) { \ + simdsync(); \ + simd_reduce_add2(t_per_atom, red_acc, offset, tid, energy, e_coul); \ + } \ + if (vflag) { \ + simdsync(); \ + simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial); \ + } \ + } \ + } \ + if (offset==0 && ii1) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + tq.x += shfl_down(tq.x, s, t_per_atom); \ + tq.y += shfl_down(tq.y, s, t_per_atom); \ + tq.z += shfl_down(tq.z, s, t_per_atom); \ + } \ + } \ + if (offset==0 && ii1) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + ufld[0] += shfl_down(ufld[0], s, t_per_atom); \ + ufld[1] += shfl_down(ufld[1], s, t_per_atom); \ + ufld[2] += shfl_down(ufld[2], s, t_per_atom); \ + dufld[0] += shfl_down(dufld[0], s, t_per_atom); \ + dufld[1] += shfl_down(dufld[1], s, t_per_atom); \ + dufld[2] += shfl_down(dufld[2], s, t_per_atom); \ + dufld[3] += shfl_down(dufld[3], s, t_per_atom); \ + dufld[4] += shfl_down(dufld[4], s, t_per_atom); \ + dufld[5] += shfl_down(dufld[5], s, t_per_atom); \ + } \ + } \ + if (offset==0 && ii1) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + _fieldp[0] += shfl_down(_fieldp[0], s, t_per_atom); \ + _fieldp[1] += shfl_down(_fieldp[1], s, t_per_atom); \ + _fieldp[2] += shfl_down(_fieldp[2], s, t_per_atom); \ + _fieldp[3] += shfl_down(_fieldp[3], s, t_per_atom); \ + _fieldp[4] += shfl_down(_fieldp[4], s, t_per_atom); \ + _fieldp[5] += shfl_down(_fieldp[5], s, t_per_atom); \ + } \ + } \ + if (offset==0 && ii1) { \ + simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ + if (vflag==2 || eflag==2) { \ + if (eflag) \ + simd_reduce_add2(t_per_atom,energy,e_coul); \ + if (vflag) \ + simd_reduce_arr(6, t_per_atom,virial); \ + } \ + } \ + if (offset==0 && ii 1; active_subgs /= vwidth) { \ + if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads(); \ + if (bnum < active_subgs) { \ + if (eflag) { \ + simd_reduce_add2(vwidth, energy, e_coul); \ + if (voffset==0) { \ + red_acc[6][bnum] = energy; \ + red_acc[7][bnum] = e_coul; \ + } \ + } \ + if (vflag) { \ + simd_reduce_arr(6, vwidth, virial); \ + if (voffset==0) \ + for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r]; \ + } \ + } \ + \ + __syncthreads(); \ + if (tid < active_subgs) { \ + if (eflag) { \ + energy = red_acc[6][tid]; \ + e_coul = red_acc[7][tid]; \ + } \ + if (vflag) \ + for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid]; \ + } else { \ + if (eflag) energy = e_coul = (acctyp)0; \ + if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0; \ + } \ + } \ + \ + if (bnum == 0) { \ + int ei=BLOCK_ID_X; \ + if (eflag) { \ + simd_reduce_add2(vwidth, energy, e_coul); \ + if (tid==0) { \ + engv[ei]+=energy*(acctyp)0.5; \ + ei+=ev_stride; \ + engv[ei]+=e_coul*(acctyp)0.5; \ + ei+=ev_stride; \ + } \ + } \ + if (vflag) { \ + simd_reduce_arr(6, vwidth, virial); \ + if (tid==0) { \ + for (int r=0; r<6; r++) { \ + engv[ei]+=virial[r]*(acctyp)0.5; \ + ei+=ev_stride; \ + } \ + } \ + } \ + } \ + } else if (offset==0 && ii1) \ + simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ + if (offset==0 && ii (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald); + + int m; + for (m = 1; m < 6; m++) { + numtyp bfac = (numtyp) (m+m-1); + alsq2n = alsq2 * alsq2n; + bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) * r2inv; + } + for (m = 0; m < 6; m++) bn[m] *= felec; + + numtyp term1,term2,term3; + numtyp term4,term5,term6; + + term1 = ci*ck; + term2 = ck*dir - ci*dkr + dik; + term3 = ci*qkr + ck*qir - dir*dkr + (numtyp)2.0*(dkqi-diqk+qiqk); + term4 = dir*qkr - dkr*qir - (numtyp)4.0*qik; + term5 = qir*qkr; + numtyp scalek = (numtyp)1.0 - factor_mpole; + rr1 = bn[0] - scalek*rr1; + rr3 = bn[1] - scalek*rr3; + rr5 = bn[2] - scalek*rr5; + rr7 = bn[3] - scalek*rr7; + rr9 = bn[4] - scalek*rr9; + rr11 = bn[5] - scalek*rr11; + numtyp e = term1*rr1 + term2*rr3 + term3*rr5 + term4*rr7 + term5*rr9; + + // find standard multipole intermediates for force and torque + + numtyp de = term1*rr3 + term2*rr5 + term3*rr7 + term4*rr9 + term5*rr11; + term1 = -ck*rr3 + dkr*rr5 - qkr*rr7; + term2 = ci*rr3 + dir*rr5 + qir*rr7; + term3 = (numtyp)2.0 * rr5; + term4 = (numtyp)2.0 * (-ck*rr5+dkr*rr7-qkr*rr9); + term5 = (numtyp)2.0 * (-ci*rr5-dir*rr7-qir*rr9); + term6 = (numtyp)4.0 * rr7; + + energy += e; + + // compute the force components for this interaction + + numtyp frcx = de*xr + term1*dix + term2*dkx + term3*(diqkx-dkqix) + + term4*qix + term5*qkx + term6*(qixk+qkxi); + numtyp frcy = de*yr + term1*diy + term2*dky + term3*(diqky-dkqiy) + + term4*qiy + term5*qky + term6*(qiyk+qkyi); + numtyp frcz = de*zr + term1*diz + term2*dkz + term3*(diqkz-dkqiz) + + term4*qiz + term5*qkz + term6*(qizk+qkzi); + + // compute the torque components for this interaction + + numtyp ttmix = -rr3*dikx + term1*dirx + term3*(dqikx+dkqirx) - + term4*qirx - term6*(qikrx+qikx); + numtyp ttmiy = -rr3*diky + term1*diry + term3*(dqiky+dkqiry) - + term4*qiry - term6*(qikry+qiky); + numtyp ttmiz = -rr3*dikz + term1*dirz + term3*(dqikz+dkqirz) - + term4*qirz - term6*(qikrz+qikz); + + // increment force-based gradient and torque on first site + + f.x -= frcx; + f.y -= frcy; + f.z -= frcz; + tq.x += ttmix; + tq.y += ttmiy; + tq.z += ttmiz; + + if (EVFLAG && vflag) { + numtyp vxx = -xr * frcx; + numtyp vxy = (numtyp)-0.5 * (yr*frcx+xr*frcy); + numtyp vxz = (numtyp)-0.5 * (zr*frcx+xr*frcz); + numtyp vyy = -yr * frcy; + numtyp vyz = (numtyp)-0.5 * (zr*frcy+yr*frcz); + numtyp vzz = -zr * frcz; + + virial[0] -= vxx; + virial[1] -= vyy; + virial[2] -= vzz; + virial[3] -= vxy; + virial[4] -= vxz; + virial[5] -= vyz; + } + } // nbor + + } // ii (numtyp)0.0) aesq2n = (numtyp)1.0 / (MY_PIS*aewald); + + for ( ; nbor (numtyp)0.0) aesq2n = (numtyp)1.0 / (MY_PIS*aewald); + + for ( ; nbor (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald); + + for (m = 1; m <= 4; m++) { + numtyp bfac = (numtyp) (m+m-1); + alsq2n = alsq2 * alsq2n; + bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) * r2inv; + } + for (m = 0; m < 5; m++) bn[m] *= felec; + + // apply Thole polarization damping to scale factors + + numtyp sc3 = (numtyp)1.0; + numtyp sc5 = (numtyp)1.0; + numtyp sc7 = (numtyp)1.0; + for (k = 0; k < 3; k++) { + rc3[k] = (numtyp)0.0; + rc5[k] = (numtyp)0.0; + rc7[k] = (numtyp)0.0; + } + + // apply Thole polarization damping to scale factors + + numtyp damp = pdi * coeff[jtype].x; // pdamp[jtype] + if (damp != (numtyp)0.0) { + numtyp pgamma = MIN(pti,coeff[jtype].y); // thole[jtype] + numtyp tmp = r*ucl_recip(damp); + damp = pgamma * (tmp*tmp*tmp); + if (damp < (numtyp)50.0) { + numtyp expdamp = ucl_exp(-damp); + sc3 = (numtyp)1.0 - expdamp; + sc5 = (numtyp)1.0 - ((numtyp)1.0+damp)*expdamp; + sc7 = (numtyp)1.0 - ((numtyp)1.0+damp+(numtyp)0.6*damp*damp) * expdamp; + numtyp temp3 = (numtyp)3.0 * damp * expdamp * r2inv; + numtyp temp5 = damp; + numtyp temp7 = (numtyp)-0.2 + (numtyp)0.6*damp; + rc3[0] = xr * temp3; + rc3[1] = yr * temp3; + rc3[2] = zr * temp3; + rc5[0] = rc3[0] * temp5; + rc5[1] = rc3[1] * temp5; + rc5[2] = rc3[2] * temp5; + rc7[0] = rc5[0] * temp7; + rc7[1] = rc5[1] * temp7; + rc7[2] = rc5[2] * temp7; + } + + psc3 = (numtyp)1.0 - sc3*factor_pscale; + psc5 = (numtyp)1.0 - sc5*factor_pscale; + psc7 = (numtyp)1.0 - sc7*factor_pscale; + dsc3 = (numtyp)1.0 - sc3*factor_dscale; + dsc5 = (numtyp)1.0 - sc5*factor_dscale; + dsc7 = (numtyp)1.0 - sc7*factor_dscale; + usc3 = (numtyp)1.0 - sc3*factor_uscale; + usc5 = (numtyp)1.0 - sc5*factor_uscale; + psr3 = bn[1] - psc3*rr3; + psr5 = bn[2] - psc5*rr5; + psr7 = bn[3] - psc7*rr7; + dsr3 = bn[1] - dsc3*rr3; + dsr5 = bn[2] - dsc5*rr5; + dsr7 = bn[3] - dsc7*rr7; + usr5 = bn[2] - usc5*rr5; + for (k = 0; k < 3; k++) { + prc3[k] = rc3[k] * factor_pscale; + prc5[k] = rc5[k] * factor_pscale; + prc7[k] = rc7[k] * factor_pscale; + drc3[k] = rc3[k] * factor_dscale; + drc5[k] = rc5[k] * factor_dscale; + drc7[k] = rc7[k] * factor_dscale; + urc3[k] = rc3[k] * factor_uscale; + urc5[k] = rc5[k] * factor_uscale; + } + } else { // damp == 0: ??? + } + + // get the induced dipole field used for dipole torques + + numtyp tix3 = psr3*ukx + dsr3*ukxp; + numtyp tiy3 = psr3*uky + dsr3*ukyp; + numtyp tiz3 = psr3*ukz + dsr3*ukzp; + numtyp tuir = -psr5*ukr - dsr5*ukrp; + + ufld[0] += tix3 + xr*tuir; + ufld[1] += tiy3 + yr*tuir; + ufld[2] += tiz3 + zr*tuir; + + // get induced dipole field gradient used for quadrupole torques + + numtyp tix5 = (numtyp)2.0 * (psr5*ukx+dsr5*ukxp); + numtyp tiy5 = (numtyp)2.0 * (psr5*uky+dsr5*ukyp); + numtyp tiz5 = (numtyp)2.0 * (psr5*ukz+dsr5*ukzp); + tuir = -psr7*ukr - dsr7*ukrp; + + dufld[0] += xr*tix5 + xr*xr*tuir; + dufld[1] += xr*tiy5 + yr*tix5 + (numtyp)2.0*xr*yr*tuir; + dufld[2] += yr*tiy5 + yr*yr*tuir; + dufld[3] += xr*tiz5 + zr*tix5 + (numtyp)2.0*xr*zr*tuir; + dufld[4] += yr*tiz5 + zr*tiy5 + (numtyp)2.0*yr*zr*tuir; + dufld[5] += zr*tiz5 + zr*zr*tuir; + + // get the dEd/dR terms used for direct polarization force + + term1 = bn[2] - dsc3*rr5; + term2 = bn[3] - dsc5*rr7; + term3 = -dsr3 + term1*xr*xr - rr3*xr*drc3[0]; + term4 = rr3*drc3[0] - term1*xr - dsr5*xr; + term5 = term2*xr*xr - dsr5 - rr5*xr*drc5[0]; + term6 = (bn[4]-dsc7*rr9)*xr*xr - bn[3] - rr7*xr*drc7[0]; + term7 = rr5*drc5[0] - (numtyp)2.0*bn[3]*xr + (dsc5+(numtyp)1.5*dsc7)*rr7*xr; + numtyp tixx = ci*term3 + dix*term4 + dir*term5 + + (numtyp)2.0*dsr5*qixx + (qiy*yr+qiz*zr)*dsc7*rr7 + (numtyp)2.0*qix*term7 + qir*term6; + numtyp tkxx = ck*term3 - dkx*term4 - dkr*term5 + + (numtyp)2.0*dsr5*qkxx + (qky*yr+qkz*zr)*dsc7*rr7 + (numtyp)2.0*qkx*term7 + qkr*term6; + + term3 = -dsr3 + term1*yr*yr - rr3*yr*drc3[1]; + term4 = rr3*drc3[1] - term1*yr - dsr5*yr; + term5 = term2*yr*yr - dsr5 - rr5*yr*drc5[1]; + term6 = (bn[4]-dsc7*rr9)*yr*yr - bn[3] - rr7*yr*drc7[1]; + term7 = rr5*drc5[1] - (numtyp)2.0*bn[3]*yr + (dsc5+(numtyp)1.5*dsc7)*rr7*yr; + numtyp tiyy = ci*term3 + diy*term4 + dir*term5 + + (numtyp)2.0*dsr5*qiyy + (qix*xr+qiz*zr)*dsc7*rr7 + (numtyp)2.0*qiy*term7 + qir*term6; + numtyp tkyy = ck*term3 - dky*term4 - dkr*term5 + + (numtyp)2.0*dsr5*qkyy + (qkx*xr+qkz*zr)*dsc7*rr7 + (numtyp)2.0*qky*term7 + qkr*term6; + + term3 = -dsr3 + term1*zr*zr - rr3*zr*drc3[2]; + term4 = rr3*drc3[2] - term1*zr - dsr5*zr; + term5 = term2*zr*zr - dsr5 - rr5*zr*drc5[2]; + term6 = (bn[4]-dsc7*rr9)*zr*zr - bn[3] - rr7*zr*drc7[2]; + term7 = rr5*drc5[2] - (numtyp)2.0*bn[3]*zr + (dsc5+(numtyp)1.5*dsc7)*rr7*zr; + numtyp tizz = ci*term3 + diz*term4 + dir*term5 + + (numtyp)2.0*dsr5*qizz + (qix*xr+qiy*yr)*dsc7*rr7 + (numtyp)2.0*qiz*term7 + qir*term6; + numtyp tkzz = ck*term3 - dkz*term4 - dkr*term5 + + (numtyp)2.0*dsr5*qkzz + (qkx*xr+qky*yr)*dsc7*rr7 + (numtyp)2.0*qkz*term7 + qkr*term6; + + term3 = term1*xr*yr - rr3*yr*drc3[0]; + term4 = rr3*drc3[0] - term1*xr; + term5 = term2*xr*yr - rr5*yr*drc5[0]; + term6 = (bn[4]-dsc7*rr9)*xr*yr - rr7*yr*drc7[0]; + term7 = rr5*drc5[0] - term2*xr; + numtyp tixy = ci*term3 - dsr5*dix*yr + diy*term4 + dir*term5 + + (numtyp)2.0*dsr5*qixy - (numtyp)2.0*dsr7*yr*qix + (numtyp)2.0*qiy*term7 + qir*term6; + numtyp tkxy = ck*term3 + dsr5*dkx*yr - dky*term4 - dkr*term5 + + (numtyp)2.0*dsr5*qkxy - (numtyp)2.0*dsr7*yr*qkx +(numtyp) 2.0*qky*term7 + qkr*term6; + + term3 = term1*xr*zr - rr3*zr*drc3[0]; + term5 = term2*xr*zr - rr5*zr*drc5[0]; + term6 = (bn[4]-dsc7*rr9)*xr*zr - rr7*zr*drc7[0]; + numtyp tixz = ci*term3 - dsr5*dix*zr + diz*term4 + dir*term5 + + (numtyp)2.0*dsr5*qixz - (numtyp)2.0*dsr7*zr*qix + (numtyp)2.0*qiz*term7 + qir*term6; + numtyp tkxz = ck*term3 + dsr5*dkx*zr - dkz*term4 - dkr*term5 + + (numtyp)2.0*dsr5*qkxz - (numtyp)2.0*dsr7*zr*qkx + (numtyp)2.0*qkz*term7 + qkr*term6; + + term3 = term1*yr*zr - rr3*zr*drc3[1]; + term4 = rr3*drc3[1] - term1*yr; + term5 = term2*yr*zr - rr5*zr*drc5[1]; + term6 = (bn[4]-dsc7*rr9)*yr*zr - rr7*zr*drc7[1]; + term7 = rr5*drc5[1] - term2*yr; + numtyp tiyz = ci*term3 - dsr5*diy*zr + diz*term4 + dir*term5 + + (numtyp)2.0*dsr5*qiyz - (numtyp)2.0*dsr7*zr*qiy + (numtyp)2.0*qiz*term7 + qir*term6; + numtyp tkyz = ck*term3 + dsr5*dky*zr - dkz*term4 - dkr*term5 + + (numtyp)2.0*dsr5*qkyz - (numtyp)2.0*dsr7*zr*qky + (numtyp)2.0*qkz*term7 + qkr*term6; + + numtyp depx = tixx*ukxp + tixy*ukyp + tixz*ukzp - tkxx*uixp - tkxy*uiyp - tkxz*uizp; + numtyp depy = tixy*ukxp + tiyy*ukyp + tiyz*ukzp - tkxy*uixp - tkyy*uiyp - tkyz*uizp; + numtyp depz = tixz*ukxp + tiyz*ukyp + tizz*ukzp - tkxz*uixp - tkyz*uiyp - tkzz*uizp; + + numtyp frcx = depx; + numtyp frcy = depy; + numtyp frcz = depz; + + // get the dEp/dR terms used for direct polarization force + + // tixx and tkxx + term1 = bn[2] - psc3*rr5; + term2 = bn[3] - psc5*rr7; + term3 = -psr3 + term1*xr*xr - rr3*xr*prc3[0]; + term4 = rr3*prc3[0] - term1*xr - psr5*xr; + term5 = term2*xr*xr - psr5 - rr5*xr*prc5[0]; + term6 = (bn[4]-psc7*rr9)*xr*xr - bn[3] - rr7*xr*prc7[0]; + term7 = rr5*prc5[0] - (numtyp)2.0*bn[3]*xr + (psc5+(numtyp)1.5*psc7)*rr7*xr; + tixx = ci*term3 + dix*term4 + dir*term5 + + (numtyp)2.0*psr5*qixx + (qiy*yr+qiz*zr)*psc7*rr7 + (numtyp)2.0*qix*term7 + qir*term6; + tkxx = ck*term3 - dkx*term4 - dkr*term5 + + (numtyp)2.0*psr5*qkxx + (qky*yr+qkz*zr)*psc7*rr7 + (numtyp)2.0*qkx*term7 + qkr*term6; + + // tiyy and tkyy + term3 = -psr3 + term1*yr*yr - rr3*yr*prc3[1]; + term4 = rr3*prc3[1] - term1*yr - psr5*yr; + term5 = term2*yr*yr - psr5 - rr5*yr*prc5[1]; + term6 = (bn[4]-psc7*rr9)*yr*yr - bn[3] - rr7*yr*prc7[1]; + term7 = rr5*prc5[1] - (numtyp)2.0*bn[3]*yr + (psc5+(numtyp)1.5*psc7)*rr7*yr; + tiyy = ci*term3 + diy*term4 + dir*term5 + + (numtyp)2.0*psr5*qiyy + (qix*xr+qiz*zr)*psc7*rr7 + (numtyp)2.0*qiy*term7 + qir*term6; + tkyy = ck*term3 - dky*term4 - dkr*term5 + + (numtyp)2.0*psr5*qkyy + (qkx*xr+qkz*zr)*psc7*rr7 + (numtyp)2.0*qky*term7 + qkr*term6; + + // tizz and tkzz + term3 = -psr3 + term1*zr*zr - rr3*zr*prc3[2]; + term4 = rr3*prc3[2] - term1*zr - psr5*zr; + term5 = term2*zr*zr - psr5 - rr5*zr*prc5[2]; + term6 = (bn[4]-psc7*rr9)*zr*zr - bn[3] - rr7*zr*prc7[2]; + term7 = rr5*prc5[2] - (numtyp)2.0*bn[3]*zr + (psc5+(numtyp)1.5*psc7)*rr7*zr; + tizz = ci*term3 + diz*term4 + dir*term5 + + (numtyp)2.0*psr5*qizz + (qix*xr+qiy*yr)*psc7*rr7 + (numtyp)2.0*qiz*term7 + qir*term6; + tkzz = ck*term3 - dkz*term4 - dkr*term5 + + (numtyp)2.0*psr5*qkzz + (qkx*xr+qky*yr)*psc7*rr7 + (numtyp)2.0*qkz*term7 + qkr*term6; + + // tixy and tkxy + term3 = term1*xr*yr - rr3*yr*prc3[0]; + term4 = rr3*prc3[0] - term1*xr; + term5 = term2*xr*yr - rr5*yr*prc5[0]; + term6 = (bn[4]-psc7*rr9)*xr*yr - rr7*yr*prc7[0]; + term7 = rr5*prc5[0] - term2*xr; + tixy = ci*term3 - psr5*dix*yr + diy*term4 + dir*term5 + + (numtyp)2.0*psr5*qixy - (numtyp)2.0*psr7*yr*qix + (numtyp)2.0*qiy*term7 + qir*term6; + tkxy = ck*term3 + psr5*dkx*yr - dky*term4 - dkr*term5 + + (numtyp)2.0*psr5*qkxy - (numtyp)2.0*psr7*yr*qkx + (numtyp)2.0*qky*term7 + qkr*term6; + + // tixz and tkxz + term3 = term1*xr*zr - rr3*zr*prc3[0]; + term5 = term2*xr*zr - rr5*zr*prc5[0]; + term6 = (bn[4]-psc7*rr9)*xr*zr - rr7*zr*prc7[0]; + tixz = ci*term3 - psr5*dix*zr + diz*term4 + dir*term5 + + (numtyp)2.0*psr5*qixz - (numtyp)2.0*psr7*zr*qix + (numtyp)2.0*qiz*term7 + qir*term6; + tkxz = ck*term3 + psr5*dkx*zr - dkz*term4 - dkr*term5 + + (numtyp)2.0*psr5*qkxz - (numtyp)2.0*psr7*zr*qkx + (numtyp)2.0*qkz*term7 + qkr*term6; + + // tiyz and tkyz + term3 = term1*yr*zr - rr3*zr*prc3[1]; + term4 = rr3*prc3[1] - term1*yr; + term5 = term2*yr*zr - rr5*zr*prc5[1]; + term6 = (bn[4]-psc7*rr9)*yr*zr - rr7*zr*prc7[1]; + term7 = rr5*prc5[1] - term2*yr; + tiyz = ci*term3 - psr5*diy*zr + diz*term4 + dir*term5 + + (numtyp)2.0*psr5*qiyz - (numtyp)2.0*psr7*zr*qiy + (numtyp)2.0*qiz*term7 + qir*term6; + tkyz = ck*term3 + psr5*dky*zr - dkz*term4 - dkr*term5 + + (numtyp)2.0*psr5*qkyz - (numtyp)2.0*psr7*zr*qky + (numtyp)2.0*qkz*term7 + qkr*term6; + + depx = tixx*ukx + tixy*uky + tixz*ukz - tkxx*uix - tkxy*uiy - tkxz*uiz; + depy = tixy*ukx + tiyy*uky + tiyz*ukz - tkxy*uix - tkyy*uiy - tkyz*uiz; + depz = tixz*ukx + tiyz*uky + tizz*ukz - tkxz*uix - tkyz*uiy - tkzz*uiz; + + frcx = frcx + depx; + frcy = frcy + depy; + frcz = frcz + depz; + + // get the dtau/dr terms used for mutual polarization force + // poltyp == MUTUAL && amoeba + + term1 = bn[2] - usc3*rr5; + term2 = bn[3] - usc5*rr7; + term3 = usr5 + term1; + term4 = rr3 * factor_uscale; + term5 = -xr*term3 + rc3[0]*term4; + term6 = -usr5 + xr*xr*term2 - rr5*xr*urc5[0]; + tixx = uix*term5 + uir*term6; + tkxx = ukx*term5 + ukr*term6; + + term5 = -yr*term3 + rc3[1]*term4; + term6 = -usr5 + yr*yr*term2 - rr5*yr*urc5[1]; + tiyy = uiy*term5 + uir*term6; + tkyy = uky*term5 + ukr*term6; + + term5 = -zr*term3 + rc3[2]*term4; + term6 = -usr5 + zr*zr*term2 - rr5*zr*urc5[2]; + tizz = uiz*term5 + uir*term6; + tkzz = ukz*term5 + ukr*term6; + + term4 = -usr5 * yr; + term5 = -xr*term1 + rr3*urc3[0]; + term6 = xr*yr*term2 - rr5*yr*urc5[0]; + tixy = uix*term4 + uiy*term5 + uir*term6; + tkxy = ukx*term4 + uky*term5 + ukr*term6; + + term4 = -usr5 * zr; + term6 = xr*zr*term2 - rr5*zr*urc5[0]; + tixz = uix*term4 + uiz*term5 + uir*term6; + tkxz = ukx*term4 + ukz*term5 + ukr*term6; + + term5 = -yr*term1 + rr3*urc3[1]; + term6 = yr*zr*term2 - rr5*zr*urc5[1]; + tiyz = uiy*term4 + uiz*term5 + uir*term6; + tkyz = uky*term4 + ukz*term5 + ukr*term6; + + depx = tixx*ukxp + tixy*ukyp + tixz*ukzp + + tkxx*uixp + tkxy*uiyp + tkxz*uizp; + depy = tixy*ukxp + tiyy*ukyp + tiyz*ukzp + + tkxy*uixp + tkyy*uiyp + tkyz*uizp; + depz = tixz*ukxp + tiyz*ukyp + tizz*ukzp + + tkxz*uixp + tkyz*uiyp + tkzz*uizp; + + frcx = frcx + depx; + frcy = frcy + depy; + frcz = frcz + depz; + + f.x += frcx; + f.y += frcy; + f.z += frcz; + + if (EVFLAG && vflag) { + numtyp vxx = xr * frcx; + numtyp vxy = (numtyp)0.5 * (yr*frcx+xr*frcy); + numtyp vxz = (numtyp)0.5 * (zr*frcx+xr*frcz); + numtyp vyy = yr * frcy; + numtyp vyz = (numtyp)0.5 * (zr*frcy+yr*frcz); + numtyp vzz = zr * frcz; + + virial[0] -= vxx; + virial[1] -= vyy; + virial[2] -= vzz; + virial[3] -= vxy; + virial[4] -= vxz; + virial[5] -= vyz; + } + } // nbor + + } // ii> SBBITS & 3; + int j = sj & NEIGHMASK; + tagint jtag = tag[j]; + + if (!which) { + int offset=ii; + for (int k=0; k +class Amoeba : public BaseAmoeba { + public: + Amoeba(); + ~Amoeba(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successful + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, const int max_amtype, const int max_amclass, + const double *host_pdamp, const double *host_thole, + const double *host_dirdamp, const int *host_amtype2class, + const double *host_special_mpole, + const double *host_special_hal, + const double *host_special_repel, + const double *host_special_disp, + const double *host_special_polar_wscale, + const double *host_special_polar_piscale, + const double *host_special_polar_pscale, + const double *host_csix, const double *host_adisp, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const int maxspecial15, const double cell_size, + const double gpu_split, FILE *_screen, + const double polar_dscale, const double polar_uscale); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + // --------------------------- TYPE DATA -------------------------- + + /// pdamp = coeff_amtype.x; thole = coeff_amtype.y; + /// dirdamp = coeff_amtype.z; amtype2class = coeff_amtype.w + UCL_D_Vec coeff_amtype; + /// csix = coeff_amclass.x; adisp = coeff_amclass.y; + UCL_D_Vec coeff_amclass; + /// Special amoeba values [0-4]: + /// sp_amoeba.x = special_hal + /// sp_amoeba.y = special_polar_pscale, + /// sp_amoeba.z = special_polar_piscale + /// sp_amoeba.w = special_mpole + UCL_D_Vec sp_amoeba; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + numtyp _polar_dscale, _polar_uscale; + numtyp _qqrd2e; + + protected: + bool _allocated; + int multipole_real(const int eflag, const int vflag); + int udirect2b(const int eflag, const int vflag); + int umutual2b(const int eflag, const int vflag); + int polar_real(const int eflag, const int vflag); + +}; + +} + +#endif diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp new file mode 100644 index 0000000000..995dfbe95f --- /dev/null +++ b/lib/gpu/lal_amoeba_ext.cpp @@ -0,0 +1,213 @@ +/*************************************************************************** + amoeba_ext.cpp + ------------------- + Trung Dac Nguyen (Northwestern) + + Functions for LAMMPS access to amoeba acceleration routines. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : trung.nguyen@northwestern.edu + ***************************************************************************/ + +#include +#include +#include + +#include "lal_amoeba.h" + +using namespace std; +using namespace LAMMPS_AL; + +static Amoeba AMOEBAMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclass, + const double *host_pdamp, const double *host_thole, + const double *host_dirdamp, const int *host_amtype2class, + const double *host_special_hal, + const double *host_special_repel, + const double *host_special_disp, + const double *host_special_mpole, + const double *host_special_polar_wscale, + const double *host_special_polar_piscale, + const double *host_special_polar_pscale, + const double *host_csix, const double *host_adisp, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const int maxspecial15, + const double cell_size, int &gpu_mode, FILE *screen, + const double polar_dscale, const double polar_uscale) { + AMOEBAMF.clear(); + gpu_mode=AMOEBAMF.device->gpu_mode(); + double gpu_split=AMOEBAMF.device->particle_split(); + int first_gpu=AMOEBAMF.device->first_device(); + int last_gpu=AMOEBAMF.device->last_device(); + int world_me=AMOEBAMF.device->world_me(); + int gpu_rank=AMOEBAMF.device->gpu_rank(); + int procs_per_gpu=AMOEBAMF.device->procs_per_gpu(); + + AMOEBAMF.device->init_message(screen,"amoeba",first_gpu,last_gpu); + + bool message=false; + if (AMOEBAMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing GPU and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=AMOEBAMF.init(ntypes, max_amtype, max_amclass, + host_pdamp, host_thole, host_dirdamp, + host_amtype2class, host_special_hal, + host_special_repel, host_special_disp, + host_special_mpole, host_special_polar_wscale, + host_special_polar_piscale, host_special_polar_pscale, + host_csix, host_adisp, nlocal, nall, max_nbors, + maxspecial, maxspecial15, cell_size, gpu_split, + screen, polar_dscale, polar_uscale); + + AMOEBAMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; igpu_barrier(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + AMOEBAMF.estimate_gpu_overhead(); + return init_ok; +} + +void amoeba_gpu_clear() { + AMOEBAMF.clear(); +} + +int** amoeba_gpu_precompute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, double ** /*host_uind*/, + double ** /*host_uinp*/, double * /*host_pval*/, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, double *prd) { + return AMOEBAMF.precompute(ago, inum_full, nall, host_x, host_type, + host_amtype, host_amgroup, host_rpole, + nullptr, nullptr, nullptr, sublo, subhi, tag, + nspecial, special, nspecial15, special15, + eflag_in, vflag_in, eatom, vatom, + host_start, ilist, jnum, cpu_time, + success, host_q, boxlo, prd); +} + + +void amoeba_gpu_compute_multipole_real(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *host_amtype, int *host_amgroup, double **host_rpole, + double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, int *nspecial15, tagint** special15, + const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, const double aewald, const double felec, const double off2, + double *host_q, double *boxlo, double *prd, void **tep_ptr) { + AMOEBAMF.compute_multipole_real(ago, inum_full, nall, host_x, host_type, + host_amtype, host_amgroup, host_rpole, nullptr, sublo, subhi, + tag, nspecial, special, nspecial15, special15, + eflag, vflag, eatom, vatom, host_start, ilist, jnum, + cpu_time, success, aewald, felec, off2, host_q, boxlo, prd, tep_ptr); +} + +void amoeba_gpu_compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, + const double aewald, const double off2, void **fieldp_ptr) { + AMOEBAMF.compute_udirect2b(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, nullptr, + aewald, off2, fieldp_ptr); +} + +void amoeba_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, + const double aewald, const double off2, void **fieldp_ptr) { + AMOEBAMF.compute_umutual2b(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, nullptr, + aewald, off2, fieldp_ptr); +} + +void amoeba_gpu_update_fieldp(void **fieldp_ptr) { + AMOEBAMF.update_fieldp(fieldp_ptr); +} + +void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, + const double aewald, const double felec, const double off2, + void **tep_ptr) { + AMOEBAMF.compute_polar_real(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, nullptr, + eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr); +} + +void amoeba_gpu_precompute_kspace(const int inum_full, const int bsorder, + double ***host_thetai1, double ***host_thetai2, + double ***host_thetai3, int** igrid, + const int nzlo_out, const int nzhi_out, + const int nylo_out, const int nyhi_out, + const int nxlo_out, const int nxhi_out) { + AMOEBAMF.precompute_kspace(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3, igrid, + nzlo_out, nzhi_out, nylo_out, nyhi_out, nxlo_out, nxhi_out); +} + +void amoeba_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1, + void **host_fdip_phi2, void **host_fdip_sum_phi) { + AMOEBAMF.compute_fphi_uind(host_grid_brick, host_fdip_phi1, + host_fdip_phi2, host_fdip_sum_phi); +} + +void amoeba_gpu_fphi_mpole(double ***host_grid_brick, void **host_fphi, const double felec) { + AMOEBAMF.compute_fphi_mpole(host_grid_brick, host_fphi, felec); +} + +void amoeba_setup_fft(const int numel, const int element_type) { + AMOEBAMF.setup_fft(numel, element_type); +} + +void amoeba_compute_fft1d(void* in, void* out, const int numel, const int mode) { + AMOEBAMF.compute_fft1d(in, out, numel, mode); +} + +double amoeba_gpu_bytes() { + return AMOEBAMF.host_memory_usage(); +} diff --git a/lib/gpu/lal_atom.cpp b/lib/gpu/lal_atom.cpp index 17cfa0dc5a..72cb59a912 100644 --- a/lib/gpu/lal_atom.cpp +++ b/lib/gpu/lal_atom.cpp @@ -48,6 +48,8 @@ int AtomT::bytes_per_atom() const { bytes+=sizeof(numtyp); if (_vel) bytes+=4*sizeof(numtyp); + if (_extra_fields>0) + bytes+=_extra_fields*sizeof(numtyp4); return bytes; } @@ -122,6 +124,11 @@ bool AtomT::alloc(const int nall) { UCL_READ_ONLY)==UCL_SUCCESS); gpu_bytes+=v.device.row_bytes(); } + if (_extra_fields>0) { + success=success && (extra.alloc(_max_atoms*_extra_fields,*dev,UCL_WRITE_ONLY, + UCL_READ_ONLY)==UCL_SUCCESS); + gpu_bytes+=extra.device.row_bytes(); + } if (_gpu_nbor>0) { if (_bonds) { @@ -156,7 +163,8 @@ bool AtomT::alloc(const int nall) { template bool AtomT::add_fields(const bool charge, const bool rot, - const int gpu_nbor, const bool bonds, const bool vel) { + const int gpu_nbor, const bool bonds, const bool vel, + const int extra_fields) { bool success=true; // Ignore host/device transfers? int gpu_bytes=0; @@ -191,7 +199,17 @@ bool AtomT::add_fields(const bool charge, const bool rot, } } - if (bonds && !_bonds) { + if (extra_fields > 0 && _extra_fields==0) { + _extra_fields=extra_fields; + _other=true; + if (_host_view==false) { + success=success && (extra.alloc(_max_atoms*_extra_fields,*dev,UCL_WRITE_ONLY, + UCL_READ_ONLY)==UCL_SUCCESS); + gpu_bytes+=extra.device.row_bytes(); + } + } + + if (bonds && _bonds==false) { _bonds=true; if (_bonds && _gpu_nbor>0) { success=success && (dev_tag.alloc(_max_atoms,*dev, @@ -254,7 +272,8 @@ bool AtomT::add_fields(const bool charge, const bool rot, template bool AtomT::init(const int nall, const bool charge, const bool rot, - UCL_Device &devi, const int gpu_nbor, const bool bonds, const bool vel) { + UCL_Device &devi, const int gpu_nbor, const bool bonds, const bool vel, + const int extra_fields) { clear(); bool success=true; @@ -262,13 +281,15 @@ bool AtomT::init(const int nall, const bool charge, const bool rot, _q_avail=false; _quat_avail=false; _v_avail=false; + _extra_avail=false; _resized=false; _gpu_nbor=gpu_nbor; _bonds=bonds; _charge=charge; _rot=rot; _vel=vel; - _other=_charge || _rot || _vel; + _extra_fields=extra_fields; + _other=_charge || _rot || _vel || (extra_fields>0); dev=&devi; _time_transfer=0; @@ -282,10 +303,14 @@ bool AtomT::init(const int nall, const bool charge, const bool rot, time_q.init(*dev); time_quat.init(*dev); time_vel.init(*dev); + time_extra.init(*dev); + time_pos.zero(); time_q.zero(); time_quat.zero(); time_vel.zero(); + time_extra.zero(); + _time_cast=0.0; #ifdef GPU_CAST @@ -308,6 +333,8 @@ void AtomT::clear_resize() { quat.clear(); if (_vel) v.clear(); + if (_extra_fields>0) + extra.clear(); dev_cell_id.clear(); dev_particle_id.clear(); @@ -350,6 +377,7 @@ void AtomT::clear() { time_q.clear(); time_quat.clear(); time_vel.clear(); + time_extra.clear(); clear_resize(); #ifdef GPU_CAST @@ -370,12 +398,19 @@ double AtomT::host_memory_usage() const { atom_bytes+=4; if (_vel) atom_bytes+=4; + if (_extra_fields>0) + atom_bytes+=_extra_fields; return _max_atoms*atom_bytes*sizeof(numtyp)+sizeof(Atom); } +#ifdef USE_CUDPP +#define USE_CUDPP_ARG(arg) arg +#else +#define USE_CUDPP_ARG(arg) +#endif // Sort arrays for neighbor list calculation template -void AtomT::sort_neighbor(const int num_atoms) { +void AtomT::sort_neighbor(const int USE_CUDPP_ARG(num_atoms)) { #ifdef USE_CUDPP CUDPPResult result = cudppSort(sort_plan, (unsigned *)dev_cell_id.begin(), (int *)dev_particle_id.begin(), diff --git a/lib/gpu/lal_atom.h b/lib/gpu/lal_atom.h index 77c1faa784..771c2a3571 100644 --- a/lib/gpu/lal_atom.h +++ b/lib/gpu/lal_atom.h @@ -76,7 +76,7 @@ class Atom { * gpu_nbor 2 if binning on host and neighboring on device **/ bool init(const int nall, const bool charge, const bool rot, UCL_Device &dev, const int gpu_nbor=0, const bool bonds=false, - const bool vel=false); + const bool vel=false, const int extra_fields=0); /// Check if we have enough device storage and realloc if not /** Returns true if resized with any call during this timestep **/ @@ -96,7 +96,7 @@ class Atom { * gpu_nbor 1 if neighboring will be performed on device * gpu_nbor 2 if binning on host and neighboring on device **/ bool add_fields(const bool charge, const bool rot, const int gpu_nbor, - const bool bonds, const bool vel=false); + const bool bonds, const bool vel=false, const int extra_fields=0); /// Returns true if GPU is using charges bool charge() { return _charge; } @@ -107,6 +107,9 @@ class Atom { /// Returns true if GPU is using velocities bool velocity() { return _vel; } + /// Returns true if GPU is using extra fields + bool using_extra() { return (_extra_fields>0); } + /// Only free matrices of length inum or nall for resizing void clear_resize(); @@ -128,6 +131,8 @@ class Atom { time_quat.add_to_total(); if (_vel) time_vel.add_to_total(); + if (_extra_fields>0) + time_extra.add_to_total(); } /// Add copy times to timers @@ -139,6 +144,8 @@ class Atom { time_quat.zero(); if (_vel) time_vel.zero(); + if (_extra_fields>0) + time_extra.zero(); } /// Return the total time for host/device data transfer @@ -158,6 +165,10 @@ class Atom { total+=time_vel.total_seconds(); time_vel.zero_total(); } + if (_extra_fields>0) { + total+=time_extra.total_seconds(); + time_extra.zero_total(); + } return total+_time_transfer/1000.0; } @@ -281,7 +292,11 @@ class Atom { /// Signal that we need to transfer atom data for next timestep inline void data_unavail() - { _x_avail=false; _q_avail=false; _quat_avail=false; _v_avail=false; _resized=false; } + { _x_avail=false; _q_avail=false; _quat_avail=false; _v_avail=false; _extra_avail=false; _resized=false; } + + /// Signal that we need to transfer atom extra data for next kernel call + inline void extra_data_unavail() + { _extra_avail=false; } typedef struct { double x,y,z; } vec3d; typedef struct { numtyp x,y,z,w; } vec4d_t; @@ -312,7 +327,7 @@ class Atom { /// Copy positions and types to device asynchronously /** Copies nall() elements **/ - inline void add_x_data(double **host_ptr, int *host_type) { + inline void add_x_data(double ** /*host_ptr*/, int * /*host_type*/) { time_pos.start(); if (_x_avail==false) { #ifdef GPU_CAST @@ -426,7 +441,7 @@ class Atom { /// Copy velocities and tags to device asynchronously /** Copies nall() elements **/ - inline void add_v_data(double **host_ptr, tagint *host_tag) { + inline void add_v_data(double ** /*host_ptr*/, tagint * /*host_tag*/) { time_vel.start(); if (_v_avail==false) { #ifdef GPU_CAST @@ -450,6 +465,33 @@ class Atom { add_v_data(host_ptr,host_tag); } + // Cast extras to write buffer + template + inline void cast_extra_data(cpytyp *host_ptr) { + if (_extra_avail==false) { + double t=MPI_Wtime(); + #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1) + #pragma omp parallel for simd schedule(static) + #elif (LAL_USE_OMP_SIMD == 1) + #pragma omp simd + #endif + for (int i=0; i<_nall*_extra_fields; i++) + extra[i]=host_ptr[i]; + _time_cast+=MPI_Wtime()-t; + } + } + + // Copy extras to device + /** Copies nall()*_extra elements **/ + inline void add_extra_data() { + time_extra.start(); + if (_extra_avail==false) { + extra.update_device(_nall*_extra_fields,true); + _extra_avail=true; + } + time_extra.stop(); + } + /// Add in casting time from additional data (seconds) inline void add_cast_time(double t) { _time_cast+=t; } @@ -473,6 +515,8 @@ class Atom { UCL_Vector quat; /// Velocities UCL_Vector v; + /// Extras + UCL_Vector extra; #ifdef GPU_CAST UCL_Vector x_cast; @@ -493,7 +537,7 @@ class Atom { UCL_H_Vec host_particle_id; /// Device timers - UCL_Timer time_pos, time_q, time_quat, time_vel; + UCL_Timer time_pos, time_q, time_quat, time_vel, time_extra; /// Geryon device UCL_Device *dev; @@ -508,11 +552,12 @@ class Atom { bool _compiled; // True if data has been copied to device already - bool _x_avail, _q_avail, _quat_avail, _v_avail, _resized; + bool _x_avail, _q_avail, _quat_avail, _v_avail, _extra_avail, _resized; bool alloc(const int nall); bool _allocated, _rot, _charge, _bonds, _vel, _other; + int _extra_fields; int _max_atoms, _nall, _gpu_nbor; bool _host_view; double _time_cast, _time_transfer; diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp new file mode 100644 index 0000000000..09d7386461 --- /dev/null +++ b/lib/gpu/lal_base_amoeba.cpp @@ -0,0 +1,962 @@ +/*************************************************************************** + base_amoeba.cpp + ------------------- + Trung Dac Nguyen (Northwestern) + + Base class for pair styles needing per-particle data for position, + charge, and type. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : trung.nguyen@northwestern.edu + ***************************************************************************/ + +#include "lal_base_amoeba.h" + +namespace LAMMPS_AL { +#define BaseAmoebaT BaseAmoeba + +extern Device global_device; + +template +BaseAmoebaT::BaseAmoeba() : _compiled(false), _max_bytes(0), short_nbor_polar_avail(false) { + device=&global_device; + ans=new Answer(); + nbor=new Neighbor(); + pair_program=nullptr; + ucl_device=nullptr; +} + +template +BaseAmoebaT::~BaseAmoeba() { + delete ans; + delete nbor; + k_multipole.clear(); + k_udirect2b.clear(); + k_umutual2b.clear(); + k_fphi_uind.clear(); + k_fphi_mpole.clear(); + k_polar.clear(); + k_special15.clear(); + k_short_nbor.clear(); + + #if 0 // !defined(USE_OPENCL) && !defined(USE_HIP) + if (fft_plan_created) cufftDestroy(plan); + #endif + + if (pair_program) delete pair_program; +} + +template +int BaseAmoebaT::bytes_per_atom_atomic(const int max_nbors) const { + return device->atom.bytes_per_atom()+ans->bytes_per_atom()+ + nbor->bytes_per_atom(max_nbors); +} + +template +int BaseAmoebaT::init_atomic(const int nlocal, const int nall, + const int max_nbors, const int maxspecial, + const int maxspecial15, + const double cell_size, const double gpu_split, + FILE *_screen, const void *pair_program, + const char *k_name_multipole, + const char *k_name_udirect2b, + const char *k_name_umutual2b, + const char *k_name_polar, + const char *k_name_fphi_uind, + const char *k_name_fphi_mpole, + const char *k_name_short_nbor, + const char* k_name_special15) { + screen=_screen; + + int gpu_nbor=0; + if (device->gpu_mode()==Device::GPU_NEIGH) + gpu_nbor=1; + else if (device->gpu_mode()==Device::GPU_HYB_NEIGH) + gpu_nbor=2; + + int _gpu_host=0; + int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor); + if (host_nlocal>0) + _gpu_host=1; + + _threads_per_atom=device->threads_per_charge(); + + bool charge = true; + bool rot = false; + bool vel = false; + _extra_fields = 24; // round up to accomodate quadruples of numtyp values + // rpole 13; uind 3; uinp 3; amtype, amgroup; pval + int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial,vel,_extra_fields/4); + if (success!=0) + return success; + + if (ucl_device!=device->gpu) _compiled=false; + + ucl_device=device->gpu; + atom=&device->atom; + + _block_size=device->pair_block_size(); + _block_bio_size=device->block_bio_pair(); + compile_kernels(*ucl_device,pair_program,k_name_multipole, + k_name_udirect2b, k_name_umutual2b,k_name_polar, + k_name_fphi_uind, k_name_fphi_mpole, + k_name_short_nbor, k_name_special15); + + if (_threads_per_atom>1 && gpu_nbor==0) { + nbor->packing(true); + _nbor_data=&(nbor->dev_packed); + } else { + _nbor_data=&(nbor->dev_nbor); + } + + bool alloc_packed=false; + success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial, + _gpu_host,max_nbors,cell_size,alloc_packed, + _threads_per_atom); + if (success!=0) + return success; + + // Initialize host-device load balancer + hd_balancer.init(device,gpu_nbor,gpu_split); + + // Initialize timers for the selected GPU + time_pair.init(*ucl_device); + time_pair.zero(); + + pos_tex.bind_float(atom->x,4); + q_tex.bind_float(atom->q,1); + + _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes(); + _maxspecial=maxspecial; + _maxspecial15=maxspecial15; + + // allocate per-atom array tep + + int ef_nall=nlocal; //nall; + if (ef_nall==0) + ef_nall=2000; + + dev_short_nbor.alloc(ef_nall*(2+max_nbors),*(this->ucl_device),UCL_READ_WRITE); + + _max_tep_size=static_cast(static_cast(ef_nall)*1.10); + _tep.alloc(_max_tep_size*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); + + _max_fieldp_size = _max_tep_size; + _fieldp.alloc(_max_fieldp_size*8,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); + + _max_thetai_size = 0; + + _nmax = nall; + dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY); + dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY); + dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY); + + #if 0 // !defined(USE_OPENCL) && !defined(USE_HIP) + fft_plan_created = false; + #endif + + #ifdef ASYNC_DEVICE_COPY + _end_command_queue=ucl_device->num_queues(); + ucl_device->push_command_queue(); + #endif + + return success; +} + +template +void BaseAmoebaT::estimate_gpu_overhead(const int add_kernels) { + device->estimate_gpu_overhead(1+add_kernels,_gpu_overhead,_driver_overhead); +} + +template +void BaseAmoebaT::clear_atomic() { + // Output any timing information + acc_timers(); + double avg_split=hd_balancer.all_avg_split(); + _gpu_overhead*=hd_balancer.timestep(); + _driver_overhead*=hd_balancer.timestep(); + device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes, + _gpu_overhead,_driver_overhead,_threads_per_atom,screen); + + time_pair.clear(); + hd_balancer.clear(); + + dev_short_nbor.clear(); + nbor->clear(); + ans->clear(); + + _tep.clear(); + _fieldp.clear(); + _thetai1.clear(); + _thetai2.clear(); + _thetai3.clear(); + _igrid.clear(); + _fdip_phi1.clear(); + _fdip_phi2.clear(); + _fdip_sum_phi.clear(); + _cgrid_brick.clear(); + + dev_nspecial15.clear(); + dev_special15.clear(); + dev_special15_t.clear(); +} + +// --------------------------------------------------------------------------- +// Copy neighbor list from host +// --------------------------------------------------------------------------- +template +int * BaseAmoebaT::reset_nbors(const int nall, const int inum, int *ilist, + int *numj, int **firstneigh, bool &success) { + success=true; + + int mn=nbor->max_nbor_loop(inum,numj,ilist); + resize_atom(inum,nall,success); + resize_local(inum,mn,success); + if (!success) + return nullptr; + + nbor->get_host(inum,ilist,numj,firstneigh,block_size()); + + double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); + if (bytes>_max_an_bytes) + _max_an_bytes=bytes; + + return ilist; +} + +// --------------------------------------------------------------------------- +// Build neighbor list on device +// --------------------------------------------------------------------------- +template +inline int BaseAmoebaT::build_nbor_list(const int inum, const int host_inum, + const int nall, double **host_x, + int *host_type, double *sublo, + double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + bool &success) { + success=true; + resize_atom(inum,nall,success); + resize_local(inum,host_inum,nbor->max_nbors(),success); + if (!success) + return 0; + atom->cast_copy_x(host_x,host_type); + + int mn; + nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, + tag, nspecial, special, success, mn, ans->error_flag); + + // add one-five neighbors + + if (_maxspecial15>0) { + UCL_H_Vec view_nspecial15; + UCL_H_Vec view_special15; + view_nspecial15.view(nspecial15,nall,*ucl_device); + view_special15.view(special15[0],nall*_maxspecial15,*ucl_device); + ucl_copy(dev_nspecial15,view_nspecial15,nall,false); + ucl_copy(dev_special15_t,view_special15,_maxspecial15*nall,false); + nbor->transpose(dev_special15, dev_special15_t, _maxspecial15, nall); + + add_onefive_neighbors(); + } + + double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); + if (bytes>_max_an_bytes) + _max_an_bytes=bytes; + return mn; +} + +// --------------------------------------------------------------------------- +// Prepare for multiple kernel calls in a time step: +// - reallocate per-atom arrays, if needed +// - transfer extra data from host to device +// - build the full neighbor lists for use by different kernels +// --------------------------------------------------------------------------- + +template +int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, double *host_pval, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, int &host_start, + int **&ilist, int **&jnum, const double cpu_time, + bool &success, double *host_q, double * /*boxlo*/, double * /*prd*/) { + acc_timers(); + if (eatom) _eflag=2; + else if (eflag_in) _eflag=1; + else _eflag=0; + if (vatom) _vflag=2; + else if (vflag_in) _vflag=1; + else _vflag=0; + + #ifdef LAL_NO_BLOCK_REDUCE + if (_eflag) _eflag=2; + if (_vflag) _vflag=2; + #endif + + set_kernel(_eflag,_vflag); + + // ------------------- Resize 1-5 neighbor arrays ------------------------ + + if (nall>_nmax) { + _nmax = nall; + dev_nspecial15.clear(); + dev_special15.clear(); + dev_special15_t.clear(); + dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY); + dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY); + dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY); + } + + if (inum_full==0) { + host_start=0; + // Make sure textures are correct if realloc by a different hybrid style + resize_atom(0,nall,success); + zero_timers(); + return nullptr; + } + + hd_balancer.balance(cpu_time); + int inum=hd_balancer.get_gpu_count(ago,inum_full); + ans->inum(inum); + host_start=inum; + + // Build neighbor list on GPU if necessary + if (ago==0) { + _max_nbors = build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, + sublo, subhi, tag, nspecial, special, nspecial15, special15, + success); + if (!success) + return nullptr; + atom->cast_q_data(host_q); + hd_balancer.start_timer(); + } else { + atom->cast_x_data(host_x,host_type); + atom->cast_q_data(host_q); + hd_balancer.start_timer(); + atom->add_x_data(host_x,host_type); + } + atom->add_q_data(); + cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval); + atom->add_extra_data(); + + *ilist=nbor->host_ilist.begin(); + *jnum=nbor->host_acc.begin(); + + // re-allocate dev_short_nbor if necessary + if (inum_full*(2+_max_nbors) > dev_short_nbor.cols()) { + int _nmax=static_cast(static_cast(inum_full)*1.10); + dev_short_nbor.resize((2+_max_nbors)*_nmax); + } + + hd_balancer.stop_timer(); + + return nbor->host_jlist.begin()-host_start; +} + +// --------------------------------------------------------------------------- +// Compute multipole real-space part +// precompute() should be already invoked before mem (re)allocation +// this is the first part in a time step done on the GPU for AMOEBA for now +// --------------------------------------------------------------------------- +template +void BaseAmoebaT::compute_multipole_real(const int /*ago*/, const int inum_full, + const int /*nall*/, double ** /*host_x*/, + int * /*host_type*/, int * /*host_amtype*/, + int * /*host_amgroup*/, double ** /*host_rpole*/, + double */*host_pval*/, double * /*sublo*/, + double * /*subhi*/, tagint * /*tag*/, + int ** /*nspecial*/, tagint ** /*special*/, + int * /*nspecial15*/, tagint ** /*special15*/, + const bool /*eflag_in*/, const bool /*vflag_in*/, + const bool /*eatom*/, const bool /*vatom*/, + int & /*host_start*/, int ** /*ilist*/, int ** /*jnum*/, + const double /*cpu_time*/, bool & /*success*/, + const double aewald, const double felec, + const double off2_mpole, double * /*host_q*/, + double * /*boxlo*/, double * /*prd*/, void **tep_ptr) { + // ------------------- Resize _tep array ------------------------ + + if (inum_full>_max_tep_size) { + _max_tep_size=static_cast(static_cast(inum_full)*1.10); + _tep.resize(_max_tep_size*4); + } + *tep_ptr=_tep.host.begin(); + + _off2_mpole = off2_mpole; + _felec = felec; + _aewald = aewald; + multipole_real(_eflag,_vflag); + + // leave the answers (forces, energies and virial) on the device, + // only copy them back in the last kernel (polar_real) + //ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); + //device->add_ans_object(ans); + + // copy tep from device to host + + _tep.update_host(_max_tep_size*4,false); +} + +// --------------------------------------------------------------------------- +// Reneighbor on GPU if necessary, and then compute the direct real space part +// of the permanent field +// --------------------------------------------------------------------------- +template +void BaseAmoebaT::compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, double *host_pval, + const double aewald, const double off2_polar, + void** fieldp_ptr) { + // all the necessary data arrays are already copied from host to device + + cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval); + atom->add_extra_data(); + + *fieldp_ptr=_fieldp.host.begin(); + + // specify the correct cutoff and alpha values + _off2_polar = off2_polar; + _aewald = aewald; + udirect2b(_eflag,_vflag); + + // copy field and fieldp from device to host (_fieldp store both arrays, one after another) + + _fieldp.update_host(_max_fieldp_size*8,false); +} + +// --------------------------------------------------------------------------- +// Reneighbor on GPU if necessary, and then compute the direct real space part +// of the induced field +// --------------------------------------------------------------------------- +template +void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double ** /*host_rpole*/, + double **host_uind, double **host_uinp, double * /*host_pval*/, + const double aewald, const double off2_polar, + void** /*fieldp_ptr*/) { + // only copy the necessary data arrays that are updated over the iterations + // use nullptr for the other arrays that are already copied from host to device + cast_extra_data(host_amtype, host_amgroup, nullptr, host_uind, host_uinp, nullptr); + atom->add_extra_data(); + + // set the correct cutoff and alpha + _off2_polar = off2_polar; + _aewald = aewald; + // launch the kernel + umutual2b(_eflag,_vflag); + + // copy field and fieldp from device to host (_fieldp store both arrays, one after another) + // NOTE: move this step to update_fieldp() to delay device-host transfer + // after umutual1 and self are done on the GPU + // *fieldp_ptr=_fieldp.host.begin(); + // _fieldp.update_host(_max_fieldp_size*8,false); +} + +// --------------------------------------------------------------------------- +// Prepare for umutual1() after bspline_fill() is done on host +// - reallocate per-atom arrays, thetai1, thetai2, thetai3, and igrid if needed +// host_thetai1, host_thetai2, host_thetai3 are allocated with nmax by bsordermax by 4 +// host_igrid is allocated with nmax by 4 +// - transfer extra data from host to device +// NOTE: can be re-used for fphi_mpole() but with a different bsorder value +// --------------------------------------------------------------------------- + +template +void BaseAmoebaT::precompute_kspace(const int inum_full, const int bsorder, + double ***host_thetai1, double ***host_thetai2, + double ***host_thetai3, int** host_igrid, + const int nzlo_out, const int nzhi_out, + const int nylo_out, const int nyhi_out, + const int nxlo_out, const int nxhi_out) { + // update bsorder with that of the kspace solver + _bsorder = bsorder; + + // allocate or resize per-atom arrays + // _max_thetai_size, _max_tep_size and _max_fieldp_size are essentially _nmax + // will be consolidated once all terms are ready + + if (_max_thetai_size == 0) { + _max_thetai_size = static_cast(static_cast(inum_full)*1.10); + _thetai1.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY); + _thetai2.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY); + _thetai3.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY); + _igrid.alloc(_max_thetai_size*4,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY); + + _fdip_phi1.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_READ_WRITE); + _fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_READ_WRITE); + _fdip_sum_phi.alloc(_max_thetai_size*20,*(this->ucl_device),UCL_READ_WRITE); + } else { + if ((int)_thetai1.cols()<_max_thetai_size*bsorder) { + _max_thetai_size=static_cast(static_cast(inum_full)*1.10); + _thetai1.resize(_max_thetai_size*bsorder); + _thetai2.resize(_max_thetai_size*bsorder); + _thetai3.resize(_max_thetai_size*bsorder); + _igrid.resize(_max_thetai_size*4); + + _fdip_phi1.resize(_max_thetai_size*10); + _fdip_phi2.resize(_max_thetai_size*10); + _fdip_sum_phi.resize(_max_thetai_size*20); + } + } + + #ifdef ASYNC_DEVICE_COPY + _thetai1.cq(ucl_device->cq(_end_command_queue)); + _thetai2.cq(ucl_device->cq(_end_command_queue)); + _thetai3.cq(ucl_device->cq(_end_command_queue)); + #endif + + // pack host data to device + + for (int i = 0; i < inum_full; i++) + for (int j = 0; j < bsorder; j++) { + int idx = i*bsorder + j; + numtyp4 v; + v.x = host_thetai1[i][j][0]; + v.y = host_thetai1[i][j][1]; + v.z = host_thetai1[i][j][2]; + v.w = host_thetai1[i][j][3]; + _thetai1[idx] = v; + } + _thetai1.update_device(true); + + for (int i = 0; i < inum_full; i++) + for (int j = 0; j < bsorder; j++) { + int idx = i*bsorder + j; + numtyp4 v; + v.x = host_thetai2[i][j][0]; + v.y = host_thetai2[i][j][1]; + v.z = host_thetai2[i][j][2]; + v.w = host_thetai2[i][j][3]; + _thetai2[idx] = v; + } + _thetai2.update_device(true); + + for (int i = 0; i < inum_full; i++) + for (int j = 0; j < bsorder; j++) { + int idx = i*bsorder + j; + numtyp4 v; + v.x = host_thetai3[i][j][0]; + v.y = host_thetai3[i][j][1]; + v.z = host_thetai3[i][j][2]; + v.w = host_thetai3[i][j][3]; + _thetai3[idx] = v; + } + _thetai3.update_device(true); + + for (int i = 0; i < inum_full; i++) { + int idx = i*4; + _igrid[idx+0] = host_igrid[i][0]; + _igrid[idx+1] = host_igrid[i][1]; + _igrid[idx+2] = host_igrid[i][2]; + } + _igrid.update_device(true); + + // _cgrid_brick holds the grid-based potential + + _nzlo_out = nzlo_out; + _nzhi_out = nzhi_out; + _nylo_out = nylo_out; + _nyhi_out = nyhi_out; + _nxlo_out = nxlo_out; + _nxhi_out = nxhi_out; + _ngridz = nzhi_out - nzlo_out + 1; + _ngridy = nyhi_out - nylo_out + 1; + _ngridx = nxhi_out - nxlo_out + 1; + _num_grid_points = _ngridx * _ngridy * _ngridz; + + int numel = _num_grid_points; + if (_cgrid_brick.cols() == 0) { + int nsize=(int)(((double)numel)*1.1); + _cgrid_brick.alloc(nsize, *(this->ucl_device), UCL_READ_WRITE, UCL_READ_ONLY); + } else if (numel > (int)_cgrid_brick.cols()) { + _cgrid_brick.resize(numel); + } +} + +// --------------------------------------------------------------------------- +// fphi_uind = induced potential from grid +// fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid +// NOTE: host_grid_brick is from ic_kspace post_convolution() +// --------------------------------------------------------------------------- + +template +void BaseAmoebaT::compute_fphi_uind(double ****host_grid_brick, + void **host_fdip_phi1, + void **host_fdip_phi2, + void **host_fdip_sum_phi) +{ + int n = 0; + for (int iz = _nzlo_out; iz <= _nzhi_out; iz++) + for (int iy = _nylo_out; iy <= _nyhi_out; iy++) + for (int ix = _nxlo_out; ix <= _nxhi_out; ix++) { + numtyp2 v; + v.x = host_grid_brick[iz][iy][ix][0]; + v.y = host_grid_brick[iz][iy][ix][1]; + _cgrid_brick[n] = v; + n++; + } + _cgrid_brick.update_device(_num_grid_points, false); + + #ifdef ASYNC_DEVICE_COPY + ucl_device->sync(); + #endif + + // launch the kernel with its execution configuration (see below) + fphi_uind(); + + // copy data from device to host + _fdip_phi1.update_host(_max_thetai_size*10, false); + _fdip_phi2.update_host(_max_thetai_size*10, false); + _fdip_sum_phi.update_host(_max_thetai_size*20, false); + + // return the pointers to the host-side arrays + *host_fdip_phi1 = _fdip_phi1.host.begin(); + *host_fdip_phi2 = _fdip_phi2.host.begin(); + *host_fdip_sum_phi = _fdip_sum_phi.host.begin(); +} + +// --------------------------------------------------------------------------- +// Interpolate the potential from the PME grid +// --------------------------------------------------------------------------- +template +int BaseAmoebaT::fphi_uind() { + int ainum=ans->inum(); + if (ainum == 0) + return 0; + + // Compute the block size and grid size to keep all cores busy + + const int BX=block_size(); + const int GX=static_cast(ceil(static_cast(ainum)/BX)); + + time_pair.start(); + int ngridxy = _ngridx * _ngridy; + k_fphi_uind.set_size(GX,BX); + k_fphi_uind.run(&_thetai1, &_thetai2, &_thetai3, &_igrid, &_cgrid_brick, + &_fdip_phi1, &_fdip_phi2, &_fdip_sum_phi, &_bsorder, &ainum, + &_nzlo_out, &_nylo_out, &_nxlo_out, &ngridxy, &_ngridx); + time_pair.stop(); + + return GX; +} + +// --------------------------------------------------------------------------- +// fphi_mpole = multipole potential from grid (limited to polar_kspace for now) +// fphi_mpole extracts the permanent multipole potential from +// the particle mesh Ewald grid +// NOTE: host_grid_brick is from p_kspace post_convolution() +// --------------------------------------------------------------------------- + +template +void BaseAmoebaT::compute_fphi_mpole(double ***host_grid_brick, void **host_fphi, const double felec) +{ + int n = 0; + for (int iz = _nzlo_out; iz <= _nzhi_out; iz++) + for (int iy = _nylo_out; iy <= _nyhi_out; iy++) + for (int ix = _nxlo_out; ix <= _nxhi_out; ix++) { + numtyp2 v; + v.x = host_grid_brick[iz][iy][ix]; + v.y = (numtyp)0; + _cgrid_brick[n] = v; + n++; + } + _cgrid_brick.update_device(_num_grid_points, false); + + _felec = felec; + fphi_mpole(); + + _fdip_sum_phi.update_host(_max_thetai_size*20, false); + + *host_fphi = _fdip_sum_phi.host.begin(); +} + +// --------------------------------------------------------------------------- +// Interpolate the potential from the PME grid +// --------------------------------------------------------------------------- +template +int BaseAmoebaT::fphi_mpole() { + int ainum=ans->inum(); + if (ainum == 0) + return 0; + + // Compute the block size and grid size to keep all cores busy + + const int BX=block_size(); + const int GX=static_cast(ceil(static_cast(ainum)/BX)); + + time_pair.start(); + int ngridxy = _ngridx * _ngridy; + k_fphi_mpole.set_size(GX,BX); + k_fphi_mpole.run(&_thetai1, &_thetai2, &_thetai3, &_igrid, &_cgrid_brick, + &_fdip_sum_phi, &_bsorder, &ainum, &_felec, + &_nzlo_out, &_nylo_out, &_nxlo_out, &ngridxy, &_ngridx); + time_pair.stop(); + + return GX; +} + +// --------------------------------------------------------------------------- +// Reneighbor on GPU if necessary, and then compute polar real-space +// --------------------------------------------------------------------------- +template +void BaseAmoebaT::compute_polar_real(int *host_amtype, int *host_amgroup, + double **host_rpole, double **host_uind, + double **host_uinp, double *host_pval, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, + const double aewald, const double felec, + const double off2_polar, void **tep_ptr) { + + // cast necessary data arrays from host to device + + cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval); + atom->add_extra_data(); + + *tep_ptr=_tep.host.begin(); + + _off2_polar = off2_polar; + _felec = felec; + _aewald = aewald; + const int red_blocks=polar_real(_eflag,_vflag); + + // only copy answers (forces, energies and virial) back from the device + // in the last kernel (which is polar_real here) + ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); + device->add_ans_object(ans); + + // copy tep from device to host + _tep.update_host(_max_tep_size*4,false); +} + +// --------------------------------------------------------------------------- +// Return the memory bytes allocated on the host and device +// --------------------------------------------------------------------------- + +template +double BaseAmoebaT::host_memory_usage_atomic() const { + return device->atom.host_memory_usage()+nbor->host_memory_usage()+ + 4*sizeof(numtyp)+sizeof(BaseAmoeba); +} + +// --------------------------------------------------------------------------- +// Setup the FFT plan: only placeholder for now +// --------------------------------------------------------------------------- + +template +void BaseAmoebaT::setup_fft(const int /*numel*/, const int /*element_type*/) +{ + // TODO: setting up FFT plan based on the backend (cuFFT or hipFFT) +} + +// --------------------------------------------------------------------------- +// Compute FFT on the device: only placeholder for now +// --------------------------------------------------------------------------- + +template +void BaseAmoebaT::compute_fft1d(void * /*in*/, void * /*out*/, + const int /*numel*/, const int /*mode*/) +{ + // TODO: setting up FFT plan based on the backend (cuFFT or hipFFT) + #if 0 // !defined(USE_OPENCL) && !defined(USE_HIP) + if (fft_plan_created == false) { + int m = numel/2; + cufftPlan1d(&plan, m, CUFFT_Z2Z, 1); + fft_plan_created = true; + } + + // n = number of double complex + int n = numel/2; + + // copy the host array to the device (data) + UCL_Vector data; + data.alloc(n, *(this->ucl_device), UCL_READ_WRITE, UCL_READ_WRITE); + int m = 0; + double* d_in = (double*)in; + for (int i = 0; i < n; i++) { + data[i].x = d_in[m]; + data[i].y = d_in[m+1]; + m += 2; + } + data.update_device(false); + + // perform the in-place forward FFT + + cufftResult result = cufftExecZ2Z(plan, (cufftDoubleComplex*)&data.device, + (cufftDoubleComplex*)&data.device, CUFFT_FORWARD); + if (result != CUFFT_SUCCESS) printf("failed cufft %d\n", result); + ucl_device->sync(); + data.update_host(false); + + // copy back the data to the host array + + m = 0; + double* d_out = (double*)out; + for (int i = 0; i < n; i++) { + d_out[m] = data[i].x; + d_out[m+1] = data[i].y; + m += 2; + } + + data.clear(); + #endif +} + +// --------------------------------------------------------------------------- +// Copy the extra data from host to device +// --------------------------------------------------------------------------- + +template +void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole, + double** uind, double** uinp, double* pval) { + // signal that we need to transfer extra data from the host + + atom->extra_data_unavail(); + + int _nall=atom->nall(); + numtyp4 *pextra=reinterpret_cast(&(atom->extra[0])); + + int n = 0; + int nstride = 1; //4; + if (rpole) { + for (int i = 0; i < _nall; i++) { + int idx = n+i*nstride; + pextra[idx].x = rpole[i][0]; + pextra[idx].y = rpole[i][1]; + pextra[idx].z = rpole[i][2]; + pextra[idx].w = rpole[i][3]; + } + + n += nstride*_nall; + for (int i = 0; i < _nall; i++) { + int idx = n+i*nstride; + pextra[idx].x = rpole[i][4]; + pextra[idx].y = rpole[i][5]; + pextra[idx].z = rpole[i][6]; + pextra[idx].w = rpole[i][8]; + } + + n += nstride*_nall; + for (int i = 0; i < _nall; i++) { + int idx = n+i*nstride; + pextra[idx].x = rpole[i][9]; + pextra[idx].y = rpole[i][12]; + pextra[idx].z = (numtyp)amtype[i]; + pextra[idx].w = (numtyp)amgroup[i]; + } + } else { + n += 2*nstride*_nall; + } + + n += nstride*_nall; + if (uind) { + for (int i = 0; i < _nall; i++) { + int idx = n+i*nstride; + pextra[idx].x = uind[i][0]; + pextra[idx].y = uind[i][1]; + pextra[idx].z = uind[i][2]; + pextra[idx].w = 0; + } + } + + n += nstride*_nall; + if (uinp) { + for (int i = 0; i < _nall; i++) { + int idx = n+i*nstride; + pextra[idx].x = uinp[i][0]; + pextra[idx].y = uinp[i][1]; + pextra[idx].z = uinp[i][2]; + pextra[idx].w = 0; + } + } + + n += nstride*_nall; + if (pval) { + for (int i = 0; i < _nall; i++) { + int idx = n+i*nstride; + pextra[idx].x = pval[i]; + pextra[idx].y = 0; + pextra[idx].z = 0; + pextra[idx].w = 0; + } + } +} + +// --------------------------------------------------------------------------- +// Compile (load) the kernel strings and set the kernels +// --------------------------------------------------------------------------- + +template +void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str, + const char *kname_multipole, + const char *kname_udirect2b, + const char *kname_umutual2b, + const char *kname_polar, + const char *kname_fphi_uind, + const char *kname_fphi_mpole, + const char *kname_short_nbor, + const char* kname_special15) { + if (_compiled) + return; + + if (pair_program) delete pair_program; + pair_program=new UCL_Program(dev); + std::string oclstring = device->compile_string()+" -DEVFLAG=1"; + pair_program->load_string(pair_str, oclstring.c_str(),nullptr, screen); + + k_multipole.set_function(*pair_program, kname_multipole); + k_udirect2b.set_function(*pair_program, kname_udirect2b); + k_umutual2b.set_function(*pair_program, kname_umutual2b); + k_polar.set_function(*pair_program, kname_polar); + k_fphi_uind.set_function(*pair_program, kname_fphi_uind); + k_fphi_mpole.set_function(*pair_program, kname_fphi_mpole); + k_short_nbor.set_function(*pair_program, kname_short_nbor); + k_special15.set_function(*pair_program, kname_special15); + pos_tex.get_texture(*pair_program, "pos_tex"); + q_tex.get_texture(*pair_program, "q_tex"); + + _compiled=true; + + #if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0)) + if (dev.has_subgroup_support()) { + int mx_subgroup_sz = k_polar.max_subgroup_size(_block_size); + if (_threads_per_atom > mx_subgroup_sz) + _threads_per_atom = mx_subgroup_sz; + device->set_simd_size(mx_subgroup_sz); + } + #endif + +} + +// --------------------------------------------------------------------------- +// Specify 1-5 neighbors from the current neighbor list +// --------------------------------------------------------------------------- + +template +int BaseAmoebaT::add_onefive_neighbors() { + // Compute the block size and grid size to keep all cores busy + const int BX=block_size(); + int GX=static_cast(ceil(static_cast(ans->inum())/ + (BX/_threads_per_atom))); + + int _nall=atom->nall(); + int ainum=ans->inum(); + int nbor_pitch=nbor->nbor_pitch(); + + k_special15.set_size(GX,BX); + k_special15.run(&nbor->dev_nbor, &_nbor_data->begin(), + &atom->dev_tag, &dev_nspecial15, &dev_special15, + &ainum, &_nall, &nbor_pitch, + &_threads_per_atom); + + return GX; +} + +template class BaseAmoeba; +} diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h new file mode 100644 index 0000000000..0eaaafeb1e --- /dev/null +++ b/lib/gpu/lal_base_amoeba.h @@ -0,0 +1,325 @@ +/*************************************************************************** + base_amoeba.h + ------------------- + Trung Dac Nguyen (Northwestern) + + Base class for pair styles needing per-particle data for position, + charge, and type. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : trung.nguyen@northwestern.edu + ***************************************************************************/ + +#ifndef LAL_BASE_AMOEBA_H +#define LAL_BASE_AMOEBA_H + +#include "lal_device.h" +#include "lal_balance.h" +#include "mpi.h" + +#if defined(USE_OPENCL) +#include "geryon/ocl_texture.h" +#elif defined(USE_CUDART) +#include "geryon/nvc_texture.h" +#elif defined(USE_HIP) +#include "geryon/hip_texture.h" +#else +#include "geryon/nvd_texture.h" +#endif + +//#define ASYNC_DEVICE_COPY + +#if !defined(USE_OPENCL) && !defined(USE_HIP) +// temporary workaround for int2 also defined in cufft +#ifdef int2 +#undef int2 +#endif +#include "cufft.h" +#endif + +namespace LAMMPS_AL { + +template +class BaseAmoeba { + public: + BaseAmoeba(); + virtual ~BaseAmoeba(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * \param k_name name for the kernel for force calculation + * + * Returns: + * - 0 if successful + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init_atomic(const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const int maxspecial15, const double cell_size, + const double gpu_split, FILE *screen, const void *pair_program, + const char *kname_multipole, const char *kname_udirect2b, + const char *kname_umutual2b, const char *kname_polar, + const char *kname_fphi_uind, const char *kname_fphi_mpole, + const char *kname_short_nbor, const char* kname_special15); + + /// Estimate the overhead for GPU context changes and CPU driver + void estimate_gpu_overhead(const int add_kernels=0); + + /// Check if there is enough storage for atom arrays and realloc if not + /** \param success set to false if insufficient memory **/ + inline void resize_atom(const int inum, const int nall, bool &success) { + if (atom->resize(nall, success)) { + pos_tex.bind_float(atom->x,4); + q_tex.bind_float(atom->q,1); + } + ans->resize(inum,success); + } + + /// Check if there is enough storage for neighbors and realloc if not + /** \param nlocal number of particles whose nbors must be stored on device + * \param host_inum number of particles whose nbors need to copied to host + * \param current maximum number of neighbors + * \note olist_size=total number of local particles **/ + inline void resize_local(const int inum, const int max_nbors, bool &success) { + nbor->resize(inum,max_nbors,success); + } + + /// Check if there is enough storage for neighbors and realloc if not + /** \param nlocal number of particles whose nbors must be stored on device + * \param host_inum number of particles whose nbors need to copied to host + * \param current maximum number of neighbors + * \note host_inum is 0 if the host is performing neighboring + * \note nlocal+host_inum=total number local particles + * \note olist_size=0 **/ + inline void resize_local(const int inum, const int host_inum, + const int max_nbors, bool &success) { + nbor->resize(inum,host_inum,max_nbors,success); + } + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear_atomic(); + + /// Returns memory usage on device per atom + int bytes_per_atom_atomic(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage_atomic() const; + + /// Accumulate timers + inline void acc_timers() { + if (device->time_device()) { + nbor->acc_timers(screen); + time_pair.add_to_total(); + atom->acc_timers(); + ans->acc_timers(); + } + } + + /// Zero timers + inline void zero_timers() { + time_pair.zero(); + atom->zero_timers(); + ans->zero_timers(); + } + + /// Copy neighbor list from host + int * reset_nbors(const int nall, const int inum, int *ilist, int *numj, + int **firstneigh, bool &success); + + /// Build neighbor list on device + int build_nbor_list(const int inum, const int host_inum, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, int *nspecial15, tagint **special15, + bool &success); + + /// Reallocate per-atom arrays if needed, and build neighbor lists once, if needed + virtual int** precompute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, double **host_uind, + double **host_uinp, double *host_pval, double *sublo, double *subhi, + tagint *tag, int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **&ilist, int **&numj, const double cpu_time, bool &success, + double *charge, double *boxlo, double *prd); + + /// Compute multipole real-space with device neighboring + virtual void compute_multipole_real(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, double *host_pval, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, int *nspecial15, tagint **special15, + const bool eflag, const bool vflag, const bool eatom, const bool vatom, + int &host_start, int **ilist, int **numj, const double cpu_time, + bool &success, const double aewald, const double felec, + const double off2_mpole, double *charge, double *boxlo, + double *prd, void **tep_ptr); + + /// Compute the real space part of the permanent field (udirect2b) with device neighboring + virtual void compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, double *host_pval, + const double aewald, const double off2_polar, void **fieldp_ptr); + + /// Compute the real space part of the induced field (umutual2b) with device neighboring + virtual void compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, double *host_pval, + const double aewald, const double off2_polar, void **fieldp_ptr); + + /// Allocate/resize per-atom arrays before the kspace parts in induce() and polar + virtual void precompute_kspace(const int inum_full, const int bsorder, + double ***host_thetai1, double ***host_thetai2, + double ***host_thetai3, int** igrid, + const int nzlo_out, const int nzhi_out, + const int nylo_out, const int nyhi_out, + const int nxlo_out, const int nxhi_out); + /// Interpolate the induced potential from the grid + virtual void compute_fphi_uind(double ****host_grid_brick, + void **host_fdip_phi1, void **host_fdip_phi2, + void **host_fdip_sum_phi); + + /// Interpolate the multipolar potential from the grid + virtual void compute_fphi_mpole(double ***host_grid_brick, void **host_fphi, + const double felec); + + /// Compute polar real-space with device neighboring + virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, double *host_pval, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, + const double aewald, const double felec, const double off2_polar, + void **tep_ptr); + + // copy field and fieldp from device to host after umutual2b + virtual void update_fieldp(void **fieldp_ptr) { + *fieldp_ptr=_fieldp.host.begin(); + // _fieldp store both arrays, one after another + _fieldp.update_host(_max_fieldp_size*8,false); + } + + /// setup a plan for FFT, where size is the number of elements + + void setup_fft(const int size, const int element_type=0); + + /// compute forward/backward FFT on the device + + void compute_fft1d(void* in, void* out, const int numel, const int mode); + + // -------------------------- DEVICE DATA ------------------------- + + /// Device Properties and Atom and Neighbor storage + Device *device; + + /// Geryon device + UCL_Device *ucl_device; + + /// Device Timers + UCL_Timer time_pair; + + /// Host device load balancer + Balance hd_balancer; + + /// LAMMPS pointer for screen output + FILE *screen; + + // --------------------------- ATOM DATA -------------------------- + + /// Atom Data + Atom *atom; + + UCL_Vector polar1, polar2, polar3, polar4, polar5; + + /// cast host arrays into a single array for atom->extra + void cast_extra_data(int* amtype, int* amgroup, double** rpole, + double** uind, double** uinp, double* pval=nullptr); + + /// Per-atom arrays + UCL_Vector _tep, _fieldp; + int _nmax, _max_tep_size, _max_fieldp_size; + + int _bsorder; + UCL_Vector _thetai1, _thetai2, _thetai3; + UCL_Vector _igrid; + UCL_Vector _cgrid_brick; + UCL_Vector _fdip_phi1, _fdip_phi2, _fdip_sum_phi; + int _max_thetai_size; + int _nzlo_out, _nzhi_out, _nylo_out, _nyhi_out, _nxlo_out, _nxhi_out; + int _ngridx, _ngridy, _ngridz, _num_grid_points; + + int _end_command_queue; + + // ------------------------ FORCE/ENERGY DATA ----------------------- + + Answer *ans; + + // --------------------------- NBOR DATA ---------------------------- + + /// Neighbor data + Neighbor *nbor; + /// Device storage for 1-5 special neighbor counts + UCL_D_Vec dev_nspecial15; + /// Device storage for special neighbors + UCL_D_Vec dev_special15, dev_special15_t; + + int add_onefive_neighbors(); + + UCL_D_Vec dev_short_nbor; + + // ------------------------- DEVICE KERNELS ------------------------- + UCL_Program *pair_program; + UCL_Kernel k_multipole, k_udirect2b, k_umutual2b, k_polar; + UCL_Kernel k_fphi_uind, k_fphi_mpole; + UCL_Kernel k_special15, k_short_nbor; + inline int block_size() { return _block_size; } + inline void set_kernel(const int /*eflag*/, const int /*vflag*/) {} + + // --------------------------- TEXTURES ----------------------------- + UCL_Texture pos_tex; + UCL_Texture q_tex; + + protected: + bool _compiled; + int _block_size, _block_bio_size, _threads_per_atom; + int _extra_fields; + double _max_bytes, _max_an_bytes, _maxspecial, _maxspecial15, _max_nbors; + double _gpu_overhead, _driver_overhead; + bool short_nbor_polar_avail; + UCL_D_Vec *_nbor_data; + + numtyp _aewald,_felec; + numtyp _off2_hal,_off2_repulse,_off2_disp,_off2_mpole,_off2_polar; + + int _eflag, _vflag; + + void compile_kernels(UCL_Device &dev, const void *pair_string, + const char *kname_multipole, const char *kname_udirect2b, + const char *kname_umutual2b, const char *kname_polar, + const char *kname_fphi_uind, const char *kname_fphi_mpole, + const char *kname_short_nbor, const char* kname_special15); + + virtual int multipole_real(const int eflag, const int vflag) = 0; + virtual int udirect2b(const int eflag, const int vflag) = 0; + virtual int umutual2b(const int eflag, const int vflag) = 0; + virtual int fphi_uind(); + virtual int fphi_mpole(); + virtual int polar_real(const int eflag, const int vflag) = 0; + + + #if !defined(USE_OPENCL) && !defined(USE_HIP) + cufftHandle plan; + #endif + bool fft_plan_created; +}; + +} + +#endif diff --git a/lib/gpu/lal_base_atomic.cpp b/lib/gpu/lal_base_atomic.cpp index bb0e815b3f..0cfc084fa4 100644 --- a/lib/gpu/lal_base_atomic.cpp +++ b/lib/gpu/lal_base_atomic.cpp @@ -72,7 +72,9 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall, _threads_per_atom=device->threads_per_atom(); - int success=device->init(*ans,false,false,nlocal,nall,maxspecial); + bool charge = false; + bool rot = false; + int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial); if (success!=0) return success; diff --git a/lib/gpu/lal_base_charge.cpp b/lib/gpu/lal_base_charge.cpp index 4a59f70d83..3cd6c6030a 100644 --- a/lib/gpu/lal_base_charge.cpp +++ b/lib/gpu/lal_base_charge.cpp @@ -72,7 +72,9 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall, _threads_per_atom=device->threads_per_charge(); - int success=device->init(*ans,true,false,nlocal,nall,maxspecial); + bool charge = true; + bool rot = false; + int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial); if (success!=0) return success; diff --git a/lib/gpu/lal_base_dipole.cpp b/lib/gpu/lal_base_dipole.cpp index 66e03de651..6ef1c40ca7 100644 --- a/lib/gpu/lal_base_dipole.cpp +++ b/lib/gpu/lal_base_dipole.cpp @@ -73,7 +73,9 @@ int BaseDipoleT::init_atomic(const int nlocal, const int nall, _threads_per_atom=device->threads_per_charge(); - int success=device->init(*ans,true,true,nlocal,nall,maxspecial); + bool charge = true; + bool rot = true; + int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial); if (success!=0) return success; diff --git a/lib/gpu/lal_base_dpd.cpp b/lib/gpu/lal_base_dpd.cpp index 44b86abeeb..e103699d40 100644 --- a/lib/gpu/lal_base_dpd.cpp +++ b/lib/gpu/lal_base_dpd.cpp @@ -72,7 +72,10 @@ int BaseDPDT::init_atomic(const int nlocal, const int nall, _threads_per_atom=device->threads_per_atom(); - int success=device->init(*ans,false,false,nlocal,nall,maxspecial,true); + bool charge = false; + bool rot = false; + bool vel = true; + int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial,vel); if (success!=0) return success; @@ -193,7 +196,7 @@ void BaseDPDT::compute(const int f_ago, const int inum_full, const int nall, const double cpu_time, bool &success, tagint *tag, double **host_v, const double dtinvsqrt, const int seed, const int timestep, - const int nlocal, double *boxlo, double *prd) { + const int /*nlocal*/, double * /*boxlo*/, double * /*prd*/) { acc_timers(); int eflag, vflag; if (eatom) eflag=2; @@ -258,7 +261,7 @@ int** BaseDPDT::compute(const int ago, const int inum_full, const double cpu_time, bool &success, double **host_v, const double dtinvsqrt, const int seed, const int timestep, - double *boxlo, double *prd) { + double * /*boxlo*/, double * /*prd*/) { acc_timers(); int eflag, vflag; if (eatom) eflag=2; diff --git a/lib/gpu/lal_base_three.cpp b/lib/gpu/lal_base_three.cpp index 3457955b3e..bfadfebf66 100644 --- a/lib/gpu/lal_base_three.cpp +++ b/lib/gpu/lal_base_three.cpp @@ -94,7 +94,9 @@ int BaseThreeT::init_three(const int nlocal, const int nall, else _threads_per_atom=device->threads_per_three(); - int success=device->init(*ans,false,false,nlocal,nall,maxspecial); + bool charge = false; + bool rot = false; + int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial); if (success!=0) return success; diff --git a/lib/gpu/lal_charmm_long.cpp b/lib/gpu/lal_charmm_long.cpp index 8008b1fbb3..0d01d70fb1 100644 --- a/lib/gpu/lal_charmm_long.cpp +++ b/lib/gpu/lal_charmm_long.cpp @@ -44,19 +44,15 @@ int CHARMMLongT::bytes_per_atom(const int max_nbors) const { } template -int CHARMMLongT::init(const int ntypes, - double host_cut_bothsq, double **host_lj1, - double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, - double *host_special_lj, const int nlocal, - const int nall, const int max_nbors, - const int maxspecial, const double cell_size, - const double gpu_split, FILE *_screen, - double host_cut_ljsq, const double host_cut_coulsq, - double *host_special_coul, const double qqrd2e, - const double g_ewald, const double cut_lj_innersq, - const double denom_lj, double **epsilon, - double **sigma, const bool mix_arithmetic) { +int CHARMMLongT::init(const int ntypes, double host_cut_bothsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double ** /*host_offset*/, double *host_special_lj, const int nlocal, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, const double gpu_split, FILE *_screen, + double host_cut_ljsq, const double host_cut_coulsq, + double *host_special_coul, const double qqrd2e, const double g_ewald, + const double cut_lj_innersq, const double denom_lj, double **epsilon, + double **sigma, const bool mix_arithmetic) { int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, _screen,charmm_long,"k_charmm_long"); diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp index 0d9578b491..dd3ce15827 100644 --- a/lib/gpu/lal_device.cpp +++ b/lib/gpu/lal_device.cpp @@ -52,7 +52,7 @@ DeviceT::~Device() { } template -int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int ngpu, +int DeviceT::init_device(MPI_Comm /*world*/, MPI_Comm replica, const int ngpu, const int first_gpu_id, const int gpu_mode, const double p_split, const int t_per_atom, const double user_cell_size, char *ocl_args, @@ -386,6 +386,9 @@ int DeviceT::set_ocl_params(std::string s_config, const std::string &extra_args) } _ocl_compile_string="-cl-mad-enable "; + #ifdef CL_VERSION_2_0 + _ocl_compile_string+="-cl-std=CL2.0 "; + #endif if (params[4]!="0") _ocl_compile_string+="-cl-fast-relaxed-math "; _ocl_compile_string+=std::string(OCL_INT_TYPE)+" "+ std::string(OCL_PRECISION_COMPILE); @@ -438,7 +441,7 @@ template int DeviceT::init(Answer &ans, const bool charge, const bool rot, const int nlocal, const int nall, const int maxspecial, - const bool vel) { + const bool vel, const int extra_fields) { if (!_device_init) return -1; if (sizeof(acctyp)==sizeof(double) && !gpu->double_precision()) @@ -467,7 +470,7 @@ int DeviceT::init(Answer &ans, const bool charge, if (_init_count==0) { // Initialize atom and nbor data - if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor>0 && maxspecial>0,vel)) + if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor>0 && maxspecial>0,vel,extra_fields)) return -3; _data_in_estimate++; @@ -477,6 +480,9 @@ int DeviceT::init(Answer &ans, const bool charge, _data_in_estimate++; if (vel) _data_in_estimate++; + if (extra_fields>0) + _data_in_estimate++; + } else { if (!atom.charge() && charge) _data_in_estimate++; @@ -484,7 +490,9 @@ int DeviceT::init(Answer &ans, const bool charge, _data_in_estimate++; if (!atom.velocity() && vel) _data_in_estimate++; - if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial,vel)) + if (atom.using_extra() && extra_fields>0) + _data_in_estimate++; + if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial,vel,extra_fields)) return -3; } @@ -520,7 +528,7 @@ int DeviceT::init(Answer &ans, const int nlocal, template int DeviceT::init_nbor(Neighbor *nbor, const int nlocal, - const int host_nlocal, const int nall, + const int host_nlocal, const int /*nall*/, const int maxspecial, const int gpu_host, const int max_nbors, const double cutoff, const bool pre_cut, const int threads_per_atom, diff --git a/lib/gpu/lal_device.h b/lib/gpu/lal_device.h index f5136d9fa0..3b27223007 100644 --- a/lib/gpu/lal_device.h +++ b/lib/gpu/lal_device.h @@ -61,6 +61,7 @@ class Device { * \param nall Total number of local+ghost particles * \param maxspecial Maximum mumber of special bonded atoms per atom * \param vel True if velocities need to be stored + * \param extra_fields Nonzero if extra fields need to be stored * * Returns: * - 0 if successful @@ -70,7 +71,7 @@ class Device { * - -5 Double precision is not supported on card **/ int init(Answer &ans, const bool charge, const bool rot, const int nlocal, const int nall, const int maxspecial, - const bool vel=false); + const bool vel=false, const int extra_fields=0); /// Initialize the device for Atom storage only /** \param nlocal Total number of local particles to allocate memory for diff --git a/lib/gpu/lal_dpd_tstat_ext.cpp b/lib/gpu/lal_dpd_tstat_ext.cpp index 2b63bf62e7..78a1bf2d9d 100644 --- a/lib/gpu/lal_dpd_tstat_ext.cpp +++ b/lib/gpu/lal_dpd_tstat_ext.cpp @@ -28,10 +28,10 @@ static DPD DPDTMF; // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int dpd_tstat_gpu_init(const int ntypes, double **cutsq, double **host_a0, - double **host_gamma, double **host_sigma, double **host_cut, - double *special_lj, const int inum, - const int nall, const int max_nbors, const int maxspecial, - const double cell_size, int &gpu_mode, FILE *screen) { + double **host_gamma, double **host_sigma, double **host_cut, + double *special_lj, const int inum, + const int nall, const int /*max_nbors*/, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen) { DPDTMF.clear(); gpu_mode=DPDTMF.device->gpu_mode(); double gpu_split=DPDTMF.device->particle_split(); diff --git a/lib/gpu/lal_eam.cpp b/lib/gpu/lal_eam.cpp index 2c0d63f7bf..b7bc7b958a 100644 --- a/lib/gpu/lal_eam.cpp +++ b/lib/gpu/lal_eam.cpp @@ -310,7 +310,7 @@ void EAMT::compute(const int f_ago, const int inum_full, const int nlocal, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag_in, const bool vflag_in, - const bool eatom, const bool vatom, + const bool /*eatom*/, const bool /*vatom*/, int &host_start, const double cpu_time, bool &success, void **fp_ptr) { this->acc_timers(); @@ -386,8 +386,8 @@ int** EAMT::compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag_in, - const bool vflag_in, const bool eatom, - const bool vatom, int &host_start, int **ilist, int **jnum, + const bool vflag_in, const bool /*eatom*/, + const bool /*vatom*/, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, int &inum, void **fp_ptr) { this->acc_timers(); diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp new file mode 100644 index 0000000000..24ffae8de2 --- /dev/null +++ b/lib/gpu/lal_hippo.cpp @@ -0,0 +1,641 @@ +/*************************************************************************** + hippo.cpp + ------------------- + Trung Dac Nguyen (Northwestern) + + Class for acceleration of the hippo pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : trung.nguyen@northwestern.edu + ***************************************************************************/ + +#if defined(USE_OPENCL) +#include "hippo_cl.h" +#elif defined(USE_CUDART) +const char *hippo=0; +#else +#include "hippo_cubin.h" +#endif + +#include "lal_hippo.h" +#include +namespace LAMMPS_AL { +#define HippoT Hippo + +extern Device device; + +template +HippoT::Hippo() : BaseAmoeba(), + _allocated(false) { +} + +template +HippoT::~Hippo() { + clear(); + k_repulsion.clear(); + k_dispersion.clear(); + +} + +template +int HippoT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template +int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass, + const double *host_pdamp, const double *host_thole, + const double *host_dirdamp, const int *host_amtype2class, + const double *host_special_repel, const double *host_special_disp, + const double *host_special_mpole, + const double *host_special_polar_wscale, + const double *host_special_polar_piscale, + const double *host_special_polar_pscale, + const double *host_sizpr, const double *host_dmppr, const double *host_elepr, + const double *host_csix, const double *host_adisp, + const double *host_pcore, const double *host_palpha, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const int maxspecial15, + const double cell_size, const double gpu_split, FILE *_screen, + const double polar_dscale, const double polar_uscale) { + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15, + cell_size,gpu_split,_screen,hippo, + "k_hippo_multipole", "k_hippo_udirect2b", + "k_hippo_umutual2b", "k_hippo_polar", + "k_hippo_fphi_uind", "k_hippo_fphi_mpole", + "k_hippo_short_nbor", "k_hippo_special15"); + if (success!=0) + return success; + + // specific to HIPPO + k_repulsion.set_function(*(this->pair_program),"k_hippo_repulsion"); + k_dispersion.set_function(*(this->pair_program),"k_hippo_dispersion"); + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + + UCL_H_Vec host_write(max_amtype, *(this->ucl_device), UCL_WRITE_ONLY); + for (int i = 0; i < max_amtype; i++) { + host_write[i].x = host_pdamp[i]; + host_write[i].y = host_thole[i]; + host_write[i].z = host_dirdamp[i]; + host_write[i].w = host_amtype2class[i]; + } + + coeff_amtype.alloc(max_amtype,*(this->ucl_device), UCL_READ_ONLY); + ucl_copy(coeff_amtype,host_write,false); + + for (int i = 0; i < max_amtype; i++) { + host_write[i].x = host_sizpr[i]; + host_write[i].y = host_dmppr[i]; + host_write[i].z = host_elepr[i]; + host_write[i].w = (numtyp)0; + } + + coeff_rep.alloc(max_amtype,*(this->ucl_device), UCL_READ_ONLY); + ucl_copy(coeff_rep,host_write,false); + + UCL_H_Vec host_write2(max_amclass, *(this->ucl_device), UCL_WRITE_ONLY); + for (int i = 0; i < max_amclass; i++) { + host_write2[i].x = host_csix[i]; + host_write2[i].y = host_adisp[i]; + host_write2[i].z = host_pcore[i]; + host_write2[i].w = host_palpha[i]; + } + + coeff_amclass.alloc(max_amclass,*(this->ucl_device), UCL_READ_ONLY); + ucl_copy(coeff_amclass,host_write2,false); + + UCL_H_Vec dview(5, *(this->ucl_device), UCL_WRITE_ONLY); + sp_polar.alloc(5,*(this->ucl_device),UCL_READ_ONLY); + for (int i=0; i<5; i++) { + dview[i].x=host_special_polar_wscale[i]; + dview[i].y=host_special_polar_piscale[i]; + dview[i].z=host_special_polar_pscale[i]; + dview[i].w=host_special_mpole[i]; + } + ucl_copy(sp_polar,dview,5,false); + + sp_nonpolar.alloc(5,*(this->ucl_device),UCL_READ_ONLY); + for (int i=0; i<5; i++) { + dview[i].x=host_special_repel[i]; + dview[i].y=host_special_disp[i]; + dview[i].z=(numtyp)0; + dview[i].w=(numtyp)0; + } + ucl_copy(sp_nonpolar,dview,5,false); + + _polar_dscale = polar_dscale; + _polar_uscale = polar_uscale; + + _allocated=true; + this->_max_bytes=coeff_amtype.row_bytes() + coeff_rep.row_bytes() + + coeff_amclass.row_bytes() + sp_polar.row_bytes() + + sp_nonpolar.row_bytes() + this->_tep.row_bytes() + + this->_fieldp.row_bytes() + this->_thetai1.row_bytes() + + this->_thetai2.row_bytes() + this->_thetai3.row_bytes() + + this->_igrid.row_bytes() + this->_cgrid_brick.row_bytes(); + return 0; +} + +template +void HippoT::clear() { + if (!_allocated) + return; + _allocated=false; + + coeff_amtype.clear(); + coeff_rep.clear(); + coeff_amclass.clear(); + sp_polar.clear(); + sp_nonpolar.clear(); + + this->clear_atomic(); +} + +template +double HippoT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(Hippo); +} + +// --------------------------------------------------------------------------- +// Compute the repulsion term, returning tep +// --------------------------------------------------------------------------- +template +void HippoT::compute_repulsion(const int /*ago*/, const int inum_full, + const int /*nall*/, double ** /*host_x*/, + int * /*host_type*/, int * /*host_amtype*/, + int * /*host_amgroup*/, double ** /*host_rpole*/, + double * /*sublo*/, double * /*subhi*/, tagint * /*tag*/, + int ** /*nspecial*/, tagint ** /*special*/, + int * /*nspecial15*/, tagint ** /*special15*/, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, + int & /*host_start*/, int ** /*ilist*/, int ** /*jnum*/, + const double /*cpu_time*/, bool & /*success*/, + const double /*aewald*/, const double off2_repulse, + double * /*host_q*/, double * /*boxlo*/, double * /*prd*/, + double cut2, double c0, double c1, double c2, + double c3, double c4, double c5, void **tep_ptr) { + this->acc_timers(); + int eflag, vflag; + if (eatom) eflag=2; + else if (eflag_in) eflag=1; + else eflag=0; + if (vatom) vflag=2; + else if (vflag_in) vflag=1; + else vflag=0; + + #ifdef LAL_NO_BLOCK_REDUCE + if (eflag) eflag=2; + if (vflag) vflag=2; + #endif + + this->set_kernel(eflag,vflag); + + // ------------------- Resize _tep array ------------------------ + + if (inum_full>this->_max_tep_size) { + this->_max_tep_size=static_cast(static_cast(inum_full)*1.10); + this->_tep.resize(this->_max_tep_size*4); + } + *tep_ptr=this->_tep.host.begin(); + + this->_off2_repulse = off2_repulse; + _cut2 = cut2; + _c0 = c0; + _c1 = c1; + _c2 = c2; + _c3 = c3; + _c4 = c4; + _c5 = c5; + repulsion(this->_eflag,this->_vflag); + + // copy tep from device to host + this->_tep.update_host(this->_max_tep_size*4,false); +} + +// --------------------------------------------------------------------------- +// Launch the repulsion kernel +// --------------------------------------------------------------------------- +template +int HippoT::repulsion(const int eflag, const int vflag) { + int ainum=this->ans->inum(); + if (ainum == 0) + return 0; + + int _nall=this->atom->nall(); + int nbor_pitch=this->nbor->nbor_pitch(); + + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + this->time_pair.start(); + + // Build the short neighbor list for the cutoff off2_disp, + // at this point repuslion is the first kernel in a time step for HIPPO + + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->_off2_repulse, &ainum, + &nbor_pitch, &this->_threads_per_atom); + + k_repulsion.set_size(GX,BX); + k_repulsion.run(&this->atom->x, &this->atom->extra, + &coeff_rep, &sp_nonpolar, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, + &this->ans->force, &this->ans->engv, &this->_tep, + &eflag, &vflag, &ainum, &_nall, &nbor_pitch, + &this->_threads_per_atom, &this->_aewald, + &this->_off2_repulse, &_cut2, + &_c0, &_c1, &_c2, &_c3, &_c4, &_c5); + this->time_pair.stop(); + + return GX; +} + +// --------------------------------------------------------------------------- +// Compute dispersion real-space +// --------------------------------------------------------------------------- +template +void HippoT::compute_dispersion_real(int *host_amtype, int *host_amgroup, + double **host_rpole, const double aewald, + const double off2_disp) { + + // cast necessary data arrays from host to device + + this->cast_extra_data(host_amtype, host_amgroup, host_rpole, + nullptr, nullptr, nullptr); + this->atom->add_extra_data(); + + this->_off2_disp = off2_disp; + this->_aewald = aewald; + dispersion_real(this->_eflag,this->_vflag); + + // only copy them back if this is the last kernel + // otherwise, commenting out these two lines to leave the answers + // (forces, energies and virial) on the device until the last kernel + //this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); + //this->device->add_ans_object(this->ans); +} + +// --------------------------------------------------------------------------- +// Launch the dispersion real-space kernel +// --------------------------------------------------------------------------- +template +int HippoT::dispersion_real(const int eflag, const int vflag) { + int ainum=this->ans->inum(); + if (ainum == 0) + return 0; + + int _nall=this->atom->nall(); + int nbor_pitch=this->nbor->nbor_pitch(); + + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + this->time_pair.start(); + + // Build the short neighbor list for the cutoff off2_disp, + // at this point dispersion is the first kernel in a time step + + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->_off2_disp, &ainum, + &nbor_pitch, &this->_threads_per_atom); + + k_dispersion.set_size(GX,BX); + k_dispersion.run(&this->atom->x, &this->atom->extra, + &coeff_amtype, &coeff_amclass, &sp_nonpolar, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, + &this->ans->force, &this->ans->engv, + &eflag, &vflag, &ainum, &_nall, &nbor_pitch, + &this->_threads_per_atom, &this->_aewald, + &this->_off2_disp); + this->time_pair.stop(); + + return GX; +} + +// --------------------------------------------------------------------------- +// Compute the multipole real-space term, returning tep +// --------------------------------------------------------------------------- +template +void HippoT::compute_multipole_real(const int /*ago*/, const int inum_full, + const int /*nall*/, double ** /*host_x*/, + int * /*host_type*/, int * /*host_amtype*/, + int * /*host_amgroup*/, double ** /*host_rpole*/, + double* host_pval, double * /*sublo*/, + double * /*subhi*/, tagint * /*tag*/, + int ** /*nspecial*/, tagint ** /*special*/, + int * /*nspecial15*/, tagint ** /*special15*/, + const bool /*eflag_in*/, const bool /*vflag_in*/, + const bool /*eatom*/, const bool /*vatom*/, + int & /*host_start*/, int ** /*ilist*/, int ** /*jnum*/, + const double /*cpu_time*/, bool & /*success*/, + const double aewald, const double felec, + const double off2_mpole, double * /*host_q*/, + double * /*boxlo*/, double * /*prd*/, void **tep_ptr) { + + // cast necessary data arrays from host to device + + this->cast_extra_data(nullptr, nullptr, nullptr, nullptr, nullptr, host_pval); + this->atom->add_extra_data(); + + // ------------------- Resize _tep array ------------------------ + + if (inum_full>this->_max_tep_size) { + this->_max_tep_size=static_cast(static_cast(inum_full)*1.10); + this->_tep.resize(this->_max_tep_size*4); + } + *tep_ptr=this->_tep.host.begin(); + + this->_off2_mpole = off2_mpole; + this->_felec = felec; + this->_aewald = aewald; + multipole_real(this->_eflag,this->_vflag); + + // copy tep from device to host + this->_tep.update_host(this->_max_tep_size*4,false); +} + +// --------------------------------------------------------------------------- +// Launch the multipole real-space kernel +// --------------------------------------------------------------------------- +template +int HippoT::multipole_real(const int eflag, const int vflag) { + int ainum=this->ans->inum(); + if (ainum == 0) + return 0; + + int _nall=this->atom->nall(); + int nbor_pitch=this->nbor->nbor_pitch(); + + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + this->time_pair.start(); + + // Build the short neighbor list for the cutoff off2_mpole + + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->_off2_mpole, &ainum, + &nbor_pitch, &this->_threads_per_atom); + + this->k_multipole.set_size(GX,BX); + this->k_multipole.run(&this->atom->x, &this->atom->extra, + &coeff_amtype, &coeff_amclass, &sp_polar, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, + &this->ans->force, &this->ans->engv, &this->_tep, + &eflag, &vflag, &ainum, &_nall, &nbor_pitch, + &this->_threads_per_atom, &this->_aewald, &this->_felec, + &this->_off2_mpole, &_polar_dscale, &_polar_uscale); + this->time_pair.stop(); + + return GX; +} + +// --------------------------------------------------------------------------- +// Compute the direct real space part of the permanent field +// returning field and fieldp +// --------------------------------------------------------------------------- +template +void HippoT::compute_udirect2b(int * /*host_amtype*/, int * /*host_amgroup*/, double ** /*host_rpole*/, + double **host_uind, double **host_uinp, double* host_pval, + const double aewald, const double off2_polar, + void** fieldp_ptr) { + + // all the necessary data arrays are already copied from host to device + + this->cast_extra_data(nullptr, nullptr, nullptr, host_uind, host_uinp, host_pval); + this->atom->add_extra_data(); + + *fieldp_ptr=this->_fieldp.host.begin(); + + this->_off2_polar = off2_polar; + this->_aewald = aewald; + udirect2b(this->_eflag,this->_vflag); + + // copy field and fieldp from device to host (_fieldp store both arrays, one after another) + + this->_fieldp.update_host(this->_max_fieldp_size*8,false); +} + +// --------------------------------------------------------------------------- +// Launch the real-space permanent field kernel +// --------------------------------------------------------------------------- +template +int HippoT::udirect2b(const int /*eflag*/, const int /*vflag*/) { + int ainum=this->ans->inum(); + if (ainum == 0) + return 0; + + int _nall=this->atom->nall(); + int nbor_pitch=this->nbor->nbor_pitch(); + + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + this->time_pair.start(); + + // Build the short neighbor list for the cutoff _off2_polar, if not done yet + // this is the first kernel in a time step where _off2_polar is used + + if (!this->short_nbor_polar_avail) { + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->_off2_polar, &ainum, + &nbor_pitch, &this->_threads_per_atom); + this->short_nbor_polar_avail = true; + } + + this->k_udirect2b.set_size(GX,BX); + this->k_udirect2b.run(&this->atom->x, &this->atom->extra, + &coeff_amtype, &coeff_amclass, &sp_polar, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, + &this->_fieldp, &ainum, &_nall, &nbor_pitch, + &this->_threads_per_atom, &this->_aewald, &this->_off2_polar, + &_polar_dscale, &_polar_uscale); + + this->time_pair.stop(); + return GX; +} + +// --------------------------------------------------------------------------- +// Compute the direct real space term of the induced field +// returning field and fieldp +// --------------------------------------------------------------------------- +template +void HippoT::compute_umutual2b(int * /*host_amtype*/, int * /*host_amgroup*/, double ** /*host_rpole*/, + double **host_uind, double **host_uinp, double * /*host_pval*/, + const double aewald, const double off2_polar, void ** /*fieldp_ptr*/) { + + // cast necessary data arrays from host to device + + this->cast_extra_data(nullptr, nullptr, nullptr, host_uind, host_uinp, nullptr); + this->atom->add_extra_data(); + + this->_off2_polar = off2_polar; + this->_aewald = aewald; + umutual2b(this->_eflag,this->_vflag); + + // copy field and fieldp from device to host (_fieldp store both arrays, one after another) + // NOTE: move this step to update_fieldp() to delay device-host transfer + // *fieldp_ptr=this->_fieldp.host.begin(); + // this->_fieldp.update_host(this->_max_fieldp_size*8,false); +} + +// --------------------------------------------------------------------------- +// Launch the real-space induced field kernel +// --------------------------------------------------------------------------- +template +int HippoT::umutual2b(const int /*eflag*/, const int /*vflag*/) { + int ainum=this->ans->inum(); + if (ainum == 0) + return 0; + + int _nall=this->atom->nall(); + int nbor_pitch=this->nbor->nbor_pitch(); + + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + this->time_pair.start(); + + // Build the short neighbor list if not done yet + if (!this->short_nbor_polar_avail) { + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), &this->dev_short_nbor, + &this->_off2_polar, &ainum, &nbor_pitch, + &this->_threads_per_atom); + this->short_nbor_polar_avail = true; + } + + this->k_umutual2b.set_size(GX,BX); + this->k_umutual2b.run(&this->atom->x, &this->atom->extra, + &coeff_amtype, &coeff_amclass, &sp_polar, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->_fieldp, &ainum, &_nall, + &nbor_pitch, &this->_threads_per_atom, &this->_aewald, + &this->_off2_polar, &_polar_dscale, &_polar_uscale); + + this->time_pair.stop(); + return GX; +} + +// --------------------------------------------------------------------------- +// Reneighbor on GPU if necessary, and then compute polar real-space +// --------------------------------------------------------------------------- +template +void HippoT::compute_polar_real(int * /*host_amtype*/, int * /*host_amgroup*/, double ** /*host_rpole*/, + double **host_uind, double **host_uinp, double * /*host_pval*/, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, + const double aewald, const double felec, + const double off2_polar, void **tep_ptr) { + // cast necessary data arrays from host to device + + this->cast_extra_data(nullptr, nullptr, nullptr, host_uind, host_uinp, nullptr); + this->atom->add_extra_data(); + + *tep_ptr=this->_tep.host.begin(); + + this->_off2_polar = off2_polar; + this->_felec = felec; + this->_aewald = aewald; + const int red_blocks=polar_real(this->_eflag,this->_vflag); + + // only copy answers (forces, energies and virial) back from the device + // in the last kernel in a timestep (which is polar_real here) + this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); + this->device->add_ans_object(this->ans); + + // copy tep from device to host + this->_tep.update_host(this->_max_tep_size*4,false); +} + +// --------------------------------------------------------------------------- +// Launch the polar real-space kernel +// --------------------------------------------------------------------------- +template +int HippoT::polar_real(const int eflag, const int vflag) { + int ainum=this->ans->inum(); + if (ainum == 0) + return 0; + + int _nall=this->atom->nall(); + int nbor_pitch=this->nbor->nbor_pitch(); + + // Compute the block size and grid size to keep all cores busy + + const int BX=this->block_size(); + const int GX=static_cast(ceil(static_cast(ainum)/(BX/this->_threads_per_atom))); + /* + const int cus = this->device->gpu->cus(); + while (GX < cus && GX > 1) { + BX /= 2; + GX=static_cast(ceil(static_cast(ainum)/(BX/this->_threads_per_atom))); + } + */ + this->time_pair.start(); + + // Build the short neighbor list if not done yet + if (!this->short_nbor_polar_avail) { + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->_off2_polar, &ainum, + &nbor_pitch, &this->_threads_per_atom); + this->short_nbor_polar_avail = true; + } + + this->k_polar.set_size(GX,BX); + this->k_polar.run(&this->atom->x, &this->atom->extra, + &coeff_amtype, &coeff_amclass, &sp_polar, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, + &this->ans->force, &this->ans->engv, &this->_tep, + &eflag, &vflag, &ainum, &_nall, &nbor_pitch, + &this->_threads_per_atom, &this->_aewald, &this->_felec, + &this->_off2_polar, &_polar_dscale, &_polar_uscale); + this->time_pair.stop(); + + // Signal that short nbor list is not avail for the next time step + // do it here because polar_real() is the last kernel in a time step at this point + + this->short_nbor_polar_avail = false; + + return GX; +} + +template class Hippo; +} diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu new file mode 100644 index 0000000000..99e20db223 --- /dev/null +++ b/lib/gpu/lal_hippo.cu @@ -0,0 +1,2519 @@ +// ************************************************************************** +// hippo.cu +// ------------------- +// Trung Dac Nguyen (Northwestern) +// +// Device code for acceleration of the hippo pair style +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : trung.nguyen@northwestern.edu +// *************************************************************************** + +#if defined(NV_KERNEL) || defined(USE_HIP) + +#include "lal_hippo_extra.h" +#ifdef LAMMPS_SMALLBIG +#define tagint int +#endif +#ifdef LAMMPS_BIGBIG +#include "inttypes.h" +#define tagint int64_t +#endif +#ifdef LAMMPS_SMALLSMALL +#define tagint int +#endif +#ifndef _DOUBLE_DOUBLE +_texture( pos_tex,float4); +_texture( q_tex,float); +#else +_texture_2d( pos_tex,int4); +_texture( q_tex,int2); +#endif + +#else +#define pos_tex x_ +#define q_tex q_ +#ifdef LAMMPS_SMALLBIG +#define tagint int +#endif +#ifdef LAMMPS_BIGBIG +#define tagint long +#endif +#ifdef LAMMPS_SMALLSMALL +#define tagint int +#endif + +#endif // defined(NV_KERNEL) || defined(USE_HIP) + + +#if (SHUFFLE_AVAIL == 0) + +#define local_allocate_store_ufld() \ + __local acctyp red_acc[6][BLOCK_PAIR]; + +#define store_answers_hippo_tq(tq, ii, inum,tid, t_per_atom, offset, i, \ + tep) \ + if (t_per_atom>1) { \ + red_acc[0][tid]=tq.x; \ + red_acc[1][tid]=tq.y; \ + red_acc[2][tid]=tq.z; \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + simdsync(); \ + if (offset < s) { \ + for (int r=0; r<3; r++) \ + red_acc[r][tid] += red_acc[r][tid+s]; \ + } \ + } \ + tq.x=red_acc[0][tid]; \ + tq.y=red_acc[1][tid]; \ + tq.z=red_acc[2][tid]; \ + } \ + if (offset==0 && ii1) { \ + red_acc[0][tid]=ufld[0]; \ + red_acc[1][tid]=ufld[1]; \ + red_acc[2][tid]=ufld[2]; \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + simdsync(); \ + if (offset < s) { \ + for (int r=0; r<3; r++) \ + red_acc[r][tid] += red_acc[r][tid+s]; \ + } \ + } \ + ufld[0]=red_acc[0][tid]; \ + ufld[1]=red_acc[1][tid]; \ + ufld[2]=red_acc[2][tid]; \ + red_acc[0][tid]=dufld[0]; \ + red_acc[1][tid]=dufld[1]; \ + red_acc[2][tid]=dufld[2]; \ + red_acc[3][tid]=dufld[3]; \ + red_acc[4][tid]=dufld[4]; \ + red_acc[5][tid]=dufld[5]; \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + simdsync(); \ + if (offset < s) { \ + for (int r=0; r<6; r++) \ + red_acc[r][tid] += red_acc[r][tid+s]; \ + } \ + } \ + dufld[0]=red_acc[0][tid]; \ + dufld[1]=red_acc[1][tid]; \ + dufld[2]=red_acc[2][tid]; \ + dufld[3]=red_acc[3][tid]; \ + dufld[4]=red_acc[4][tid]; \ + dufld[5]=red_acc[5][tid]; \ + } \ + if (offset==0 && ii1) { \ + red_acc[0][tid]=_fieldp[0]; \ + red_acc[1][tid]=_fieldp[1]; \ + red_acc[2][tid]=_fieldp[2]; \ + red_acc[3][tid]=_fieldp[3]; \ + red_acc[4][tid]=_fieldp[4]; \ + red_acc[5][tid]=_fieldp[5]; \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + simdsync(); \ + if (offset < s) { \ + for (int r=0; r<6; r++) \ + red_acc[r][tid] += red_acc[r][tid+s]; \ + } \ + } \ + _fieldp[0]=red_acc[0][tid]; \ + _fieldp[1]=red_acc[1][tid]; \ + _fieldp[2]=red_acc[2][tid]; \ + _fieldp[3]=red_acc[3][tid]; \ + _fieldp[4]=red_acc[4][tid]; \ + _fieldp[5]=red_acc[5][tid]; \ + } \ + if (offset==0 && ii1) { \ + simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z); \ + if (EVFLAG && (vflag==2 || eflag==2)) { \ + if (eflag) { \ + simdsync(); \ + simd_reduce_add2(t_per_atom, red_acc, offset, tid, energy, e_coul); \ + } \ + if (vflag) { \ + simdsync(); \ + simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial); \ + } \ + } \ + } \ + if (offset==0 && ii1) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + tq.x += shfl_down(tq.x, s, t_per_atom); \ + tq.y += shfl_down(tq.y, s, t_per_atom); \ + tq.z += shfl_down(tq.z, s, t_per_atom); \ + } \ + } \ + if (offset==0 && ii1) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + ufld[0] += shfl_down(ufld[0], s, t_per_atom); \ + ufld[1] += shfl_down(ufld[1], s, t_per_atom); \ + ufld[2] += shfl_down(ufld[2], s, t_per_atom); \ + dufld[0] += shfl_down(dufld[0], s, t_per_atom); \ + dufld[1] += shfl_down(dufld[1], s, t_per_atom); \ + dufld[2] += shfl_down(dufld[2], s, t_per_atom); \ + dufld[3] += shfl_down(dufld[3], s, t_per_atom); \ + dufld[4] += shfl_down(dufld[4], s, t_per_atom); \ + dufld[5] += shfl_down(dufld[5], s, t_per_atom); \ + } \ + } \ + if (offset==0 && ii1) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + _fieldp[0] += shfl_down(_fieldp[0], s, t_per_atom); \ + _fieldp[1] += shfl_down(_fieldp[1], s, t_per_atom); \ + _fieldp[2] += shfl_down(_fieldp[2], s, t_per_atom); \ + _fieldp[3] += shfl_down(_fieldp[3], s, t_per_atom); \ + _fieldp[4] += shfl_down(_fieldp[4], s, t_per_atom); \ + _fieldp[5] += shfl_down(_fieldp[5], s, t_per_atom); \ + } \ + } \ + if (offset==0 && ii1) { \ + simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ + if (vflag==2 || eflag==2) { \ + if (eflag) \ + simd_reduce_add2(t_per_atom,energy,e_coul); \ + if (vflag) \ + simd_reduce_arr(6, t_per_atom,virial); \ + } \ + } \ + if (offset==0 && ii 1; active_subgs /= vwidth) { \ + if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads(); \ + if (bnum < active_subgs) { \ + if (eflag) { \ + simd_reduce_add2(vwidth, energy, e_coul); \ + if (voffset==0) { \ + red_acc[6][bnum] = energy; \ + red_acc[7][bnum] = e_coul; \ + } \ + } \ + if (vflag) { \ + simd_reduce_arr(6, vwidth, virial); \ + if (voffset==0) \ + for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r]; \ + } \ + } \ + \ + __syncthreads(); \ + if (tid < active_subgs) { \ + if (eflag) { \ + energy = red_acc[6][tid]; \ + e_coul = red_acc[7][tid]; \ + } \ + if (vflag) \ + for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid]; \ + } else { \ + if (eflag) energy = e_coul = (acctyp)0; \ + if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0; \ + } \ + } \ + \ + if (bnum == 0) { \ + int ei=BLOCK_ID_X; \ + if (eflag) { \ + simd_reduce_add2(vwidth, energy, e_coul); \ + if (tid==0) { \ + engv[ei]+=energy*(acctyp)0.5; \ + ei+=ev_stride; \ + engv[ei]+=e_coul*(acctyp)0.5; \ + ei+=ev_stride; \ + } \ + } \ + if (vflag) { \ + simd_reduce_arr(6, vwidth, virial); \ + if (tid==0) { \ + for (int r=0; r<6; r++) { \ + engv[ei]+=virial[r]*(acctyp)0.5; \ + ei+=ev_stride; \ + } \ + } \ + } \ + } \ + } else if (offset==0 && ii1) \ + simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ + if (offset==0 && ii cut2) { + numtyp r3 = r2 * r; + numtyp r4 = r2 * r2; + numtyp r5 = r2 * r3; + numtyp taper = c5*r5 + c4*r4 + c3*r3 + c2*r2 + c1*r + c0; + numtyp dtaper = (numtyp)5.0*c5*r4 + (numtyp)4.0*c4*r3 + + (numtyp)3.0*c3*r2 + (numtyp)2.0*c2*r + c1; + dtaper *= e * rr1; + e *= taper; + frcx = frcx*taper - dtaper*xr; + frcy = frcy*taper - dtaper*yr; + frcz = frcz*taper - dtaper*zr; + ttmix *= taper; + ttmiy *= taper; + ttmiz *= taper; + } + + energy += e; + + // increment force-based gradient and torque on atom I + + f.x -= frcx; + f.y -= frcy; + f.z -= frcz; + tq.x += ttmix; + tq.y += ttmiy; + tq.z += ttmiz; + + // increment the internal virial tensor components + if (EVFLAG && vflag) { + numtyp vxx = -xr * frcx; + numtyp vxy = (numtyp)-0.5 * (yr*frcx+xr*frcy); + numtyp vxz = (numtyp)-0.5 * (zr*frcx+xr*frcz); + numtyp vyy = -yr * frcy; + numtyp vyz = (numtyp)-0.5 * (zr*frcy+yr*frcz); + numtyp vzz = -zr * frcz; + + virial[0] -= vxx; + virial[1] -= vyy; + virial[2] -= vzz; + virial[3] -= vxy; + virial[4] -= vxz; + virial[5] -= vyz; + } + } // nbor + + } // ii (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald); + + int m; + for (m = 1; m < 6; m++) { + numtyp bfac = (numtyp) (m+m-1); + alsq2n = alsq2 * alsq2n; + bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) * r2inv; + } + for (m = 0; m < 6; m++) bn[m] *= felec; + + numtyp term1,term2,term3; + numtyp term4,term5,term6; + + term1 = corei*corek; + numtyp term1i = corek*vali; + numtyp term2i = corek*dir; + numtyp term3i = corek*qir; + numtyp term1k = corei*valk; + numtyp term2k = -corei*dkr; + numtyp term3k = corei*qkr; + numtyp term1ik = vali*valk; + numtyp term2ik = valk*dir - vali*dkr + dik; + numtyp term3ik = vali*qkr + valk*qir - dir*dkr + 2.0*(dkqi-diqk+qiqk); + numtyp term4ik = dir*qkr - dkr*qir - 4.0*qik; + numtyp term5ik = qir*qkr; + numtyp dmpi[9],dmpj[9]; + numtyp dmpij[11]; + damppole(r,11,alphai,alphak,dmpi,dmpj,dmpij); + numtyp scalek = factor_mpole; + numtyp rr1i = bn[0] - ((numtyp)1.0-scalek*dmpi[0])*rr1; + numtyp rr3i = bn[1] - ((numtyp)1.0-scalek*dmpi[2])*rr3; + numtyp rr5i = bn[2] - ((numtyp)1.0-scalek*dmpi[4])*rr5; + numtyp rr7i = bn[3] - ((numtyp)1.0-scalek*dmpi[6])*rr7; + numtyp rr1k = bn[0] - ((numtyp)1.0-scalek*dmpj[0])*rr1; + numtyp rr3k = bn[1] - ((numtyp)1.0-scalek*dmpj[2])*rr3; + numtyp rr5k = bn[2] - ((numtyp)1.0-scalek*dmpj[4])*rr5; + numtyp rr7k = bn[3] - ((numtyp)1.0-scalek*dmpj[6])*rr7; + numtyp rr1ik = bn[0] - ((numtyp)1.0-scalek*dmpij[0])*rr1; + numtyp rr3ik = bn[1] - ((numtyp)1.0-scalek*dmpij[2])*rr3; + numtyp rr5ik = bn[2] - ((numtyp)1.0-scalek*dmpij[4])*rr5; + numtyp rr7ik = bn[3] - ((numtyp)1.0-scalek*dmpij[6])*rr7; + numtyp rr9ik = bn[4] - ((numtyp)1.0-scalek*dmpij[8])*rr9; + numtyp rr11ik = bn[5] - ((numtyp)1.0-scalek*dmpij[10])*rr11; + rr1 = bn[0] - ((numtyp)1.0-scalek)*rr1; + rr3 = bn[1] - ((numtyp)1.0-scalek)*rr3; + numtyp e = term1*rr1 + term4ik*rr7ik + term5ik*rr9ik + + term1i*rr1i + term1k*rr1k + term1ik*rr1ik + + term2i*rr3i + term2k*rr3k + term2ik*rr3ik + + term3i*rr5i + term3k*rr5k + term3ik*rr5ik; + + // find damped multipole intermediates for force and torque + + numtyp de = term1*rr3 + term4ik*rr9ik + term5ik*rr11ik + + term1i*rr3i + term1k*rr3k + term1ik*rr3ik + + term2i*rr5i + term2k*rr5k + term2ik*rr5ik + + term3i*rr7i + term3k*rr7k + term3ik*rr7ik; + term1 = -corek*rr3i - valk*rr3ik + dkr*rr5ik - qkr*rr7ik; + term2 = corei*rr3k + vali*rr3ik + dir*rr5ik + qir*rr7ik; + term3 = (numtyp)2.0 * rr5ik; + term4 = (numtyp)-2.0 * (corek*rr5i+valk*rr5ik - dkr*rr7ik+qkr*rr9ik); + term5 = (numtyp)-2.0 * (corei*rr5k+vali*rr5ik + dir*rr7ik+qir*rr9ik); + term6 = (numtyp)4.0 * rr7ik; + rr3 = rr3ik; + + energy += e; + + // compute the force components for this interaction + + numtyp frcx = de*xr + term1*dix + term2*dkx + term3*(diqkx-dkqix) + + term4*qix + term5*qkx + term6*(qixk+qkxi); + numtyp frcy = de*yr + term1*diy + term2*dky + term3*(diqky-dkqiy) + + term4*qiy + term5*qky + term6*(qiyk+qkyi); + numtyp frcz = de*zr + term1*diz + term2*dkz + term3*(diqkz-dkqiz) + + term4*qiz + term5*qkz + term6*(qizk+qkzi); + + // compute the torque components for this interaction + + numtyp ttmix = -rr3*dikx + term1*dirx + term3*(dqikx+dkqirx) - + term4*qirx - term6*(qikrx+qikx); + numtyp ttmiy = -rr3*diky + term1*diry + term3*(dqiky+dkqiry) - + term4*qiry - term6*(qikry+qiky); + numtyp ttmiz = -rr3*dikz + term1*dirz + term3*(dqikz+dkqirz) - + term4*qirz - term6*(qikrz+qikz); + + // increment force-based gradient and torque on first site + + f.x -= frcx; + f.y -= frcy; + f.z -= frcz; + tq.x += ttmix; + tq.y += ttmiy; + tq.z += ttmiz; + + if (EVFLAG && vflag) { + numtyp vxx = -xr * frcx; + numtyp vxy = (numtyp)-0.5 * (yr*frcx+xr*frcy); + numtyp vxz = (numtyp)-0.5 * (zr*frcx+xr*frcz); + numtyp vyy = -yr * frcy; + numtyp vyz = (numtyp)-0.5 * (zr*frcy+yr*frcz); + numtyp vzz = -zr * frcz; + + virial[0] -= vxx; + virial[1] -= vyy; + virial[2] -= vzz; + virial[3] -= vxy; + virial[4] -= vxz; + virial[5] -= vyz; + } + } // nbor + + } // ii (numtyp)0.0) aesq2n = (numtyp)1.0 / (MY_PIS*aewald); + + for ( ; nbor (numtyp)0.0) aesq2n = (numtyp)1.0 / (MY_PIS*aewald); + + for ( ; nbor (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald); + + for (m = 1; m <= 4; m++) { + numtyp bfac = (numtyp) (m+m-1); + alsq2n = alsq2 * alsq2n; + bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) * r2inv; + } + for (m = 0; m < 5; m++) bn[m] *= felec; + + // apply charge penetration damping to scale factors + + numtyp corek = coeff_amclass[jtype].z; // pcore[jclass]; + numtyp alphak = coeff_amclass[jtype].w; // palpha[jclass]; + numtyp valk = polar6[j].x; + numtyp dmpi[9],dmpk[9]; + numtyp dmpik[9]; + damppole(r,9,alphai,alphak,dmpi,dmpk,dmpik); + numtyp rr3core = bn[1] - ((numtyp)1.0-factor_dscale)*rr3; + numtyp rr5core = bn[2] - ((numtyp)1.0-factor_dscale)*rr5; + + numtyp rr3i = bn[1] - ((numtyp)1.0-factor_dscale*dmpi[2])*rr3; + numtyp rr5i = bn[2] - ((numtyp)1.0-factor_dscale*dmpi[4])*rr5; + numtyp rr7i = bn[3] - ((numtyp)1.0-factor_dscale*dmpi[6])*rr7; + numtyp rr9i = bn[4] - ((numtyp)1.0-factor_dscale*dmpi[8])*rr9; + numtyp rr3k = bn[1] - ((numtyp)1.0-factor_dscale*dmpk[2])*rr3; + numtyp rr5k = bn[2] - ((numtyp)1.0-factor_dscale*dmpk[4])*rr5; + numtyp rr7k = bn[3] - ((numtyp)1.0-factor_dscale*dmpk[6])*rr7; + numtyp rr9k = bn[4] - ((numtyp)1.0-factor_dscale*dmpk[8])*rr9; + numtyp rr5ik = bn[2] - ((numtyp)1.0-factor_wscale*dmpik[4])*rr5; + numtyp rr7ik = bn[3] - ((numtyp)1.0-factor_wscale*dmpik[6])*rr7; + + // get the induced dipole field used for dipole torques + + numtyp tix3 = (numtyp)2.0*rr3i*ukx; + numtyp tiy3 = (numtyp)2.0*rr3i*uky; + numtyp tiz3 = (numtyp)2.0*rr3i*ukz; + numtyp tuir = (numtyp)-2.0*rr5i*ukr; + + ufld[0] += tix3 + xr*tuir; + ufld[1] += tiy3 + yr*tuir; + ufld[2] += tiz3 + zr*tuir; + + // get induced dipole field gradient used for quadrupole torques + + numtyp tix5 = (numtyp)4.0 * (rr5i*ukx); + numtyp tiy5 = (numtyp)4.0 * (rr5i*uky); + numtyp tiz5 = (numtyp)4.0 * (rr5i*ukz); + tuir = (numtyp)-2.0*rr7i*ukr; + + dufld[0] += xr*tix5 + xr*xr*tuir; + dufld[1] += xr*tiy5 + yr*tix5 + (numtyp)2.0*xr*yr*tuir; + dufld[2] += yr*tiy5 + yr*yr*tuir; + dufld[3] += xr*tiz5 + zr*tix5 + (numtyp)2.0*xr*zr*tuir; + dufld[4] += yr*tiz5 + zr*tiy5 + (numtyp)2.0*yr*zr*tuir; + dufld[5] += zr*tiz5 + zr*zr*tuir; + + // get the field gradient for direct polarization force + + numtyp term1i,term2i,term3i,term4i,term5i,term6i,term7i,term8i; + numtyp term1k,term2k,term3k,term4k,term5k,term6k,term7k,term8k; + numtyp term1core; + numtyp tixx,tiyy,tizz,tixy,tixz,tiyz; + numtyp tkxx,tkyy,tkzz,tkxy,tkxz,tkyz; + + term1i = rr3i - rr5i*xr*xr; + term1core = rr3core - rr5core*xr*xr; + term2i = (numtyp)2.0*rr5i*xr ; + term3i = rr7i*xr*xr - rr5i; + term4i = (numtyp)2.0*rr5i; + term5i = (numtyp)5.0*rr7i*xr; + term6i = rr9i*xr*xr; + term1k = rr3k - rr5k*xr*xr; + term2k = (numtyp)2.0*rr5k*xr; + term3k = rr7k*xr*xr - rr5k; + term4k = (numtyp)2.0*rr5k; + term5k = (numtyp)5.0*rr7k*xr; + term6k = rr9k*xr*xr; + tixx = vali*term1i + corei*term1core + dix*term2i - dir*term3i - + qixx*term4i + qix*term5i - qir*term6i + (qiy*yr+qiz*zr)*rr7i; + tkxx = valk*term1k + corek*term1core - dkx*term2k + dkr*term3k - + qkxx*term4k + qkx*term5k - qkr*term6k + (qky*yr+qkz*zr)*rr7k; + + term1i = rr3i - rr5i*yr*yr; + term1core = rr3core - rr5core*yr*yr; + term2i = (numtyp)2.0*rr5i*yr; + term3i = rr7i*yr*yr - rr5i; + term4i = (numtyp)2.0*rr5i; + term5i = (numtyp)5.0*rr7i*yr; + term6i = rr9i*yr*yr; + term1k = rr3k - rr5k*yr*yr; + term2k = (numtyp)2.0*rr5k*yr; + term3k = rr7k*yr*yr - rr5k; + term4k = (numtyp)2.0*rr5k; + term5k = (numtyp)5.0*rr7k*yr; + term6k = rr9k*yr*yr; + tiyy = vali*term1i + corei*term1core + diy*term2i - dir*term3i - + qiyy*term4i + qiy*term5i - qir*term6i + (qix*xr+qiz*zr)*rr7i; + tkyy = valk*term1k + corek*term1core - dky*term2k + dkr*term3k - + qkyy*term4k + qky*term5k - qkr*term6k + (qkx*xr+qkz*zr)*rr7k; + + term1i = rr3i - rr5i*zr*zr; + term1core = rr3core - rr5core*zr*zr; + term2i = (numtyp)2.0*rr5i*zr; + term3i = rr7i*zr*zr - rr5i; + term4i = (numtyp)2.0*rr5i; + term5i = (numtyp)5.0*rr7i*zr; + term6i = rr9i*zr*zr; + term1k = rr3k - rr5k*zr*zr; + term2k = (numtyp)2.0*rr5k*zr; + term3k = rr7k*zr*zr - rr5k; + term4k = (numtyp)2.0*rr5k; + term5k = (numtyp)5.0*rr7k*zr; + term6k = rr9k*zr*zr; + tizz = vali*term1i + corei*term1core + diz*term2i - dir*term3i - + qizz*term4i + qiz*term5i - qir*term6i + (qix*xr+qiy*yr)*rr7i; + tkzz = valk*term1k + corek*term1core - dkz*term2k + dkr*term3k - + qkzz*term4k + qkz*term5k - qkr*term6k + (qkx*xr+qky*yr)*rr7k; + + term2i = rr5i*xr ; + term1i = yr * term2i; + term1core = rr5core*xr*yr; + term3i = rr5i*yr; + term4i = yr * (rr7i*xr); + term5i = (numtyp)2.0*rr5i; + term6i = (numtyp)2.0*rr7i*xr; + term7i = (numtyp)2.0*rr7i*yr; + term8i = yr*rr9i*xr; + term2k = rr5k*xr; + term1k = yr * term2k; + term3k = rr5k*yr; + term4k = yr * (rr7k*xr); + term5k = (numtyp)2.0*rr5k; + term6k = (numtyp)2.0*rr7k*xr; + term7k = (numtyp)2.0*rr7k*yr; + term8k = yr*rr9k*xr; + tixy = -vali*term1i - corei*term1core + diy*term2i + dix*term3i - + dir*term4i - qixy*term5i + qiy*term6i + qix*term7i - qir*term8i; + tkxy = -valk*term1k - corek*term1core - dky*term2k - dkx*term3k + + dkr*term4k - qkxy*term5k + qky*term6k + qkx*term7k - qkr*term8k; + + term2i = rr5i*xr; + term1i = zr * term2i; + term1core = rr5core*xr*zr; + term3i = rr5i*zr; + term4i = zr * (rr7i*xr); + term5i = (numtyp)2.0*rr5i; + term6i = (numtyp)2.0*rr7i*xr; + term7i = (numtyp)2.0*rr7i*zr; + term8i = zr*rr9i*xr; + term2k = rr5k*xr; + term1k = zr * term2k; + term3k = rr5k*zr; + term4k = zr * (rr7k*xr); + term5k = (numtyp)2.0*rr5k; + term6k = (numtyp)2.0*rr7k*xr; + term7k = (numtyp)2.0*rr7k*zr; + term8k = zr*rr9k*xr; + tixz = -vali*term1i - corei*term1core + diz*term2i + dix*term3i - + dir*term4i - qixz*term5i + qiz*term6i + qix*term7i - qir*term8i; + tkxz = -valk*term1k - corek*term1core - dkz*term2k - dkx*term3k + + dkr*term4k - qkxz*term5k + qkz*term6k + qkx*term7k - qkr*term8k; + + term2i = rr5i*yr; + term1i = zr * term2i; + term1core = rr5core*yr*zr; + term3i = rr5i*zr; + term4i = zr * (rr7i*yr); + term5i = (numtyp)2.0*rr5i; + term6i = (numtyp)2.0*rr7i*yr; + term7i = (numtyp)2.0*rr7i*zr; + term8i = zr*rr9i*yr; + term2k = rr5k*yr; + term1k = zr * term2k; + term3k = rr5k*zr; + term4k = zr * (rr7k*yr); + term5k = (numtyp)2.0*rr5k; + term6k = (numtyp)2.0*rr7k*yr; + term7k = (numtyp)2.0*rr7k*zr; + term8k = zr*rr9k*yr; + tiyz = -vali*term1i - corei*term1core + diz*term2i + diy*term3i - + dir*term4i - qiyz*term5i + qiz*term6i + qiy*term7i - qir*term8i; + tkyz = -valk*term1k - corek*term1core - dkz*term2k - dky*term3k + + dkr*term4k - qkyz*term5k + qkz*term6k + qky*term7k - qkr*term8k; + + numtyp depx = tixx*ukx + tixy*uky + tixz*ukz - tkxx*uix - tkxy*uiy - tkxz*uiz; + numtyp depy = tixy*ukx + tiyy*uky + tiyz*ukz - tkxy*uix - tkyy*uiy - tkyz*uiz; + numtyp depz = tixz*ukx + tiyz*uky + tizz*ukz - tkxz*uix - tkyz*uiy - tkzz*uiz; + + numtyp frcx = (numtyp)-2.0 * depx; + numtyp frcy = (numtyp)-2.0 * depy; + numtyp frcz = (numtyp)-2.0 * depz; + + numtyp term1,term2,term3; + + // get the dEp/dR terms used for direct polarization force + // poltyp == MUTUAL && hippo + // tixx and tkxx + term1 = (numtyp)2.0 * rr5ik; + term2 = term1*xr; + term3 = rr5ik - rr7ik*xr*xr; + tixx = uix*term2 + uir*term3; + tkxx = ukx*term2 + ukr*term3; + + // tiyy and tkyy + term2 = term1*yr; + term3 = rr5ik - rr7ik*yr*yr; + tiyy = uiy*term2 + uir*term3; + tkyy = uky*term2 + ukr*term3; + + // tiz and tkzz + term2 = term1*zr; + term3 = rr5ik - rr7ik*zr*zr; + tizz = uiz*term2 + uir*term3; + tkzz = ukz*term2 + ukr*term3; + + // tixy and tkxy + term1 = rr5ik*yr; + term2 = rr5ik*xr; + term3 = yr * (rr7ik*xr); + tixy = uix*term1 + uiy*term2 - uir*term3; + tkxy = ukx*term1 + uky*term2 - ukr*term3; + + // tixx and tkxx + term1 = rr5ik * zr; + term3 = zr * (rr7ik*xr); + tixz = uix*term1 + uiz*term2 - uir*term3; + tkxz = ukx*term1 + ukz*term2 - ukr*term3; + + // tiyz and tkyz + term2 = rr5ik*yr; + term3 = zr * (rr7ik*yr); + tiyz = uiy*term1 + uiz*term2 - uir*term3; + tkyz = uky*term1 + ukz*term2 - ukr*term3; + + depx = tixx*ukxp + tixy*ukyp + tixz*ukzp + tkxx*uixp + tkxy*uiyp + tkxz*uizp; + depy = tixy*ukxp + tiyy*ukyp + tiyz*ukzp + tkxy*uixp + tkyy*uiyp + tkyz*uizp; + depz = tixz*ukxp + tiyz*ukyp + tizz*ukzp + tkxz*uixp + tkyz*uiyp + tkzz*uizp; + + frcx = frcx - depx; + frcy = frcy - depy; + frcz = frcz - depz; + + f.x += frcx; + f.y += frcy; + f.z += frcz; + + if (EVFLAG && vflag) { + numtyp vxx = xr * frcx; + numtyp vxy = (numtyp)0.5 * (yr*frcx+xr*frcy); + numtyp vxz = (numtyp)0.5 * (zr*frcx+xr*frcz); + numtyp vyy = yr * frcy; + numtyp vyz = (numtyp)0.5 * (zr*frcy+yr*frcz); + numtyp vzz = zr * frcz; + + virial[0] -= vxx; + virial[1] -= vyy; + virial[2] -= vzz; + virial[3] -= vxy; + virial[4] -= vxz; + virial[5] -= vyz; + } + } // nbor + + } // ii> SBBITS & 3; + int j = sj & NEIGHMASK; + tagint jtag = tag[j]; + + if (!which) { + int offset=ii; + for (int k=0; k +class Hippo : public BaseAmoeba { + public: + Hippo(); + ~Hippo(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successful + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, const int max_amtype, const int max_amclass, + const double *host_pdamp, const double *host_thole, + const double *host_dirdamp, const int *host_amtype2class, + const double *host_special_mpole, + const double *host_special_repel, + const double *host_special_disp, + const double *host_special_polar_wscale, + const double *host_special_polar_piscale, + const double *host_special_polar_pscale, + const double *host_sizpr, const double *host_dmppr, const double *host_elepr, + const double *host_csix, const double *host_adisp, + const double *host_pcore, const double *host_palpha, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const int maxspecial15, const double cell_size, + const double gpu_split, FILE *_screen, + const double polar_dscale, const double polar_uscale); + + /// Compute repulsion with device neighboring + virtual void compute_repulsion(const int ago, const int inum_full, + const int nall, double **host_x, + int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, + const double aewald, const double off2_repulse, + double *host_q, double *boxlo, double *prd, + double cut2, double c0, double c1, double c2, + double c3, double c4, double c5,void** tep_ptr); + + /// Compute dispersion real-space with device neighboring + virtual void compute_dispersion_real(int *host_amtype, int *host_amgroup, + double **host_rpole, const double aewald, + const double off2_disp); + + /// Compute multipole real-space with device neighboring + virtual void compute_multipole_real(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, double *host_pval, + double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **numj, const double cpu_time, bool &success, + const double aewald, const double felec, const double off2_mpole, double *charge, + double *boxlo, double *prd, void **tep_ptr); + + /// Compute the real space part of the permanent field (udirect2b) with device neighboring + virtual void compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, double* host_pval, + const double aewald, const double off2_polar, void** fieldp_ptr); + + /// Compute the real space part of the induced field (umutual2b) with device neighboring + virtual void compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, double *host_pval, + const double aewald, const double off2_polar, + void** fieldp_ptr); + + /// Compute polar real-space with device neighboring + virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, double *host_pval, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, + const double aewald, const double felec, const double off2_polar, + void **tep_ptr); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + // --------------------------- TYPE DATA -------------------------- + + /// pdamp = coeff_amtype.x; thole = coeff_amtype.y; + /// dirdamp = coeff_amtype.z; amtype2class = coeff_amtype.w + UCL_D_Vec coeff_amtype; + /// csix = coeff_amclass.x; adisp = coeff_amclass.y; + UCL_D_Vec coeff_amclass; + /// sizpr = coeff_rep.x; dmppr = coeff_rep.y; elepr = coeff_rep.z; + UCL_D_Vec coeff_rep; + /// Special polar values [0-4]: + /// sp_polar.x = special_polar_wscale + /// sp_polar.y special_polar_pscale, + /// sp_polar.z = special_polar_piscale + /// sp_polar.w = special_mpole + UCL_D_Vec sp_polar; + /// Special nonpolar values [0-4]: + /// sp_nonpolar.x = special_hal + /// sp_nonpolar.y special_repel + /// sp_nonpolar.z = special_disp + UCL_D_Vec sp_nonpolar; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + numtyp _cut2,_c0,_c1,_c2,_c3,_c4,_c5; + numtyp _polar_dscale, _polar_uscale; + numtyp _qqrd2e; + + UCL_Kernel k_repulsion, k_dispersion; + + protected: + bool _allocated; + int repulsion(const int eflag, const int vflag); + int dispersion_real(const int eflag, const int vflag); + int multipole_real(const int eflag, const int vflag); + int udirect2b(const int eflag, const int vflag); + int umutual2b(const int eflag, const int vflag); + int polar_real(const int eflag, const int vflag); + +}; + +} + +#endif diff --git a/lib/gpu/lal_hippo_ext.cpp b/lib/gpu/lal_hippo_ext.cpp new file mode 100644 index 0000000000..0cb00387ca --- /dev/null +++ b/lib/gpu/lal_hippo_ext.cpp @@ -0,0 +1,231 @@ +/*************************************************************************** + hippo_ext.cpp + ------------------- + Trung Dac Nguyen (Northwestern) + + Functions for LAMMPS access to hippo acceleration routines. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : trung.nguyen@northwestern.edu + ***************************************************************************/ + +#include +#include +#include + +#include "lal_hippo.h" + +using namespace std; +using namespace LAMMPS_AL; + +static Hippo HIPPOMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass, + const double *host_pdamp, const double *host_thole, + const double *host_dirdamp, const int *host_amtype2class, + const double *host_special_repel, + const double *host_special_disp, + const double *host_special_mpole, + const double *host_special_polar_wscale, + const double *host_special_polar_piscale, + const double *host_special_polar_pscale, + const double *host_sizpr, const double *host_dmppr, const double *host_elepr, + const double *host_csix, const double *host_adisp, + const double *host_pcore, const double *host_palpha, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const int maxspecial15, + const double cell_size, int &gpu_mode, FILE *screen, + const double polar_dscale, const double polar_uscale) { + HIPPOMF.clear(); + gpu_mode=HIPPOMF.device->gpu_mode(); + double gpu_split=HIPPOMF.device->particle_split(); + int first_gpu=HIPPOMF.device->first_device(); + int last_gpu=HIPPOMF.device->last_device(); + int world_me=HIPPOMF.device->world_me(); + int gpu_rank=HIPPOMF.device->gpu_rank(); + int procs_per_gpu=HIPPOMF.device->procs_per_gpu(); + + HIPPOMF.device->init_message(screen,"HIPPO",first_gpu,last_gpu); + + bool message=false; + if (HIPPOMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing GPU and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=HIPPOMF.init(ntypes, max_amtype, max_amclass, + host_pdamp, host_thole, host_dirdamp, + host_amtype2class, host_special_repel, host_special_disp, + host_special_mpole, host_special_polar_wscale, + host_special_polar_piscale, host_special_polar_pscale, + host_sizpr, host_dmppr, host_elepr, + host_csix, host_adisp, host_pcore, host_palpha, + nlocal, nall, max_nbors, + maxspecial, maxspecial15, cell_size, gpu_split, + screen, polar_dscale, polar_uscale); + + HIPPOMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; igpu_barrier(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + HIPPOMF.estimate_gpu_overhead(); + return init_ok; +} + +void hippo_gpu_clear() { + HIPPOMF.clear(); +} + +int** hippo_gpu_precompute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double ** /*host_uind*/, double ** /*host_uinp*/, double * /*host_pval*/, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, double *prd) { + return HIPPOMF.precompute(ago, inum_full, nall, host_x, host_type, + host_amtype, host_amgroup, host_rpole, + nullptr, nullptr, nullptr, sublo, subhi, tag, + nspecial, special, nspecial15, special15, + eflag_in, vflag_in, eatom, vatom, + host_start, ilist, jnum, cpu_time, + success, host_q, boxlo, prd); +} + +void hippo_gpu_compute_repulsion(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *host_amtype, int *host_amgroup, double **host_rpole, + double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, int *nspecial15, tagint** special15, + const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, const double aewald, const double off2, + double *host_q, double *boxlo, double *prd, + double cut2, double c0, double c1, double c2, + double c3, double c4, double c5, void **tep_ptr) { + HIPPOMF.compute_repulsion(ago, inum_full, nall, host_x, host_type, + host_amtype, host_amgroup, host_rpole, sublo, subhi, + tag, nspecial, special, nspecial15, special15, + eflag, vflag, eatom, vatom, host_start, ilist, jnum, + cpu_time, success, aewald, off2, host_q, boxlo, prd, + cut2, c0, c1, c2, c3, c4, c5, tep_ptr); +} + +void hippo_gpu_compute_dispersion_real(int *host_amtype, int *host_amgroup, + double **host_rpole, const double aewald, + const double off2) { + HIPPOMF.compute_dispersion_real(host_amtype, host_amgroup, host_rpole, + aewald, off2); +} + +void hippo_gpu_compute_multipole_real(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *host_amtype, int *host_amgroup, double **host_rpole, + double *host_pval, double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, int *nspecial15, tagint** special15, + const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, const double aewald, const double felec, const double off2, + double *host_q, double *boxlo, double *prd, void **tep_ptr) { + HIPPOMF.compute_multipole_real(ago, inum_full, nall, host_x, host_type, + host_amtype, host_amgroup, host_rpole, host_pval, sublo, subhi, + tag, nspecial, special, nspecial15, special15, + eflag, vflag, eatom, vatom, host_start, ilist, jnum, + cpu_time, success, aewald, felec, off2, host_q, boxlo, prd, tep_ptr); +} + +void hippo_gpu_compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, double *host_pval, + const double aewald, const double off2, void **fieldp_ptr) { + HIPPOMF.compute_udirect2b(host_amtype, host_amgroup, host_rpole, + host_uind, host_uinp, host_pval, + aewald, off2, fieldp_ptr); +} + +void hippo_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, double *host_pval, + const double aewald, const double off2, void **fieldp_ptr) { + HIPPOMF.compute_umutual2b(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval, + aewald, off2, fieldp_ptr); +} + +void hippo_gpu_update_fieldp(void **fieldp_ptr) { + HIPPOMF.update_fieldp(fieldp_ptr); +} + +void hippo_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, double *host_pval, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, + const double aewald, const double felec, const double off2, + void **tep_ptr) { + HIPPOMF.compute_polar_real(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval, + eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr); +} + +void hippo_gpu_precompute_kspace(const int inum_full, const int bsorder, + double ***host_thetai1, double ***host_thetai2, + double ***host_thetai3, int** igrid, + const int nzlo_out, const int nzhi_out, + const int nylo_out, const int nyhi_out, + const int nxlo_out, const int nxhi_out) { + HIPPOMF.precompute_kspace(inum_full, bsorder, host_thetai1, host_thetai2, + host_thetai3, igrid, nzlo_out, nzhi_out, + nylo_out, nyhi_out, nxlo_out, nxhi_out); +} + +void hippo_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1, + void **host_fdip_phi2, void **host_fdip_sum_phi) { + HIPPOMF.compute_fphi_uind(host_grid_brick, host_fdip_phi1, host_fdip_phi2, host_fdip_sum_phi); +} + +double hippo_gpu_bytes() { + return HIPPOMF.host_memory_usage(); +} diff --git a/lib/gpu/lal_hippo_extra.h b/lib/gpu/lal_hippo_extra.h new file mode 100644 index 0000000000..7ff62aa9a4 --- /dev/null +++ b/lib/gpu/lal_hippo_extra.h @@ -0,0 +1,431 @@ +/// ************************************************************************** +// hippo_extra.h +// ------------------- +// Trung Dac Nguyen +// +// Device code for hippo math routines +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : ndactrung@gmail.com +// ***************************************************************************/* + +#ifndef LAL_HIPPO_EXTRA_H +#define LAL_HIPPO_EXTRA_H + +#if defined(NV_KERNEL) || defined(USE_HIP) +#include "lal_aux_fun1.h" +#else +#endif + +#define MY_PI2 (numtyp)1.57079632679489661923 +#define MY_PI4 (numtyp)0.78539816339744830962 + +/* ---------------------------------------------------------------------- + damprep generates coefficients for the Pauli repulsion + damping function for powers of the interatomic distance + + literature reference: + + J. A. Rackers and J. W. Ponder, "Classical Pauli Repulsion: An + Anisotropic, Atomic Multipole Model", Journal of Chemical Physics, + 150, 084104 (2019) +------------------------------------------------------------------------- */ + +ucl_inline void damprep(const numtyp r, const numtyp r2, const numtyp rr1, + const numtyp rr3, const numtyp rr5, const numtyp rr7, + const numtyp rr9, const numtyp rr11, const int rorder, + const numtyp dmpi, const numtyp dmpk, numtyp dmpik[11]) +{ + numtyp r3,r4; + numtyp r5,r6,r7,r8; + numtyp s,ds,d2s; + numtyp d3s,d4s,d5s; + numtyp dmpi2,dmpk2; + numtyp dmpi22,dmpi23; + numtyp dmpi24,dmpi25; + numtyp dmpi26,dmpi27; + numtyp dmpk22,dmpk23; + numtyp dmpk24,dmpk25; + numtyp dmpk26; + numtyp eps,diff; + numtyp expi,expk; + numtyp dampi,dampk; + numtyp pre,term,tmp; + + // compute tolerance value for damping exponents + + eps = (numtyp)0.001; + diff = dmpi-dmpk; // fabs(dmpi-dmpk) + if (diff < (numtyp)0) diff = -diff; + + // treat the case where alpha damping exponents are equal + + if (diff < eps) { + r3 = r2 * r; + r4 = r3 * r; + r5 = r4 * r; + r6 = r5 * r; + r7 = r6 * r; + dmpi2 = (numtyp)0.5 * dmpi; + dampi = dmpi2 * r; + expi = ucl_exp(-dampi); + dmpi22 = dmpi2 * dmpi2; + dmpi23 = dmpi22 * dmpi2; + dmpi24 = dmpi23 * dmpi2; + dmpi25 = dmpi24 * dmpi2; + dmpi26 = dmpi25 * dmpi2; + pre = (numtyp)128.0; + s = (r + dmpi2*r2 + dmpi22*r3/(numtyp)3.0) * expi; + + ds = (dmpi22*r3 + dmpi23*r4) * expi / (numtyp)3.0; + d2s = dmpi24 * expi * r5 / (numtyp)9.0; + d3s = dmpi25 * expi * r6 / (numtyp)45.0; + d4s = (dmpi25*r6 + dmpi26*r7) * expi / (numtyp)315.0; + if (rorder >= 11) { + r8 = r7 * r; + dmpi27 = dmpi2 * dmpi26; + d5s = (dmpi25*r6 + dmpi26*r7 + dmpi27*r8/(numtyp)3.0) * expi / (numtyp)945.0; + } + + // treat the case where alpha damping exponents are unequal + + } else { + r3 = r2 * r; + r4 = r3 * r; + r5 = r4 * r; + dmpi2 = (numtyp)0.5 * dmpi; + dmpk2 = (numtyp)0.5 * dmpk; + dampi = dmpi2 * r; + dampk = dmpk2 * r; + expi = ucl_exp(-dampi); + expk = ucl_exp(-dampk); + dmpi22 = dmpi2 * dmpi2; + dmpi23 = dmpi22 * dmpi2; + dmpi24 = dmpi23 * dmpi2; + dmpi25 = dmpi24 * dmpi2; + dmpk22 = dmpk2 * dmpk2; + dmpk23 = dmpk22 * dmpk2; + dmpk24 = dmpk23 * dmpk2; + dmpk25 = dmpk24 * dmpk2; + term = dmpi22 - dmpk22; + pre = (numtyp)8192.0 * dmpi23 * dmpk23 / (term*term*term*term); //ucl_powr(term,(numtyp)4.0); + tmp = (numtyp)4.0 * dmpi2 * dmpk2 / term; + s = (dampi-tmp)*expk + (dampk+tmp)*expi; + + ds = (dmpi2*dmpk2*r2 - (numtyp)4.0*dmpi2*dmpk22*r/term - + (numtyp)4.0*dmpi2*dmpk2/term) * expk + + (dmpi2*dmpk2*r2 + (numtyp)4.0*dmpi22*dmpk2*r/term + (numtyp)4.0*dmpi2*dmpk2/term) * expi; + d2s = (dmpi2*dmpk2*r2/3.0 + dmpi2*dmpk22*r3/(numtyp)3.0 - + ((numtyp)4.0/(numtyp)3.0)*dmpi2*dmpk23*r2/term - (numtyp)4.0*dmpi2*dmpk22*r/term - + (numtyp)4.0*dmpi2*dmpk2/term) * expk + + (dmpi2*dmpk2*r2/(numtyp)3.0 + dmpi22*dmpk2*r3/(numtyp)3.0 + + ((numtyp)4.0/(numtyp)3.0)*dmpi23*dmpk2*r2/term + (numtyp)4.0*dmpi22*dmpk2*r/term + + (numtyp)4.0*dmpi2*dmpk2/term) * expi; + d3s = (dmpi2*dmpk23*r4/(numtyp)15.0 + dmpi2*dmpk22*r3/(numtyp)5.0 + dmpi2*dmpk2*r2/(numtyp)5.0 - + ((numtyp)4.0/(numtyp)15.0)*dmpi2*dmpk24*r3/term - ((numtyp)8.0/(numtyp)5.0)*dmpi2*dmpk23*r2/term - + (numtyp)4.0*dmpi2*dmpk22*r/term - (numtyp)4.0/term*dmpi2*dmpk2) * expk + + (dmpi23*dmpk2*r4/(numtyp)15.0 + dmpi22*dmpk2*r3/(numtyp)5.0 + dmpi2*dmpk2*r2/(numtyp)5.0 + + ((numtyp)4.0/(numtyp)15.0)*dmpi24*dmpk2*r3/term + ((numtyp)8.0/(numtyp)5.0)*dmpi23*dmpk2*r2/term + + (numtyp)4.0*dmpi22*dmpk2*r/term + (numtyp)4.0/term*dmpi2*dmpk2) * expi; + d4s = (dmpi2*dmpk24*r5/(numtyp)105.0 + ((numtyp)2.0/(numtyp)35.0)*dmpi2*dmpk23*r4 + + dmpi2*dmpk22*r3/(numtyp)7.0 + dmpi2*dmpk2*r2/(numtyp)7.0 - + ((numtyp)4.0/(numtyp)105.0)*dmpi2*dmpk25*r4/term - ((numtyp)8.0/21.0)*dmpi2*dmpk24*r3/term - + ((numtyp)12.0/(numtyp)7.0)*dmpi2*dmpk23*r2/term - (numtyp)4.0*dmpi2*dmpk22*r/term - + (numtyp)4.0*dmpi2*dmpk2/term) * expk + + (dmpi24*dmpk2*r5/(numtyp)105.0 + ((numtyp)2.0/(numtyp)35.0)*dmpi23*dmpk2*r4 + + dmpi22*dmpk2*r3/(numtyp)7.0 + dmpi2*dmpk2*r2/(numtyp)7.0 + + ((numtyp)4.0/(numtyp)105.0)*dmpi25*dmpk2*r4/term + ((numtyp)8.0/(numtyp)21.0)*dmpi24*dmpk2*r3/term + + ((numtyp)12.0/(numtyp)7.0)*dmpi23*dmpk2*r2/term + (numtyp)4.0*dmpi22*dmpk2*r/term + + (numtyp)4.0*dmpi2*dmpk2/term) * expi; + + if (rorder >= 11) { + r6 = r5 * r; + dmpi26 = dmpi25 * dmpi2; + dmpk26 = dmpk25 * dmpk2; + d5s = (dmpi2*dmpk25*r6/(numtyp)945.0 + ((numtyp)2.0/(numtyp)189.0)*dmpi2*dmpk24*r5 + + dmpi2*dmpk23*r4/(numtyp)21.0 + dmpi2*dmpk22*r3/(numtyp)9.0 + dmpi2*dmpk2*r2/(numtyp)9.0 - + ((numtyp)4.0/(numtyp)945.0)*dmpi2*dmpk26*r5/term - + ((numtyp)4.0/(numtyp)63.0)*dmpi2*dmpk25*r4/term - ((numtyp)4.0/(numtyp)9.0)*dmpi2*dmpk24*r3/term - + ((numtyp)16.0/(numtyp)9.0)*dmpi2*dmpk23*r2/term - (numtyp)4.0*dmpi2*dmpk22*r/term - + (numtyp)4.0*dmpi2*dmpk2/term) * expk + + (dmpi25*dmpk2*r6/(numtyp)945.0 + ((numtyp)2.0/(numtyp)189.0)*dmpi24*dmpk2*r5 + + dmpi23*dmpk2*r4/(numtyp)21.0 + dmpi22*dmpk2*r3/(numtyp)9.0 + dmpi2*dmpk2*r2/(numtyp)9.0 + + ((numtyp)4.0/(numtyp)945.0)*dmpi26*dmpk2*r5/term + ((numtyp)4.0/(numtyp)63.0)*dmpi25*dmpk2*r4/term + + ((numtyp)4.0/(numtyp)9.0)*dmpi24*dmpk2*r3/term + ((numtyp)16.0/(numtyp)9.0)*dmpi23*dmpk2*r2/term + + (numtyp)4.0*dmpi22*dmpk2*r/term + (numtyp)4.0*dmpi2*dmpk2/term) * expi; + } + } + + // convert partial derivatives into full derivatives + + s = s * rr1; + ds = ds * rr3; + d2s = d2s * rr5; + d3s = d3s * rr7; + d4s = d4s * rr9; + d5s = d5s * rr11; + dmpik[0] = (numtyp)0.5 * pre * s * s; + dmpik[2] = pre * s * ds; + dmpik[4] = pre * (s*d2s + ds*ds); + dmpik[6] = pre * (s*d3s + (numtyp)3.0*ds*d2s); + dmpik[8] = pre * (s*d4s + (numtyp)4.0*ds*d3s + (numtyp)3.0*d2s*d2s); + + if (rorder >= 11) dmpik[10] = pre * (s*d5s + (numtyp)5.0*ds*d4s + (numtyp)10.0*d2s*d3s); +} + +/* ---------------------------------------------------------------------- + damppole generates coefficients for the charge penetration + damping function for powers of the interatomic distance + + literature references: + + L. V. Slipchenko and M. S. Gordon, "Electrostatic Energy in the + Effective Fragment Potential Method: Theory and Application to + the Benzene Dimer", Journal of Computational Chemistry, 28, + 276-291 (2007) [Gordon f1 and f2 models] + + J. A. Rackers, Q. Wang, C. Liu, J.-P. Piquemal, P. Ren and + J. W. Ponder, "An Optimized Charge Penetration Model for Use with + the AMOEBA Force Field", Physical Chemistry Chemical Physics, 19, + 276-291 (2017) +------------------------------------------------------------------------- */ + +ucl_inline void damppole(const numtyp r, const int rorder, + const numtyp alphai, const numtyp alphak, + numtyp dmpi[9], numtyp dmpk[9], numtyp dmpik[11]) +{ + numtyp termi,termk; + numtyp termi2,termk2; + numtyp alphai2,alphak2; + numtyp eps,diff; + numtyp expi,expk; + numtyp dampi,dampk; + numtyp dampi2,dampi3; + numtyp dampi4,dampi5; + numtyp dampi6,dampi7; + numtyp dampi8; + numtyp dampk2,dampk3; + numtyp dampk4,dampk5; + numtyp dampk6; + + // compute tolerance and exponential damping factors + + eps = (numtyp)0.001; + diff = alphai-alphak; + if (diff < (numtyp)0) diff = -diff; + dampi = alphai * r; + dampk = alphak * r; + expi = ucl_exp(-dampi); + expk = ucl_exp(-dampk); + + // core-valence charge penetration damping for Gordon f1 + + dampi2 = dampi * dampi; + dampi3 = dampi * dampi2; + dampi4 = dampi2 * dampi2; + dampi5 = dampi2 * dampi3; + dmpi[0] = (numtyp)1.0 - ((numtyp)1.0 + (numtyp)0.5*dampi)*expi; + dmpi[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2)*expi; + dmpi[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi; + dmpi[6] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + dampi4/(numtyp)30.0)*expi; + dmpi[8] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + + (numtyp)4.0*dampi4/(numtyp)105.0 + dampi5/(numtyp)210.0)*expi; + if (diff < eps) { + dmpk[0] = dmpi[0]; + dmpk[2] = dmpi[2]; + dmpk[4] = dmpi[4]; + dmpk[6] = dmpi[6]; + dmpk[8] = dmpi[8]; + } else { + dampk2 = dampk * dampk; + dampk3 = dampk * dampk2; + dampk4 = dampk2 * dampk2; + dampk5 = dampk2 * dampk3; + dmpk[0] = (numtyp)1.0 - ((numtyp)1.0 + (numtyp)0.5*dampk)*expk; + dmpk[2] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2)*expk; + dmpk[4] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0)*expk; + dmpk[6] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + dampk4/(numtyp)30.0)*expk; + dmpk[8] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + + (numtyp)4.0*dampk4/(numtyp)105.0 + dampk5/(numtyp)210.0)*expk; + } + + // valence-valence charge penetration damping for Gordon f1 + + if (diff < eps) { + dampi6 = dampi3 * dampi3; + dampi7 = dampi3 * dampi4; + dmpik[0] = (numtyp)1.0 - ((numtyp)1.0 + (numtyp)11.0*dampi/(numtyp)16.0 + (numtyp)3.0*dampi2/(numtyp)16.0 + + dampi3/(numtyp)48.0)*expi; + dmpik[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + + (numtyp)7.0*dampi3/(numtyp)48.0 + dampi4/(numtyp)48.0)*expi; + dmpik[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + + dampi4/(numtyp)24.0 + dampi5/(numtyp)144.0)*expi; + dmpik[6] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + + dampi4/(numtyp)24.0 + dampi5/(numtyp)120.0 + dampi6/(numtyp)720.0)*expi; + dmpik[8] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + + dampi4/(numtyp)24.0 + dampi5/(numtyp)120.0 + dampi6/(numtyp)720.0 + + dampi7/(numtyp)5040.0)*expi; + if (rorder >= 11) { + dampi8 = dampi4 * dampi4; + dmpik[10] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + + dampi4/(numtyp)24.0 + dampi5/(numtyp)120.0 + dampi6/(numtyp)720.0 + + dampi7/(numtyp)5040.0 + dampi8/(numtyp)45360.0)*expi; + } + + } else { + alphai2 = alphai * alphai; + alphak2 = alphak * alphak; + termi = alphak2 / (alphak2-alphai2); + termk = alphai2 / (alphai2-alphak2); + termi2 = termi * termi; + termk2 = termk * termk; + dmpik[0] = (numtyp)1.0 - termi2*(1.0 + (numtyp)2.0*termk + (numtyp)0.5*dampi)*expi - + termk2*((numtyp)1.0 + (numtyp)2.0*termi + (numtyp)0.5*dampk)*expk; + dmpik[2] = (numtyp)1.0 - termi2*((numtyp)1.0+dampi+(numtyp)0.5*dampi2)*expi - + termk2*((numtyp)1.0+dampk+(numtyp)0.5*dampk2)*expk - + (numtyp)2.0*termi2*termk*((numtyp)1.0+dampi)*expi - + (numtyp)2.0*termk2*termi*((numtyp)1.0+dampk)*expk; + dmpik[4] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi - + termk2*(1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0)*expk - + (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + dampi2/(numtyp)3.0)*expi - + (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + dampk2/(numtyp)3.0)*expk; + dmpik[6] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + + dampi3/(numtyp)6.0 + dampi4/(numtyp)30.0)*expi - + termk2*((numtyp)1.0 + dampk + 0.5*dampk2 + dampk3/(numtyp)6.0 + dampk4/(numtyp)30.0)*expk - + (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)2.0*dampi2/(numtyp)5.0 + dampi3/(numtyp)15.0)*expi - + (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + (numtyp)2.0*dampk2/(numtyp)5.0 + dampk3/(numtyp)15.0)*expk; + dmpik[8] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + + (numtyp)4.0*dampi4/(numtyp)105.0 + dampi5/(numtyp)210.0)*expi - + termk2*((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + + (numtyp)4.0*dampk4/105.0 + dampk5/(numtyp)210.0)*expk - + (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)3.0*dampi2/(numtyp)7.0 + + (numtyp)2.0*dampi3/(numtyp)21.0 + dampi4/(numtyp)105.0)*expi - + (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + (numtyp)3.0*dampk2/(numtyp)7.0 + + (numtyp)2.0*dampk3/(numtyp)21.0 + dampk4/(numtyp)105.0)*expk; + + if (rorder >= 11) { + dampi6 = dampi3 * dampi3; + dampk6 = dampk3 * dampk3; + dmpik[10] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + + (numtyp)5.0*dampi4/(numtyp)126.0 + (numtyp)2.0*dampi5/(numtyp)315.0 + + dampi6/(numtyp)1890.0)*expi - + termk2*((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + (numtyp)5.0*dampk4/(numtyp)126.0 + + (numtyp)2.0*dampk5/(numtyp)315.0 + dampk6/(numtyp)1890.0)*expk - + (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)4.0*dampi2/(numtyp)9.0 + dampi3/(numtyp)9.0 + + dampi4/(numtyp)63.0 + dampi5/(numtyp)945.0)*expi - + (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + 4.0*dampk2/(numtyp)9.0 + dampk3/(numtyp)9.0 + + dampk4/(numtyp)63.0 + dampk5/(numtyp)945.0)*expk; + } + } +} + +/* ---------------------------------------------------------------------- + dampdir = direct field damping coefficents + dampdir generates coefficients for the direct field damping + function for powers of the interatomic distance +------------------------------------------------------------------------- */ + +ucl_inline void dampdir(numtyp r, numtyp alphai, numtyp alphak, numtyp *dmpi, numtyp *dmpk) +{ + numtyp eps,diff; + numtyp expi,expk; + numtyp dampi,dampk; + numtyp dampi2,dampk2; + numtyp dampi3,dampk3; + numtyp dampi4,dampk4; + + // compute tolerance and exponential damping factors + + eps = (numtyp)0.001; + diff = alphai-alphak; // fabs(alphai-alphak); + if (diff < (numtyp)0) diff = -diff; + dampi = alphai * r; + dampk = alphak * r; + expi = ucl_exp(-dampi); + expk = ucl_exp(-dampk); + + // core-valence charge penetration damping for Gordon f1 (HIPPO) + + dampi2 = dampi * dampi; + dampi3 = dampi * dampi2; + dampi4 = dampi2 * dampi2; + dmpi[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2)*expi; + dmpi[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi; + dmpi[6] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + dampi4/(numtyp)30.0)*expi; + if (diff < eps) { + dmpk[2] = dmpi[2]; + dmpk[4] = dmpi[4]; + dmpk[6] = dmpi[6]; + } else { + dampk2 = dampk * dampk; + dampk3 = dampk * dampk2; + dampk4 = dampk2 * dampk2; + dmpk[2] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2)*expk; + dmpk[4] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0)*expk; + dmpk[6] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + dampk4/30.0)*expk; + } +} + +/* ---------------------------------------------------------------------- + dampmut = mutual field damping coefficents + dampmut generates coefficients for the mutual field damping + function for powers of the interatomic distance +------------------------------------------------------------------------- */ + +ucl_inline void dampmut(numtyp r, numtyp alphai, numtyp alphak, numtyp dmpik[5]) +{ + numtyp termi,termk; + numtyp termi2,termk2; + numtyp alphai2,alphak2; + numtyp eps,diff; + numtyp expi,expk; + numtyp dampi,dampk; + numtyp dampi2,dampi3; + numtyp dampi4,dampi5; + numtyp dampk2,dampk3; + + // compute tolerance and exponential damping factors + + eps = (numtyp)0.001; + diff = alphai-alphak; // fabs(alphai-alphak); + if (diff < (numtyp)0) diff = -diff; + dampi = alphai * r; + dampk = alphak * r; + expi = ucl_exp(-dampi); + expk = ucl_exp(-dampk); + + // valence-valence charge penetration damping for Gordon f1 (HIPPO) + + dampi2 = dampi * dampi; + dampi3 = dampi * dampi2; + if (diff < eps) { + dampi4 = dampi2 * dampi2; + dampi5 = dampi2 * dampi3; + dmpik[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + + 7.0*dampi3/(numtyp)48.0 + dampi4/48.0)*expi; + dmpik[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + + dampi4/(numtyp)24.0 + dampi5/(numtyp)144.0)*expi; + } else { + dampk2 = dampk * dampk; + dampk3 = dampk * dampk2; + alphai2 = alphai * alphai; + alphak2 = alphak * alphak; + termi = alphak2 / (alphak2-alphai2); + termk = alphai2 / (alphai2-alphak2); + termi2 = termi * termi; + termk2 = termk * termk; + dmpik[2] = (numtyp)1.0 - termi2*((numtyp)1.0+dampi+(numtyp)0.5*dampi2)*expi - + termk2*((numtyp)1.0+dampk+(numtyp)0.5*dampk2)*expk - + (numtyp)2.0*termi2*termk*((numtyp)1.0+dampi)*expi - (numtyp)2.0*termk2*termi*((numtyp)1.0+dampk)*expk; + dmpik[4] = (numtyp)1.0 - termi2*((numtyp)1.0+dampi+(numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi - + termk2*((numtyp)1.0+dampk+(numtyp)0.5*dampk2 + dampk3/(numtyp)6.00)*expk - + (numtyp)2.0*termi2*termk *((numtyp)1.0+dampi+dampi2/(numtyp)3.0)*expi - + (numtyp)2.0*termk2*termi *((numtyp)1.0+dampk+dampk2/(numtyp)3.0)*expk; + } +} + +#endif diff --git a/lib/gpu/lal_neighbor.cpp b/lib/gpu/lal_neighbor.cpp index a327fdd45b..10816e2fa6 100644 --- a/lib/gpu/lal_neighbor.cpp +++ b/lib/gpu/lal_neighbor.cpp @@ -576,6 +576,11 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, time_nbor.stop(); if (_time_device) time_nbor.add_to_total(); + + // on the host, special[i][j] = the special j neighbor of atom i (nall by maxspecial) + // on the device, transpose the matrix (1-d array) for coalesced reads + // dev_special[i][j] = the special i neighbor of atom j + time_transpose.start(); const int b2x=_block_cell_2d; const int b2y=_block_cell_2d; @@ -679,6 +684,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, if (_cutoff < _cell_size) vadjust*=1.46; mn=std::max(mn,static_cast(ceil(_max_neighbor_factor*vadjust*mn))); if (mn<33) mn+=3; + resize_max_neighbors(mn,success); set_nbor_block_size(mn/2); if (!success) @@ -831,6 +837,17 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, time_nbor.stop(); } +void Neighbor::transpose(UCL_D_Vec &out, const UCL_D_Vec &in, + const int columns_in, const int rows_in) +{ + const int b2x=_block_cell_2d; + const int b2y=_block_cell_2d; + const int g2x=static_cast(ceil(static_cast(columns_in)/b2x)); + const int g2y=static_cast(ceil(static_cast(rows_in)/b2y)); + _shared->k_transpose.set_size(g2x,g2y,b2x,b2y); + _shared->k_transpose.run(&out, &in, &columns_in, &rows_in); +} + template void Neighbor::build_nbor_list (double **x, const int inum, const int host_inum, const int nall, Atom &atom, double *sublo, double *subhi, diff --git a/lib/gpu/lal_neighbor.h b/lib/gpu/lal_neighbor.h index 5b569f804a..45ec95a9d1 100644 --- a/lib/gpu/lal_neighbor.h +++ b/lib/gpu/lal_neighbor.h @@ -33,7 +33,7 @@ #endif #endif -#if defined(USE_HIP) +#if defined(USE_HIP) || defined(__APPLE__) #define LAL_USE_OLD_NEIGHBOR #endif @@ -259,6 +259,10 @@ class Neighbor { return o.str(); } + /// Helper function + void transpose(UCL_D_Vec &out, const UCL_D_Vec &in, + const int columns_in, const int rows_in); + private: NeighborShared *_shared; UCL_Device *dev; @@ -289,15 +293,17 @@ class Neighbor { #endif int _simd_size; + #ifdef LAL_USE_OLD_NEIGHBOR inline void set_nbor_block_size(const int mn) { - #ifdef LAL_USE_OLD_NEIGHBOR int desired=mn/(2*_simd_size); desired*=_simd_size; if (desired<_simd_size) desired=_simd_size; else if (desired>_max_block_nbor_build) desired=_max_block_nbor_build; _block_nbor_build=desired; - #endif } + #else + inline void set_nbor_block_size(const int) {} + #endif }; } diff --git a/lib/gpu/lal_neighbor_gpu.cu b/lib/gpu/lal_neighbor_gpu.cu index 352f1d6138..359d9b75cb 100644 --- a/lib/gpu/lal_neighbor_gpu.cu +++ b/lib/gpu/lal_neighbor_gpu.cu @@ -48,6 +48,19 @@ _texture_2d( pos_tex,int4); #define LAL_USE_OLD_NEIGHBOR #endif +/* + compute the id of the cell where the atoms belong to +x: atom coordinates +cell_id: cell ids +particle_id: +boxlo[0-2]: the lower left corner of the local box +ncell[xyz]: the number of cells in xyz dims +i_cell_size is the inverse cell size +inum = the number of the local atoms that are ported to the device +nall = the number of the local+ghost atoms that are ported to the device +cells_in_cutoff = the number of cells that are within the cutoff +*/ + __kernel void calc_cell_id(const numtyp4 *restrict x_, unsigned *restrict cell_id, int *restrict particle_id, @@ -90,6 +103,8 @@ __kernel void calc_cell_id(const numtyp4 *restrict x_, } } +// compute the number of atoms in each cell + __kernel void kernel_calc_cell_counts(const unsigned *restrict cell_id, int *restrict cell_counts, int nall, int ncell) { diff --git a/lib/gpu/lal_pppm.cu b/lib/gpu/lal_pppm.cu index e17df5b88c..a8e929efe4 100644 --- a/lib/gpu/lal_pppm.cu +++ b/lib/gpu/lal_pppm.cu @@ -273,19 +273,19 @@ __kernel void interp(const __global numtyp4 *restrict x_, int my=mz+fast_mul(ny,npts_x); for (int m=0; m> SBBITS & 3; }; +#define SBBITS15 29 +#define NEIGHMASK15 0x1FFFFFFF +ucl_inline int sbmask15(int j) { return j >> SBBITS15 & 7; }; + // default to 32-bit smallint and other ints, 64-bit bigint: // same as defined in src/lmptype.h #if !defined(LAMMPS_SMALLSMALL) && !defined(LAMMPS_BIGBIG) && \ diff --git a/lib/gpu/lal_sw.cpp b/lib/gpu/lal_sw.cpp index eb42c710cc..9687a0352d 100644 --- a/lib/gpu/lal_sw.cpp +++ b/lib/gpu/lal_sw.cpp @@ -150,7 +150,7 @@ double SWT::host_memory_usage() const { // --------------------------------------------------------------------------- template int SWT::loop(const int eflag, const int vflag, const int evatom, - bool &success) { + bool & /*success*/) { const int nbor_pitch=this->nbor->nbor_pitch(); // build the short neighbor list diff --git a/lib/gpu/lal_tersoff.cu b/lib/gpu/lal_tersoff.cu index 8baa5ce12a..feab8bb5c0 100644 --- a/lib/gpu/lal_tersoff.cu +++ b/lib/gpu/lal_tersoff.cu @@ -106,6 +106,7 @@ _texture_2d( pos_tex,int4); } \ } +// (SHUFFLE_AVAIL == 1) #else #define local_allocate_acc_zeta() @@ -202,6 +203,7 @@ _texture_2d( pos_tex,int4); } \ } +// EVFLAG == 0 #else #define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom, \ @@ -216,8 +218,8 @@ _texture_2d( pos_tex,int4); ans[ii]=old; \ } -#endif -#endif +#endif // EVFLAG +#endif // SHUFFLE_AVAIL #ifdef LAL_SIMD_IP_SYNC #define t_per_atom t_per_atom_in diff --git a/lib/gpu/lal_vashishta.cpp b/lib/gpu/lal_vashishta.cpp index c343de3f55..fcc9d00ab0 100644 --- a/lib/gpu/lal_vashishta.cpp +++ b/lib/gpu/lal_vashishta.cpp @@ -56,7 +56,7 @@ int VashishtaT::init(const int ntypes, const int nlocal, const int nall, const i const double* costheta, const double* bigb, const double* big2b, const double* bigc) { - int success; + int success=0; success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split, _screen,vashishta,"k_vashishta","k_vashishta_three_center", "k_vashishta_three_end","k_vashishta_short_nbor"); @@ -211,7 +211,7 @@ double VashishtaT::host_memory_usage() const { // --------------------------------------------------------------------------- template int VashishtaT::loop(const int eflag, const int vflag, const int evatom, - bool &success) { + bool & /*success*/) { const int nbor_pitch=this->nbor->nbor_pitch(); // build the short neighbor list diff --git a/src/AMOEBA/amoeba_convolution.cpp b/src/AMOEBA/amoeba_convolution.cpp index f222613c3c..ae3dbf16c4 100644 --- a/src/AMOEBA/amoeba_convolution.cpp +++ b/src/AMOEBA/amoeba_convolution.cpp @@ -22,6 +22,7 @@ #include "memory.h" #include "neighbor.h" #include "remap_wrap.h" +#include "timer.h" #include #include @@ -326,15 +327,23 @@ FFT_SCALAR *AmoebaConvolution::pre_convolution_3d() cfft[n++] = ZEROF; } + double time0,time1; + + if (timer->has_sync()) MPI_Barrier(world); + time0 = platform::walltime(); + // perform forward FFT fft1->compute(cfft,cfft,FFT3d::FORWARD); + time1 = platform::walltime(); if (SCALE) { - double scale = 1.0/nfft_global; + FFT_SCALAR scale = 1.0/nfft_global; for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale; } + time_fft += time1 - time0; + #if DEBUG_AMOEBA debug_scalar(CFFT1,"PRE Convo / POST FFT"); debug_file(CFFT1,"pre.convo.post.fft"); @@ -382,15 +391,24 @@ FFT_SCALAR *AmoebaConvolution::pre_convolution_4d() debug_scalar(FFT,"PRE Convo / POST Remap"); debug_file(FFT,"pre.convo.post.remap"); #endif + + double time0,time1; + + if (timer->has_sync()) MPI_Barrier(world); + time0 = platform::walltime(); + // perform forward FFT fft1->compute(cfft,cfft,FFT3d::FORWARD); + time1 = platform::walltime(); if (SCALE) { - double scale = 1.0/nfft_global; + FFT_SCALAR scale = 1.0/nfft_global; for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale; } + time_fft += time1 - time0; + #if DEBUG_AMOEBA debug_scalar(CFFT1,"PRE Convo / POST FFT"); debug_file(CFFT1,"pre.convo.post.fft"); @@ -423,7 +441,16 @@ void *AmoebaConvolution::post_convolution_3d() debug_scalar(CFFT1,"POST Convo / PRE FFT"); debug_file(CFFT1,"post.convo.pre.fft"); #endif + + double time0,time1; + + if (timer->has_sync()) MPI_Barrier(world); + time0 = platform::walltime(); + fft2->compute(cfft,cfft,FFT3d::BACKWARD); + time1 = platform::walltime(); + + time_fft += time1 - time0; #if DEBUG_AMOEBA debug_scalar(CFFT2,"POST Convo / POST FFT"); @@ -465,8 +492,18 @@ void *AmoebaConvolution::post_convolution_4d() debug_scalar(CFFT1,"POST Convo / PRE FFT"); debug_file(CFFT1,"post.convo.pre.fft"); #endif + + double time0,time1; + + if (timer->has_sync()) MPI_Barrier(world); + time0 = platform::walltime(); + fft2->compute(cfft,cfft,FFT3d::BACKWARD); + time1 = platform::walltime(); + + time_fft += time1 - time0; + #if DEBUG_AMOEBA debug_scalar(CFFT2,"POST Convo / POST FFT"); debug_file(CFFT2,"post.convo.post.fft"); diff --git a/src/AMOEBA/amoeba_convolution.h b/src/AMOEBA/amoeba_convolution.h index 99ad11ade4..bed65149ec 100644 --- a/src/AMOEBA/amoeba_convolution.h +++ b/src/AMOEBA/amoeba_convolution.h @@ -38,7 +38,7 @@ class AmoebaConvolution : protected Pointers { int nxlo_out, nxhi_out, nylo_out, nyhi_out, nzlo_out, nzhi_out; int nxlo_fft, nxhi_fft, nylo_fft, nyhi_fft, nzlo_fft, nzhi_fft; bigint nfft_global; // nx * ny * nz - double *grid_brick_start; // lower left corner of (c)grid_brick data + FFT_SCALAR *grid_brick_start; // lower left corner of (c)grid_brick data AmoebaConvolution(class LAMMPS *, class Pair *, int, int, int, int, int); ~AmoebaConvolution(); @@ -47,35 +47,37 @@ class AmoebaConvolution : protected Pointers { FFT_SCALAR *pre_convolution(); void *post_convolution(); - private: - int which; // caller name for convolution being performed - int flag3d; // 1 if using 3d grid_brick, 0 for 4d cgrid_brick - int nbrick_owned; // owned grid points in brick decomp - int nbrick_ghosts; // owned + ghost brick grid points - int ngrid_either; // max of nbrick_owned or nfft_owned + double time_fft; + + protected: + int which; // caller name for convolution being performed + int flag3d; // 1 if using 3d grid_brick, 0 for 4d cgrid_brick + int nbrick_owned; // owned grid points in brick decomp + int nbrick_ghosts; // owned + ghost brick grid points + int ngrid_either; // max of nbrick_owned or nfft_owned class Pair *amoeba; class FFT3d *fft1, *fft2; class Grid3d *gc; class Remap *remap; - double ***grid_brick; // 3d real brick grid with ghosts - double ****cgrid_brick; // 4d complex brick grid with ghosts + FFT_SCALAR ***grid_brick; // 3d real brick grid with ghosts + FFT_SCALAR ****cgrid_brick; // 4d complex brick grid with ghosts FFT_SCALAR *grid_fft; // 3d FFT grid as 1d vector FFT_SCALAR *cfft; // 3d complex FFT grid as 1d vector - double *gc_buf1, *gc_buf2; // buffers for GridComm - double *remap_buf; // buffer for Remap + FFT_SCALAR *gc_buf1, *gc_buf2; // buffers for GridComm + FFT_SCALAR *remap_buf; // buffer for Remap void allocate_grid(); void deallocate_grid(); void *zero_3d(); void *zero_4d(); FFT_SCALAR *pre_convolution_3d(); - FFT_SCALAR *pre_convolution_4d(); + virtual FFT_SCALAR *pre_convolution_4d(); void *post_convolution_3d(); - void *post_convolution_4d(); + virtual void *post_convolution_4d(); void procs2grid2d(int, int, int, int &, int &); // DEBUG diff --git a/src/AMOEBA/amoeba_dispersion.cpp b/src/AMOEBA/amoeba_dispersion.cpp index f3af921d85..cc283f22d2 100644 --- a/src/AMOEBA/amoeba_dispersion.cpp +++ b/src/AMOEBA/amoeba_dispersion.cpp @@ -285,7 +285,7 @@ void PairAmoeba::dispersion_kspace() // gridpre = my portion of 3d grid in brick decomp w/ ghost values // zeroed by zero() - double ***gridpre = (double ***) d_kspace->zero(); + FFT_SCALAR ***gridpre = (FFT_SCALAR ***) d_kspace->zero(); // map atoms to grid @@ -294,7 +294,7 @@ void PairAmoeba::dispersion_kspace() // pre-convolution operations including forward FFT // gridfft = my portion of complex 3d grid in FFT decomposition - double *gridfft = d_kspace->pre_convolution(); + FFT_SCALAR *gridfft = d_kspace->pre_convolution(); // --------------------- // convolution operation diff --git a/src/AMOEBA/amoeba_induce.cpp b/src/AMOEBA/amoeba_induce.cpp index a6724e2bb7..ecc20a198c 100644 --- a/src/AMOEBA/amoeba_induce.cpp +++ b/src/AMOEBA/amoeba_induce.cpp @@ -24,6 +24,7 @@ #include "math_special.h" #include "my_page.h" #include "neigh_list.h" +#include "timer.h" #include @@ -381,8 +382,6 @@ void PairAmoeba::induce() } } - // if (comm->me == 0) printf("CG iteration count = %d\n",iter); - // terminate the calculation if dipoles failed to converge // NOTE: could make this an error @@ -546,13 +545,19 @@ void PairAmoeba::ufield0c(double **field, double **fieldp) } } - // get the reciprocal space part of the mutual field - - if (polar_kspace_flag) umutual1(field,fieldp); + double time0, time1, time2; + if (timer->has_sync()) MPI_Barrier(world); + time0 = platform::walltime(); // get the real space portion of the mutual field if (polar_rspace_flag) umutual2b(field,fieldp); + time1 = platform::walltime(); + + // get the reciprocal space part of the mutual field + + if (polar_kspace_flag) umutual1(field,fieldp); + time2 = platform::walltime(); // add the self-energy portion of the mutual field @@ -563,6 +568,11 @@ void PairAmoeba::ufield0c(double **field, double **fieldp) fieldp[i][j] += term*uinp[i][j]; } } + + // accumulate timing information + + time_mutual_rspace += time1 - time0; + time_mutual_kspace += time2 - time1; } /* ---------------------------------------------------------------------- @@ -785,7 +795,12 @@ void PairAmoeba::dfield0c(double **field, double **fieldp) // get the reciprocal space part of the permanent field + double time0, time1, time2; + if (timer->has_sync()) MPI_Barrier(world); + time0 = platform::walltime(); + if (polar_kspace_flag) udirect1(field); + time1 = platform::walltime(); for (i = 0; i < nlocal; i++) { for (j = 0; j < 3; j++) { @@ -796,6 +811,7 @@ void PairAmoeba::dfield0c(double **field, double **fieldp) // get the real space portion of the permanent field if (polar_rspace_flag) udirect2b(field,fieldp); + time2 = platform::walltime(); // get the self-energy portion of the permanent field @@ -806,6 +822,11 @@ void PairAmoeba::dfield0c(double **field, double **fieldp) fieldp[i][j] += term*rpole[i][j+1]; } } + + // accumulate timing information + + time_direct_kspace += time1 - time0; + time_direct_rspace += time2 - time1; } /* ---------------------------------------------------------------------- @@ -842,18 +863,26 @@ void PairAmoeba::umutual1(double **field, double **fieldp) } } + double time0, time1; + // gridpre = my portion of 4d grid in brick decomp w/ ghost values - double ****gridpre = (double ****) ic_kspace->zero(); + FFT_SCALAR ****gridpre = (FFT_SCALAR ****) ic_kspace->zero(); // map 2 values to grid + if (timer->has_sync()) MPI_Barrier(world); + time0 = platform::walltime(); + grid_uind(fuind,fuinp,gridpre); + time1 = platform::walltime(); + time_grid_uind += (time1 - time0); + // pre-convolution operations including forward FFT // gridfft = my portion of complex 3d grid in FFT decomposition - double *gridfft = ic_kspace->pre_convolution(); + FFT_SCALAR *gridfft = ic_kspace->pre_convolution(); // --------------------- // convolution operation @@ -883,12 +912,18 @@ void PairAmoeba::umutual1(double **field, double **fieldp) // post-convolution operations including backward FFT // gridppost = my portion of 4d grid in brick decomp w/ ghost values - double ****gridpost = (double ****) ic_kspace->post_convolution(); + FFT_SCALAR ****gridpost = (FFT_SCALAR ****) ic_kspace->post_convolution(); // get potential + if (timer->has_sync()) MPI_Barrier(world); + time0 = platform::walltime(); + fphi_uind(gridpost,fdip_phi1,fdip_phi2,fdip_sum_phi); + time1 = platform::walltime(); + time_fphi_uind += (time1 - time0); + // store fractional reciprocal potentials for OPT method if (poltyp == OPT) { @@ -1055,7 +1090,7 @@ void PairAmoeba::udirect1(double **field) // gridpre = my portion of 3d grid in brick decomp w/ ghost values // zeroed by setup() - double ***gridpre = (double ***) i_kspace->zero(); + FFT_SCALAR ***gridpre = (FFT_SCALAR ***) i_kspace->zero(); // map multipole moments to grid @@ -1064,7 +1099,7 @@ void PairAmoeba::udirect1(double **field) // pre-convolution operations including forward FFT // gridfft = my 1d portion of complex 3d grid in FFT decomp - double *gridfft = i_kspace->pre_convolution(); + FFT_SCALAR *gridfft = i_kspace->pre_convolution(); // --------------------- // convolution operation @@ -1109,7 +1144,7 @@ void PairAmoeba::udirect1(double **field) // post-convolution operations including backward FFT // gridppost = my portion of 3d grid in brick decomp w/ ghost values - double ***gridpost = (double ***) i_kspace->post_convolution(); + FFT_SCALAR ***gridpost = (FFT_SCALAR ***) i_kspace->post_convolution(); // get potential diff --git a/src/AMOEBA/amoeba_kspace.cpp b/src/AMOEBA/amoeba_kspace.cpp index da6483ef40..6d2fb64dd6 100644 --- a/src/AMOEBA/amoeba_kspace.cpp +++ b/src/AMOEBA/amoeba_kspace.cpp @@ -68,25 +68,23 @@ void PairAmoeba::moduli() int maxfft = MAX(nfft1,nfft2); maxfft = MAX(maxfft,nfft3); - double *array = new double[bsorder]; - double *bsarray = new double[maxfft]; + if (maxfft > _nfft_max) { + memory->destroy(_moduli_bsarray); + _nfft_max = maxfft; + memory->create(_moduli_bsarray,_nfft_max,"amoeba:_moduli_bsarray"); + } // compute and load the moduli values double x = 0.0; - bspline(x,bsorder,array); + bspline(x,bsorder,_moduli_array); - for (i = 0; i < maxfft; i++) bsarray[i] = 0.0; - for (i = 0; i < bsorder; i++) bsarray[i+1] = array[i]; + for (i = 0; i < maxfft; i++) _moduli_bsarray[i] = 0.0; + for (i = 0; i < bsorder; i++) _moduli_bsarray[i+1] = _moduli_array[i]; - dftmod(bsmod1,bsarray,nfft1,bsorder); - dftmod(bsmod2,bsarray,nfft2,bsorder); - dftmod(bsmod3,bsarray,nfft3,bsorder); - - // perform deallocation of local arrays - - delete[] array; - delete[] bsarray; + dftmod(bsmod1,_moduli_bsarray,nfft1,bsorder); + dftmod(bsmod2,_moduli_bsarray,nfft2,bsorder); + dftmod(bsmod3,_moduli_bsarray,nfft3,bsorder); } /* ---------------------------------------------------------------------- @@ -525,7 +523,7 @@ void PairAmoeba::frac_to_cart() grid_mpole maps fractional atomic multipoles to PME grid ------------------------------------------------------------------------- */ -void PairAmoeba::grid_mpole(double **fmp, double ***grid) +void PairAmoeba::grid_mpole(double **fmp, FFT_SCALAR ***grid) { int i,j,k,m,ib,jb,kb; double v0,u0,t0; @@ -598,7 +596,7 @@ void PairAmoeba::grid_mpole(double **fmp, double ***grid) the particle mesh Ewald grid ------------------------------------------------------------------------- */ -void PairAmoeba::fphi_mpole(double ***grid, double **fphi) +void PairAmoeba::fphi_mpole(FFT_SCALAR ***grid, double **fphi) { int i,j,k,m,ib,jb,kb; double v0,v1,v2,v3; @@ -742,7 +740,7 @@ void PairAmoeba::fphi_mpole(double ***grid, double **fphi) grid_uind maps fractional induced dipoles to the PME grid ------------------------------------------------------------------------- */ -void PairAmoeba::grid_uind(double **fuind, double **fuinp, double ****grid) +void PairAmoeba::grid_uind(double **fuind, double **fuinp, FFT_SCALAR ****grid) { int i,j,k,m,ib,jb,kb; double v0,u0,t0; @@ -793,7 +791,7 @@ void PairAmoeba::grid_uind(double **fuind, double **fuinp, double ****grid) fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid ------------------------------------------------------------------------- */ -void PairAmoeba::fphi_uind(double ****grid, double **fdip_phi1, +void PairAmoeba::fphi_uind(FFT_SCALAR ****grid, double **fdip_phi1, double **fdip_phi2, double **fdip_sum_phi) { int i,j,k,m,ib,jb,kb; @@ -1042,7 +1040,7 @@ void PairAmoeba::fphi_uind(double ****grid, double **fdip_phi1, grid_disp maps dispersion coefficients to PME grid ------------------------------------------------------------------------- */ -void PairAmoeba::grid_disp(double ***grid) +void PairAmoeba::grid_disp(FFT_SCALAR ***grid) { int i,j,k,m,ib,jb,kb,itype,iclass; double v0,u0,t0; diff --git a/src/AMOEBA/amoeba_multipole.cpp b/src/AMOEBA/amoeba_multipole.cpp index f58395aa1c..a1503a91f3 100644 --- a/src/AMOEBA/amoeba_multipole.cpp +++ b/src/AMOEBA/amoeba_multipole.cpp @@ -21,6 +21,7 @@ #include "math_const.h" #include "math_special.h" #include "neigh_list.h" +#include "timer.h" #include @@ -55,6 +56,8 @@ void PairAmoeba::multipole() double qixx,qixy,qixz,qiyy,qiyz,qizz; double cii,dii,qii; + double time0,time1,time2; + // set cutoffs, taper coeffs, and PME params if (use_ewald) choose(MPOLE_LONG); @@ -78,13 +81,18 @@ void PairAmoeba::multipole() felec = electric / am_dielectric; + if (timer->has_sync()) MPI_Barrier(world); + time0 = platform::walltime(); + // compute the real space part of the Ewald summation if (mpole_rspace_flag) multipole_real(); + time1 = platform::walltime(); // compute the reciprocal space part of the Ewald summation if (mpole_kspace_flag) multipole_kspace(); + time2 = platform::walltime(); // compute the Ewald self-energy term over all the atoms @@ -109,6 +117,11 @@ void PairAmoeba::multipole() e = fterm * (cii + term*(dii/3.0+2.0*term*qii/5.0)); empole += e; } + + // accumulate timing information + + time_mpole_rspace += time1 - time0; + time_mpole_kspace += time2 - time1; } /* ---------------------------------------------------------------------- @@ -361,6 +374,9 @@ void PairAmoeba::multipole_real() bn[k] = (bfac*bn[k-1]+alsq2n*exp2a) / r2; } for (k = 0; k < 6; k++) bn[k] *= felec; + //if (i == 0 && j < 10) { + // printf("j = %d: aewald = %f; rr1 = %f; bn: %f %f %f %f %f %f\n", j, aewald, rr1, bn[0], bn[1], bn[2], bn[3], bn[4], bn[5]); + //} // find damped multipole intermediates and energy value @@ -404,6 +420,8 @@ void PairAmoeba::multipole_real() term2i*rr3i + term2k*rr3k + term2ik*rr3ik + term3i*rr5i + term3k*rr5k + term3ik*rr5ik; + + // find damped multipole intermediates for force and torque de = term1*rr3 + term4ik*rr9ik + term5ik*rr11ik + @@ -444,6 +462,7 @@ void PairAmoeba::multipole_real() term4 = 2.0 * (-ck*rr5+dkr*rr7-qkr*rr9); term5 = 2.0 * (-ci*rr5-dir*rr7-qir*rr9); term6 = 4.0 * rr7; + } empole += e; @@ -482,6 +501,7 @@ void PairAmoeba::multipole_real() tq[i][2] += ttmi[2]; // increment force-based gradient and torque on second site + // commenting out j parts for DEBUGGING f[j][0] += frcx; f[j][1] += frcy; @@ -638,7 +658,7 @@ void PairAmoeba::multipole_kspace() // gridpre = my portion of 3d grid in brick decomp w/ ghost values - double ***gridpre = (double ***) m_kspace->zero(); + FFT_SCALAR ***gridpre = (FFT_SCALAR ***) m_kspace->zero(); // map atoms to grid @@ -647,7 +667,7 @@ void PairAmoeba::multipole_kspace() // pre-convolution operations including forward FFT // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector - double *gridfft = m_kspace->pre_convolution(); + FFT_SCALAR *gridfft = m_kspace->pre_convolution(); // --------------------- // convolution operation @@ -718,7 +738,7 @@ void PairAmoeba::multipole_kspace() // post-convolution operations including backward FFT // gridppost = my portion of 3d grid in brick decomp w/ ghost values - double ***gridpost = (double ***) m_kspace->post_convolution(); + FFT_SCALAR ***gridpost = (FFT_SCALAR ***) m_kspace->post_convolution(); // get potential diff --git a/src/AMOEBA/amoeba_polar.cpp b/src/AMOEBA/amoeba_polar.cpp index 4d143c7a22..3c51426beb 100644 --- a/src/AMOEBA/amoeba_polar.cpp +++ b/src/AMOEBA/amoeba_polar.cpp @@ -21,6 +21,7 @@ #include "math_const.h" #include "math_special.h" #include "neigh_list.h" +#include "timer.h" #include #include @@ -55,6 +56,8 @@ void PairAmoeba::polar() double fix[3],fiy[3],fiz[3]; double tep[3]; + double time0,time1,time2; + // set cutoffs, taper coeffs, and PME params if (use_ewald) choose(POLAR_LONG); @@ -76,11 +79,16 @@ void PairAmoeba::polar() // compute the real space part of the dipole interactions + if (timer->has_sync()) MPI_Barrier(world); + time0 = platform::walltime(); + if (polar_rspace_flag) polar_real(); + time1 = platform::walltime(); // compute the reciprocal space part of dipole interactions if (polar_kspace_flag) polar_kspace(); + time2 = platform::walltime(); // compute the Ewald self-energy torque and virial terms @@ -133,6 +141,11 @@ void PairAmoeba::polar() virpolar[4] -= vxz; virpolar[5] -= vyz; } + + // accumulate timing information + + time_polar_rspace += time1 - time0; + time_polar_kspace += time2 - time1; } /* ---------------------------------------------------------------------- @@ -382,7 +395,7 @@ void PairAmoeba::polar_real() factor_uscale = 1.0; } } - + //if (i == 12 && j < 20) printf("j = %d: r = %f; factor_wscale = %f\n", j, sqrt(r2), factor_wscale); r = sqrt(r2); ck = rpole[j][0]; dkx = rpole[j][1]; @@ -597,7 +610,6 @@ void PairAmoeba::polar_real() dufld[i][3] += xr*tiz5 + zr*tix5 + 2.0*xr*zr*tuir; dufld[i][4] += yr*tiz5 + zr*tiy5 + 2.0*yr*zr*tuir; dufld[i][5] += zr*tiz5 + zr*zr*tuir; - dufld[j][0] -= xr*tkx5 + xr*xr*tukr; dufld[j][1] -= xr*tky5 + yr*tkx5 + 2.0*xr*yr*tukr; dufld[j][2] -= yr*tky5 + yr*yr*tukr; @@ -855,6 +867,7 @@ void PairAmoeba::polar_real() frcx = -2.0 * depx; frcy = -2.0 * depy; frcz = -2.0 * depz; + } // get the dtau/dr terms used for mutual polarization force @@ -1327,7 +1340,7 @@ void PairAmoeba::polar_kspace() // gridpre = my portion of 3d grid in brick decomp w/ ghost values - double ***gridpre = (double ***) p_kspace->zero(); + FFT_SCALAR ***gridpre = (FFT_SCALAR ***) p_kspace->zero(); // map atoms to grid @@ -1336,7 +1349,7 @@ void PairAmoeba::polar_kspace() // pre-convolution operations including forward FFT // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector - double *gridfft = p_kspace->pre_convolution(); + FFT_SCALAR *gridfft = p_kspace->pre_convolution(); // --------------------- // convolution operation @@ -1386,7 +1399,7 @@ void PairAmoeba::polar_kspace() // post-convolution operations including backward FFT // gridppost = my portion of 3d grid in brick decomp w/ ghost values - double ***gridpost = (double ***) p_kspace->post_convolution(); + FFT_SCALAR ***gridpost = (FFT_SCALAR ***) p_kspace->post_convolution(); // get potential @@ -1419,7 +1432,7 @@ void PairAmoeba::polar_kspace() // gridpre2 = my portion of 4d grid in brick decomp w/ ghost values - double ****gridpre2 = (double ****) pc_kspace->zero(); + FFT_SCALAR ****gridpre2 = (FFT_SCALAR ****) pc_kspace->zero(); // map 2 values to grid @@ -1428,7 +1441,7 @@ void PairAmoeba::polar_kspace() // pre-convolution operations including forward FFT // gridfft = my portion of complex 3d grid in FFT decomposition - double *gridfft = pc_kspace->pre_convolution(); + FFT_SCALAR *gridfft = pc_kspace->pre_convolution(); // --------------------- // convolution operation @@ -1451,7 +1464,7 @@ void PairAmoeba::polar_kspace() // post-convolution operations including backward FFT // gridppost = my portion of 4d grid in brick decomp w/ ghost values - double ****gridpost = (double ****) pc_kspace->post_convolution(); + FFT_SCALAR ****gridpost = (FFT_SCALAR ****) pc_kspace->post_convolution(); // get potential @@ -1857,7 +1870,7 @@ void PairAmoeba::polar_kspace() // gridpre = my portion of 3d grid in brick decomp w/ ghost values // zeroed by zero() - double ***gridpre = (double ***) p_kspace->zero(); + FFT_SCALAR ***gridpre = (FFT_SCALAR ***) p_kspace->zero(); // map atoms to grid @@ -1887,7 +1900,7 @@ void PairAmoeba::polar_kspace() // gridpre = my portion of 3d grid in brick decomp w/ ghost values // zeroed by zero() - gridpre = (double ***) p_kspace->zero(); + gridpre = (FFT_SCALAR ***) p_kspace->zero(); // map atoms to grid @@ -1896,7 +1909,7 @@ void PairAmoeba::polar_kspace() // pre-convolution operations including forward FFT // gridfft1/2 = my portions of complex 3d grid in FFT decomp as 1d vectors - double *gridfft2 = p_kspace->pre_convolution(); + FFT_SCALAR *gridfft2 = p_kspace->pre_convolution(); // --------------------- // convolution operation @@ -1953,7 +1966,7 @@ void PairAmoeba::polar_kspace() // gridpre = my portion of 3d grid in brick decomp w/ ghost values // zeroed by zero() - double ***gridpre = (double ***) p_kspace->zero(); + FFT_SCALAR ***gridpre = (FFT_SCALAR ***) p_kspace->zero(); // map atoms to grid @@ -1962,12 +1975,12 @@ void PairAmoeba::polar_kspace() // pre-convolution operations including forward FFT // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector - double *gridfft = p_kspace->pre_convolution(); + FFT_SCALAR *gridfft = p_kspace->pre_convolution(); // gridfft1 = copy of first FFT int nfft_owned = p_kspace->nfft_owned; - memcpy(gridfft1,gridfft,2*nfft_owned*sizeof(double)); + memcpy(gridfft1,gridfft,2*nfft_owned*sizeof(FFT_SCALAR)); // assign ??? to the PME grid @@ -1982,7 +1995,7 @@ void PairAmoeba::polar_kspace() // gridpre = my portion of 3d grid in brick decomp w/ ghost values - gridpre = (double ***) p_kspace->zero(); + gridpre = (FFT_SCALAR ***) p_kspace->zero(); // map atoms to grid @@ -1991,7 +2004,7 @@ void PairAmoeba::polar_kspace() // pre-convolution operations including forward FFT // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector - double *gridfft2 = p_kspace->pre_convolution(); + FFT_SCALAR *gridfft2 = p_kspace->pre_convolution(); // --------------------- // convolution operation diff --git a/src/AMOEBA/fix_amoeba_bitorsion.cpp b/src/AMOEBA/fix_amoeba_bitorsion.cpp index aeba26fb4d..cb8c62819d 100644 --- a/src/AMOEBA/fix_amoeba_bitorsion.cpp +++ b/src/AMOEBA/fix_amoeba_bitorsion.cpp @@ -194,8 +194,8 @@ void FixAmoebaBiTorsion::init() // error check that PairAmoeba or PairHiippo exist pair = nullptr; - pair = force->pair_match("amoeba",1,0); - if (!pair) pair = force->pair_match("hippo",1,0); + pair = force->pair_match("^amoeba",0,0); + if (!pair) pair = force->pair_match("^hippo",0,0); if (!pair) error->all(FLERR,"Cannot use fix amoeba/bitorsion w/out pair amoeba/hippo"); diff --git a/src/AMOEBA/improper_amoeba.cpp b/src/AMOEBA/improper_amoeba.cpp index b1e403da78..cb9db01b59 100644 --- a/src/AMOEBA/improper_amoeba.cpp +++ b/src/AMOEBA/improper_amoeba.cpp @@ -285,8 +285,9 @@ void ImproperAmoeba::init_style() // check if PairAmoeba disabled improper terms Pair *pair = nullptr; - pair = force->pair_match("amoeba",1,0); - if (!pair) pair = force->pair_match("hippo",1,0); + pair = force->pair_match("^amoeba",0,0); + if (!pair) pair = force->pair_match("^hippo",0,0); + if (!pair) error->all(FLERR,"Improper amoeba could not find pair amoeba/hippo"); int tmp; diff --git a/src/AMOEBA/pair_amoeba.cpp b/src/AMOEBA/pair_amoeba.cpp index e8b7a18dba..0812fe43f0 100644 --- a/src/AMOEBA/pair_amoeba.cpp +++ b/src/AMOEBA/pair_amoeba.cpp @@ -29,6 +29,7 @@ #include "my_page.h" #include "neigh_list.h" #include "neighbor.h" +#include "timer.h" #include "update.h" #include @@ -47,6 +48,7 @@ enum{MUTUAL,OPT,TCG,DIRECT}; enum{GEAR,ASPC,LSQR}; #define DELTASTACK 16 +#define DEBUG_AMOEBA 0 /* ---------------------------------------------------------------------- */ @@ -85,6 +87,10 @@ PairAmoeba::PairAmoeba(LAMMPS *lmp) : Pair(lmp) cmp = fmp = nullptr; cphi = fphi = nullptr; + _moduli_array = nullptr; + _moduli_bsarray = nullptr; + _nfft_max = 0; + poli = nullptr; conj = conjp = nullptr; vec = vecp = nullptr; @@ -227,6 +233,9 @@ PairAmoeba::~PairAmoeba() memory->destroy(fphidp); memory->destroy(cphidp); + memory->destroy(_moduli_array); + memory->destroy(_moduli_bsarray); + memory->destroy(thetai1); memory->destroy(thetai2); memory->destroy(thetai3); @@ -349,12 +358,22 @@ void PairAmoeba::compute(int eflag, int vflag) if (update->ntimestep <= update->beginstep+1) { time_init = time_hal = time_repulse = time_disp = time_mpole = 0.0; time_induce = time_polar = time_qxfer = 0.0; + + time_mpole_rspace = time_mpole_kspace = 0.0; + time_direct_rspace = time_direct_kspace = 0.0; + time_mutual_rspace = time_mutual_kspace = 0.0; + time_polar_rspace = time_polar_kspace = 0.0; + + time_grid_uind = time_fphi_uind = 0.0; + if (ic_kspace) { + ic_kspace->time_fft = 0.0; + } } double time0,time1,time2,time3,time4,time5,time6,time7,time8; - MPI_Barrier(world); - time0 = MPI_Wtime(); + if (timer->has_sync()) MPI_Barrier(world); + time0 = platform::walltime(); // if reneighboring step: // augment neighbor list to include 1-5 neighbor flags @@ -410,8 +429,7 @@ void PairAmoeba::compute(int eflag, int vflag) comm->forward_comm(this); if (amoeba) pbc_xred(); - - time1 = MPI_Wtime(); + time1 = platform::walltime(); // ---------------------------------------- // compute components of force field @@ -420,22 +438,22 @@ void PairAmoeba::compute(int eflag, int vflag) // buffered 14-7 Vdwl, pairwise if (amoeba && hal_flag) hal(); - time2 = MPI_Wtime(); + time2 = platform::walltime(); // Pauli repulsion, pairwise if (!amoeba && repulse_flag) repulsion(); - time3 = MPI_Wtime(); + time3 = platform::walltime(); // Ewald dispersion, pairwise and long range if (!amoeba && (disp_rspace_flag || disp_kspace_flag)) dispersion(); - time4 = MPI_Wtime(); + time4 = platform::walltime(); // multipole, pairwise and long range if (mpole_rspace_flag || mpole_kspace_flag) multipole(); - time5 = MPI_Wtime(); + time5 = platform::walltime(); // induced dipoles, interative CG relaxation // communicate induce() output values needed by ghost atoms @@ -445,17 +463,17 @@ void PairAmoeba::compute(int eflag, int vflag) cfstyle = INDUCE; comm->forward_comm(this); } - time6 = MPI_Wtime(); + time6 = platform::walltime(); // dipoles, pairwise and long range if (polar_rspace_flag || polar_kspace_flag) polar(); - time7 = MPI_Wtime(); + time7 = platform::walltime(); // charge transfer, pairwise if (!amoeba && qxfer_flag) charge_transfer(); - time8 = MPI_Wtime(); + time8 = platform::walltime(); // store energy components for output by compute pair command @@ -518,6 +536,44 @@ void PairAmoeba::finish() MPI_Allreduce(&time_qxfer,&ave,1,MPI_DOUBLE,MPI_SUM,world); time_qxfer = ave/comm->nprocs; + #if DEBUG_AMOEBA + // real-space/kspace breakdown + MPI_Allreduce(&time_mpole_rspace,&ave,1,MPI_DOUBLE,MPI_SUM,world); + time_mpole_rspace = ave/comm->nprocs; + + MPI_Allreduce(&time_mpole_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world); + time_mpole_kspace = ave/comm->nprocs; + + MPI_Allreduce(&time_direct_rspace,&ave,1,MPI_DOUBLE,MPI_SUM,world); + time_direct_rspace = ave/comm->nprocs; + + MPI_Allreduce(&time_direct_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world); + time_direct_kspace = ave/comm->nprocs; + + MPI_Allreduce(&time_mutual_rspace,&ave,1,MPI_DOUBLE,MPI_SUM,world); + time_mutual_rspace = ave/comm->nprocs; + + MPI_Allreduce(&time_mutual_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world); + time_mutual_kspace = ave/comm->nprocs; + + MPI_Allreduce(&time_polar_rspace,&ave,1,MPI_DOUBLE,MPI_SUM,world); + time_polar_rspace = ave/comm->nprocs; + + MPI_Allreduce(&time_polar_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world); + time_polar_kspace = ave/comm->nprocs; + + MPI_Allreduce(&time_grid_uind,&ave,1,MPI_DOUBLE,MPI_SUM,world); + time_grid_uind = ave/comm->nprocs; + + MPI_Allreduce(&time_fphi_uind,&ave,1,MPI_DOUBLE,MPI_SUM,world); + time_fphi_uind = ave/comm->nprocs; + + double time_mutual_fft = 0; + if (ic_kspace) time_mutual_fft = ic_kspace->time_fft; + MPI_Allreduce(&time_mutual_fft,&ave,1,MPI_DOUBLE,MPI_SUM,world); + time_mutual_fft = ave/comm->nprocs; + #endif // DEBUG_AMOEBA + double time_total = (time_init + time_hal + time_repulse + time_disp + time_mpole + time_induce + time_polar + time_qxfer) / 100.0; @@ -534,8 +590,27 @@ void PairAmoeba::finish() utils::logmesg(lmp," Induce time: {:<12.6g} {:6.2f}%\n", time_induce, time_induce/time_total); utils::logmesg(lmp," Polar time: {:<12.6g} {:6.2f}%\n", time_polar, time_polar/time_total); if (!amoeba) - utils::logmesg(lmp," Qxfer time: {:<12.6g} {:6.2f}%\n", time_qxfer, time_qxfer/time_total); - utils::logmesg(lmp," Total time: {:<12.6g}\n",time_total * 100.0); + utils::logmesg(lmp," Qxfer time: {:.6g} {:.6g}\n", time_qxfer, time_qxfer/time_total); + utils::logmesg(lmp," Total time: {:.6g}\n",time_total * 100.0); + + #if DEBUG_AMOEBA + double rspace_time = time_mpole_rspace + time_direct_rspace + time_mutual_rspace + time_polar_rspace; + double kspace_time = time_mpole_kspace + time_direct_kspace + time_mutual_kspace + time_polar_kspace; + + utils::logmesg(lmp," Real-space timing breakdown: {:.3g}%\n", rspace_time/time_total); + utils::logmesg(lmp," Mpole time: {:.6g} {:.3g}%\n", time_mpole_rspace, time_mpole_rspace/time_total); + utils::logmesg(lmp," Direct time: {:.6g} {:.3g}%\n", time_direct_rspace, time_direct_rspace/time_total); + utils::logmesg(lmp," Mutual time: {:.6g} {:.3g}%\n", time_mutual_rspace, time_mutual_rspace/time_total); + utils::logmesg(lmp," Polar time: {:.6g} {:.3g}%\n", time_polar_rspace, time_polar_rspace/time_total); + utils::logmesg(lmp," K-space timing breakdown : {:.3g}%\n", kspace_time/time_total); + utils::logmesg(lmp," Mpole time: {:.6g} {:.3g}%\n", time_mpole_kspace, time_mpole_kspace/time_total); + utils::logmesg(lmp," Direct time: {:.6g} {:.3g}%\n", time_direct_kspace, time_direct_kspace/time_total); + utils::logmesg(lmp," Mutual time: {:.6g} {:.3g}%\n", time_mutual_kspace, time_mutual_kspace/time_total); + utils::logmesg(lmp," - Grid : {:.6g} {:.3g}%\n", time_grid_uind, time_grid_uind/time_total); + utils::logmesg(lmp," - FFT : {:.6g} {:.3g}%\n", time_mutual_fft, time_mutual_fft/time_total); + utils::logmesg(lmp," - Interp : {:.6g} {:.3g}%\n", time_fphi_uind, time_fphi_uind/time_total); + utils::logmesg(lmp," Polar time: {:.6g} {:.3g}%\n", time_polar_kspace, time_polar_kspace/time_total); + #endif } } @@ -2320,6 +2395,8 @@ void PairAmoeba::grow_local() firstneigh_pcpc = (double **) memory->smalloc(nmax*sizeof(double *),"induce:firstneigh_pcpc"); } + + memory->create(_moduli_array,bsordermax,"amoeba:_moduli_array"); } /* ---------------------------------------------------------------------- diff --git a/src/AMOEBA/pair_amoeba.h b/src/AMOEBA/pair_amoeba.h index 847764244b..cdeee6c95f 100644 --- a/src/AMOEBA/pair_amoeba.h +++ b/src/AMOEBA/pair_amoeba.h @@ -82,6 +82,12 @@ class PairAmoeba : public Pair { double time_init, time_hal, time_repulse, time_disp; double time_mpole, time_induce, time_polar, time_qxfer; + double time_mpole_rspace, time_mpole_kspace; + double time_direct_rspace, time_direct_kspace; + double time_mutual_rspace, time_mutual_kspace; + double time_polar_rspace, time_polar_kspace; + double time_grid_uind, time_fphi_uind; + // energy/virial components double ehal, erepulse, edisp, epolar, empole, eqxfer; @@ -324,8 +330,12 @@ class PairAmoeba : public Pair { double *qfac; // convoulution pre-factors double *gridfft1; // copy of p_kspace FFT grid - double **cmp, **fmp; // Cartesian and fractional multipoles - double **cphi, **fphi; + double **cmp,**fmp; // Cartesian and fractional multipoles + double **cphi,**fphi; + + double *_moduli_array; // buffers for moduli + double *_moduli_bsarray; + int _nfft_max; // params for current KSpace solve and FFT being worked on @@ -335,8 +345,12 @@ class PairAmoeba : public Pair { double ctf[10][10]; // indices NOT flipped vs Fortran double ftc[10][10]; // indices NOT flipped vs Fortran - class AmoebaConvolution *m_kspace, *p_kspace, *pc_kspace, *d_kspace; - class AmoebaConvolution *i_kspace, *ic_kspace; + class AmoebaConvolution *m_kspace; // multipole KSpace + class AmoebaConvolution *p_kspace; // polar KSpace + class AmoebaConvolution *pc_kspace; + class AmoebaConvolution *d_kspace; // dispersion KSpace + class AmoebaConvolution *i_kspace; // induce KSpace + class AmoebaConvolution *ic_kspace; // FFT grid size factors @@ -347,33 +361,33 @@ class PairAmoeba : public Pair { void hal(); - void repulsion(); - void damprep(double, double, double, double, double, double, double, double, int, double, double, - double *); + virtual void repulsion(); + void damprep(double, double, double, double, double, double, double, double, + int, double, double, double *); void dispersion(); - void dispersion_real(); + virtual void dispersion_real(); void dispersion_kspace(); void multipole(); - void multipole_real(); + virtual void multipole_real(); void multipole_kspace(); void polar(); void polar_energy(); - void polar_real(); - void polar_kspace(); + virtual void polar_real(); + virtual void polar_kspace(); void damppole(double, int, double, double, double *, double *, double *); - void induce(); + virtual void induce(); void ulspred(); - void ufield0c(double **, double **); + virtual void ufield0c(double **, double **); void uscale0b(int, double **, double **, double **, double **); void dfield0c(double **, double **); - void umutual1(double **, double **); - void umutual2b(double **, double **); + virtual void umutual1(double **, double **); + virtual void umutual2b(double **, double **); void udirect1(double **); - void udirect2b(double **, double **); + virtual void udirect2b(double **, double **); void dampmut(double, double, double, double *); void dampdir(double, double, double, double *, double *); void cholesky(int, double *, double *); @@ -393,11 +407,11 @@ class PairAmoeba : public Pair { void fphi_to_cphi(double **, double **); void frac_to_cart(); - void grid_mpole(double **, double ***); - void fphi_mpole(double ***, double **); - void grid_uind(double **, double **, double ****); - void fphi_uind(double ****, double **, double **, double **); - void grid_disp(double ***); + void grid_mpole(double **, FFT_SCALAR ***); + void fphi_mpole(FFT_SCALAR ***, double **); + void grid_uind(double **, double **, FFT_SCALAR ****); + virtual void fphi_uind(FFT_SCALAR ****, double **, double **, double **); + void grid_disp(FFT_SCALAR ***); void kewald(); void kewald_parallel(int, int, int, int, int &, int &, int &, int &, int &, int &, int &, int &, diff --git a/src/Depend.sh b/src/Depend.sh index 10d612f490..470a0a2a2b 100755 --- a/src/Depend.sh +++ b/src/Depend.sh @@ -45,6 +45,10 @@ depend () { # add one if statement per parent package # add one depend() call per child package that depends on that parent +if (test $1 = "AMOEBA") then + depend GPU +fi + if (test $1 = "ASPHERE") then depend GPU depend OPENMP diff --git a/src/GPU/Install.sh b/src/GPU/Install.sh index d28e6260f8..19e89498fc 100755 --- a/src/GPU/Install.sh +++ b/src/GPU/Install.sh @@ -28,6 +28,8 @@ action () { # list of files with optional dependcies +action amoeba_convolution_gpu.cpp amoeba_convolution.cpp +action amoeba_convolution_gpu.h amoeba_convolution.cpp action fix_gpu.cpp action fix_gpu.h action fix_nve_gpu.h @@ -41,6 +43,8 @@ action fix_npt_gpu.cpp action fix_nve_asphere_gpu.h fix_nve_asphere.h action fix_nve_asphere_gpu.cpp fix_nve_asphere.cpp action gpu_extra.h +action pair_amoeba_gpu.cpp pair_amoeba.cpp +action pair_amoeba_gpu.h pair_amoeba.h action pair_beck_gpu.cpp pair_beck.cpp action pair_beck_gpu.h pair_beck.h action pair_born_coul_long_gpu.cpp pair_born_coul_long.cpp @@ -89,6 +93,8 @@ action pair_gauss_gpu.cpp pair_gauss.cpp action pair_gauss_gpu.h pair_gauss.h action pair_gayberne_gpu.cpp pair_gayberne.cpp action pair_gayberne_gpu.h pair_gayberne.cpp +action pair_hippo_gpu.cpp pair_hippo.cpp +action pair_hippo_gpu.h pair_hippo.cpp action pair_lj96_cut_gpu.cpp pair_lj96_cut.cpp action pair_lj96_cut_gpu.h pair_lj96_cut.h action pair_lj_charmm_coul_long_gpu.cpp pair_lj_charmm_coul_long.cpp @@ -113,6 +119,10 @@ action pair_lj_cut_coul_msm_gpu.cpp pair_lj_cut_coul_msm.cpp action pair_lj_cut_coul_msm_gpu.h pair_lj_cut_coul_msm.h action pair_lj_cut_gpu.cpp action pair_lj_cut_gpu.h +action pair_lj_cut_dipole_long_gpu.cpp pair_lj_cut_dipole_long.cpp +action pair_lj_cut_dipole_long_gpu.h pair_lj_cut_dipole_long.cpp +action pair_lj_cut_tip4p_long_gpu.h pair_lj_cut_tip4p_long.cpp +action pair_lj_cut_tip4p_long_gpu.cpp pair_lj_cut_tip4p_long.cpp action pair_lj_smooth_gpu.cpp pair_lj_smooth.cpp action pair_lj_smooth_gpu.h pair_lj_smooth.cpp action pair_lj_expand_gpu.cpp @@ -155,10 +165,6 @@ action pppm_gpu.cpp pppm.cpp action pppm_gpu.h pppm.cpp action pair_ufm_gpu.cpp pair_ufm.cpp action pair_ufm_gpu.h pair_ufm.h -action pair_lj_cut_dipole_long_gpu.cpp pair_lj_cut_dipole_long.cpp -action pair_lj_cut_dipole_long_gpu.h pair_lj_cut_dipole_long.cpp -action pair_lj_cut_tip4p_long_gpu.h pair_lj_cut_tip4p_long.cpp -action pair_lj_cut_tip4p_long_gpu.cpp pair_lj_cut_tip4p_long.cpp # edit 2 Makefile.package files to include/exclude package info diff --git a/src/GPU/amoeba_convolution_gpu.cpp b/src/GPU/amoeba_convolution_gpu.cpp new file mode 100644 index 0000000000..908c9e409c --- /dev/null +++ b/src/GPU/amoeba_convolution_gpu.cpp @@ -0,0 +1,181 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/ Sandia National Laboratories + LAMMPS Development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include "amoeba_convolution_gpu.h" +#include "comm.h" +#include "fft3d_wrap.h" +#include "remap_wrap.h" +#include "grid3d.h" + +using namespace LAMMPS_NS; + +// DEBUG + +#define DEBUG_AMOEBA 0 +#if DEBUG_AMOEBA +char *labels[7] = + {(char *) "MPOLE_GRID", (char *) "POLAR_GRID", + (char *) "POLAR_GRIDC", (char *) "DISP_GRID", + (char *) "INDUCE_GRID", (char *) "INDUCE_GRIDC"}; + +enum{GRIDBRICK_OUT,GRIDBRICK_IN,FFT,CFFT1,CFFT2}; +#endif +// END DEBUG + +#define SCALE 0 + +//#define USE_AMOEBA_FFT +#ifdef USE_AMOEBA_FFT +// External functions from GPU library +int amoeba_setup_fft(const int size, const int numel, const int element_type); +int amoeba_compute_fft1d(void* in, void* out, const int numel, const int mode); +#endif + +/* ---------------------------------------------------------------------- + partition an FFT grid across processors + both for a brick and FFT x pencil decomposition + nx,nz,nz = global FFT grid size + order = size of stencil in each dimension that maps atoms to grid + adapted from PPPM::set_grid_local() +------------------------------------------------------------------------- */ + +AmoebaConvolutionGPU::AmoebaConvolutionGPU(LAMMPS *lmp, Pair *pair, + int nx_caller, int ny_caller, int nz_caller, + int order_caller, int which_caller) : + AmoebaConvolution(lmp, pair, nx_caller, ny_caller, nz_caller, order_caller, + which_caller) +{ + +} + +/* ---------------------------------------------------------------------- + perform pre-convolution grid operations for 4d cgrid_brick array +------------------------------------------------------------------------- */ + +FFT_SCALAR *AmoebaConvolutionGPU::pre_convolution_4d() +{ + int ix,iy,iz,n; + + // reverse comm for 4d brick grid + ghosts + +#if DEBUG_AMOEBA + debug_scalar(GRIDBRICK_OUT,"PRE Convo / PRE Grid3d"); +#endif + + gc->reverse_comm(Grid3d::PAIR,amoeba,which,2,sizeof(FFT_SCALAR), + gc_buf1,gc_buf2,MPI_FFT_SCALAR); + +#if DEBUG_AMOEBA + debug_scalar(GRIDBRICK_IN,"PRE Convo / POST Grid3d"); + debug_file(GRIDBRICK_IN,"pre.convo.post.grid3d"); +#endif + // copy owned 4d brick grid values to FFT grid + + n = 0; + for (iz = nzlo_in; iz <= nzhi_in; iz++) + for (iy = nylo_in; iy <= nyhi_in; iy++) + for (ix = nxlo_in; ix <= nxhi_in; ix++) { + cfft[n++] = cgrid_brick[iz][iy][ix][0]; + cfft[n++] = cgrid_brick[iz][iy][ix][1]; + } + + // remap FFT grid from brick to x pencil partitioning + // NOTE: could just setup FFT to start from brick decomp and skip remap + + remap->perform(cfft,cfft,remap_buf); + +#if DEBUG_AMOEBA + debug_scalar(FFT,"PRE Convo / POST Remap"); + debug_file(FFT,"pre.convo.post.remap"); +#endif + + double time0,time1; + + MPI_Barrier(world); + time0 = platform::walltime(); + + // perform forward FFT + + #ifdef USE_AMOEBA_FFT + amoeba_compute_fft1d(cfft,cfft,2*nfft_owned,FFT3d::FORWARD); + #else + fft1->compute(cfft,cfft,FFT3d::FORWARD); + #endif + + time1 = platform::walltime(); + + time_fft += time1 - time0; + + if (SCALE) { + double scale = 1.0/nfft_global; + for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale; + } + +#if DEBUG_AMOEBA + debug_scalar(CFFT1,"PRE Convo / POST FFT"); + debug_file(CFFT1,"pre.convo.post.fft"); +#endif + return cfft; +} + +/* ---------------------------------------------------------------------- + perform post-convolution grid operations for 4d cgrid_brick array +------------------------------------------------------------------------- */ + +void *AmoebaConvolutionGPU::post_convolution_4d() +{ + int ix,iy,iz,n; + + // perform backward FFT + +#if DEBUG_AMOEBA + debug_scalar(CFFT1,"POST Convo / PRE FFT"); + debug_file(CFFT1,"post.convo.pre.fft"); +#endif + + double time0,time1; + + MPI_Barrier(world); + time0 = platform::walltime(); + + fft2->compute(cfft,cfft,FFT3d::BACKWARD); + + time1 = platform::walltime(); + + time_fft += time1 - time0; + +#if DEBUG_AMOEBA + debug_scalar(CFFT2,"POST Convo / POST FFT"); + debug_file(CFFT2,"post.convo.post.fft"); +#endif + // copy 1d complex values into 4d complex grid + + n = 0; + for (iz = nzlo_in; iz <= nzhi_in; iz++) + for (iy = nylo_in; iy <= nyhi_in; iy++) + for (ix = nxlo_in; ix <= nxhi_in; ix++) { + cgrid_brick[iz][iy][ix][0] = cfft[n++]; + cgrid_brick[iz][iy][ix][1] = cfft[n++]; + } + + // forward comm to populate ghost grid values + +#if DEBUG_AMOEBA + debug_scalar(GRIDBRICK_IN,"POST Convo / PRE grid3d"); + debug_file(GRIDBRICK_IN,"post.convo.pre.grid3d"); +#endif + gc->forward_comm(Grid3d::PAIR,amoeba,which,2,sizeof(FFT_SCALAR), + gc_buf1,gc_buf2,MPI_FFT_SCALAR); + + return (void *) cgrid_brick; +} diff --git a/src/GPU/amoeba_convolution_gpu.h b/src/GPU/amoeba_convolution_gpu.h new file mode 100644 index 0000000000..4286f2155f --- /dev/null +++ b/src/GPU/amoeba_convolution_gpu.h @@ -0,0 +1,32 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/ Sandia National Laboratories + LAMMPS Development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifndef LMP_AMOEBA_CONVOLUTION_GPU_H +#define LMP_AMOEBA_CONVOLUTION_GPU_H + +#include "amoeba_convolution.h" + + +namespace LAMMPS_NS { + +class AmoebaConvolutionGPU : public AmoebaConvolution { + public: + AmoebaConvolutionGPU(class LAMMPS *, class Pair *, int, int, int, int, int); + + FFT_SCALAR *pre_convolution_4d() override; + void *post_convolution_4d() override; + +}; + +} // namespace LAMMPS_NS +#endif diff --git a/src/GPU/fix_gpu.cpp b/src/GPU/fix_gpu.cpp index 97f22da0a7..23191c12c8 100644 --- a/src/GPU/fix_gpu.cpp +++ b/src/GPU/fix_gpu.cpp @@ -131,7 +131,7 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) : _gpu_mode = GPU_NEIGH; _particle_split = 1.0; int nthreads = 0; - int newtonflag = 0; + int newtonflag = force->newton_pair; int threads_per_atom = -1; double binsize = 0.0; char *opencl_args = nullptr; @@ -360,6 +360,8 @@ double FixGPU::memory_usage() return bytes; } +/* ---------------------------------------------------------------------- */ + double FixGPU::binsize(const double subx, const double suby, const double subz, const int nlocal, const double cut) { diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp new file mode 100644 index 0000000000..fd423486fd --- /dev/null +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -0,0 +1,2067 @@ +// clang-format off +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS Development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Trung Nguyen (Northwestern/UChicago) +------------------------------------------------------------------------- */ + +#include "pair_amoeba_gpu.h" + +#include "amoeba_convolution_gpu.h" +#include "atom.h" +#include "comm.h" +#include "domain.h" +#include "error.h" +#include "fix_store_peratom.h" +#include "force.h" +#include "gpu_extra.h" +#include "info.h" +#include "math_const.h" +#include "memory.h" +#include "my_page.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "neighbor.h" +#include "suffix.h" +#include + +using namespace LAMMPS_NS; +using namespace MathConst; + +// same as in amoeba_induce.cpp +enum{INDUCE,RSD,SETUP_AMOEBA,SETUP_HIPPO,KMPOLE,AMGROUP}; // forward comm +enum{FIELD,ZRSD,TORQUE,UFLD}; // reverse comm +enum{VDWL,REPULSE,QFER,DISP,MPOLE,POLAR,USOLV,DISP_LONG,MPOLE_LONG,POLAR_LONG}; +enum{MUTUAL,OPT,TCG,DIRECT}; +enum{GEAR,ASPC,LSQR}; +enum{BUILD,APPLY}; +enum{GORDON1,GORDON2}; + +// same as in pair_amoeba.cpp +enum{MPOLE_GRID,POLAR_GRID,POLAR_GRIDC,DISP_GRID,INDUCE_GRID,INDUCE_GRIDC}; + +#define DEBYE 4.80321 // conversion factor from q-Angs (real units) to Debye + +// External functions from cuda library for atom decomposition + +int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclass, + const double *host_pdamp, const double *host_thole, + const double *host_dirdamp, const int* host_amtype2class, + const double *host_special_hal, const double *host_special_repel, + const double *host_special_disp, const double *host_special_mpole, + const double *host_special_polar_wscale, + const double *host_special_polar_piscale, + const double *host_special_polar_pscale, + const double *host_csix, const double *host_adisp, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const int maxspecial15, + const double cell_size, int &gpu_mode, FILE *screen, + const double polar_dscale, const double polar_uscale); +void amoeba_gpu_clear(); + +int** amoeba_gpu_precompute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, double *host_pval, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, double *prd); + +void amoeba_gpu_compute_multipole_real(const int ago, const int inum, const int nall, + double **host_x, int *host_type, int *host_amtype, int *host_amgroup, + double **host_rpole, double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, int* nspecial15, tagint** special15, + const bool eflag, const bool vflag, const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, const double cpu_time, + bool &success, const double aewald, const double felec, const double off2, + double *host_q, double *boxlo, double *prd, void **tq_ptr); + +void amoeba_gpu_compute_udirect2b(int *host_amtype, int *host_amgroup, + double **host_rpole, double **host_uind, double **host_uinp, + const double aewald, const double off2, void **fieldp_ptr); + +void amoeba_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup, + double **host_rpole, double **host_uind, double **host_uinp, + const double aewald, const double off2, void **fieldp_ptr); + +void amoeba_gpu_update_fieldp(void **fieldp_ptr); + +void amoeba_gpu_precompute_kspace(const int inum_full, const int bsorder, + double ***host_thetai1, double ***host_thetai2, + double ***host_thetai3, int** igrid, + const int nzlo_out, const int nzhi_out, + const int nylo_out, const int nyhi_out, + const int nxlo_out, const int nxhi_out); + +void amoeba_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1, + void **host_fdip_phi2, void **host_fdip_sum_phi); + +void amoeba_gpu_fphi_mpole(double ***host_grid_brick, void **host_fdip_sum_phi, + const double felec); + +void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, + double **host_rpole, double **host_uind, double **host_uinp, + const bool eflag, const bool vflag, const bool eatom, const bool vatom, + const double aewald, const double felec, const double off2, + void **tq_ptr); + +double amoeba_gpu_bytes(); + +/* ---------------------------------------------------------------------- */ + +PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE) +{ + respa_enable = 0; + reinitflag = 0; + cpu_time = 0.0; + suffix_flag |= Suffix::GPU; + fieldp_pinned = nullptr; + tq_pinned = nullptr; + + gpu_hal_ready = false; // true for AMOEBA when ready + gpu_repulsion_ready = false; // always false for AMOEBA + gpu_dispersion_real_ready = false; // always false for AMOEBA + gpu_multipole_real_ready = true; // need to be true for precompute() + gpu_udirect2b_ready = true; + gpu_umutual1_ready = true; + gpu_fphi_uind_ready = true; + gpu_umutual2b_ready = true; + gpu_polar_real_ready = true; // need to be true for copying data from device back to host + + GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); +} + +/* ---------------------------------------------------------------------- + free all arrays +------------------------------------------------------------------------- */ + +PairAmoebaGPU::~PairAmoebaGPU() +{ + amoeba_gpu_clear(); +} + +/* ---------------------------------------------------------------------- + init specific to this pair style +------------------------------------------------------------------------- */ + +void PairAmoebaGPU::init_style() +{ + PairAmoeba::init_style(); + + // Repeat cutsq calculation because done after call to init_style + + double maxcut = -1.0; + double cut; + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = i; j <= atom->ntypes; j++) { + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + cut = init_one(i,j); + cut *= cut; + if (cut > maxcut) + maxcut = cut; + cutsq[i][j] = cutsq[j][i] = cut; + } else + cutsq[i][j] = cutsq[j][i] = 0.0; + } + } + + double cell_size = sqrt(maxcut) + neighbor->skin; + + int maxspecial=0; + int maxspecial15=0; + if (atom->molecular != Atom::ATOMIC) { + maxspecial=atom->maxspecial; + maxspecial15=atom->maxspecial15; + } + + int mnf = 5e-2 * neighbor->oneatom; + int success = amoeba_gpu_init(atom->ntypes+1, max_amtype, max_amclass, + pdamp, thole, dirdamp, amtype2class, special_hal, + special_repel, special_disp, special_mpole, + special_polar_wscale, special_polar_piscale, + special_polar_pscale, csix, adisp, atom->nlocal, + atom->nlocal+atom->nghost, mnf, maxspecial, + maxspecial15, cell_size, gpu_mode, screen, + polar_dscale, polar_uscale); + GPU_EXTRA::check_flag(success,error,world); + + if (gpu_mode == GPU_FORCE) + error->all(FLERR,"Pair style amoeba/gpu does not support neigh no for now"); + + acc_float = Info::has_accelerator_feature("GPU", "precision", "single"); + + // replace with the gpu counterpart + + if (gpu_umutual1_ready) { + if (use_ewald && ic_kspace) { + delete ic_kspace; + ic_kspace = + new AmoebaConvolutionGPU(lmp,this,nefft1,nefft2,nefft3,bsporder,INDUCE_GRIDC); + } + } +} + +/* ---------------------------------------------------------------------- + multipole_real = real-space portion of mulipole interactions + adapted from Tinker emreal1d() routine +------------------------------------------------------------------------- */ + +void PairAmoebaGPU::multipole_real() +{ + if (!gpu_multipole_real_ready) { + PairAmoeba::multipole_real(); + return; + } + + int eflag=1, vflag=1; + double **f = atom->f; + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + bool success = true; + int *ilist, *numneigh; + + double sublo[3],subhi[3]; + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi); + } + inum = atom->nlocal; + + amoeba_gpu_precompute(neighbor->ago, inum, nall, atom->x, + atom->type, amtype, amgroup, rpole, + nullptr, nullptr, nullptr, + sublo, subhi, atom->tag, + atom->nspecial, atom->special, + atom->nspecial15, atom->special15, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, atom->q, domain->boxlo, domain->prd); + if (!success) + error->one(FLERR,"Insufficient memory on accelerator"); + + // select the correct cutoff for the term + + if (use_ewald) choose(MPOLE_LONG); + else choose(MPOLE); + + // set the energy unit conversion factor for multipolar real-space calculation + + double felec = electric / am_dielectric; + + amoeba_gpu_compute_multipole_real(neighbor->ago, inum, nall, atom->x, + atom->type, amtype, amgroup, rpole, + sublo, subhi, atom->tag, + atom->nspecial, atom->special, + atom->nspecial15, atom->special15, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, aewald, felec, off2, atom->q, + domain->boxlo, domain->prd, &tq_pinned); + + + + // reference to the tep array from GPU lib + + if (acc_float) { + auto *tq_ptr = (float *)tq_pinned; + compute_force_from_torque(tq_ptr, f, virmpole); // fmpole + } else { + auto *tq_ptr = (double *)tq_pinned; + compute_force_from_torque(tq_ptr, f, virmpole); // fmpole + } +} + +/* ---------------------------------------------------------------------- + induce = induced dipole moments via pre-conditioned CG solver + adapted from Tinker induce0a() routine + NOTE: Almost the same in the CPU version, except that there is no need + to call reverse_comm() for crstyle = FIELD; +------------------------------------------------------------------------- */ + +void PairAmoebaGPU::induce() +{ + bool done; + int i,j,m,itype; + int iter,maxiter; + double polmin; + double eps,epsold; + double epsd,epsp; + double udsum,upsum; + double a,ap,b,bp; + double sum,sump,term; + double reduce[4],allreduce[4]; + + // set cutoffs, taper coeffs, and PME params + // create qfac here, free at end of polar() + + if (use_ewald) choose(POLAR_LONG); + else choose(POLAR); + + // owned atoms + + int nlocal = atom->nlocal; + + // zero out the induced dipoles at each site + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + uind[i][j] = 0.0; + uinp[i][j] = 0.0; + } + } + + // get the electrostatic field due to permanent multipoles + + dfield0c(field,fieldp); + + // need reverse_comm if dfield0c (i.e. udirect2b) is CPU-only + + if (!gpu_udirect2b_ready) { + crstyle = FIELD; + comm->reverse_comm(this); + } + + // set induced dipoles to polarizability times direct field + + for (i = 0; i < nlocal; i++) { + itype = amtype[i]; + for (j = 0; j < 3; j++) { + udir[i][j] = polarity[itype] * field[i][j]; + udirp[i][j] = polarity[itype] * fieldp[i][j]; + if (pcgguess) { + uind[i][j] = udir[i][j]; + uinp[i][j] = udirp[i][j]; + } + } + } + + // allocate memory and make early host-device transfers + // must be done before the first ufield0c + // NOTE: this is for ic_kspace, and thetai[1-3] + + if (ic_kspace) + amoeba_gpu_precompute_kspace(atom->nlocal, bsorder, thetai1, thetai2, + thetai3, igrid, + ic_kspace->nzlo_out, ic_kspace->nzhi_out, + ic_kspace->nylo_out, ic_kspace->nyhi_out, + ic_kspace->nxlo_out, ic_kspace->nxhi_out); + + // get induced dipoles via the OPT extrapolation method + // NOTE: any way to rewrite these loops to avoid allocating + // uopt,uoptp with a optorder+1 dimension, just optorder ?? + // since no need to store optorder+1 values after these loops + + if (poltyp == OPT) { + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + uopt[i][0][j] = udir[i][j]; + uoptp[i][0][j] = udirp[i][j]; + } + } + + for (m = 1; m <= optorder; m++) { + optlevel = m - 1; // used in umutual1() for fopt,foptp + + cfstyle = INDUCE; + comm->forward_comm(this); + + ufield0c(field,fieldp); + + if (!gpu_umutual2b_ready) { + crstyle = FIELD; + comm->reverse_comm(this); + } + + for (i = 0; i < nlocal; i++) { + itype = amtype[i]; + for (j = 0; j < 3; j++) { + uopt[i][m][j] = polarity[itype] * field[i][j]; + uoptp[i][m][j] = polarity[itype] * fieldp[i][j]; + uind[i][j] = uopt[i][m][j]; + uinp[i][j] = uoptp[i][m][j]; + } + } + } + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + uind[i][j] = 0.0; + uinp[i][j] = 0.0; + usum[i][j] = 0.0; + usump[i][j] = 0.0; + for (m = 0; m <= optorder; m++) { + usum[i][j] += uopt[i][m][j]; + usump[i][j] += uoptp[i][m][j]; + uind[i][j] += copt[m]*usum[i][j]; + uinp[i][j] += copt[m]*usump[i][j]; + } + } + } + } + + // set tolerances for computation of mutual induced dipoles + + if (poltyp == MUTUAL) { + done = false; + maxiter = 100; + iter = 0; + polmin = 0.00000001; + eps = 100.0; + + // estimate induced dipoles using a polynomial predictor + + if (use_pred && nualt == maxualt) { + ulspred(); + + double ***udalt = fixudalt->tstore; + double ***upalt = fixupalt->tstore; + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + udsum = 0.0; + upsum = 0.0; + for (m = 0; m < nualt; m++) { + udsum += bpred[m]*udalt[i][m][j]; + upsum += bpredp[m]*upalt[i][m][j]; + } + uind[i][j] = udsum; + uinp[i][j] = upsum; + } + } + } + + // estimate induced dipoles via inertial extended Lagrangian + // not supported for now + // requires uaux,upaux to persist with each atom + // also requires a velocity vector(s) to persist + // also requires updating uaux,upaux in the Verlet integration + + /* + if (use_ielscf) { + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + uind[i][j] = uaux[i][j]; + uinp[i][j] = upaux[i][j]; + } + } + } + */ + + // get the electrostatic field due to induced dipoles + + cfstyle = INDUCE; + comm->forward_comm(this); + + ufield0c(field,fieldp); + + if (!gpu_umutual2b_ready) { + crstyle = FIELD; + comm->reverse_comm(this); + } + + // set initial conjugate gradient residual and conjugate vector + + for (i = 0; i < nlocal; i++) { + itype = amtype[i]; + + poli[i] = MAX(polmin,polarity[itype]); + for (j = 0; j < 3; j++) { + if (pcgguess) { + rsd[i][j] = (udir[i][j]-uind[i][j])/poli[i] + field[i][j]; + rsdp[i][j] = (udirp[i][j]-uinp[i][j])/poli[i] + fieldp[i][j]; + } else { + rsd[i][j] = udir[i][j] / poli[i]; + rsdp[i][j] = udirp[i][j] / poli[i]; + } + zrsd[i][j] = rsd[i][j]; + zrsdp[i][j] = rsdp[i][j]; + } + } + + if (pcgprec) { + cfstyle = RSD; + comm->forward_comm(this); + uscale0b(BUILD,rsd,rsdp,zrsd,zrsdp); + uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp); + crstyle = ZRSD; + comm->reverse_comm(this); + } + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + conj[i][j] = zrsd[i][j]; + conjp[i][j] = zrsdp[i][j]; + } + } + + // conjugate gradient iteration of the mutual induced dipoles + + while (!done) { + iter++; + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + vec[i][j] = uind[i][j]; + vecp[i][j] = uinp[i][j]; + uind[i][j] = conj[i][j]; + uinp[i][j] = conjp[i][j]; + } + } + + cfstyle = INDUCE; + comm->forward_comm(this); + + ufield0c(field,fieldp); + + if (!gpu_umutual2b_ready) { + crstyle = FIELD; + comm->reverse_comm(this); + } + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + uind[i][j] = vec[i][j]; + uinp[i][j] = vecp[i][j]; + vec[i][j] = conj[i][j]/poli[i] - field[i][j]; + vecp[i][j] = conjp[i][j]/poli[i] - fieldp[i][j]; + } + } + + a = 0.0; + ap = 0.0; + sum = 0.0; + sump = 0.0; + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + a += conj[i][j]*vec[i][j]; + ap += conjp[i][j]*vecp[i][j]; + sum += rsd[i][j]*zrsd[i][j]; + sump += rsdp[i][j]*zrsdp[i][j]; + } + } + + reduce[0] = a; + reduce[1] = ap; + reduce[2] = sum; + reduce[3] = sump; + MPI_Allreduce(reduce,allreduce,4,MPI_DOUBLE,MPI_SUM,world); + a = allreduce[0]; + ap = allreduce[1]; + sum = allreduce[2]; + sump = allreduce[3]; + + if (a != 0.0) a = sum / a; + if (ap != 0.0) ap = sump / ap; + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + uind[i][j] = uind[i][j] + a*conj[i][j]; + uinp[i][j] = uinp[i][j] + ap*conjp[i][j]; + rsd[i][j] = rsd[i][j] - a*vec[i][j]; + rsdp[i][j] = rsdp[i][j] - ap*vecp[i][j]; + zrsd[i][j] = rsd[i][j]; + zrsdp[i][j] = rsdp[i][j]; + } + } + + if (pcgprec) { + cfstyle = RSD; + comm->forward_comm(this); + uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp); + crstyle = ZRSD; + comm->reverse_comm(this); + } + + b = 0.0; + bp = 0.0; + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + b += rsd[i][j]*zrsd[i][j]; + bp += rsdp[i][j]*zrsdp[i][j]; + } + } + + reduce[0] = b; + reduce[1] = bp; + MPI_Allreduce(reduce,allreduce,4,MPI_DOUBLE,MPI_SUM,world); + b = allreduce[0]; + bp = allreduce[1]; + + if (sum != 0.0) b /= sum; + if (sump != 0.0) bp /= sump; + + epsd = 0.0; + epsp = 0.0; + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + conj[i][j] = zrsd[i][j] + b*conj[i][j]; + conjp[i][j] = zrsdp[i][j] + bp*conjp[i][j]; + epsd += rsd[i][j]*rsd[i][j]; + epsp += rsdp[i][j]*rsdp[i][j]; + } + } + + reduce[0] = epsd; + reduce[1] = epsp; + MPI_Allreduce(reduce,allreduce,4,MPI_DOUBLE,MPI_SUM,world); + epsd = allreduce[0]; + epsp = allreduce[1]; + + // check the convergence of the mutual induced dipoles + + epsold = eps; + eps = MAX(epsd,epsp); + eps = DEBYE * sqrt(eps/atom->natoms); + + if (eps < poleps) done = true; + if (eps > epsold) done = true; + if (iter >= politer) done = true; + + // apply a "peek" iteration to the mutual induced dipoles + + if (done) { + for (i = 0; i < nlocal; i++) { + term = pcgpeek * poli[i]; + for (j = 0; j < 3; j++) { + uind[i][j] += term*rsd[i][j]; + uinp[i][j] += term*rsdp[i][j]; + } + } + } + + } + + // terminate the calculation if dipoles failed to converge + // NOTE: could make this an error + + if (iter >= maxiter || eps > epsold) + if (comm->me == 0) + error->warning(FLERR,"AMOEBA induced dipoles did not converge"); + } + + // update the lists of previous induced dipole values + // shift previous m values up to m+1, add new values at m = 0 + // only when preconditioner is used + + if (use_pred) { + double ***udalt = fixudalt->tstore; + double ***upalt = fixupalt->tstore; + + nualt = MIN(nualt+1,maxualt); + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + for (m = nualt-1; m > 0; m--) { + udalt[i][m][j] = udalt[i][m-1][j]; + upalt[i][m][j] = upalt[i][m-1][j]; + } + udalt[i][0][j] = uind[i][j]; + upalt[i][0][j] = uinp[i][j]; + } + } + } +} + +/* ---------------------------------------------------------------------- + udirect2b = Ewald real direct field via list + udirect2b computes the real space contribution of the permanent + atomic multipole moments to the field via a neighbor list +------------------------------------------------------------------------- */ + +void PairAmoebaGPU::udirect2b(double **field, double **fieldp) +{ + if (!gpu_udirect2b_ready) { + PairAmoeba::udirect2b(field, fieldp); + return; + } + + int inum; + double sublo[3],subhi[3]; + + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi); + } + inum = atom->nlocal; + + // select the correct cutoff (off2) for the term + + if (use_ewald) choose(POLAR_LONG); + else choose(POLAR); + + amoeba_gpu_compute_udirect2b(amtype, amgroup, rpole, uind, uinp, + aewald, off2, &fieldp_pinned); + + // rebuild dipole-dipole pair list and store pairwise dipole matrices + // done one atom at a time in real-space double loop over atoms & neighs + // NOTE: for the moment the tdipdip values are computed just in time in umutual2b() + // so no need to call ubdirect2b_cpu(). + // udirect2b_cpu(); + + // accumulate the field and fieldp values from the GPU lib + // field and fieldp may already have some nonzero values from kspace (udirect1) + + int nlocal = atom->nlocal; + if (acc_float) { + auto field_ptr = (float *)fieldp_pinned; + + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + field[i][0] += field_ptr[idx]; + field[i][1] += field_ptr[idx+1]; + field[i][2] += field_ptr[idx+2]; + } + + field_ptr += 4*inum; + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + fieldp[i][0] += field_ptr[idx]; + fieldp[i][1] += field_ptr[idx+1]; + fieldp[i][2] += field_ptr[idx+2]; + } + } else { + auto field_ptr = (double *)fieldp_pinned; + + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + field[i][0] += field_ptr[idx]; + field[i][1] += field_ptr[idx+1]; + field[i][2] += field_ptr[idx+2]; + } + + field_ptr += 4*inum; + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + fieldp[i][0] += field_ptr[idx]; + fieldp[i][1] += field_ptr[idx+1]; + fieldp[i][2] += field_ptr[idx+2]; + } + } + + +} + +/* ---------------------------------------------------------------------- + udirect2b = Ewald real direct field via list + udirect2b computes the real space contribution of the permanent + atomic multipole moments to the field via a neighbor list +------------------------------------------------------------------------- */ + +void PairAmoebaGPU::udirect2b_cpu() +{ + int i,j,m,n,ii,jj,jextra,ndip,itype,jtype,igroup,jgroup; + double xr,yr,zr,r,r2; + double rr1,rr2,rr3,rr5; + double bfac,exp2a; + double ralpha,aefac; + double aesq2,aesq2n; + double pdi,pti; + double pgamma; + double damp,expdamp; + double scale3,scale5; + double scalek; + double bn[4],bcn[3]; + double factor_uscale; + + int inum,jnum; + int *ilist,*jlist,*numneigh,**firstneigh; + + double **x = atom->x; + + // neigh list + + inum = list->inum; + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + + // NOTE: doesn't this have a problem if aewald is tiny ?? + + aesq2 = 2.0 * aewald * aewald; + aesq2n = 0.0; + if (aewald > 0.0) aesq2n = 1.0 / (MY_PIS*aewald); + + // rebuild dipole-dipole pair list and store pairwise dipole matrices + // done one atom at a time in real-space double loop over atoms & neighs + + int *neighptr; + double *tdipdip; + + // compute the real space portion of the Ewald summation + + for (ii = 0; ii < inum; ii++) { + i = ilist[ii]; + itype = amtype[i]; + igroup = amgroup[i]; + jlist = firstneigh[i]; + jnum = numneigh[i]; + + n = ndip = 0; + neighptr = ipage_dipole->vget(); + tdipdip = dpage_dipdip->vget(); + + pdi = pdamp[itype]; + pti = thole[itype]; + + // evaluate all sites within the cutoff distance + + for (jj = 0; jj < jnum; jj++) { + jextra = jlist[jj]; + j = jextra & NEIGHMASK15; + + xr = x[j][0] - x[i][0]; + yr = x[j][1] - x[i][1]; + zr = x[j][2] - x[i][2]; + r2 = xr*xr + yr* yr + zr*zr; + if (r2 > off2) continue; + + jtype = amtype[j]; + jgroup = amgroup[j]; + + if (igroup == jgroup) factor_uscale = polar_uscale; + else factor_uscale = 1.0; + + r = sqrt(r2); + rr1 = 1.0 / r; + rr2 = rr1 * rr1; + rr3 = rr2 * rr1; + rr5 = 3.0 * rr2 * rr3; + + // calculate the real space Ewald error function terms + + ralpha = aewald * r; + bn[0] = erfc(ralpha) * rr1; + exp2a = exp(-ralpha*ralpha); + aefac = aesq2n; + for (m = 1; m <= 3; m++) { + bfac = m+m-1; + aefac = aesq2 * aefac; + bn[m] = (bfac*bn[m-1]+aefac*exp2a) * rr2; + } + + // find terms needed later to compute mutual polarization + + if (poltyp != DIRECT) { + scale3 = 1.0; + scale5 = 1.0; + damp = pdi * pdamp[jtype]; + if (damp != 0.0) { + pgamma = MIN(pti,thole[jtype]); + damp = pgamma * pow(r/damp,3.0); + if (damp < 50.0) { + expdamp = exp(-damp); + scale3 = 1.0 - expdamp; + scale5 = 1.0 - expdamp*(1.0+damp); + } + } + scalek = factor_uscale; + bcn[0] = bn[1] - (1.0-scalek*scale3)*rr3; + bcn[1] = bn[2] - (1.0-scalek*scale5)*rr5; + + neighptr[n++] = j; + tdipdip[ndip++] = -bcn[0] + bcn[1]*xr*xr; + tdipdip[ndip++] = bcn[1]*xr*yr; + tdipdip[ndip++] = bcn[1]*xr*zr; + tdipdip[ndip++] = -bcn[0] + bcn[1]*yr*yr; + tdipdip[ndip++] = bcn[1]*yr*zr; + tdipdip[ndip++] = -bcn[0] + bcn[1]*zr*zr; + } else { + if (comm->me == 0) printf("i = %d: j = %d: poltyp == DIRECT\n", i, j); + } + + } // jj + + firstneigh_dipole[i] = neighptr; + firstneigh_dipdip[i] = tdipdip; + numneigh_dipole[i] = n; + ipage_dipole->vgot(n); + dpage_dipdip->vgot(ndip); + } +} + +/* ---------------------------------------------------------------------- + ufield0c = mutual induction via Ewald sum + ufield0c computes the mutual electrostatic field due to + induced dipole moments via Ewald summation +------------------------------------------------------------------------- */ + +void PairAmoebaGPU::ufield0c(double **field, double **fieldp) +{ + double term; + + // zero field,fieldp for owned and ghost atoms + + int nlocal = atom->nlocal; + int nall = nlocal + atom->nghost; + + memset(&field[0][0], 0, 3*nall *sizeof(double)); + memset(&fieldp[0][0], 0, 3*nall *sizeof(double)); + + // get the real space portion of the mutual field first + + double time0, time1, time2; + + MPI_Barrier(world); + time0 = platform::walltime(); + + if (polar_rspace_flag) umutual2b(field,fieldp); + time1 = platform::walltime(); + + // get the reciprocal space part of the mutual field + + if (polar_kspace_flag) umutual1(field,fieldp); + time2 = platform::walltime(); + + // add the self-energy portion of the mutual field + + term = (4.0/3.0) * aewald*aewald*aewald / MY_PIS; + for (int i = 0; i < nlocal; i++) { + field[i][0] += term*uind[i][0]; + field[i][1] += term*uind[i][1]; + field[i][2] += term*uind[i][2]; + } + + for (int i = 0; i < nlocal; i++) { + fieldp[i][0] += term*uinp[i][0]; + fieldp[i][1] += term*uinp[i][1]; + fieldp[i][2] += term*uinp[i][2]; + } + + // accumulate the field and fieldp values from the real-space portion from umutual2b() on the GPU + // field and fieldp may already have some nonzero values from kspace (umutual1 and self) + + amoeba_gpu_update_fieldp(&fieldp_pinned); + + int inum = atom->nlocal; + if (acc_float) { + auto field_ptr = (float *)fieldp_pinned; + + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + field[i][0] += field_ptr[idx]; + field[i][1] += field_ptr[idx+1]; + field[i][2] += field_ptr[idx+2]; + } + + field_ptr += 4*inum; + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + fieldp[i][0] += field_ptr[idx]; + fieldp[i][1] += field_ptr[idx+1]; + fieldp[i][2] += field_ptr[idx+2]; + } + } else { + auto field_ptr = (double *)fieldp_pinned; + + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + field[i][0] += field_ptr[idx]; + field[i][1] += field_ptr[idx+1]; + field[i][2] += field_ptr[idx+2]; + } + + field_ptr += 4*inum; + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + fieldp[i][0] += field_ptr[idx]; + fieldp[i][1] += field_ptr[idx+1]; + fieldp[i][2] += field_ptr[idx+2]; + } + } + + + // accumulate timing information + + time_mutual_rspace += time1 - time0; + time_mutual_kspace += time2 - time1; +} + +/* ---------------------------------------------------------------------- + umutual1 = Ewald recip mutual induced field + umutual1 computes the reciprocal space contribution of the + induced atomic dipole moments to the field +------------------------------------------------------------------------- */ + +void PairAmoebaGPU::umutual1(double **field, double **fieldp) +{ + int m,n; + int nxlo,nxhi,nylo,nyhi,nzlo,nzhi; + double term; + double a[3][3]; // indices not flipped vs Fortran + + // return if the Ewald coefficient is zero + + if (aewald < 1.0e-6) return; + + // convert Cartesian dipoles to fractional coordinates + + for (int j = 0; j < 3; j++) { + a[0][j] = nfft1 * recip[0][j]; + a[1][j] = nfft2 * recip[1][j]; + a[2][j] = nfft3 * recip[2][j]; + } + + int nlocal = atom->nlocal; + for (int i = 0; i < nlocal; i++) { + fuind[i][0] = a[0][0]*uind[i][0] + a[0][1]*uind[i][1] + a[0][2]*uind[i][2]; + fuind[i][1] = a[1][0]*uind[i][0] + a[1][1]*uind[i][1] + a[1][2]*uind[i][2]; + fuind[i][2] = a[2][0]*uind[i][0] + a[2][1]*uind[i][1] + a[2][2]*uind[i][2]; + } + + for (int i = 0; i < nlocal; i++) { + fuinp[i][0] = a[0][0]*uinp[i][0] + a[0][1]*uinp[i][1] + a[0][2]*uinp[i][2]; + fuinp[i][1] = a[1][0]*uinp[i][0] + a[1][1]*uinp[i][1] + a[1][2]*uinp[i][2]; + fuinp[i][2] = a[2][0]*uinp[i][0] + a[2][1]*uinp[i][1] + a[2][2]*uinp[i][2]; + } + + // gridpre = my portion of 4d grid in brick decomp w/ ghost values + + FFT_SCALAR ****gridpre = (FFT_SCALAR ****) ic_kspace->zero(); + + // map 2 values to grid + + double time0, time1; + MPI_Barrier(world); + time0 = platform::walltime(); + + grid_uind(fuind,fuinp,gridpre); + + time1 = platform::walltime(); + time_grid_uind += (time1 - time0); + + // pre-convolution operations including forward FFT + // gridfft = my portion of complex 3d grid in FFT decomposition + + FFT_SCALAR *gridfft = ic_kspace->pre_convolution(); + + // --------------------- + // convolution operation + // --------------------- + + nxlo = ic_kspace->nxlo_fft; + nxhi = ic_kspace->nxhi_fft; + nylo = ic_kspace->nylo_fft; + nyhi = ic_kspace->nyhi_fft; + nzlo = ic_kspace->nzlo_fft; + nzhi = ic_kspace->nzhi_fft; + + // use qfac values stored in udirect1() + + m = n = 0; + for (int k = nzlo; k <= nzhi; k++) { + for (int j = nylo; j <= nyhi; j++) { + for (int i = nxlo; i <= nxhi; i++) { + term = qfac[m++]; + gridfft[n] *= term; + gridfft[n+1] *= term; + n += 2; + } + } + } + + // post-convolution operations including backward FFT + // gridppost = my portion of 4d grid in brick decomp w/ ghost values + + FFT_SCALAR ****gridpost = (FFT_SCALAR ****) ic_kspace->post_convolution(); + + // get potential + + MPI_Barrier(world); + time0 = platform::walltime(); + + fphi_uind(gridpost,fdip_phi1,fdip_phi2,fdip_sum_phi); + + time1 = platform::walltime(); + time_fphi_uind += (time1 - time0); + + // store fractional reciprocal potentials for OPT method + + if (poltyp == OPT) { + for (int i = 0; i < nlocal; i++) { + for (int j = 0; j < 10; j++) { + fopt[i][optlevel][j] = fdip_phi1[i][j]; + foptp[i][optlevel][j] = fdip_phi2[i][j]; + } + } + } + + for (int i = 0; i < nlocal; i++) { + double dfx = a[0][0]*fdip_phi1[i][1] + + a[0][1]*fdip_phi1[i][2] + a[0][2]*fdip_phi1[i][3]; + double dfy = a[1][0]*fdip_phi1[i][1] + + a[1][1]*fdip_phi1[i][2] + a[1][2]*fdip_phi1[i][3]; + double dfz = a[2][0]*fdip_phi1[i][1] + + a[2][1]*fdip_phi1[i][2] + a[2][2]*fdip_phi1[i][3]; + field[i][0] -= dfx; + field[i][1] -= dfy; + field[i][2] -= dfz; + } + + for (int i = 0; i < nlocal; i++) { + double dfx = a[0][0]*fdip_phi2[i][1] + + a[0][1]*fdip_phi2[i][2] + a[0][2]*fdip_phi2[i][3]; + double dfy = a[1][0]*fdip_phi2[i][1] + + a[1][1]*fdip_phi2[i][2] + a[1][2]*fdip_phi2[i][3]; + double dfz = a[2][0]*fdip_phi2[i][1] + + a[2][1]*fdip_phi2[i][2] + a[2][2]*fdip_phi2[i][3]; + fieldp[i][0] -= dfx; + fieldp[i][1] -= dfy; + fieldp[i][2] -= dfz; + } + +} + +/* ---------------------------------------------------------------------- + fphi_uind = induced potential from grid + fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid +------------------------------------------------------------------------- */ + +void PairAmoebaGPU::fphi_uind(FFT_SCALAR ****grid, double **fdip_phi1, + double **fdip_phi2, double **fdip_sum_phi) +{ + if (!gpu_fphi_uind_ready) { + PairAmoeba::fphi_uind(grid, fdip_phi1, fdip_phi2, fdip_sum_phi); + return; + } + + void* fdip_phi1_pinned = nullptr; + void* fdip_phi2_pinned = nullptr; + void* fdip_sum_phi_pinned = nullptr; + amoeba_gpu_fphi_uind(grid, &fdip_phi1_pinned, &fdip_phi2_pinned, + &fdip_sum_phi_pinned); + + int nlocal = atom->nlocal; + if (acc_float) { + auto _fdip_phi1_ptr = (float *)fdip_phi1_pinned; + for (int i = 0; i < nlocal; i++) { + int n = i; + for (int m = 0; m < 10; m++) { + fdip_phi1[i][m] = _fdip_phi1_ptr[n]; + n += nlocal; + } + } + + auto _fdip_phi2_ptr = (float *)fdip_phi2_pinned; + for (int i = 0; i < nlocal; i++) { + int n = i; + for (int m = 0; m < 10; m++) { + fdip_phi2[i][m] = _fdip_phi2_ptr[n]; + n += nlocal; + } + } + + auto _fdip_sum_phi_ptr = (float *)fdip_sum_phi_pinned; + for (int i = 0; i < nlocal; i++) { + int n = i; + for (int m = 0; m < 20; m++) { + fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[n]; + n += nlocal; + } + } + + } else { + auto _fdip_phi1_ptr = (double *)fdip_phi1_pinned; + for (int i = 0; i < nlocal; i++) { + int n = i; + for (int m = 0; m < 10; m++) { + fdip_phi1[i][m] = _fdip_phi1_ptr[n]; + n += nlocal; + } + } + + auto _fdip_phi2_ptr = (double *)fdip_phi2_pinned; + for (int i = 0; i < nlocal; i++) { + int n = i; + for (int m = 0; m < 10; m++) { + fdip_phi2[i][m] = _fdip_phi2_ptr[n]; + n += nlocal; + } + } + + auto _fdip_sum_phi_ptr = (double *)fdip_sum_phi_pinned; + for (int i = 0; i < nlocal; i++) { + int n = i; + for (int m = 0; m < 20; m++) { + fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[n]; + n += nlocal; + } + } + } + +} + +/* ---------------------------------------------------------------------- + umutual2b = Ewald real mutual field via list + umutual2b computes the real space contribution of the induced + atomic dipole moments to the field via a neighbor list +------------------------------------------------------------------------- */ + +void PairAmoebaGPU::umutual2b(double **field, double **fieldp) +{ + if (!gpu_umutual2b_ready) { + PairAmoeba::umutual2b(field, fieldp); + return; + } + + double sublo[3],subhi[3]; + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi); + } + + // select the correct cutoff (off2) for the term + + if (use_ewald) choose(POLAR_LONG); + else choose(POLAR); + + amoeba_gpu_compute_umutual2b(amtype, amgroup, rpole, uind, uinp, + aewald, off2, &fieldp_pinned); +} + +/* ---------------------------------------------------------------------- + polar_real = real-space portion of induced dipole polarization + adapted from Tinker epreal1d() routine +------------------------------------------------------------------------- */ + +void PairAmoebaGPU::polar_real() +{ + if (!gpu_polar_real_ready) { + PairAmoeba::polar_real(); + return; + } + + int eflag=1, vflag=1; + double **f = atom->f; + + double sublo[3],subhi[3]; + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi); + } + + // select the correct cutoff and aewald for the term + + if (use_ewald) choose(POLAR_LONG); + else choose(POLAR); + + // set the energy unit conversion factor for polar real-space calculation + + double felec = 0.5 * electric / am_dielectric; + + amoeba_gpu_compute_polar_real(amtype, amgroup, rpole, uind, uinp, + eflag, vflag, eflag_atom, vflag_atom, + aewald, felec, off2, &tq_pinned); + + // reference to the tep array from GPU lib + + if (acc_float) { + auto *tep_ptr = (float *)tq_pinned; + compute_force_from_torque(tep_ptr, f, virpolar); // fpolar + } else { + auto *tep_ptr = (double *)tq_pinned; + compute_force_from_torque(tep_ptr, f, virpolar); // fpolar + } +} + +/* ---------------------------------------------------------------------- + polar_kspace = KSpace portion of induced dipole polarization + adapted from Tinker eprecip1() routine + same as PairAmoeba, except that fphi_uind() is reimplemented here + ------------------------------------------------------------------------- */ + +void PairAmoebaGPU::polar_kspace() +{ + int i,j,k,m,n; + int nhalf1,nhalf2,nhalf3; + int nxlo,nxhi,nylo,nyhi,nzlo,nzhi; + int j1,j2,j3; + int ix,iy,iz; + double eterm,felec; + double r1,r2,r3; + double h1,h2,h3; + double f1,f2,f3; + double xix,yix,zix; + double xiy,yiy,ziy; + double xiz,yiz,ziz; + double vxx,vyy,vzz; + double vxy,vxz,vyz; + double volterm,denom; + double hsq,expterm; + double term,pterm; + double vterm,struc2; + double tep[3]; + double fix[3],fiy[3],fiz[3]; + double cphid[4],cphip[4]; + double a[3][3]; // indices not flipped vs Fortran + + bool gpu_fphi_mpole_ready = true; + + // indices into the electrostatic field array + // decremented by 1 versus Fortran + + int deriv1[10] = {1, 4, 7, 8, 10, 15, 17, 13, 14, 19}; + int deriv2[10] = {2, 7, 5, 9, 13, 11, 18, 15, 19, 16}; + int deriv3[10] = {3, 8, 9, 6, 14, 16, 12, 19, 17, 18}; + + // return if the Ewald coefficient is zero + + if (aewald < 1.0e-6) return; + + // owned atoms + + double **x = atom->x; + double **f = atom->f; + int nlocal = atom->nlocal; + + double volbox = domain->prd[0] * domain->prd[1] * domain->prd[2]; + pterm = pow((MY_PI/aewald),2.0); + volterm = MY_PI * volbox; + + // initialize variables required for the scalar summation + + felec = electric / am_dielectric; + + // remove scalar sum virial from prior multipole FFT + // can only do this if multipoles were computed with same aeewald = apewald + // else need to re-compute it via new long-range solve + + nfft1 = p_kspace->nx; + nfft2 = p_kspace->ny; + nfft3 = p_kspace->nz; + bsorder = p_kspace->order; + + nhalf1 = (nfft1+1) / 2; + nhalf2 = (nfft2+1) / 2; + nhalf3 = (nfft3+1) / 2; + + nxlo = p_kspace->nxlo_fft; + nxhi = p_kspace->nxhi_fft; + nylo = p_kspace->nylo_fft; + nyhi = p_kspace->nyhi_fft; + nzlo = p_kspace->nzlo_fft; + nzhi = p_kspace->nzhi_fft; + + // use previous results or compute new qfac and convolution + + if (aewald == aeewald) { + vxx = -vmsave[0]; + vyy = -vmsave[1]; + vzz = -vmsave[2]; + vxy = -vmsave[3]; + vxz = -vmsave[4]; + vyz = -vmsave[5]; + + } else { + + // setup stencil size and B-spline coefficients + + moduli(); + bspline_fill(); + + // allocate memory and make early host-device transfers + + // NOTE: this is for p_kspace, and igrid and thetai[1-3] are filled by bpsline_fill + if (gpu_fphi_mpole_ready) { + amoeba_gpu_precompute_kspace(atom->nlocal, bsorder, + thetai1, thetai2, thetai3, igrid, + p_kspace->nzlo_out, p_kspace->nzhi_out, + p_kspace->nylo_out, p_kspace->nyhi_out, + p_kspace->nxlo_out, p_kspace->nxhi_out); + } + + + // convert Cartesian multipoles to fractional coordinates + + cmp_to_fmp(cmp,fmp); + + // gridpre = my portion of 3d grid in brick decomp w/ ghost values + + FFT_SCALAR ***gridpre = (FFT_SCALAR ***) p_kspace->zero(); + + // map atoms to grid + + grid_mpole(fmp,gridpre); + + // pre-convolution operations including forward FFT + // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector + + FFT_SCALAR *gridfft = p_kspace->pre_convolution(); + + // --------------------- + // convolution operation + // --------------------- + + // zero virial accumulation variables + + vxx = vyy = vzz = vxy = vxz = vyz = 0.0; + + // perform convolution on K-space points I own + + m = n = 0; + for (k = nzlo; k <= nzhi; k++) { + for (j = nylo; j <= nyhi; j++) { + for (i = nxlo; i <= nxhi; i++) { + r1 = (i >= nhalf1) ? i-nfft1 : i; + r2 = (j >= nhalf2) ? j-nfft2 : j; + r3 = (k >= nhalf3) ? k-nfft3 : k; + h1 = recip[0][0]*r1 + recip[0][1]*r2 + recip[0][2]*r3; // matvec + h2 = recip[1][0]*r1 + recip[1][1]*r2 + recip[1][2]*r3; + h3 = recip[2][0]*r1 + recip[2][1]*r2 + recip[2][2]*r3; + hsq = h1*h1 + h2*h2 + h3*h3; + term = -pterm * hsq; + expterm = 0.0; + if (term > -50.0 && hsq != 0.0) { + denom = volterm*hsq*bsmod1[i]*bsmod2[j]*bsmod3[k]; + if (hsq) expterm = exp(term) / denom; + struc2 = gridfft[n]*gridfft[n] + gridfft[n+1]*gridfft[n+1]; + eterm = 0.5 * felec * expterm * struc2; + vterm = (2.0/hsq) * (1.0-term) * eterm; + vxx -= h1*h1*vterm - eterm; + vyy -= h2*h2*vterm - eterm; + vzz -= h3*h3*vterm - eterm; + vxy -= h1*h2*vterm; + vxz -= h1*h3*vterm; + vyz -= h2*h3*vterm; + } + + expterm = qfac[m++]; + gridfft[n] *= expterm; + gridfft[n+1] *= expterm; + n += 2; + } + } + } + + // post-convolution operations including backward FFT + // gridppost = my portion of 3d grid in brick decomp w/ ghost values + + FFT_SCALAR ***gridpost = (FFT_SCALAR ***) p_kspace->post_convolution(); + + // get potential + + if (!gpu_fphi_mpole_ready) { + fphi_mpole(gridpost,fphi); + + for (i = 0; i < nlocal; i++) { + for (k = 0; k < 20; k++) + fphi[i][k] *= felec; + } + + } else { + void* fphi_pinned = nullptr; + amoeba_gpu_fphi_mpole(gridpost, &fphi_pinned, felec); + if (acc_float) { + auto _fphi_ptr = (float *)fphi_pinned; + for (int i = 0; i < nlocal; i++) { + int idx = i; + for (int m = 0; m < 20; m++) { + fphi[i][m] = _fphi_ptr[idx]; + idx += nlocal; + } + } + } else { + auto _fphi_ptr = (double *)fphi_pinned; + for (int i = 0; i < nlocal; i++) { + int idx = i; + for (int m = 0; m < 20; m++) { + fphi[i][m] = _fphi_ptr[idx]; + idx += nlocal; + } + } + } + } + + // convert field from fractional to Cartesian + + fphi_to_cphi(fphi,cphi); + } + + // convert Cartesian induced dipoles to fractional coordinates + + for (i = 0; i < 3; i++) { + a[0][i] = nfft1 * recip[0][i]; + a[1][i] = nfft2 * recip[1][i]; + a[2][i] = nfft3 * recip[2][i]; + } + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + fuind[i][j] = a[j][0]*uind[i][0] + a[j][1]*uind[i][1] + a[j][2]*uind[i][2]; + fuinp[i][j] = a[j][0]*uinp[i][0] + a[j][1]*uinp[i][1] + a[j][2]*uinp[i][2]; + } + } + + // gridpre2 = my portion of 4d grid in brick decomp w/ ghost values + + FFT_SCALAR ****gridpre2 = (FFT_SCALAR ****) pc_kspace->zero(); + + // map 2 values to grid + + grid_uind(fuind,fuinp,gridpre2); + + // pre-convolution operations including forward FFT + // gridfft = my portion of complex 3d grid in FFT decomposition + + FFT_SCALAR *gridfft = pc_kspace->pre_convolution(); + + // --------------------- + // convolution operation + // --------------------- + + // use qfac values from above or from induce() + + m = n = 0; + for (k = nzlo; k <= nzhi; k++) { + for (j = nylo; j <= nyhi; j++) { + for (i = nxlo; i <= nxhi; i++) { + term = qfac[m++]; + gridfft[n] *= term; + gridfft[n+1] *= term; + n += 2; + } + } + } + + // post-convolution operations including backward FFT + // gridppost = my portion of 4d grid in brick decomp w/ ghost values + + FFT_SCALAR ****gridpost = (FFT_SCALAR ****) pc_kspace->post_convolution(); + + // get potential + + fphi_uind(gridpost,fphid,fphip,fphidp); + + // TODO: port the remaining loops to the GPU + + for (i = 0; i < nlocal; i++) { + for (j = 1; j < 10; j++) { + fphid[i][j] = felec * fphid[i][j]; + fphip[i][j] = felec * fphip[i][j]; + } + for (j = 0; j < 20; j++) + fphidp[i][j] = felec * fphidp[i][j]; + } + + // increment the dipole polarization gradient contributions + + for (i = 0; i < nlocal; i++) { + f1 = 0.0; + f2 = 0.0; + f3 = 0.0; + for (k = 0; k < 3; k++) { + j1 = deriv1[k+1]; + j2 = deriv2[k+1]; + j3 = deriv3[k+1]; + f1 += (fuind[i][k]+fuinp[i][k])*fphi[i][j1]; + f2 += (fuind[i][k]+fuinp[i][k])*fphi[i][j2]; + f3 += (fuind[i][k]+fuinp[i][k])*fphi[i][j3]; + if (poltyp == MUTUAL) { + f1 += fuind[i][k]*fphip[i][j1] + fuinp[i][k]*fphid[i][j1]; + f2 += fuind[i][k]*fphip[i][j2] + fuinp[i][k]*fphid[i][j2]; + f3 += fuind[i][k]*fphip[i][j3] + fuinp[i][k]*fphid[i][j3]; + } + } + for (k = 0; k < 10; k++) { + f1 += fmp[i][k]*fphidp[i][deriv1[k]]; + f2 += fmp[i][k]*fphidp[i][deriv2[k]]; + f3 += fmp[i][k]*fphidp[i][deriv3[k]]; + } + f1 *= 0.5 * nfft1; + f2 *= 0.5 * nfft2; + f3 *= 0.5 * nfft3; + h1 = recip[0][0]*f1 + recip[0][1]*f2 + recip[0][2]*f3; + h2 = recip[1][0]*f1 + recip[1][1]*f2 + recip[1][2]*f3; + h3 = recip[2][0]*f1 + recip[2][1]*f2 + recip[2][2]*f3; + f[i][0] -= h1; + f[i][1] -= h2; + f[i][2] -= h3; + } + + // set the potential to be the induced dipole average + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 10; j++) + fphidp[i][j] *= 0.5; + } + + fphi_to_cphi(fphidp,cphidp); + + // get the fractional to Cartesian transformation matrix + + //frac_to_cart(); + + // increment the dipole polarization virial contributions + + for (i = 0; i < nlocal; i++) { + for (j = 1; j < 4; j++) { + cphid[j] = 0.0; + cphip[j] = 0.0; + for (k = 1; k < 4; k++) { + cphid[j] += ftc[j][k]*fphid[i][k]; + cphip[j] += ftc[j][k]*fphip[i][k]; + } + } + + vxx -= cmp[i][1]*cphidp[i][1] + + 0.5*((uind[i][0]+uinp[i][0])*cphi[i][1]); + vyy -= cmp[i][2]*cphidp[i][2] + + 0.5*((uind[i][1]+uinp[i][1])*cphi[i][2]); + vzz -= cmp[i][3]*cphidp[i][3] + + 0.5*((uind[i][2]+uinp[i][2])*cphi[i][3]); + vxy -= 0.5*(cphidp[i][1]*cmp[i][2]+cphidp[i][2]*cmp[i][1]) + + 0.25*((uind[i][1]+uinp[i][1])*cphi[i][1] + + (uind[i][0]+uinp[i][0])*cphi[i][2]); + vyz -= 0.5*(cphidp[i][2]*cmp[i][3]+cphidp[i][3]*cmp[i][2]) + + 0.25*((uind[i][2]+uinp[i][2])*cphi[i][2] + + (uind[i][1]+uinp[i][1])*cphi[i][3]); + vxz -= 0.5*(cphidp[i][1]*cmp[i][3]+cphidp[i][3]*cmp[i][1]) + + 0.25*((uind[i][2]+uinp[i][2])*cphi[i][1] + + (uind[i][0]+uinp[i][0])*cphi[i][3]); + + vxx -= 2.0*cmp[i][4]*cphidp[i][4] + cmp[i][7]*cphidp[i][7] + + cmp[i][8]*cphidp[i][8]; + vyy -= 2.0*cmp[i][5]*cphidp[i][5] + cmp[i][7]*cphidp[i][7] + + cmp[i][9]*cphidp[i][9]; + vzz -= 2.0*cmp[i][6]*cphidp[i][6] + cmp[i][8]*cphidp[i][8] + + cmp[i][9]*cphidp[i][9]; + vxy -= (cmp[i][4]+cmp[i][5])*cphidp[i][7] + + 0.5*(cmp[i][7]*(cphidp[i][5]+cphidp[i][4]) + + cmp[i][8]*cphidp[i][9]+cmp[i][9]*cphidp[i][8]); + vyz -= (cmp[i][5]+cmp[i][6])*cphidp[i][9] + + 0.5*(cmp[i][9]*(cphidp[i][5]+cphidp[i][6]) + + cmp[i][7]*cphidp[i][8]+cmp[i][8]*cphidp[i][7]); + vxz -= (cmp[i][4]+cmp[i][6])*cphidp[i][8] + + 0.5*(cmp[i][8]*(cphidp[i][4]+cphidp[i][6]) + + cmp[i][7]*cphidp[i][9]+cmp[i][9]*cphidp[i][7]); + + if (poltyp == MUTUAL) { + vxx -= 0.5 * (cphid[1]*uinp[i][0]+cphip[1]*uind[i][0]); + vyy -= 0.5 * (cphid[2]*uinp[i][1]+cphip[2]*uind[i][1]); + vzz -= 0.5 * (cphid[3]*uinp[i][2]+cphip[3]*uind[i][2]); + vxy -= 0.25 * (cphid[1]*uinp[i][1]+cphip[1]*uind[i][1] + + cphid[2]*uinp[i][0]+cphip[2]*uind[i][0]); + vyz -= 0.25 * (cphid[2]*uinp[i][2]+cphip[2]*uind[i][2] + + cphid[3]*uinp[i][1]+cphip[3]*uind[i][1]); + vxz -= 0.25 * (cphid[1]*uinp[i][2]+cphip[1]*uind[i][2] + + cphid[3]*uinp[i][0]+cphip[3]*uind[i][0]); + } + } + + + // resolve site torques then increment forces and virial + + for (i = 0; i < nlocal; i++) { + tep[0] = cmp[i][3]*cphidp[i][2] - cmp[i][2]*cphidp[i][3] + + 2.0*(cmp[i][6]-cmp[i][5])*cphidp[i][9] + cmp[i][8]*cphidp[i][7] + + cmp[i][9]*cphidp[i][5]- cmp[i][7]*cphidp[i][8] - cmp[i][9]*cphidp[i][6]; + tep[1] = cmp[i][1]*cphidp[i][3] - cmp[i][3]*cphidp[i][1] + + 2.0*(cmp[i][4]-cmp[i][6])*cphidp[i][8] + cmp[i][7]*cphidp[i][9] + + cmp[i][8]*cphidp[i][6] - cmp[i][8]*cphidp[i][4] - cmp[i][9]*cphidp[i][7]; + tep[2] = cmp[i][2]*cphidp[i][1] - cmp[i][1]*cphidp[i][2] + + 2.0*(cmp[i][5]-cmp[i][4])*cphidp[i][7] + cmp[i][7]*cphidp[i][4] + + cmp[i][9]*cphidp[i][8] - cmp[i][7]*cphidp[i][5] - cmp[i][8]*cphidp[i][9]; + + torque2force(i,tep,fix,fiy,fiz,f); + + iz = zaxis2local[i]; + ix = xaxis2local[i]; + iy = yaxis2local[i]; + + xiz = x[iz][0] - x[i][0]; + yiz = x[iz][1] - x[i][1]; + ziz = x[iz][2] - x[i][2]; + xix = x[ix][0] - x[i][0]; + yix = x[ix][1] - x[i][1]; + zix = x[ix][2] - x[i][2]; + xiy = x[iy][0] - x[i][0]; + yiy = x[iy][1] - x[i][1]; + ziy = x[iy][2] - x[i][2]; + + vxx += xix*fix[0] + xiy*fiy[0] + xiz*fiz[0]; + vyy += yix*fix[1] + yiy*fiy[1] + yiz*fiz[1]; + vzz += zix*fix[2] + ziy*fiy[2] + ziz*fiz[2]; + vxy += 0.5*(yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] + + xix*fix[1] + xiy*fiy[1] + xiz*fiz[1]); + vyz += 0.5*(zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] + + yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]); + vxz += 0.5*(zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] + + xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]); + } + + // account for dipole response terms in the OPT method + + if (poltyp == OPT) { + for (i = 0; i < nlocal; i++) { + for (k = 0; k < optorder; k++) { + for (j = 1; j < 10; j++) { + fphid[i][j] = felec * fopt[i][k][j]; + fphip[i][j] = felec * foptp[i][k][j]; + } + + for (m = 0; m < optorder-k; m++) { + for (j = 0; j < 3; j++) { + fuind[i][j] = a[0][j]*uopt[i][m][0] + a[1][j]*uopt[i][m][1] + + a[2][j]*uopt[i][m][2]; + fuinp[i][j] = a[0][j]*uoptp[i][m][0] + a[1][j]*uoptp[i][m][1] + + a[2][j]*uoptp[i][m][2]; + } + + f1 = 0.0; + f2 = 0.0; + f3 = 0.0; + + for (j = 0; j < 3; j++) { + j1 = deriv1[j+1]; + j2 = deriv2[j+1]; + j3 = deriv3[j+1]; + f1 += fuind[i][j]*fphip[i][j1] + fuinp[i][j]*fphid[i][j1]; + f2 += fuind[i][j]*fphip[i][j2] + fuinp[i][j]*fphid[i][j2]; + f3 += fuind[i][j]*fphip[i][j3] + fuinp[i][j]*fphid[i][j3]; + } + + f1 *= 0.5 * nfft1; + f2 *= 0.5 * nfft2; + f3 *= 0.5 * nfft3; + h1 = recip[0][0]*f1 + recip[0][1]*f2 + recip[0][2]*f3; + h2 = recip[1][0]*f1 + recip[1][1]*f2 + recip[1][2]*f3; + h3 = recip[2][0]*f1 + recip[2][1]*f2 + recip[2][2]*f3; + + f[i][0] -= copm[k+m+1]*h1; + f[i][1] -= copm[k+m+1]*h2; + f[i][2] -= copm[k+m+1]*h3; + + for (j = 1; j < 4; j++) { + cphid[j] = 0.0; + cphip[j] = 0.0; + for (j1 = 1; j1 < 4; j1++) { + cphid[j] += ftc[j][j1]*fphid[i][j1]; + cphip[j] += ftc[j][j1]*fphip[i][j1]; + } + } + + vxx -= 0.5*copm[k+m+1] * + (cphid[1]*uoptp[i][m][0] + cphip[1]*uopt[i][m][0]); + vyy -= 0.5*copm[k+m+1] * + (cphid[2]*uoptp[i][m][1]+ cphip[2]*uopt[i][m][1]); + vzz -= 0.5*copm[k+m+1] * + (cphid[3]*uoptp[i][m][2]+ cphip[3]*uopt[i][m][2]); + vxy -= 0.25*copm[k+m+1] * + (cphid[1]*uoptp[i][m][1]+ cphip[1]*uopt[i][m][1]+ + cphid[2]*uoptp[i][m][0]+ cphip[2]*uopt[i][m][0]); + vyz -= 0.25*copm[k+m+1] * + (cphid[1]*uoptp[i][m][2]+ cphip[1]*uopt[i][m][2]+ + cphid[3]*uoptp[i][m][0]+ cphip[3]*uopt[i][m][0]); + vxz -= 0.25*copm[k+m+1] * + (cphid[2]*uoptp[i][m][2]+ cphip[2]*uopt[i][m][2]+ + cphid[3]*uoptp[i][m][1]+ cphip[3]*uopt[i][m][1]); + } + } + } + } + + // assign permanent and induced multipoles to the PME grid + + for (i = 0; i < nlocal; i++) { + for (j = 1; j < 4; j++) + cmp[i][j] += uinp[i][j-1]; + } + + // convert Cartesian multipoles to fractional multipoles + + cmp_to_fmp(cmp,fmp); + + // gridpre = my portion of 3d grid in brick decomp w/ ghost values + // zeroed by zero() + + FFT_SCALAR ***gridpre = (FFT_SCALAR ***) p_kspace->zero(); + + // map atoms to grid + + grid_mpole(fmp,gridpre); + + // pre-convolution operations including forward FFT + // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector + + gridfft = p_kspace->pre_convolution(); + + // gridfft1 = copy of first FFT + + int nfft_owned = p_kspace->nfft_owned; + memcpy(gridfft1,gridfft,2*nfft_owned*sizeof(FFT_SCALAR)); + + // assign induced dipoles to the PME grid + + for (i = 0; i < nlocal; i++) { + for (j = 1; j < 4; j++) + cmp[i][j] += uind[i][j-1] - uinp[i][j-1]; + } + + // convert Cartesian multipoles to fractional multipoles + + cmp_to_fmp(cmp,fmp); + + // gridpre = my portion of 3d grid in brick decomp w/ ghost values + // zeroed by zero() + + gridpre = (FFT_SCALAR ***) p_kspace->zero(); + + // map atoms to grid + + grid_mpole(fmp,gridpre); + + // pre-convolution operations including forward FFT + // gridfft1/2 = my portions of complex 3d grid in FFT decomp as 1d vectors + + FFT_SCALAR *gridfft2 = p_kspace->pre_convolution(); + + // --------------------- + // convolution operation + // --------------------- + + m = n = 0; + for (k = nzlo; k <= nzhi; k++) { + for (j = nylo; j <= nyhi; j++) { + for (i = nxlo; i <= nxhi; i++) { + r1 = (i >= nhalf1) ? i-nfft1 : i; + r2 = (j >= nhalf2) ? j-nfft2 : j; + r3 = (k >= nhalf3) ? k-nfft3 : k; + h1 = recip[0][0]*r1 + recip[0][1]*r2 + recip[0][2]*r3; // matvec + h2 = recip[1][0]*r1 + recip[1][1]*r2 + recip[1][2]*r3; + h3 = recip[2][0]*r1 + recip[2][1]*r2 + recip[2][2]*r3; + hsq = h1*h1 + h2*h2 + h3*h3; + term = -pterm * hsq; + expterm = 0.0; + if (term > -50.0 && hsq != 0.0) { + denom = volterm*hsq*bsmod1[i]*bsmod2[j]*bsmod3[k]; + expterm = exp(term) / denom; + struc2 = gridfft1[n]*gridfft2[n] + gridfft1[n+1]*gridfft2[n+1]; + eterm = 0.5 * felec * expterm * struc2; + vterm = (2.0/hsq) * (1.0-term) * eterm; + vxx += h1*h1*vterm - eterm; + vyy += h2*h2*vterm - eterm; + vzz += h3*h3*vterm - eterm; + vxy += h1*h2*vterm; + vyz += h2*h3*vterm; + vxz += h1*h3*vterm; + } + n += 2; + } + } + } + + // assign only the induced dipoles to the PME grid + // and perform the 3-D FFT forward transformation + // NOTE: why is there no inverse FFT in this section? + + if (poltyp == DIRECT || poltyp == TCG) { + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 10; j++) + cmp[i][j] = 0.0; + for (j = 1; j < 4; j++) + cmp[i][j] = uinp[i][j-1]; + } + + // convert Cartesian multipoles to fractional multipoles + + cmp_to_fmp(cmp,fmp); + + // gridpre = my portion of 3d grid in brick decomp w/ ghost values + // zeroed by zero() + + FFT_SCALAR ***gridpre = (FFT_SCALAR ***) p_kspace->zero(); + + // map atoms to grid + + grid_mpole(fmp,gridpre); + + // pre-convolution operations including forward FFT + // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector + + FFT_SCALAR *gridfft = p_kspace->pre_convolution(); + + // gridfft1 = copy of first FFT + + int nfft_owned = p_kspace->nfft_owned; + memcpy(gridfft1,gridfft,2*nfft_owned*sizeof(FFT_SCALAR)); + + // assign ??? to the PME grid + + for (i = 0; i < nlocal; i++) { + for (j = 1; j < 4; j++) + cmp[i][j] = uind[i][j-1]; + } + + // convert Cartesian multipoles to fractional multipoles + + cmp_to_fmp(cmp,fmp); + + // gridpre = my portion of 3d grid in brick decomp w/ ghost values + + gridpre = (FFT_SCALAR ***) p_kspace->zero(); + + // map atoms to grid + + grid_mpole(fmp,gridpre); + + // pre-convolution operations including forward FFT + // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector + + FFT_SCALAR *gridfft2 = p_kspace->pre_convolution(); + + // --------------------- + // convolution operation + // --------------------- + + m = n = 0; + for (k = nzlo; k <= nzhi; k++) { + for (j = nylo; j <= nyhi; j++) { + for (i = nxlo; i <= nxhi; i++) { + r1 = (i >= nhalf1) ? i-nfft1 : i; + r2 = (j >= nhalf2) ? j-nfft2 : j; + r3 = (k >= nhalf3) ? k-nfft3 : k; + h1 = recip[0][0]*r1 + recip[0][1]*r2 + recip[0][2]*r3; // matvec + h2 = recip[1][0]*r1 + recip[1][1]*r2 + recip[1][2]*r3; + h3 = recip[2][0]*r1 + recip[2][1]*r2 + recip[2][2]*r3; + hsq = h1*h1 + h2*h2 + h3*h3; + term = -pterm * hsq; + expterm = 0.0; + if (term > -50.0 && hsq != 0.0) { + denom = volterm*hsq*bsmod1[i]*bsmod2[j]*bsmod3[k]; + expterm = exp(term) / denom; + struc2 = gridfft1[n]*gridfft2[n] + gridfft1[n+1]*gridfft2[n+1]; + eterm = 0.5 * felec * expterm * struc2; + vterm = (2.0/hsq) * (1.0-term) * eterm; + vxx += h1*h1*vterm - eterm; + vyy += h2*h2*vterm - eterm; + vzz += h3*h3*vterm - eterm; + vxy += h1*h2*vterm; + vyz += h2*h3*vterm; + vxz += h1*h3*vterm; + } + n += 2; + } + } + } + } + + // increment the total internal virial tensor components + + if (vflag_global) { + virpolar[0] -= vxx; + virpolar[1] -= vyy; + virpolar[2] -= vzz; + virpolar[3] -= vxy; + virpolar[4] -= vxz; + virpolar[5] -= vyz; + } +} + +/* ---------------------------------------------------------------------- + compute atom forces from torques +------------------------------------------------------------------------- */ + +template +void PairAmoebaGPU::compute_force_from_torque(const numtyp* tq_ptr, + double** force_comp, + double* virial_comp) +{ + int i,ix,iy,iz; + double xix,yix,zix; + double xiy,yiy,ziy; + double xiz,yiz,ziz; + double vxx,vyy,vzz; + double vxy,vxz,vyz; + double fix[3],fiy[3],fiz[3],_tq[4]; + + double** x = atom->x; + int nlocal = atom->nlocal; + + for (i = 0; i < nlocal; i++) { + _tq[0] = tq_ptr[4*i]; + _tq[1] = tq_ptr[4*i+1]; + _tq[2] = tq_ptr[4*i+2]; + torque2force(i,_tq,fix,fiy,fiz,force_comp); + + iz = zaxis2local[i]; + ix = xaxis2local[i]; + iy = yaxis2local[i]; + + xiz = x[iz][0] - x[i][0]; + yiz = x[iz][1] - x[i][1]; + ziz = x[iz][2] - x[i][2]; + xix = x[ix][0] - x[i][0]; + yix = x[ix][1] - x[i][1]; + zix = x[ix][2] - x[i][2]; + xiy = x[iy][0] - x[i][0]; + yiy = x[iy][1] - x[i][1]; + ziy = x[iy][2] - x[i][2]; + + vxx = xix*fix[0] + xiy*fiy[0] + xiz*fiz[0]; + vyy = yix*fix[1] + yiy*fiy[1] + yiz*fiz[1]; + vzz = zix*fix[2] + ziy*fiy[2] + ziz*fiz[2]; + vxy = 0.5 * (yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] + + xix*fix[1] + xiy*fiy[1] + xiz*fiz[1]); + vxz = 0.5 * (zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] + + xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]); + vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] + + yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]); + + virial_comp[0] -= vxx; + virial_comp[1] -= vyy; + virial_comp[2] -= vzz; + virial_comp[3] -= vxy; + virial_comp[4] -= vxz; + virial_comp[5] -= vyz; + } +} + +/* ---------------------------------------------------------------------- */ + +double PairAmoebaGPU::memory_usage() +{ + double bytes = Pair::memory_usage(); + return bytes + amoeba_gpu_bytes(); +} diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h new file mode 100644 index 0000000000..be53f7ef50 --- /dev/null +++ b/src/GPU/pair_amoeba_gpu.h @@ -0,0 +1,72 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS Development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS +// clang-format off +PairStyle(amoeba/gpu,PairAmoebaGPU); +// clang-format on +#else + +#ifndef LMP_PAIR_AMOEBA_GPU_H +#define LMP_PAIR_AMOEBA_GPU_H + +#include "pair_amoeba.h" + +namespace LAMMPS_NS { + +class PairAmoebaGPU : public PairAmoeba { + public: + PairAmoebaGPU(LAMMPS *lmp); + ~PairAmoebaGPU() override; + void init_style() override; + double memory_usage() override; + + enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH }; + + void induce() override; + + void multipole_real() override; + void udirect2b(double **, double **) override; + void umutual1(double **, double **) override; + void fphi_uind(FFT_SCALAR ****, double **, double **, double **) override; + void umutual2b(double **, double **) override; + void ufield0c(double **, double **) override; + void polar_real() override; + void polar_kspace() override; + + private: + int gpu_mode; + double cpu_time; + void *tq_pinned; + void *fieldp_pinned; + bool acc_float; + + bool gpu_hal_ready; + bool gpu_repulsion_ready; + bool gpu_dispersion_real_ready; + bool gpu_multipole_real_ready; + bool gpu_udirect2b_ready; + bool gpu_umutual1_ready; + bool gpu_fphi_uind_ready; + bool gpu_umutual2b_ready; + bool gpu_polar_real_ready; + + void udirect2b_cpu(); + + template + void compute_force_from_torque(const numtyp*, double**, double*); +}; + +} // namespace LAMMPS_NS +#endif +#endif diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp new file mode 100644 index 0000000000..9d286d5db7 --- /dev/null +++ b/src/GPU/pair_hippo_gpu.cpp @@ -0,0 +1,1494 @@ +// clang-format off +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS Development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Trung Nguyen (Northwestern/UChicago) +------------------------------------------------------------------------- */ + +#include "pair_hippo_gpu.h" + +#include "amoeba_convolution_gpu.h" +#include "atom.h" +#include "comm.h" +#include "domain.h" +#include "error.h" +#include "fix_store_peratom.h" +#include "force.h" +#include "gpu_extra.h" +#include "info.h" +#include "math_const.h" +#include "memory.h" +#include "my_page.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "neighbor.h" +#include "suffix.h" +#include + +using namespace LAMMPS_NS; +using namespace MathConst; + +enum{INDUCE,RSD,SETUP_hippo,SETUP_HIPPO,KMPOLE,AMGROUP}; // forward comm +enum{FIELD,ZRSD,TORQUE,UFLD}; // reverse comm +enum{VDWL,REPULSE,QFER,DISP,MPOLE,POLAR,USOLV,DISP_LONG,MPOLE_LONG,POLAR_LONG}; +enum{MUTUAL,OPT,TCG,DIRECT}; +enum{GEAR,ASPC,LSQR}; +enum{BUILD,APPLY}; +enum{GORDON1,GORDON2}; + +// same as in pair_amoeba.cpp +enum{MPOLE_GRID,POLAR_GRID,POLAR_GRIDC,DISP_GRID,INDUCE_GRID,INDUCE_GRIDC}; + +#define DEBYE 4.80321 // conversion factor from q-Angs (real units) to Debye + +// External functions from cuda library for atom decomposition + +int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass, + const double *host_pdamp, const double *host_thole, + const double *host_dirdamp, const int* host_amtype2class, + const double *host_special_repel, const double *host_special_disp, + const double *host_special_mpole, + const double *host_special_polar_wscale, + const double *host_special_polar_piscale, + const double *host_special_polar_pscale, + const double *host_sizpr, const double *host_dmppr, const double *host_elepr, + const double *host_csix, const double *host_adisp, + const double *host_pcore, const double *host_palpha, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const int maxspecial15, + const double cell_size, int &gpu_mode, FILE *screen, + const double polar_dscale, const double polar_uscale); +void hippo_gpu_clear(); + +int** hippo_gpu_precompute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, double *host_pval, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, double *prd); + +void hippo_gpu_compute_repulsion(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *host_amtype, int *host_amgroup, double **host_rpole, + double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, int *nspecial15, tagint** special15, + const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, const double aewald, const double off2, + double *host_q, double *boxlo, double *prd, + double cut2, double c0, double c1, double c2, + double c3, double c4, double c5, void **tep_ptr); + +void hippo_gpu_compute_dispersion_real(int *host_amtype, int *host_amgroup, double **host_rpole, + const double aewald, const double off2); + +void hippo_gpu_compute_multipole_real(const int ago, const int inum, const int nall, + double **host_x, int *host_type, int *host_amtype, int *host_amgroup, + double **host_rpole, double *host_pval, double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, int* nspecial15, tagint** special15, + const bool eflag, const bool vflag, const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, const double cpu_time, + bool &success, const double aewald, const double felec, const double off2, + double *host_q, double *boxlo, double *prd, void **tq_ptr); + +void hippo_gpu_compute_udirect2b(int *host_amtype, int *host_amgroup, + double **host_rpole, double **host_uind, double **host_uinp, + double *host_pval, const double aewald, const double off2, void **fieldp_ptr); + +void hippo_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup, + double **host_rpole, double **host_uind, double **host_uinp, double *host_pval, + const double aewald, const double off2, void **fieldp_ptr); + +void hippo_gpu_update_fieldp(void **fieldp_ptr); + +void hippo_gpu_precompute_kspace(const int inum_full, const int bsorder, + double ***host_thetai1, double ***host_thetai2, + double ***host_thetai3, int** igrid, + const int nzlo_out, const int nzhi_out, + const int nylo_out, const int nyhi_out, + const int nxlo_out, const int nxhi_out); + +void hippo_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1, + void **host_fdip_phi2, void **host_fdip_sum_phi); + +void hippo_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, + double **host_rpole, double **host_uind, double **host_uinp, double *host_pval, + const bool eflag, const bool vflag, const bool eatom, const bool vatom, + const double aewald, const double felec, const double off2, + void **tq_ptr); + +double hippo_gpu_bytes(); + +/* ---------------------------------------------------------------------- */ + +PairHippoGPU::PairHippoGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE) +{ + amoeba = false; + mystyle = "hippo"; + + respa_enable = 0; + reinitflag = 0; + cpu_time = 0.0; + suffix_flag |= Suffix::GPU; + fieldp_pinned = nullptr; + tq_pinned = nullptr; + + gpu_hal_ready = false; // always false for HIPPO + gpu_repulsion_ready = true; + gpu_dispersion_real_ready = true; + gpu_multipole_real_ready = true; + gpu_udirect2b_ready = true; + gpu_umutual1_ready = true; + gpu_fphi_uind_ready = true; + gpu_umutual2b_ready = true; + gpu_polar_real_ready = true; // need to be true for copying data from device back to host + + GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); +} + +/* ---------------------------------------------------------------------- + free all arrays +------------------------------------------------------------------------- */ + +PairHippoGPU::~PairHippoGPU() +{ + hippo_gpu_clear(); +} + +/* ---------------------------------------------------------------------- + init specific to this pair style +------------------------------------------------------------------------- */ + +void PairHippoGPU::init_style() +{ + PairAmoeba::init_style(); + + // Repeat cutsq calculation because done after call to init_style + + double maxcut = -1.0; + double cut; + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = i; j <= atom->ntypes; j++) { + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + cut = init_one(i,j); + cut *= cut; + if (cut > maxcut) + maxcut = cut; + cutsq[i][j] = cutsq[j][i] = cut; + } else + cutsq[i][j] = cutsq[j][i] = 0.0; + } + } + + double cell_size = sqrt(maxcut) + neighbor->skin; + + int maxspecial=0; + int maxspecial15=0; + if (atom->molecular != Atom::ATOMIC) { + maxspecial=atom->maxspecial; + maxspecial15=atom->maxspecial15; + } + + int mnf = 5e-2 * neighbor->oneatom; + int success = hippo_gpu_init(atom->ntypes+1, max_amtype, max_amclass, + pdamp, thole, dirdamp, amtype2class, + special_repel, special_disp, special_mpole, + special_polar_wscale, special_polar_piscale, + special_polar_pscale, sizpr, dmppr, elepr, + csix, adisp, pcore, palpha, + atom->nlocal, atom->nlocal+atom->nghost, mnf, + maxspecial, maxspecial15, cell_size, gpu_mode, + screen, polar_dscale, polar_uscale); + GPU_EXTRA::check_flag(success,error,world); + + if (gpu_mode == GPU_FORCE) + error->all(FLERR,"Pair style hippo/gpu does not support neigh no for now"); + + acc_float = Info::has_accelerator_feature("GPU", "precision", "single"); + + // replace with the gpu counterpart + + if (gpu_umutual1_ready) { + if (use_ewald && ic_kspace) { + delete ic_kspace; + ic_kspace = + new AmoebaConvolutionGPU(lmp,this,nefft1,nefft2,nefft3,bsporder,INDUCE_GRIDC); + } + } +} + +/* ---------------------------------------------------------------------- + repulsion = Pauli repulsion interactions + adapted from Tinker erepel1b() routine +------------------------------------------------------------------------- */ + +void PairHippoGPU::repulsion() +{ + if (!gpu_repulsion_ready) { + PairAmoeba::repulsion(); + return; + } + + int eflag=1, vflag=1; + double **f = atom->f; + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + bool success = true; + int *ilist, *numneigh; + + double sublo[3],subhi[3]; + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi); + } + inum = atom->nlocal; + + hippo_gpu_precompute(neighbor->ago, inum, nall, atom->x, + atom->type, amtype, amgroup, rpole, + nullptr, nullptr, nullptr, + sublo, subhi, atom->tag, + atom->nspecial, atom->special, + atom->nspecial15, atom->special15, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, atom->q, domain->boxlo, domain->prd); + + // select the correct cutoff for the term + + choose(REPULSE); + + hippo_gpu_compute_repulsion(neighbor->ago, inum, nall, atom->x, + atom->type, amtype, amgroup, rpole, + sublo, subhi, atom->tag, + atom->nspecial, atom->special, + atom->nspecial15, atom->special15, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, aewald, off2, atom->q, + domain->boxlo, domain->prd, cut2, + c0, c1, c2, c3, c4, c5, &tq_pinned); + + if (!success) + error->one(FLERR,"Insufficient memory on accelerator"); + + // reference to the tep array from GPU lib + + if (acc_float) { + auto *tq_ptr = (float *)tq_pinned; + compute_force_from_torque(tq_ptr, f, virrepulse); // frepulse + } else { + auto *tq_ptr = (double *)tq_pinned; + compute_force_from_torque(tq_ptr, f, virrepulse); // frepulse + } +} + +/* ---------------------------------------------------------------------- + dispersion_real = real-space portion of Ewald dispersion + adapted from Tinker edreal1d() routine +------------------------------------------------------------------------- */ + +void PairHippoGPU::dispersion_real() +{ + if (!gpu_dispersion_real_ready) { + PairAmoeba::dispersion_real(); + return; + } + + double sublo[3],subhi[3]; + + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi); + } + + // select the correct cutoff for the term + + if (use_dewald) choose(DISP_LONG); + else choose(DISP); + + hippo_gpu_compute_dispersion_real(amtype, amgroup, rpole, aewald, off2); +} + +/* ---------------------------------------------------------------------- + multipole_real = real-space portion of mulipole interactions + adapted from Tinker emreal1d() routine +------------------------------------------------------------------------- */ + +void PairHippoGPU::multipole_real() +{ + if (!gpu_multipole_real_ready) { + PairAmoeba::multipole_real(); + return; + } + + int eflag=1, vflag=1; + double **f = atom->f; + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + bool success = true; + int *ilist, *numneigh; + + double sublo[3],subhi[3]; + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi); + } + inum = atom->nlocal; + + // select the correct cutoff for the term + + if (use_ewald) choose(MPOLE_LONG); + else choose(MPOLE); + + // set the energy unit conversion factor for multipolar real-space calculation + + double felec = electric / am_dielectric; + double *pval = atom->dvector[index_pval]; + + hippo_gpu_compute_multipole_real(neighbor->ago, inum, nall, atom->x, + atom->type, amtype, amgroup, rpole, pval, + sublo, subhi, atom->tag, + atom->nspecial, atom->special, + atom->nspecial15, atom->special15, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, aewald, felec, off2, atom->q, + domain->boxlo, domain->prd, &tq_pinned); + + if (!success) + error->one(FLERR,"Insufficient memory on accelerator"); + + // reference to the tep array from GPU lib + + if (acc_float) { + auto *tq_ptr = (float *)tq_pinned; + compute_force_from_torque(tq_ptr, f, virmpole); // fmpole + } else { + auto *tq_ptr = (double *)tq_pinned; + compute_force_from_torque(tq_ptr, f, virmpole); // fmpole + } +} + +/* ---------------------------------------------------------------------- + induce = induced dipole moments via pre-conditioned CG solver + adapted from Tinker induce0a() routine + NOTE: Almost the same in the CPU version, except that there is no need + to call reverse_comm() for crstyle = FIELD; +------------------------------------------------------------------------- */ + +void PairHippoGPU::induce() +{ + bool done; + int i,j,m,itype; + int iter,maxiter; + double polmin; + double eps,epsold; + double epsd,epsp; + double udsum,upsum; + double a,ap,b,bp; + double sum,sump,term; + double reduce[4],allreduce[4]; + + // set cutoffs, taper coeffs, and PME params + // create qfac here, free at end of polar() + + if (use_ewald) choose(POLAR_LONG); + else choose(POLAR); + + // owned atoms + + int nlocal = atom->nlocal; + + // zero out the induced dipoles at each site + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + uind[i][j] = 0.0; + uinp[i][j] = 0.0; + } + } + + // get the electrostatic field due to permanent multipoles + + dfield0c(field,fieldp); + + // need reverse_comm if dfield0c (i.e. udirect2b) is CPU-only + + if (!gpu_udirect2b_ready) { + crstyle = FIELD; + comm->reverse_comm(this); + } + + // set induced dipoles to polarizability times direct field + + for (i = 0; i < nlocal; i++) { + itype = amtype[i]; + for (j = 0; j < 3; j++) { + udir[i][j] = polarity[itype] * field[i][j]; + udirp[i][j] = polarity[itype] * fieldp[i][j]; + if (pcgguess) { + uind[i][j] = udir[i][j]; + uinp[i][j] = udirp[i][j]; + } + } + } + + // allocate memory and make early host-device transfers + // must be done before the first ufield0c + if (ic_kspace) + hippo_gpu_precompute_kspace(atom->nlocal, bsorder, thetai1, thetai2, + thetai3, igrid, + ic_kspace->nzlo_out, ic_kspace->nzhi_out, + ic_kspace->nylo_out, ic_kspace->nyhi_out, + ic_kspace->nxlo_out, ic_kspace->nxhi_out); + + // get induced dipoles via the OPT extrapolation method + // NOTE: any way to rewrite these loops to avoid allocating + // uopt,uoptp with a optorder+1 dimension, just optorder ?? + // since no need to store optorder+1 values after these loops + + if (poltyp == OPT) { + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + uopt[i][0][j] = udir[i][j]; + uoptp[i][0][j] = udirp[i][j]; + } + } + + for (m = 1; m <= optorder; m++) { + optlevel = m - 1; // used in umutual1() for fopt,foptp + + cfstyle = INDUCE; + comm->forward_comm(this); + + ufield0c(field,fieldp); + + if (!gpu_umutual2b_ready) { + crstyle = FIELD; + comm->reverse_comm(this); + } + + for (i = 0; i < nlocal; i++) { + itype = amtype[i]; + for (j = 0; j < 3; j++) { + uopt[i][m][j] = polarity[itype] * field[i][j]; + uoptp[i][m][j] = polarity[itype] * fieldp[i][j]; + uind[i][j] = uopt[i][m][j]; + uinp[i][j] = uoptp[i][m][j]; + } + } + } + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + uind[i][j] = 0.0; + uinp[i][j] = 0.0; + usum[i][j] = 0.0; + usump[i][j] = 0.0; + for (m = 0; m <= optorder; m++) { + usum[i][j] += uopt[i][m][j]; + usump[i][j] += uoptp[i][m][j]; + uind[i][j] += copt[m]*usum[i][j]; + uinp[i][j] += copt[m]*usump[i][j]; + } + } + } + } + + // set tolerances for computation of mutual induced dipoles + + if (poltyp == MUTUAL) { + done = false; + maxiter = 100; + iter = 0; + polmin = 0.00000001; + eps = 100.0; + + // estimate induced dipoles using a polynomial predictor + + if (use_pred && nualt == maxualt) { + ulspred(); + + double ***udalt = fixudalt->tstore; + double ***upalt = fixupalt->tstore; + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + udsum = 0.0; + upsum = 0.0; + for (m = 0; m < nualt; m++) { + udsum += bpred[m]*udalt[i][m][j]; + upsum += bpredp[m]*upalt[i][m][j]; + } + uind[i][j] = udsum; + uinp[i][j] = upsum; + } + } + } + + // estimate induced dipoles via inertial extended Lagrangian + // not supported for now + // requires uaux,upaux to persist with each atom + // also requires a velocity vector(s) to persist + // also requires updating uaux,upaux in the Verlet integration + + /* + if (use_ielscf) { + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + uind[i][j] = uaux[i][j]; + uinp[i][j] = upaux[i][j]; + } + } + } + */ + + // get the electrostatic field due to induced dipoles + + cfstyle = INDUCE; + comm->forward_comm(this); + + ufield0c(field,fieldp); + + if (!gpu_umutual2b_ready) { + crstyle = FIELD; + comm->reverse_comm(this); + } + + // set initial conjugate gradient residual and conjugate vector + + for (i = 0; i < nlocal; i++) { + itype = amtype[i]; + + poli[i] = MAX(polmin,polarity[itype]); + for (j = 0; j < 3; j++) { + if (pcgguess) { + rsd[i][j] = (udir[i][j]-uind[i][j])/poli[i] + field[i][j]; + rsdp[i][j] = (udirp[i][j]-uinp[i][j])/poli[i] + fieldp[i][j]; + } else { + rsd[i][j] = udir[i][j] / poli[i]; + rsdp[i][j] = udirp[i][j] / poli[i]; + } + zrsd[i][j] = rsd[i][j]; + zrsdp[i][j] = rsdp[i][j]; + } + } + + if (pcgprec) { + cfstyle = RSD; + comm->forward_comm(this); + uscale0b(BUILD,rsd,rsdp,zrsd,zrsdp); + uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp); + crstyle = ZRSD; + comm->reverse_comm(this); + } + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + conj[i][j] = zrsd[i][j]; + conjp[i][j] = zrsdp[i][j]; + } + } + + // conjugate gradient iteration of the mutual induced dipoles + + while (!done) { + iter++; + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + vec[i][j] = uind[i][j]; + vecp[i][j] = uinp[i][j]; + uind[i][j] = conj[i][j]; + uinp[i][j] = conjp[i][j]; + } + } + + cfstyle = INDUCE; + comm->forward_comm(this); + + ufield0c(field,fieldp); + + if (!gpu_umutual2b_ready) { + crstyle = FIELD; + comm->reverse_comm(this); + } + + //error->all(FLERR,"STOP"); + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + uind[i][j] = vec[i][j]; + uinp[i][j] = vecp[i][j]; + vec[i][j] = conj[i][j]/poli[i] - field[i][j]; + vecp[i][j] = conjp[i][j]/poli[i] - fieldp[i][j]; + } + } + + a = 0.0; + ap = 0.0; + sum = 0.0; + sump = 0.0; + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + a += conj[i][j]*vec[i][j]; + ap += conjp[i][j]*vecp[i][j]; + sum += rsd[i][j]*zrsd[i][j]; + sump += rsdp[i][j]*zrsdp[i][j]; + } + } + + reduce[0] = a; + reduce[1] = ap; + reduce[2] = sum; + reduce[3] = sump; + MPI_Allreduce(reduce,allreduce,4,MPI_DOUBLE,MPI_SUM,world); + a = allreduce[0]; + ap = allreduce[1]; + sum = allreduce[2]; + sump = allreduce[3]; + + if (a != 0.0) a = sum / a; + if (ap != 0.0) ap = sump / ap; + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + uind[i][j] = uind[i][j] + a*conj[i][j]; + uinp[i][j] = uinp[i][j] + ap*conjp[i][j]; + rsd[i][j] = rsd[i][j] - a*vec[i][j]; + rsdp[i][j] = rsdp[i][j] - ap*vecp[i][j]; + zrsd[i][j] = rsd[i][j]; + zrsdp[i][j] = rsdp[i][j]; + } + } + + if (pcgprec) { + cfstyle = RSD; + comm->forward_comm(this); + uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp); + crstyle = ZRSD; + comm->reverse_comm(this); + } + + b = 0.0; + bp = 0.0; + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + b += rsd[i][j]*zrsd[i][j]; + bp += rsdp[i][j]*zrsdp[i][j]; + } + } + + reduce[0] = b; + reduce[1] = bp; + MPI_Allreduce(reduce,allreduce,4,MPI_DOUBLE,MPI_SUM,world); + b = allreduce[0]; + bp = allreduce[1]; + + if (sum != 0.0) b /= sum; + if (sump != 0.0) bp /= sump; + + epsd = 0.0; + epsp = 0.0; + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + conj[i][j] = zrsd[i][j] + b*conj[i][j]; + conjp[i][j] = zrsdp[i][j] + bp*conjp[i][j]; + epsd += rsd[i][j]*rsd[i][j]; + epsp += rsdp[i][j]*rsdp[i][j]; + } + } + + reduce[0] = epsd; + reduce[1] = epsp; + MPI_Allreduce(reduce,allreduce,4,MPI_DOUBLE,MPI_SUM,world); + epsd = allreduce[0]; + epsp = allreduce[1]; + + // check the convergence of the mutual induced dipoles + + epsold = eps; + eps = MAX(epsd,epsp); + eps = DEBYE * sqrt(eps/atom->natoms); + + if (eps < poleps) done = true; + if (eps > epsold) done = true; + if (iter >= politer) done = true; + + // apply a "peek" iteration to the mutual induced dipoles + + if (done) { + for (i = 0; i < nlocal; i++) { + term = pcgpeek * poli[i]; + for (j = 0; j < 3; j++) { + uind[i][j] += term*rsd[i][j]; + uinp[i][j] += term*rsdp[i][j]; + } + } + } + + } + + // terminate the calculation if dipoles failed to converge + // NOTE: could make this an error + + if (iter >= maxiter || eps > epsold) + if (comm->me == 0) + error->warning(FLERR,"HIPPO induced dipoles did not converge"); + } + + // update the lists of previous induced dipole values + // shift previous m values up to m+1, add new values at m = 0 + // only when preconditioner is used + + if (use_pred) { + double ***udalt = fixudalt->tstore; + double ***upalt = fixupalt->tstore; + + nualt = MIN(nualt+1,maxualt); + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + for (m = nualt-1; m > 0; m--) { + udalt[i][m][j] = udalt[i][m-1][j]; + upalt[i][m][j] = upalt[i][m-1][j]; + } + udalt[i][0][j] = uind[i][j]; + upalt[i][0][j] = uinp[i][j]; + } + } + } +} + +/* ---------------------------------------------------------------------- + udirect2b = Ewald real direct field via list + udirect2b computes the real space contribution of the permanent + atomic multipole moments to the field via a neighbor list +------------------------------------------------------------------------- */ + +void PairHippoGPU::udirect2b(double **field, double **fieldp) +{ + if (!gpu_udirect2b_ready) { + PairAmoeba::udirect2b(field, fieldp); + return; + } + + int inum; + double sublo[3],subhi[3]; + + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi); + } + inum = atom->nlocal; + + // select the correct cutoff (off2) for the term + + if (use_ewald) choose(POLAR_LONG); + else choose(POLAR); + + double *pval = atom->dvector[index_pval]; + hippo_gpu_compute_udirect2b(amtype, amgroup, rpole, uind, uinp, pval, + aewald, off2, &fieldp_pinned); + + // rebuild dipole-dipole pair list and store pairwise dipole matrices + // done one atom at a time in real-space double loop over atoms & neighs + // NOTE: for the moment the tdipdip values are computed just in time in umutual2b() + // so no need to call ubdirect2b_cpu(). + // udirect2b_cpu(); + + // accumulate the field and fieldp values from the GPU lib + // field and fieldp may already have some nonzero values from kspace (udirect1) + + int nlocal = atom->nlocal; + if (acc_float) { + auto field_ptr = (float *)fieldp_pinned; + + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + field[i][0] += field_ptr[idx]; + field[i][1] += field_ptr[idx+1]; + field[i][2] += field_ptr[idx+2]; + } + + field_ptr += 4*inum; + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + fieldp[i][0] += field_ptr[idx]; + fieldp[i][1] += field_ptr[idx+1]; + fieldp[i][2] += field_ptr[idx+2]; + } + + } else { + + auto field_ptr = (double *)fieldp_pinned; + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + field[i][0] += field_ptr[idx]; + field[i][1] += field_ptr[idx+1]; + field[i][2] += field_ptr[idx+2]; + } + + field_ptr += 4*inum; + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + fieldp[i][0] += field_ptr[idx]; + fieldp[i][1] += field_ptr[idx+1]; + fieldp[i][2] += field_ptr[idx+2]; + } + } +} + +/* ---------------------------------------------------------------------- + udirect2b = Ewald real direct field via list + udirect2b computes the real space contribution of the permanent + atomic multipole moments to the field via a neighbor list +------------------------------------------------------------------------- */ + +void PairHippoGPU::udirect2b_cpu() +{ + int i,j,m,n,ii,jj,jextra,ndip,itype,jtype,igroup,jgroup; + double xr,yr,zr,r,r2; + double rr1,rr2,rr3,rr5; + double bfac,exp2a; + double ralpha,aefac; + double aesq2,aesq2n; + double pdi,pti; + double pgamma; + double damp,expdamp; + double scale3,scale5,scalek; + double bn[4],bcn[3]; + double factor_uscale; + + int inum,jnum; + int *ilist,*jlist,*numneigh,**firstneigh; + + double **x = atom->x; + + // neigh list + + inum = list->inum; + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + + // NOTE: doesn't this have a problem if aewald is tiny ?? + + aesq2 = 2.0 * aewald * aewald; + aesq2n = 0.0; + if (aewald > 0.0) aesq2n = 1.0 / (MY_PIS*aewald); + + // rebuild dipole-dipole pair list and store pairwise dipole matrices + // done one atom at a time in real-space double loop over atoms & neighs + + int *neighptr; + double *tdipdip; + + // compute the real space portion of the Ewald summation + + for (ii = 0; ii < inum; ii++) { + i = ilist[ii]; + itype = amtype[i]; + igroup = amgroup[i]; + jlist = firstneigh[i]; + jnum = numneigh[i]; + + n = ndip = 0; + neighptr = ipage_dipole->vget(); + tdipdip = dpage_dipdip->vget(); + + pdi = pdamp[itype]; + pti = thole[itype]; + + // evaluate all sites within the cutoff distance + + for (jj = 0; jj < jnum; jj++) { + jextra = jlist[jj]; + j = jextra & NEIGHMASK15; + + xr = x[j][0] - x[i][0]; + yr = x[j][1] - x[i][1]; + zr = x[j][2] - x[i][2]; + r2 = xr*xr + yr* yr + zr*zr; + if (r2 > off2) continue; + + jtype = amtype[j]; + jgroup = amgroup[j]; + + if (igroup == jgroup) factor_uscale = polar_uscale; + else factor_uscale = 1.0; + + r = sqrt(r2); + rr1 = 1.0 / r; + rr2 = rr1 * rr1; + rr3 = rr2 * rr1; + rr5 = 3.0 * rr2 * rr3; + + // calculate the real space Ewald error function terms + + ralpha = aewald * r; + bn[0] = erfc(ralpha) * rr1; + exp2a = exp(-ralpha*ralpha); + aefac = aesq2n; + for (m = 1; m <= 3; m++) { + bfac = m+m-1; + aefac = aesq2 * aefac; + bn[m] = (bfac*bn[m-1]+aefac*exp2a) * rr2; + } + + // find terms needed later to compute mutual polarization + + if (poltyp != DIRECT) { + scale3 = 1.0; + scale5 = 1.0; + damp = pdi * pdamp[jtype]; + if (damp != 0.0) { + pgamma = MIN(pti,thole[jtype]); + damp = pgamma * pow(r/damp,3.0); + if (damp < 50.0) { + expdamp = exp(-damp); + scale3 = 1.0 - expdamp; + scale5 = 1.0 - expdamp*(1.0+damp); + } + } + scalek = factor_uscale; + bcn[0] = bn[1] - (1.0-scalek*scale3)*rr3; + bcn[1] = bn[2] - (1.0-scalek*scale5)*rr5; + + neighptr[n++] = j; + tdipdip[ndip++] = -bcn[0] + bcn[1]*xr*xr; + tdipdip[ndip++] = bcn[1]*xr*yr; + tdipdip[ndip++] = bcn[1]*xr*zr; + tdipdip[ndip++] = -bcn[0] + bcn[1]*yr*yr; + tdipdip[ndip++] = bcn[1]*yr*zr; + tdipdip[ndip++] = -bcn[0] + bcn[1]*zr*zr; + } else { + + } + + } // jj + + firstneigh_dipole[i] = neighptr; + firstneigh_dipdip[i] = tdipdip; + numneigh_dipole[i] = n; + ipage_dipole->vgot(n); + dpage_dipdip->vgot(ndip); + } +} + +/* ---------------------------------------------------------------------- + ufield0c = mutual induction via Ewald sum + ufield0c computes the mutual electrostatic field due to + induced dipole moments via Ewald summation +------------------------------------------------------------------------- */ + +void PairHippoGPU::ufield0c(double **field, double **fieldp) +{ + double term; + + // zero field,fieldp for owned and ghost atoms + + int nlocal = atom->nlocal; + int nall = nlocal + atom->nghost; + + memset(&field[0][0], 0, 3*nall *sizeof(double)); + memset(&fieldp[0][0], 0, 3*nall *sizeof(double)); + + // get the real space portion of the mutual field first + + double time0, time1, time2; + MPI_Barrier(world); + time0 = platform::walltime(); + + if (polar_rspace_flag) umutual2b(field,fieldp); + time1 = platform::walltime(); + + // get the reciprocal space part of the mutual field + + if (polar_kspace_flag) umutual1(field,fieldp); + time2 = platform::walltime(); + + // add the self-energy portion of the mutual field + + term = (4.0/3.0) * aewald*aewald*aewald / MY_PIS; + for (int i = 0; i < nlocal; i++) { + field[i][0] += term*uind[i][0]; + field[i][1] += term*uind[i][1]; + field[i][2] += term*uind[i][2]; + } + + for (int i = 0; i < nlocal; i++) { + fieldp[i][0] += term*uinp[i][0]; + fieldp[i][1] += term*uinp[i][1]; + fieldp[i][2] += term*uinp[i][2]; + } + + // accumulate the field and fieldp values from the real-space portion from umutual2b() on the GPU + // field and fieldp may already have some nonzero values from kspace (umutual1 and self) + + hippo_gpu_update_fieldp(&fieldp_pinned); + int inum = atom->nlocal; + + if (acc_float) { + auto *field_ptr = (float *)fieldp_pinned; + + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + field[i][0] += field_ptr[idx]; + field[i][1] += field_ptr[idx+1]; + field[i][2] += field_ptr[idx+2]; + } + + field_ptr += 4*inum; + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + fieldp[i][0] += field_ptr[idx]; + fieldp[i][1] += field_ptr[idx+1]; + fieldp[i][2] += field_ptr[idx+2]; + } + + } else { + auto *field_ptr = (double *)fieldp_pinned; + + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + field[i][0] += field_ptr[idx]; + field[i][1] += field_ptr[idx+1]; + field[i][2] += field_ptr[idx+2]; + } + + field_ptr += 4*inum; + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + fieldp[i][0] += field_ptr[idx]; + fieldp[i][1] += field_ptr[idx+1]; + fieldp[i][2] += field_ptr[idx+2]; + } + } + + // accumulate timing information + + time_mutual_rspace += time1 - time0; + time_mutual_kspace += time2 - time1; +} + +/* ---------------------------------------------------------------------- + umutual1 = Ewald recip mutual induced field + umutual1 computes the reciprocal space contribution of the + induced atomic dipole moments to the field +------------------------------------------------------------------------- */ + +void PairHippoGPU::umutual1(double **field, double **fieldp) +{ + int m,n; + int nxlo,nxhi,nylo,nyhi,nzlo,nzhi; + double term; + double a[3][3]; // indices not flipped vs Fortran + + // return if the Ewald coefficient is zero + + if (aewald < 1.0e-6) return; + + // convert Cartesian dipoles to fractional coordinates + + for (int j = 0; j < 3; j++) { + a[0][j] = nfft1 * recip[0][j]; + a[1][j] = nfft2 * recip[1][j]; + a[2][j] = nfft3 * recip[2][j]; + } + + int nlocal = atom->nlocal; + + for (int i = 0; i < nlocal; i++) { + fuind[i][0] = a[0][0]*uind[i][0] + a[0][1]*uind[i][1] + a[0][2]*uind[i][2]; + fuind[i][1] = a[1][0]*uind[i][0] + a[1][1]*uind[i][1] + a[1][2]*uind[i][2]; + fuind[i][2] = a[2][0]*uind[i][0] + a[2][1]*uind[i][1] + a[2][2]*uind[i][2]; + } + + for (int i = 0; i < nlocal; i++) { + fuinp[i][0] = a[0][0]*uinp[i][0] + a[0][1]*uinp[i][1] + a[0][2]*uinp[i][2]; + fuinp[i][1] = a[1][0]*uinp[i][0] + a[1][1]*uinp[i][1] + a[1][2]*uinp[i][2]; + fuinp[i][2] = a[2][0]*uinp[i][0] + a[2][1]*uinp[i][1] + a[2][2]*uinp[i][2]; + } + + // gridpre = my portion of 4d grid in brick decomp w/ ghost values + + FFT_SCALAR ****gridpre = (FFT_SCALAR ****) ic_kspace->zero(); + + // map 2 values to grid + + double time0, time1; + MPI_Barrier(world); + time0 = platform::walltime(); + + grid_uind(fuind,fuinp,gridpre); + + time1 = platform::walltime(); + time_grid_uind += (time1 - time0); + + // pre-convolution operations including forward FFT + // gridfft = my portion of complex 3d grid in FFT decomposition + + FFT_SCALAR *gridfft = ic_kspace->pre_convolution(); + + // --------------------- + // convolution operation + // --------------------- + + nxlo = ic_kspace->nxlo_fft; + nxhi = ic_kspace->nxhi_fft; + nylo = ic_kspace->nylo_fft; + nyhi = ic_kspace->nyhi_fft; + nzlo = ic_kspace->nzlo_fft; + nzhi = ic_kspace->nzhi_fft; + + // use qfac values stored in udirect1() + + m = n = 0; + for (int k = nzlo; k <= nzhi; k++) { + for (int j = nylo; j <= nyhi; j++) { + for (int i = nxlo; i <= nxhi; i++) { + term = qfac[m++]; + gridfft[n] *= term; + gridfft[n+1] *= term; + n += 2; + } + } + } + + // post-convolution operations including backward FFT + // gridppost = my portion of 4d grid in brick decomp w/ ghost values + + FFT_SCALAR ****gridpost = (FFT_SCALAR ****) ic_kspace->post_convolution(); + + // get potential + + MPI_Barrier(world); + time0 = platform::walltime(); + + fphi_uind(gridpost,fdip_phi1,fdip_phi2,fdip_sum_phi); + + time1 = platform::walltime(); + time_fphi_uind += (time1 - time0); + + // store fractional reciprocal potentials for OPT method + + if (poltyp == OPT) { + for (int i = 0; i < nlocal; i++) { + for (int j = 0; j < 10; j++) { + fopt[i][optlevel][j] = fdip_phi1[i][j]; + foptp[i][optlevel][j] = fdip_phi2[i][j]; + } + } + } + + // convert the dipole fields from fractional to Cartesian + + for (int i = 0; i < 3; i++) { + a[0][i] = nfft1 * recip[0][i]; + a[1][i] = nfft2 * recip[1][i]; + a[2][i] = nfft3 * recip[2][i]; + } + + for (int i = 0; i < nlocal; i++) { + double dfx = a[0][0]*fdip_phi1[i][1] + + a[0][1]*fdip_phi1[i][2] + a[0][2]*fdip_phi1[i][3]; + double dfy = a[1][0]*fdip_phi1[i][1] + + a[1][1]*fdip_phi1[i][2] + a[1][2]*fdip_phi1[i][3]; + double dfz = a[2][0]*fdip_phi1[i][1] + + a[2][1]*fdip_phi1[i][2] + a[2][2]*fdip_phi1[i][3]; + field[i][0] -= dfx; + field[i][1] -= dfy; + field[i][2] -= dfz; + } + + for (int i = 0; i < nlocal; i++) { + double dfx = a[0][0]*fdip_phi2[i][1] + + a[0][1]*fdip_phi2[i][2] + a[0][2]*fdip_phi2[i][3]; + double dfy = a[1][0]*fdip_phi2[i][1] + + a[1][1]*fdip_phi2[i][2] + a[1][2]*fdip_phi2[i][3]; + double dfz = a[2][0]*fdip_phi2[i][1] + + a[2][1]*fdip_phi2[i][2] + a[2][2]*fdip_phi2[i][3]; + fieldp[i][0] -= dfx; + fieldp[i][1] -= dfy; + fieldp[i][2] -= dfz; + } +} + +/* ---------------------------------------------------------------------- + fphi_uind = induced potential from grid + fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid +------------------------------------------------------------------------- */ + +void PairHippoGPU::fphi_uind(FFT_SCALAR ****grid, double **fdip_phi1, + double **fdip_phi2, double **fdip_sum_phi) +{ + if (!gpu_fphi_uind_ready) { + PairAmoeba::fphi_uind(grid, fdip_phi1, fdip_phi2, fdip_sum_phi); + return; + } + + void* fdip_phi1_pinned = nullptr; + void* fdip_phi2_pinned = nullptr; + void* fdip_sum_phi_pinned = nullptr; + hippo_gpu_fphi_uind(grid, &fdip_phi1_pinned, &fdip_phi2_pinned, + &fdip_sum_phi_pinned); + + int nlocal = atom->nlocal; + if (acc_float) { + auto _fdip_phi1_ptr = (float *)fdip_phi1_pinned; + for (int i = 0; i < nlocal; i++) { + int n = i; + for (int m = 0; m < 10; m++) { + fdip_phi1[i][m] = _fdip_phi1_ptr[n]; + n += nlocal; + } + } + + auto _fdip_phi2_ptr = (float *)fdip_phi2_pinned; + for (int i = 0; i < nlocal; i++) { + int n = i; + for (int m = 0; m < 10; m++) { + fdip_phi2[i][m] = _fdip_phi2_ptr[n]; + n += nlocal; + } + } + + auto _fdip_sum_phi_ptr = (float *)fdip_sum_phi_pinned; + for (int i = 0; i < nlocal; i++) { + int n = i; + for (int m = 0; m < 20; m++) { + fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[n]; + n += nlocal; + } + } + + } else { + + auto _fdip_phi1_ptr = (double *)fdip_phi1_pinned; + for (int i = 0; i < nlocal; i++) { + int n = i; + for (int m = 0; m < 10; m++) { + fdip_phi1[i][m] = _fdip_phi1_ptr[n]; + n += nlocal; + } + } + + auto _fdip_phi2_ptr = (double *)fdip_phi2_pinned; + for (int i = 0; i < nlocal; i++) { + int n = i; + for (int m = 0; m < 10; m++) { + fdip_phi2[i][m] = _fdip_phi2_ptr[n]; + n += nlocal; + } + } + + auto _fdip_sum_phi_ptr = (double *)fdip_sum_phi_pinned; + for (int i = 0; i < nlocal; i++) { + int n = i; + for (int m = 0; m < 20; m++) { + fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[n]; + n += nlocal; + } + } + } +} + +/* ---------------------------------------------------------------------- + umutual2b = Ewald real mutual field via list + umutual2b computes the real space contribution of the induced + atomic dipole moments to the field via a neighbor list +------------------------------------------------------------------------- */ + +void PairHippoGPU::umutual2b(double **field, double **fieldp) +{ + if (!gpu_umutual2b_ready) { + PairAmoeba::umutual2b(field, fieldp); + return; + } + + double sublo[3],subhi[3]; + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi); + } + + // select the correct cutoff (off2) for the term + + if (use_ewald) choose(POLAR_LONG); + else choose(POLAR); + + double *pval = atom->dvector[index_pval]; + hippo_gpu_compute_umutual2b(amtype, amgroup, rpole, uind, uinp, pval, + aewald, off2, &fieldp_pinned); +} + +/* ---------------------------------------------------------------------- + polar_real = real-space portion of induced dipole polarization + adapted from Tinker epreal1d() routine +------------------------------------------------------------------------- */ + +void PairHippoGPU::polar_real() +{ + if (!gpu_polar_real_ready) { + PairAmoeba::polar_real(); + return; + } + + int eflag=1, vflag=1; + double **f = atom->f; + double sublo[3],subhi[3]; + + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi); + } + + // select the correct cutoff and aewald for the term + + if (use_ewald) choose(POLAR_LONG); + else choose(POLAR); + + // set the energy unit conversion factor for polar real-space calculation + + double felec = 0.5 * electric / am_dielectric; + double *pval = atom->dvector[index_pval]; + + hippo_gpu_compute_polar_real(amtype, amgroup, rpole, uind, uinp, pval, + eflag, vflag, eflag_atom, vflag_atom, + aewald, felec, off2, &tq_pinned); + + // reference to the tep array from GPU lib + + if (acc_float) { + auto *tep_ptr = (float *)tq_pinned; + compute_force_from_torque(tep_ptr, f, virpolar); // fpolar + } else { + auto *tep_ptr = (double *)tq_pinned; + compute_force_from_torque(tep_ptr, f, virpolar); // fpolar + } +} + +/* ---------------------------------------------------------------------- + compute atom forces from torques used by various terms +------------------------------------------------------------------------- */ + +template +void PairHippoGPU::compute_force_from_torque(const numtyp* tq_ptr, + double** force_comp, + double* virial_comp) +{ + int i,ix,iy,iz; + double xix,yix,zix; + double xiy,yiy,ziy; + double xiz,yiz,ziz; + double vxx,vyy,vzz; + double vxy,vxz,vyz; + double fix[3],fiy[3],fiz[3],_tq[4]; + + double** x = atom->x; + int nlocal = atom->nlocal; + + for (i = 0; i < nlocal; i++) { + _tq[0] = tq_ptr[4*i]; + _tq[1] = tq_ptr[4*i+1]; + _tq[2] = tq_ptr[4*i+2]; + torque2force(i,_tq,fix,fiy,fiz,force_comp); + + iz = zaxis2local[i]; + ix = xaxis2local[i]; + iy = yaxis2local[i]; + + xiz = x[iz][0] - x[i][0]; + yiz = x[iz][1] - x[i][1]; + ziz = x[iz][2] - x[i][2]; + xix = x[ix][0] - x[i][0]; + yix = x[ix][1] - x[i][1]; + zix = x[ix][2] - x[i][2]; + xiy = x[iy][0] - x[i][0]; + yiy = x[iy][1] - x[i][1]; + ziy = x[iy][2] - x[i][2]; + + vxx = xix*fix[0] + xiy*fiy[0] + xiz*fiz[0]; + vyy = yix*fix[1] + yiy*fiy[1] + yiz*fiz[1]; + vzz = zix*fix[2] + ziy*fiy[2] + ziz*fiz[2]; + vxy = 0.5 * (yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] + + xix*fix[1] + xiy*fiy[1] + xiz*fiz[1]); + vxz = 0.5 * (zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] + + xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]); + vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] + + yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]); + + virial_comp[0] -= vxx; + virial_comp[1] -= vyy; + virial_comp[2] -= vzz; + virial_comp[3] -= vxy; + virial_comp[4] -= vxz; + virial_comp[5] -= vyz; + } +} + +/* ---------------------------------------------------------------------- */ + +double PairHippoGPU::memory_usage() +{ + double bytes = Pair::memory_usage(); + return bytes + hippo_gpu_bytes(); +} diff --git a/src/GPU/pair_hippo_gpu.h b/src/GPU/pair_hippo_gpu.h new file mode 100644 index 0000000000..d160446d77 --- /dev/null +++ b/src/GPU/pair_hippo_gpu.h @@ -0,0 +1,73 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS Development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS +// clang-format off +PairStyle(hippo/gpu,PairHippoGPU); +// clang-format on +#else + +#ifndef LMP_PAIR_HIPPO_GPU_H +#define LMP_PAIR_HIPPO_GPU_H + +#include "pair_amoeba.h" + +namespace LAMMPS_NS { + +class PairHippoGPU : public PairAmoeba { + public: + PairHippoGPU(LAMMPS *lmp); + ~PairHippoGPU() override; + void init_style() override; + double memory_usage() override; + + enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH }; + + void induce() override; + + void repulsion() override; + void dispersion_real() override; + void multipole_real() override; + void udirect2b(double **, double **) override; + void umutual1(double **, double **) override; + void fphi_uind(FFT_SCALAR ****, double **, double **, double **) override; + void umutual2b(double **, double **) override; + void ufield0c(double **, double **) override; + void polar_real() override; + + private: + int gpu_mode; + double cpu_time; + void *tq_pinned; + void *fieldp_pinned; + bool acc_float; + + bool gpu_hal_ready; + bool gpu_repulsion_ready; + bool gpu_dispersion_real_ready; + bool gpu_multipole_real_ready; + bool gpu_udirect2b_ready; + bool gpu_umutual1_ready; + bool gpu_fphi_uind_ready; + bool gpu_umutual2b_ready; + bool gpu_polar_real_ready; + + void udirect2b_cpu(); + + template + void compute_force_from_torque(const numtyp*, double**, double*); +}; + +} // namespace LAMMPS_NS +#endif +#endif diff --git a/src/REAXFF/fix_reaxff_species.cpp b/src/REAXFF/fix_reaxff_species.cpp index 65a2e6d8ce..ce04be2cc8 100644 --- a/src/REAXFF/fix_reaxff_species.cpp +++ b/src/REAXFF/fix_reaxff_species.cpp @@ -21,6 +21,7 @@ #include "atom.h" #include "atom_vec.h" +#include "citeme.h" #include "comm.h" #include "domain.h" #include "error.h" @@ -36,12 +37,25 @@ #include "pair_reaxff.h" #include "reaxff_defs.h" +#include #include #include +#include using namespace LAMMPS_NS; using namespace FixConst; +static const char cite_reaxff_species_delete[] = + "fix reaxff/species, 'delete' keyword: https://doi.org/10.1016/j.carbon.2022.11.002\n\n" + "@Article{Gissinger23,\n" + " author = {J. R. Gissinger, S. R. Zavada, J. G. Smith, J. Kemppainen, I. Gallegos, G. M. Odegard, E. J. Siochi, K. E. Wise},\n" + " title = {Predicting char yield of high-temperature resins},\n" + " journal = {Carbon},\n" + " year = 2023,\n" + " volume = 202,\n" + " pages = {336-347}\n" + "}\n\n"; + /* ---------------------------------------------------------------------- */ FixReaxFFSpecies::FixReaxFFSpecies(LAMMPS *lmp, int narg, char **arg) : @@ -145,6 +159,7 @@ FixReaxFFSpecies::FixReaxFFSpecies(LAMMPS *lmp, int narg, char **arg) : ele = filepos = filedel = nullptr; eleflag = posflag = padflag = 0; delflag = specieslistflag = masslimitflag = 0; + delete_Nlimit = delete_Nsteps = 0; singlepos_opened = multipos_opened = del_opened = 0; multipos = 0; @@ -221,7 +236,12 @@ FixReaxFFSpecies::FixReaxFFSpecies(LAMMPS *lmp, int narg, char **arg) : } else error->all(FLERR, "Unknown fix reaxff/species delete option: {}", arg[iarg]); - + // rate limit when deleting molecules + } else if (strcmp(arg[iarg], "delete_rate_limit") == 0) { + if (iarg + 3 > narg) utils::missing_cmd_args(FLERR, "fix reaxff/species delete_rate_limit", error); + delete_Nlimit = utils::numeric(FLERR, arg[iarg+1], false, lmp); + delete_Nsteps = utils::numeric(FLERR, arg[iarg+2], false, lmp); + iarg += 3; // position of molecules } else if (strcmp(arg[iarg], "position") == 0) { if (iarg + 3 > narg) utils::missing_cmd_args(FLERR, "fix reaxff/species position", error); @@ -260,6 +280,15 @@ FixReaxFFSpecies::FixReaxFFSpecies(LAMMPS *lmp, int narg, char **arg) : if (delflag && specieslistflag && masslimitflag) error->all(FLERR, "Incompatible combination fix reaxff/species command options"); + if (delete_Nlimit > 0) { + if (lmp->citeme) lmp->citeme->add(cite_reaxff_species_delete); + memory->create(delete_Tcount,delete_Nsteps,"reaxff/species:delete_Tcount"); + + for (int i = 0; i < delete_Nsteps; i++) + delete_Tcount[i] = -1; + delete_Tcount[0] = 0; + } + vector_nmole = 0; vector_nspec = 0; } @@ -279,6 +308,7 @@ FixReaxFFSpecies::~FixReaxFFSpecies() memory->destroy(Mol2Spec); memory->destroy(MolType); memory->destroy(MolName); + memory->destroy(delete_Tcount); delete[] filepos; delete[] filedel; @@ -375,7 +405,13 @@ void FixReaxFFSpecies::Output_ReaxFF_Bonds(bigint ntimestep, FILE * /*fp*/) // point to fix_ave_atom f_SPECBOND->end_of_step(); - if (ntimestep != nvalid) return; + if (ntimestep != nvalid) { + // push back delete_Tcount on every step + if (delete_Nlimit > 0) + for (int i = delete_Nsteps-1; i > 0; i--) + delete_Tcount[i] = delete_Tcount[i-1]; + return; + } nlocal = atom->nlocal; @@ -826,6 +862,15 @@ void FixReaxFFSpecies::WritePos(int Nmole, int Nspec) void FixReaxFFSpecies::DeleteSpecies(int Nmole, int Nspec) { + int ndeletions; + int headroom = -1; + if (delete_Nlimit > 0) { + if (delete_Tcount[delete_Nsteps-1] == -1) return; + ndeletions = delete_Tcount[0] - delete_Tcount[delete_Nsteps-1]; + headroom = MAX(0, delete_Nlimit - ndeletions); + if (headroom == 0) return; + } + int i, j, m, n, itype, cid; int ndel, ndelone, count, count_tmp; int *Nameall; @@ -856,7 +901,23 @@ void FixReaxFFSpecies::DeleteSpecies(int Nmole, int Nspec) int *marklist; memory->create(marklist, nlocal, "reaxff/species:marklist"); - for (m = 1; m <= Nmole; m++) { + std::random_device rnd; + std::minstd_rand park_rng(rnd()); + int *molrange; + memory->create(molrange,Nmole,"reaxff/species:molrange"); + for (m = 0; m < Nmole; m++) + molrange[m] = m + 1; + if (delete_Nlimit > 0) { + // shuffle index when using rate_limit, in case order is biased + if (comm->me == 0) + std::shuffle(&molrange[0],&molrange[Nmole], park_rng); + MPI_Bcast(&molrange[0], Nmole, MPI_INT, 0, world); + } + + int this_delete_Tcount = 0; + for (int mm = 0; mm < Nmole; mm++) { + if (this_delete_Tcount == headroom) break; + m = molrange[mm]; localmass = totalmass = count = nmarklist = 0; for (n = 0; n < ntypes; n++) Name[n] = 0; @@ -896,6 +957,7 @@ void FixReaxFFSpecies::DeleteSpecies(int Nmole, int Nspec) // find corresponding moltype if (totalmass > massmin && totalmass < massmax) { + this_delete_Tcount++; for (j = 0; j < nmarklist; j++) { mark[marklist[j]] = 1; deletecount[Mol2Spec[m - 1]] += 1.0 / (double) count; @@ -905,6 +967,7 @@ void FixReaxFFSpecies::DeleteSpecies(int Nmole, int Nspec) if (count > 0) { for (i = 0; i < ndelspec; i++) { if (del_species[i] == species_str) { + this_delete_Tcount++; for (j = 0; j < nmarklist; j++) { mark[marklist[j]] = 1; deletecount[i] += 1.0 / (double) count; @@ -976,6 +1039,14 @@ void FixReaxFFSpecies::DeleteSpecies(int Nmole, int Nspec) } } + + // push back delete_Tcount on every step + if (delete_Nlimit > 0) { + for (i = delete_Nsteps-1; i > 0; i--) + delete_Tcount[i] = delete_Tcount[i-1]; + delete_Tcount[0] += this_delete_Tcount; + } + if (ndel && (atom->map_style != Atom::MAP_NONE)) { atom->nghost = 0; atom->map_init(); @@ -988,6 +1059,7 @@ void FixReaxFFSpecies::DeleteSpecies(int Nmole, int Nspec) memory->destroy(marklist); memory->destroy(mark); memory->destroy(deletecount); + memory->destroy(molrange); } /* ---------------------------------------------------------------------- */ diff --git a/src/REAXFF/fix_reaxff_species.h b/src/REAXFF/fix_reaxff_species.h index 65eeae4c60..329e17145b 100644 --- a/src/REAXFF/fix_reaxff_species.h +++ b/src/REAXFF/fix_reaxff_species.h @@ -60,6 +60,7 @@ class FixReaxFFSpecies : public Fix { FILE *fp, *pos, *fdel; int eleflag, posflag, multipos, padflag, setupflag; int delflag, specieslistflag, masslimitflag; + int delete_Nlimit, delete_Nsteps, *delete_Tcount; double massmin, massmax; int singlepos_opened, multipos_opened, del_opened; char *ele, **eletype, *filepos, *filedel; diff --git a/src/atom.cpp b/src/atom.cpp index 480a779e68..32285758c0 100644 --- a/src/atom.cpp +++ b/src/atom.cpp @@ -2345,6 +2345,18 @@ void Atom::setup_sort_bins() return; } +#ifdef LMP_GPU + if (userbinsize == 0.0) { + auto ifix = dynamic_cast(modify->get_fix_by_id("package_gpu")); + if (ifix) { + const double subx = domain->subhi[0] - domain->sublo[0]; + const double suby = domain->subhi[1] - domain->sublo[1]; + const double subz = domain->subhi[2] - domain->sublo[2]; + binsize = ifix->binsize(subx, suby, subz, atom->nlocal, 0.5 * neighbor->cutneighmax); + } + } +#endif + double bininv = 1.0/binsize; // nbin xyz = local bins diff --git a/src/neighbor.cpp b/src/neighbor.cpp index f2b094ec37..05371c8259 100644 --- a/src/neighbor.cpp +++ b/src/neighbor.cpp @@ -535,6 +535,7 @@ void Neighbor::init() int flag=0; for (int isub=0; isub < ph->nstyles; ++isub) { if (force->pair_match("amoeba",0,isub) + || force->pair_match("hippo",0,isub) || force->pair_match("coul/wolf",0,isub) || force->pair_match("coul/dsf",0,isub) || force->pair_match("coul/exclude",0) @@ -545,6 +546,7 @@ void Neighbor::init() special_flag[1] = special_flag[2] = special_flag[3] = 2; } else { if (force->pair_match("amoeba",0) + || force->pair_match("hippo",0) || force->pair_match("coul/wolf",0) || force->pair_match("coul/dsf",0) || force->pair_match("coul/exclude",0)