diff --git a/cmake/Modules/LAMMPSUtils.cmake b/cmake/Modules/LAMMPSUtils.cmake index d42f91f10e..9b42dafc44 100644 --- a/cmake/Modules/LAMMPSUtils.cmake +++ b/cmake/Modules/LAMMPSUtils.cmake @@ -99,8 +99,15 @@ function(check_for_autogen_files source_dir) endfunction() macro(pkg_depends PKG1 PKG2) - if(PKG_${PKG1} AND NOT (PKG_${PKG2} OR BUILD_${PKG2})) - message(FATAL_ERROR "The ${PKG1} package needs LAMMPS to be built with the ${PKG2} package") + if(DEFINED BUILD_${PKG2}) + if(PKG_${PKG1} AND NOT BUILD_${PKG2}) + message(FATAL_ERROR "The ${PKG1} package needs LAMMPS to be built with -D BUILD_${PKG2}=ON") + endif() + elseif(DEFINED PKG_${PKG2}) + if(PKG_${PKG1} AND NOT PKG_${PKG2}) + message(WARNING "The ${PKG1} package depends on the ${PKG2} package. Enabling it.") + set(PKG_${PKG2} ON CACHE BOOL "" FORCE) + endif() endif() endmacro() diff --git a/cmake/Modules/Packages/COMPRESS.cmake b/cmake/Modules/Packages/COMPRESS.cmake index bdcf1aa3f8..4e1ab846a7 100644 --- a/cmake/Modules/Packages/COMPRESS.cmake +++ b/cmake/Modules/Packages/COMPRESS.cmake @@ -1,4 +1,9 @@ -find_package(ZLIB REQUIRED) +find_package(ZLIB) +if(NOT ZLIB_FOUND) + message(WARNING "No Zlib development support found. Disabling COMPRESS package...") + set(PKG_COMPRESS OFF CACHE BOOL "" FORCE) + return() +endif() target_link_libraries(lammps PRIVATE ZLIB::ZLIB) find_package(PkgConfig QUIET) diff --git a/cmake/Modules/Packages/GPU.cmake b/cmake/Modules/Packages/GPU.cmake index dd66276ae4..24d9538206 100644 --- a/cmake/Modules/Packages/GPU.cmake +++ b/cmake/Modules/Packages/GPU.cmake @@ -26,6 +26,19 @@ elseif(GPU_PREC STREQUAL "SINGLE") set(GPU_PREC_SETTING "SINGLE_SINGLE") endif() +option(GPU_DEBUG "Enable debugging code of the GPU package" OFF) +mark_as_advanced(GPU_DEBUG) + +if(PKG_AMOEBA AND FFT_SINGLE) + message(FATAL_ERROR "GPU acceleration of AMOEBA is not (yet) compatible with single precision FFT") +endif() + +if (PKG_AMOEBA) + list(APPEND GPU_SOURCES + ${GPU_SOURCES_DIR}/amoeba_convolution_gpu.h + ${GPU_SOURCES_DIR}/amoeba_convolution_gpu.cpp) +endif() + file(GLOB GPU_LIB_SOURCES ${CONFIGURE_DEPENDS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/[^.]*.cpp) file(MAKE_DIRECTORY ${LAMMPS_LIB_BINARY_DIR}/gpu) @@ -151,7 +164,12 @@ if(GPU_API STREQUAL "CUDA") add_library(gpu STATIC ${GPU_LIB_SOURCES} ${GPU_LIB_CUDPP_SOURCES} ${GPU_OBJS}) target_link_libraries(gpu PRIVATE ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY}) target_include_directories(gpu PRIVATE ${LAMMPS_LIB_BINARY_DIR}/gpu ${CUDA_INCLUDE_DIRS}) - target_compile_definitions(gpu PRIVATE -DUSE_CUDA -D_${GPU_PREC_SETTING} -DMPI_GERYON -DUCL_NO_EXIT ${GPU_CUDA_MPS_FLAGS}) + target_compile_definitions(gpu PRIVATE -DUSE_CUDA -D_${GPU_PREC_SETTING} ${GPU_CUDA_MPS_FLAGS}) + if(GPU_DEBUG) + target_compile_definitions(gpu PRIVATE -DUCL_DEBUG -DGERYON_KERNEL_DUMP) + else() + target_compile_definitions(gpu PRIVATE -DMPI_GERYON -DUCL_NO_EXIT) + endif() if(CUDPP_OPT) target_include_directories(gpu PRIVATE ${LAMMPS_LIB_SOURCE_DIR}/gpu/cudpp_mini) target_compile_definitions(gpu PRIVATE -DUSE_CUDPP) @@ -192,6 +210,7 @@ elseif(GPU_API STREQUAL "OPENCL") ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff.cu ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_zbl.cu ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_mod.cu + ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_hippo.cu ) foreach(GPU_KERNEL ${GPU_LIB_CU}) @@ -208,6 +227,7 @@ elseif(GPU_API STREQUAL "OPENCL") GenerateOpenCLHeader(tersoff ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff.cu) GenerateOpenCLHeader(tersoff_zbl ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_zbl_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_zbl_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_zbl.cu) GenerateOpenCLHeader(tersoff_mod ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_mod_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_mod_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_mod.cu) + GenerateOpenCLHeader(hippo ${CMAKE_CURRENT_BINARY_DIR}/gpu/hippo_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_hippo_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_hippo.cu) list(APPEND GPU_LIB_SOURCES ${CMAKE_CURRENT_BINARY_DIR}/gpu/gayberne_cl.h @@ -217,14 +237,18 @@ elseif(GPU_API STREQUAL "OPENCL") ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_cl.h ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_zbl_cl.h ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_mod_cl.h + ${CMAKE_CURRENT_BINARY_DIR}/gpu/hippo_cl.h ) add_library(gpu STATIC ${GPU_LIB_SOURCES}) target_link_libraries(gpu PRIVATE OpenCL::OpenCL) target_include_directories(gpu PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/gpu) - target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT) - target_compile_definitions(gpu PRIVATE -DUSE_OPENCL) - + target_compile_definitions(gpu PRIVATE -DUSE_OPENCL -D_${GPU_PREC_SETTING}) + if(GPU_DEBUG) + target_compile_definitions(gpu PRIVATE -DUCL_DEBUG -DGERYON_KERNEL_DUMP) + else() + target_compile_definitions(gpu PRIVATE -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT) + endif() target_link_libraries(lammps PRIVATE gpu) add_executable(ocl_get_devices ${LAMMPS_LIB_SOURCE_DIR}/gpu/geryon/ucl_get_devices.cpp) @@ -374,8 +398,12 @@ elseif(GPU_API STREQUAL "HIP") add_library(gpu STATIC ${GPU_LIB_SOURCES}) target_include_directories(gpu PRIVATE ${LAMMPS_LIB_BINARY_DIR}/gpu) - target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -DMPI_GERYON -DUCL_NO_EXIT) - target_compile_definitions(gpu PRIVATE -DUSE_HIP) + target_compile_definitions(gpu PRIVATE -DUSE_HIP -D_${GPU_PREC_SETTING}) + if(GPU_DEBUG) + target_compile_definitions(gpu PRIVATE -DUCL_DEBUG -DGERYON_KERNEL_DUMP) + else() + target_compile_definitions(gpu PRIVATE -DMPI_GERYON -DUCL_NO_EXIT) + endif() target_link_libraries(gpu PRIVATE hip::host) if(HIP_USE_DEVICE_SORT) diff --git a/cmake/Modules/Packages/KOKKOS.cmake b/cmake/Modules/Packages/KOKKOS.cmake index f2cfa078c2..de64df7268 100644 --- a/cmake/Modules/Packages/KOKKOS.cmake +++ b/cmake/Modules/Packages/KOKKOS.cmake @@ -144,6 +144,7 @@ if(PKG_ML-IAP) ${KOKKOS_PKG_SOURCES_DIR}/mliap_descriptor_so3_kokkos.cpp ${KOKKOS_PKG_SOURCES_DIR}/mliap_model_linear_kokkos.cpp ${KOKKOS_PKG_SOURCES_DIR}/mliap_model_python_kokkos.cpp + ${KOKKOS_PKG_SOURCES_DIR}/mliap_unified_kokkos.cpp ${KOKKOS_PKG_SOURCES_DIR}/mliap_so3_kokkos.cpp) # Add KOKKOS version of ML-IAP Python coupling if non-KOKKOS version is included diff --git a/doc/src/Build_extras.rst b/doc/src/Build_extras.rst index be2ba0fc60..659d185e18 100644 --- a/doc/src/Build_extras.rst +++ b/doc/src/Build_extras.rst @@ -126,10 +126,11 @@ CMake build -D GPU_API=value # value = opencl (default) or cuda or hip -D GPU_PREC=value # precision setting # value = double or mixed (default) or single - -D HIP_PATH # path to HIP installation. Must be set if GPU_API=HIP -D GPU_ARCH=value # primary GPU hardware choice for GPU_API=cuda - # value = sm_XX, see below - # default is sm_50 + # value = sm_XX (see below, default is sm_50) + -D GPU_DEBUG=value # enable debug code in the GPU package library, mostly useful for developers + # value = yes or no (default) + -D HIP_PATH=value # value = path to HIP installation. Must be set if GPU_API=HIP -D HIP_ARCH=value # primary GPU hardware choice for GPU_API=hip # value depends on selected HIP_PLATFORM # default is 'gfx906' for HIP_PLATFORM=amd and 'sm_50' for HIP_PLATFORM=nvcc diff --git a/doc/src/Commands_pair.rst b/doc/src/Commands_pair.rst index f5924f12c7..7f7b2d4b7d 100644 --- a/doc/src/Commands_pair.rst +++ b/doc/src/Commands_pair.rst @@ -39,7 +39,7 @@ OPT. * :doc:`agni (o) ` * :doc:`airebo (io) ` * :doc:`airebo/morse (io) ` - * :doc:`amoeba ` + * :doc:`amoeba (g) ` * :doc:`atm ` * :doc:`awpmd/cut ` * :doc:`beck (go) ` @@ -126,7 +126,7 @@ OPT. * :doc:`hbond/dreiding/lj (o) ` * :doc:`hbond/dreiding/morse (o) ` * :doc:`hdnnp ` - * :doc:`hippo ` + * :doc:`hippo (g) ` * :doc:`ilp/graphene/hbn (t) ` * :doc:`ilp/tmd (t) ` * :doc:`kolmogorov/crespi/full ` @@ -200,6 +200,7 @@ OPT. * :doc:`mdpd ` * :doc:`mdpd/rhosum ` * :doc:`meam (k) ` + * :doc:`meam/ms (k) ` * :doc:`meam/spline (o) ` * :doc:`meam/sw/spline ` * :doc:`mesocnt ` diff --git a/doc/src/fix_pimd.rst b/doc/src/fix_pimd.rst index 838b9812ad..e5d42eb15f 100644 --- a/doc/src/fix_pimd.rst +++ b/doc/src/fix_pimd.rst @@ -149,6 +149,34 @@ related tasks for each of the partitions, e.g. restart 1000 system_${ibead}.restart1 system_${ibead}.restart2 read_restart system_${ibead}.restart2 +Restart, fix_modify, output, run start/stop, minimize info +""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" + +This fix writes the state of the Nose/Hoover thermostat over all +quasi-beads to :doc:`binary restart files `. See the +:doc:`read_restart ` command for info on how to re-specify +a fix in an input script that reads a restart file, so that the +operation of the fix continues in an uninterrupted fashion. + +None of the :doc:`fix_modify ` options +are relevant to this fix. + +This fix computes a global 3-vector, which can be accessed by various +:doc:`output commands `. The three quantities in the +global vector are + +#. the total spring energy of the quasi-beads, +#. the current temperature of the classical system of ring polymers, +#. the current value of the scalar virial estimator for the kinetic + energy of the quantum system :ref:`(Herman) `. + +The vector values calculated by this fix are "extensive", except for the +temperature, which is "intensive". + +No parameter of this fix can be used with the *start/stop* keywords of +the :doc:`run ` command. This fix is not invoked during +:doc:`energy minimization `. + Restrictions """""""""""" @@ -204,3 +232,8 @@ Path Integrals, McGraw-Hill, New York (1965). **(Calhoun)** A. Calhoun, M. Pavese, G. Voth, Chem Phys Letters, 262, 415 (1996). + +.. _Herman: + +**(Herman)** M. F. Herman, E. J. Bruskin, B. J. Berne, J Chem Phys, 76, 5150 (1982). + diff --git a/doc/src/fix_reaxff_species.rst b/doc/src/fix_reaxff_species.rst index c78c05a35e..383b8212f9 100644 --- a/doc/src/fix_reaxff_species.rst +++ b/doc/src/fix_reaxff_species.rst @@ -39,6 +39,9 @@ Syntax *masslimit* value = massmin massmax massmin = minimum molecular weight of species to delete massmax = maximum molecular weight of species to delete + *delete_rate_limit* value = Nlimit Nsteps + Nlimit = maximum number of deletions allowed to occur within interval + Nsteps = the interval (number of timesteps) over which to count deletions Examples """""""" @@ -142,7 +145,13 @@ When using the *masslimit* keyword, each line of the *filedel* file contains the timestep on which deletions occurs, followed by how many of each species are deleted (with quantities preceding chemical formulae). The *specieslist* and *masslimit* keywords cannot both be -used in the same *reaxff/species* fix. +used in the same *reaxff/species* fix. The *delete_rate_limit* +keyword can enforce an upper limit on the overall rate of molecule +deletion. The number of deletion occurrences is limited to Nlimit +within an interval of Nsteps timesteps. When using the +*delete_rate_limit* keyword, no deletions are permitted to occur +within the first Nsteps timesteps of the first run (after reading a +either a data or restart file). ---------- diff --git a/doc/src/fix_rigid.rst b/doc/src/fix_rigid.rst index 9a958e50d1..3a2477f90a 100644 --- a/doc/src/fix_rigid.rst +++ b/doc/src/fix_rigid.rst @@ -732,8 +732,8 @@ choices: * Use one of the 4 NPT or NPH styles for the rigid bodies. Use the *dilate* all option so that it will dilate the positions of the - *non-rigid particles as well. Use :doc:`fix nvt ` (or any - *other thermostat) for the non-rigid particles. + non-rigid particles as well. Use :doc:`fix nvt ` (or any + other thermostat) for the non-rigid particles. * Use :doc:`fix npt ` for the group of non-rigid particles. Use the *dilate* all option so that it will dilate the center-of-mass positions of the rigid bodies as well. Use one of the 4 NVE or 2 NVT diff --git a/doc/src/pair_amoeba.rst b/doc/src/pair_amoeba.rst index f5c0ea14df..6ef92a6938 100644 --- a/doc/src/pair_amoeba.rst +++ b/doc/src/pair_amoeba.rst @@ -1,11 +1,18 @@ .. index:: pair_style amoeba +.. index:: pair_style amoeba/gpu .. index:: pair_style hippo +.. index:: pair_style hippo/gpu pair_style amoeba command ========================= +Accelerator Variants: *amoeba/gpu* + pair_style hippo command ======================== + +Accelerator Variants: *hippo/gpu* + Syntax """""" @@ -127,6 +134,10 @@ version discussed in :ref:`(Ponder) `, :ref:`(Ren) implementation of HIPPO in LAMMPS matches the version discussed in :ref:`(Rackers) `. +.. versionadded:: TBD + +Accelerator support via the GPU package is available. + ---------- Only a single pair_coeff command is used with either the *amoeba* and @@ -187,6 +198,19 @@ These pair styles can only be used via the *pair* keyword of the ---------- +.. include:: accel_styles.rst + +.. note:: + + Using the GPU accelerated pair styles 'amoeba/gpu' or 'hippo/gpu' + when compiling the GPU package for OpenCL has a few known issues + when running on integrated GPUs and the calculation may crash. + + The GPU accelerated pair styles are also not (yet) compatible + with single precision FFTs. + +---------- + Restrictions """""""""""" diff --git a/doc/src/pair_meam.rst b/doc/src/pair_meam.rst index 6a3d52c4d5..57c40aa6ee 100644 --- a/doc/src/pair_meam.rst +++ b/doc/src/pair_meam.rst @@ -1,17 +1,26 @@ .. index:: pair_style meam .. index:: pair_style meam/kk +.. index:: pair_style meam/ms +.. index:: pair_style meam/ms/kk pair_style meam command ========================= Accelerator Variants: *meam/kk* +pair_style meam/ms command +========================== + +Accelerator Variants: *meam/ms/kk* + Syntax """""" .. code-block:: LAMMPS - pair_style meam + pair_style style + +* style = *meam* or *meam/ms* Examples """""""" @@ -22,6 +31,9 @@ Examples pair_coeff * * ../potentials/library.meam Si ../potentials/si.meam Si pair_coeff * * ../potentials/library.meam Ni Al NULL Ni Al Ni Ni + pair_style meam/ms + pair_coeff * * ../potentials/library.msmeam H Ga ../potentials/HGa.meam H Ga + Description """"""""""" @@ -31,16 +43,23 @@ Description as of November 2010; see description below of the mixture_ref_t parameter -Pair style *meam* computes non-bonded interactions for a variety of materials -using the modified embedded-atom method (MEAM) -:ref:`(Baskes) `. Conceptually, it is an extension to the original -:doc:`EAM method ` which adds angular forces. It is -thus suitable for modeling metals and alloys with fcc, bcc, hcp and -diamond cubic structures, as well as materials with covalent interactions -like silicon and carbon. This *meam* pair style is a translation of the -original Fortran version to C++. It is functionally equivalent but more -efficient and has additional features. The Fortran version of the *meam* -pair style has been removed from LAMMPS after the 12 December 2018 release. +Pair style *meam* computes non-bonded interactions for a variety of +materials using the modified embedded-atom method (MEAM) :ref:`(Baskes) +`. Conceptually, it is an extension to the original :doc:`EAM +method ` which adds angular forces. It is thus suitable for +modeling metals and alloys with fcc, bcc, hcp and diamond cubic +structures, as well as materials with covalent interactions like silicon +and carbon. + +The *meam* pair style is a translation of the original Fortran version +to C++. It is functionally equivalent but more efficient and has +additional features. The Fortran version of the *meam* pair style has +been removed from LAMMPS after the 12 December 2018 release. + +Pair style *meam/ms* uses the multi-state MEAM (MS-MEAM) method +according to :ref:`(Baskes2) `, which is an extension to MEAM. +This pair style is mostly equivalent to *meam* and differs only +where noted in the documentation below. In the MEAM formulation, the total energy E of a system of atoms is given by: @@ -351,6 +370,16 @@ Most published MEAM parameter sets use the default values *attrac* = *repulse* = Setting *repuls* = *attrac* = *delta* corresponds to the form used in several recent published MEAM parameter sets, such as :ref:`(Valone) ` +Then using *meam/ms* pair style the multi-state MEAM (MS-MEAM) method is +activated. This requires 6 extra parameters in the MEAM library file, +resulting in 25 parameters ordered that are ordered like this: + +elt, lat, z, ielement, atwt, alpha, b0, b1, b2, b3, b1m, b2m, b3m, alat, esub, asub, +t0, t1, t2, t3, t1m, t2m, t3m, rozero, ibar + +The 6 extra MS-MEAM parameters are *b1m, b2m, b3m, t1m, t2m, t3m*. +In the LAMMPS ``potentials`` folder, compatible files have an ".msmeam" extension. + ---------- .. include:: accel_styles.rst @@ -393,16 +422,15 @@ This pair style can only be used via the *pair* keyword of the Restrictions """""""""""" -The *meam* style is provided in the MEAM package. It is -only enabled if LAMMPS was built with that package. +The *meam* and *meam/ms* pair styles are provided in the MEAM +package. They are only enabled if LAMMPS was built with that package. See the :doc:`Build package ` page for more info. -The maximum number of elements, that can be read from the MEAM -library file, is determined at compile time. The default is 5. -If you need support for more elements, you have to change the -define for the constant 'maxelt' at the beginning of the file -src/MEAM/meam.h and update/recompile LAMMPS. There is no -limit on the number of atoms types. +The maximum number of elements, that can be read from the MEAM library +file, is determined at compile time. The default is 5. If you need +support for more elements, you have to change the the constant 'maxelt' +at the beginning of the file ``src/MEAM/meam.h`` and update/recompile +LAMMPS. There is no limit on the number of atoms types. Related commands """""""""""""""" @@ -421,6 +449,10 @@ none **(Baskes)** Baskes, Phys Rev B, 46, 2727-2742 (1992). +.. _Baskes2: + +**(Baskes2)** Baskes, Phys Rev B, 75, 094113 (2007). + .. _Gullet: **(Gullet)** Gullet, Wagner, Slepoy, SANDIA Report 2003-8782 (2003). DOI:10.2172/918395 diff --git a/doc/src/pair_style.rst b/doc/src/pair_style.rst index facfadeb9b..b3f7276480 100644 --- a/doc/src/pair_style.rst +++ b/doc/src/pair_style.rst @@ -277,7 +277,8 @@ accelerated styles exist. * :doc:`lubricateU/poly ` - hydrodynamic lubrication forces for Fast Lubrication with polydispersity * :doc:`mdpd ` - mDPD particle interactions * :doc:`mdpd/rhosum ` - mDPD particle interactions for mass density -* :doc:`meam ` - modified embedded atom method (MEAM) in C +* :doc:`meam ` - modified embedded atom method (MEAM) +* :doc:`meam/ms ` - multi-state modified embedded atom method (MS-MEAM) * :doc:`meam/spline ` - splined version of MEAM * :doc:`meam/sw/spline ` - splined version of MEAM with a Stillinger-Weber term * :doc:`mesocnt ` - mesoscopic vdW potential for (carbon) nanotubes diff --git a/examples/meam/msmeam/HGa.meam b/examples/meam/msmeam/HGa.meam new file mode 100644 index 0000000000..9f01501c16 --- /dev/null +++ b/examples/meam/msmeam/HGa.meam @@ -0,0 +1,30 @@ +bkgd_dyn = 1 +emb_lin_neg = 1 +augt1=0 +ialloy=1 +rc = 5.9 +#H +attrac(1,1)=0.460 +repuls(1,1)=0.460 +Cmin(1,1,1)=1.3 # PuMS +Cmax(1,1,1)= 2.80 +nn2(1,1)=1 +#Ga +rho0(2) = 0.6 +attrac(2,2)=0.097 +repuls(2,2)=0.097 +nn2(2,2)=1 +#HGa +attrac(1,2)=0.300 +repuls(1,2)=0.300 +lattce(1,2)=l12 +re(1,2)=3.19 +delta(1,2)=-0.48 +alpha(1,2)=6.6 +Cmin(1,1,2)=2.0 +Cmin(2,1,2)= 2.0 +Cmin(1,2,1)=2.0 +Cmin(2,2,1) = 1.4 +Cmin(1,2,2) = 1.4 +Cmin(1,1,2) = 1.4 +nn2(1,2)=1 diff --git a/examples/meam/msmeam/README.md b/examples/meam/msmeam/README.md new file mode 100644 index 0000000000..dbf569d4b3 --- /dev/null +++ b/examples/meam/msmeam/README.md @@ -0,0 +1,9 @@ +To run Baske's test, do + + lmp -in in.msmeam + +Then + + diff dump.msmeam dump.msmeam.bu + + diff --git a/examples/meam/msmeam/data.msmeam.bu b/examples/meam/msmeam/data.msmeam.bu new file mode 100644 index 0000000000..576a3c50de --- /dev/null +++ b/examples/meam/msmeam/data.msmeam.bu @@ -0,0 +1,25 @@ +LAMMPS data file via write_data, version 16 Feb 2016, timestep = 1 + +3 atoms +2 atom types + +-4.0000000000000000e+00 4.0000000000000000e+00 xlo xhi +-4.0000000000000000e+00 4.0000000000000000e+00 ylo yhi +-4.0000000000000000e+00 4.0000000000000000e+00 zlo zhi + +Masses + +1 1.0079 +2 69.723 + +Atoms # atomic + +1 1 0.0000000000000000e+00 0.0000000000000000e+00 0.0000000000000000e+00 0 0 0 +2 2 2.2000000000000002e+00 0.0000000000000000e+00 0.0000000000000000e+00 0 0 0 +3 2 2.9999999999999999e-01 2.2999999999999998e+00 0.0000000000000000e+00 0 0 0 + +Velocities + +1 0.0000000000000000e+00 0.0000000000000000e+00 0.0000000000000000e+00 +2 0.0000000000000000e+00 0.0000000000000000e+00 0.0000000000000000e+00 +3 0.0000000000000000e+00 0.0000000000000000e+00 0.0000000000000000e+00 diff --git a/examples/meam/msmeam/dump.msmeam.bu b/examples/meam/msmeam/dump.msmeam.bu new file mode 100644 index 0000000000..039f630073 --- /dev/null +++ b/examples/meam/msmeam/dump.msmeam.bu @@ -0,0 +1,24 @@ +ITEM: TIMESTEP +0 +ITEM: NUMBER OF ATOMS +3 +ITEM: BOX BOUNDS pp pp pp +-4 4 +-4 4 +-4 4 +ITEM: ATOMS id x y z fx fy fz c_pot_energy c_stress[1] c_stress[2] c_stress[3] c_stress[4] c_stress[5] c_stress[6] +1 0 0 0 -131.925 -88.3005 0 22.9153 -2.147e+08 -1.62661e+08 -0 -2.05301e+07 -0 -0 +2 2.2 0 0 120.809 -0.482171 0 14.7692 -2.12028e+08 -0 -0 403352 -0 -0 +3 0.3 2.3 0 11.1159 88.7827 0 8.61478 -2.67145e+06 -1.62661e+08 -0 -2.09335e+07 -0 -0 +ITEM: TIMESTEP +1 +ITEM: NUMBER OF ATOMS +3 +ITEM: BOX BOUNDS pp pp pp +-4 4 +-4 4 +-4 4 +ITEM: ATOMS id x y z fx fy fz c_pot_energy c_stress[1] c_stress[2] c_stress[3] c_stress[4] c_stress[5] c_stress[6] +1 0 0 0 -131.925 -88.3005 0 22.9153 -2.147e+08 -1.62661e+08 -0 -2.05301e+07 -0 -0 +2 2.2 0 0 120.809 -0.482171 0 14.7692 -2.12028e+08 -0 -0 403352 -0 -0 +3 0.3 2.3 0 11.1159 88.7827 0 8.61478 -2.67145e+06 -1.62661e+08 -0 -2.09335e+07 -0 -0 diff --git a/examples/meam/msmeam/in.msmeam b/examples/meam/msmeam/in.msmeam new file mode 100644 index 0000000000..82ffb89a13 --- /dev/null +++ b/examples/meam/msmeam/in.msmeam @@ -0,0 +1,31 @@ +echo both +log log.msmeam +# Test of MEAM potential for HGa + +# ------------------------ INITIALIZATION ---------------------------- +units metal +dimension 3 +boundary p p p +atom_style atomic +variable latparam equal 4.646 +variable ncell equal 3 + +# ----------------------- ATOM DEFINITION ---------------------------- +region box block -4 4 -4 4 -4 4 +create_box 2 box + +# + +include potential.mod +create_atoms 1 single 0 0 0 units box +create_atoms 2 single 2.2 0 0 units box +create_atoms 2 single 0.3 2.3 0 units box +# ---------- Define Settings --------------------- +variable teng equal "c_eatoms" +compute pot_energy all pe/atom +compute stress all stress/atom NULL +dump 1 all custom 1 dump.msmeam id x y z fx fy fz c_pot_energy c_stress[1] c_stress[2] c_stress[3] c_stress[4] c_stress[5] c_stress[6] +run 1 +write_data data.msmeam + +print "All done!" diff --git a/examples/meam/msmeam/library.msmeam b/examples/meam/msmeam/library.msmeam new file mode 100644 index 0000000000..9937eaee08 --- /dev/null +++ b/examples/meam/msmeam/library.msmeam @@ -0,0 +1,14 @@ +# DATE: 2018-09-22 UNITS: metal CONTRIBUTOR: Steve Valone, smv@lanl.gov CITATION: Baskes, PRB 1992; smv, sr, mib, JNM 2010 +# ms-meam data format May 2010 +# elt lat z ielement atwt +# alpha b0 b1 b2 b3 b1m b2m b3m alat esub asub +# - t0 t1 t2 t3 t1m t2m t3m rozero ibar +# NOTE: leading character cannot be a space + +'H' 'dim' 1.0 1 1.0079 +2.960 2.960 3.0 1.0 1.0 1.0 3.0 1.0 0.741 2.235 2.50 +1.0 0.44721 0.0 0.00 0.0 0.31623 0 6.70 0 + +'Ga4' 'fcc' 12.0 31 69.723 +4.42 4.80 3.10 6.00 0.00 0.0 0.0 0.5 4.247 2.897 0.97 +1.0 1.649 1.435 0.00 0.0 0.0 2.0 0.70 0 diff --git a/examples/meam/msmeam/log.msmeam.bu b/examples/meam/msmeam/log.msmeam.bu new file mode 100644 index 0000000000..8eac453c1e --- /dev/null +++ b/examples/meam/msmeam/log.msmeam.bu @@ -0,0 +1,107 @@ +# Test of MEAM potential for HGa + +# ------------------------ INITIALIZATION ---------------------------- +units metal +dimension 3 +boundary p p p +atom_style atomic +variable latparam equal 4.646 +variable ncell equal 3 + +# ----------------------- ATOM DEFINITION ---------------------------- +region box block -4 4 -4 4 -4 4 +create_box 2 box +Created orthogonal box = (-4 -4 -4) to (4 4 4) + 1 by 1 by 1 MPI processor grid + +# + +include potential.mod +# NOTE: This script can be modified for different pair styles +# See in.elastic for more info. + +variable Pu string H +print "potential chosen ${Pu}" +potential chosen H +# Choose potential +pair_style MSmeam +print "we just executed" +we just executed + +pair_coeff * * library.MSmeam ${Pu} Ga4 HGaMS.meam ${Pu} Ga4 +pair_coeff * * library.MSmeam H Ga4 HGaMS.meam ${Pu} Ga4 +pair_coeff * * library.MSmeam H Ga4 HGaMS.meam H Ga4 +Reading potential file library.MSmeam with DATE: 2018-09-22 +# Setup neighbor style +neighbor 1.0 nsq +neigh_modify once no every 1 delay 0 check yes + +# Setup minimization style +variable dmax equal 1.0e-2 +min_style cg +min_modify dmax ${dmax} line quadratic +min_modify dmax 0.01 line quadratic +compute eng all pe/atom +compute eatoms all reduce sum c_eng + +# Setup output +thermo 100 +thermo_style custom step temp etotal press pxx pyy pzz pxy pxz pyz lx ly lz vol c_eatoms +thermo_modify norm yes +create_atoms 1 single 0 0 0 units box +Created 1 atoms +create_atoms 2 single 2.2 0 0 units box +Created 1 atoms +create_atoms 2 single 0.3 2.3 0 units box +Created 1 atoms +# ---------- Define Settings --------------------- +variable teng equal "c_eatoms" +compute pot_energy all pe/atom +compute stress all stress/atom NULL +dump 1 all custom 1 dump.msmeam id x y z fx fy fz c_pot_energy c_stress[1] c_stress[2] c_stress[3] c_stress[4] c_stress[5] c_stress[6] +run 1 +WARNING: No fixes defined, atoms won't move (../verlet.cpp:55) +Neighbor list info ... + 2 neighbor list requests + update every 1 steps, delay 0 steps, check yes + max neighbors/atom: 2000, page size: 100000 + master list distance cutoff = 6.9 + ghost atom cutoff = 6.9 +Memory usage per processor = 12.9295 Mbytes +Step Temp TotEng Press Pxx Pyy Pzz Pxy Pxz Pyz Lx Ly Lz Volume eatoms + 0 0 15.433079 491354.68 838670.91 635393.13 0 80195.793 0 0 8 8 8 512 15.433079 + 1 0 15.433079 491354.68 838670.91 635393.13 0 80195.793 0 0 8 8 8 512 15.433079 +Loop time of 0.000172138 on 1 procs for 1 steps with 3 atoms + +Performance: 501.922 ns/day, 0.048 hours/ns, 5809.285 timesteps/s +81.3% CPU use with 1 MPI tasks x no OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 6.6996e-05 | 6.6996e-05 | 6.6996e-05 | 0.0 | 38.92 +Neigh | 0 | 0 | 0 | 0.0 | 0.00 +Comm | 1.9073e-06 | 1.9073e-06 | 1.9073e-06 | 0.0 | 1.11 +Output | 9.7036e-05 | 9.7036e-05 | 9.7036e-05 | 0.0 | 56.37 +Modify | 0 | 0 | 0 | 0.0 | 0.00 +Other | | 6.199e-06 | | | 3.60 + +Nlocal: 3 ave 3 max 3 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Nghost: 78 ave 78 max 78 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Neighs: 7 ave 7 max 7 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +FullNghs: 14 ave 14 max 14 min +Histogram: 1 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 14 +Ave neighs/atom = 4.66667 +Neighbor list builds = 0 +Dangerous builds = 0 +write_data data.msmeam + +print "All done!" +All done! +Total wall time: 0:00:00 + diff --git a/examples/meam/msmeam/msmeam.dump.bu b/examples/meam/msmeam/msmeam.dump.bu new file mode 100644 index 0000000000..039f630073 --- /dev/null +++ b/examples/meam/msmeam/msmeam.dump.bu @@ -0,0 +1,24 @@ +ITEM: TIMESTEP +0 +ITEM: NUMBER OF ATOMS +3 +ITEM: BOX BOUNDS pp pp pp +-4 4 +-4 4 +-4 4 +ITEM: ATOMS id x y z fx fy fz c_pot_energy c_stress[1] c_stress[2] c_stress[3] c_stress[4] c_stress[5] c_stress[6] +1 0 0 0 -131.925 -88.3005 0 22.9153 -2.147e+08 -1.62661e+08 -0 -2.05301e+07 -0 -0 +2 2.2 0 0 120.809 -0.482171 0 14.7692 -2.12028e+08 -0 -0 403352 -0 -0 +3 0.3 2.3 0 11.1159 88.7827 0 8.61478 -2.67145e+06 -1.62661e+08 -0 -2.09335e+07 -0 -0 +ITEM: TIMESTEP +1 +ITEM: NUMBER OF ATOMS +3 +ITEM: BOX BOUNDS pp pp pp +-4 4 +-4 4 +-4 4 +ITEM: ATOMS id x y z fx fy fz c_pot_energy c_stress[1] c_stress[2] c_stress[3] c_stress[4] c_stress[5] c_stress[6] +1 0 0 0 -131.925 -88.3005 0 22.9153 -2.147e+08 -1.62661e+08 -0 -2.05301e+07 -0 -0 +2 2.2 0 0 120.809 -0.482171 0 14.7692 -2.12028e+08 -0 -0 403352 -0 -0 +3 0.3 2.3 0 11.1159 88.7827 0 8.61478 -2.67145e+06 -1.62661e+08 -0 -2.09335e+07 -0 -0 diff --git a/examples/meam/msmeam/potential.mod b/examples/meam/msmeam/potential.mod new file mode 100644 index 0000000000..760cc93503 --- /dev/null +++ b/examples/meam/msmeam/potential.mod @@ -0,0 +1,25 @@ +# NOTE: This script can be modified for different pair styles +# See in.elastic for more info. + +variable Pu string H +print "potential chosen ${Pu}" +# Choose potential +pair_style meam/ms +print "we just executed" + +pair_coeff * * library.msmeam ${Pu} Ga4 HGa.meam ${Pu} Ga4 +# Setup neighbor style +neighbor 1.0 bin +neigh_modify once no every 1 delay 0 check yes + +# Setup minimization style +variable dmax equal 1.0e-2 +min_style cg +min_modify dmax ${dmax} line quadratic +compute eng all pe/atom +compute eatoms all reduce sum c_eng + +# Setup output +thermo 100 +thermo_style custom step temp etotal press pxx pyy pzz pxy pxz pyz lx ly lz vol c_eatoms +thermo_modify norm yes diff --git a/lib/gpu/Makefile.lammps.standard b/lib/gpu/Makefile.lammps.standard index 9526e8e373..0bb3394b3e 100644 --- a/lib/gpu/Makefile.lammps.standard +++ b/lib/gpu/Makefile.lammps.standard @@ -6,6 +6,6 @@ CUDA_HOME=/usr/local/cuda endif gpu_SYSINC = -gpu_SYSLIB = -lcudart -lcuda +gpu_SYSLIB = -lcudart -lcuda -lcufft gpu_SYSPATH = -L$(CUDA_HOME)/lib64 -L$(CUDA_HOME)/lib64/stubs diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile index 56942d3f3c..298d404117 100644 --- a/lib/gpu/Nvidia.makefile +++ b/lib/gpu/Nvidia.makefile @@ -1,9 +1,17 @@ +# Common headers for kernels +PRE1_H = lal_preprocessor.h lal_aux_fun1.h + # Headers for Geryon UCL_H = $(wildcard ./geryon/ucl*.h) NVD_H = $(wildcard ./geryon/nvd*.h) $(UCL_H) lal_preprocessor.h \ lal_pre_cuda_hip.h -ALL_H = $(NVD_H) $(wildcard ./lal_*.h) +# Headers for Host files +HOST_H = lal_answer.h lal_atom.h lal_balance.h lal_base_atomic.h lal_base_amoeba.h \ + lal_base_charge.h lal_base_dipole.h lal_base_dpd.h \ + lal_base_ellipsoid.h lal_base_three.h lal_device.h lal_neighbor.h \ + lal_neighbor_shared.h lal_pre_ocl_config.h $(NVD_H) + # Source files SRCS := $(wildcard ./lal_*.cpp) OBJS := $(subst ./,$(OBJ_DIR)/,$(SRCS:%.cpp=%.o)) @@ -54,13 +62,40 @@ $(OBJ_DIR)/pppm_d.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h \ $(OBJ_DIR)/pppm_d_cubin.h: $(OBJ_DIR)/pppm_d.cubin $(BIN2C) -c -n pppm_d $(OBJ_DIR)/pppm_d.cubin > $(OBJ_DIR)/pppm_d_cubin.h -$(OBJ_DIR)/%_cubin.h: lal_%.cu $(ALL_H) +$(OBJ_DIR)/%_cubin.h: lal_%.cu $(PRE1_H) $(CUDA) --fatbin -DNV_KERNEL -o $(OBJ_DIR)/$*.cubin $(OBJ_DIR)/lal_$*.cu $(BIN2C) -c -n $* $(OBJ_DIR)/$*.cubin > $@ # host code compilation -$(OBJ_DIR)/lal_%.o: lal_%.cpp $(CUHS) $(ALL_H) +$(OBJ_DIR)/lal_answer.o: lal_answer.cpp $(HOST_H) + $(CUDR) -o $@ -c lal_answer.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_dpd_tstat_ext.o: lal_dpd_tstat_ext.cpp lal_dpd.h $(HOST_H) + $(CUDR) -o $@ -c lal_dpd_tstat_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_eam_alloy_ext.o: lal_eam_alloy_ext.cpp lal_eam.h $(HOST_H) + $(CUDR) -o $@ -c lal_eam_alloy_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_eam_fs_ext.o: lal_eam_fs_ext.cpp lal_eam.h $(HOST_H) + $(CUDR) -o $@ -c lal_eam_fs_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_neighbor.o: lal_neighbor.cpp $(HOST_H) + $(CUDR) -o $@ -c lal_neighbor.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_neighbor_shared.o: lal_neighbor_shared.cpp $(HOST_H) + $(CUDR) -o $@ -c lal_neighbor_shared.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_pppm.o: lal_pppm.cpp pppm_f_cubin.h pppm_d_cubin.h $(HOST_H) + $(CUDR) -o $@ -c lal_pppm.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_%_ext.o: lal_%_ext.cpp lal_%.h $(HOST_H) + $(CUDR) -o $@ -c $< -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_base_%.o: lal_base_%.cpp $(HOST_H) + $(CUDR) -o $@ -c $< -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_%.o: lal_%.cpp %_cubin.h $(HOST_H) $(CUDR) -o $@ -c $< -I$(OBJ_DIR) #ifdef CUDPP_OPT @@ -77,10 +112,10 @@ $(OBJ_DIR)/cudpp_plan_manager.o: cudpp_mini/cudpp_plan_manager.cpp $(CUDR) -o $@ -c cudpp_mini/cudpp_plan_manager.cpp -Icudpp_mini $(OBJ_DIR)/radixsort_app.cu_o: cudpp_mini/radixsort_app.cu - $(CUDA) -o $@ -c cudpp_mini/radixsort_app.cu + $(CUDA) -o $@ -c cudpp_mini/radixsort_app.cu -Icudpp_mini $(OBJ_DIR)/scan_app.cu_o: cudpp_mini/scan_app.cu - $(CUDA) -o $@ -c cudpp_mini/scan_app.cu + $(CUDA) -o $@ -c cudpp_mini/scan_app.cu -Icudpp_mini #endif # build libgpu.a diff --git a/lib/gpu/Opencl.makefile b/lib/gpu/Opencl.makefile index 2ff98827d4..d318da15dd 100644 --- a/lib/gpu/Opencl.makefile +++ b/lib/gpu/Opencl.makefile @@ -6,7 +6,7 @@ UCL_H = $(wildcard ./geryon/ucl*.h) OCL_H = $(wildcard ./geryon/ocl*.h) $(UCL_H) lal_precision.h # Headers for Host files -HOST_H = lal_answer.h lal_atom.h lal_balance.h lal_base_atomic.h \ +HOST_H = lal_answer.h lal_atom.h lal_balance.h lal_base_atomic.h lal_base_amoeba.h \ lal_base_charge.h lal_base_dipole.h lal_base_dpd.h \ lal_base_ellipsoid.h lal_base_three.h lal_device.h lal_neighbor.h \ lal_neighbor_shared.h lal_pre_ocl_config.h $(OCL_H) @@ -74,6 +74,9 @@ $(OBJ_DIR)/tersoff_mod_cl.h: lal_tersoff_mod.cu $(PRE1_H) lal_tersoff_mod_extra. $(OBJ_DIR)/tersoff_zbl_cl.h: lal_tersoff_zbl.cu $(PRE1_H) lal_tersoff_zbl_extra.h $(BSH) ./geryon/file_to_cstr.sh tersoff_zbl $(PRE1_H) lal_tersoff_zbl_extra.h lal_tersoff_zbl.cu $(OBJ_DIR)/tersoff_zbl_cl.h; +$(OBJ_DIR)/hippo_cl.h: lal_hippo.cu $(PRE1_H) lal_hippo_extra.h + $(BSH) ./geryon/file_to_cstr.sh hippo $(PRE1_H) lal_hippo_extra.h lal_hippo.cu $(OBJ_DIR)/hippo_cl.h; + $(OBJ_DIR)/%_cl.h: lal_%.cu $(PRE1_H) $(BSH) ./geryon/file_to_cstr.sh $* $(PRE1_H) $< $@; diff --git a/lib/gpu/geryon/hip_macros.h b/lib/gpu/geryon/hip_macros.h index 96313ec87e..e16caf4944 100644 --- a/lib/gpu/geryon/hip_macros.h +++ b/lib/gpu/geryon/hip_macros.h @@ -26,6 +26,9 @@ #ifdef UCL_DEBUG #define UCL_SYNC_DEBUG #define UCL_DESTRUCT_CHECK +#define UCL_DEBUG_ARG(arg) arg +#else +#define UCL_DEBUG_ARG(arg) #endif #ifndef UCL_NO_API_CHECK diff --git a/lib/gpu/geryon/nvd_macros.h b/lib/gpu/geryon/nvd_macros.h index ac2e6cc682..19c8ff4b6c 100644 --- a/lib/gpu/geryon/nvd_macros.h +++ b/lib/gpu/geryon/nvd_macros.h @@ -33,6 +33,9 @@ #ifdef UCL_DEBUG #define UCL_SYNC_DEBUG #define UCL_DESTRUCT_CHECK +#define UCL_DEBUG_ARG(arg) arg +#else +#define UCL_DEBUG_ARG(arg) #endif #ifndef UCL_NO_API_CHECK diff --git a/lib/gpu/geryon/ocl_device.h b/lib/gpu/geryon/ocl_device.h index 4163d40881..588c53c8fa 100644 --- a/lib/gpu/geryon/ocl_device.h +++ b/lib/gpu/geryon/ocl_device.h @@ -309,15 +309,14 @@ class UCL_Device { /// Return the maximum memory pitch in bytes for current device inline size_t max_pitch() { return max_pitch(_device); } /// Return the maximum memory pitch in bytes - inline size_t max_pitch(const int i) { return 0; } + inline size_t max_pitch(const int) { return 0; } /// Returns false if accelerator cannot be shared by multiple processes /** If it cannot be determined, true is returned **/ inline bool sharing_supported() { return sharing_supported(_device); } /// Returns false if accelerator cannot be shared by multiple processes /** If it cannot be determined, true is returned **/ - inline bool sharing_supported(const int i) - { return true; } + inline bool sharing_supported(const int) { return true; } /// True if the device is a sub-device inline bool is_subdevice() diff --git a/lib/gpu/geryon/ocl_macros.h b/lib/gpu/geryon/ocl_macros.h index 5e5a190ede..652d7795e9 100644 --- a/lib/gpu/geryon/ocl_macros.h +++ b/lib/gpu/geryon/ocl_macros.h @@ -33,6 +33,9 @@ #ifdef UCL_DEBUG #define UCL_SYNC_DEBUG #define UCL_DESTRUCT_CHECK +#define UCL_DEBUG_ARG(arg) arg +#else +#define UCL_DEBUG_ARG(arg) #endif #ifndef UCL_NO_API_CHECK diff --git a/lib/gpu/geryon/ocl_memory.h b/lib/gpu/geryon/ocl_memory.h index bfc260889a..5d8b9808bd 100644 --- a/lib/gpu/geryon/ocl_memory.h +++ b/lib/gpu/geryon/ocl_memory.h @@ -137,7 +137,7 @@ inline int _host_view(mat_type &mat, copy_type &cm, const size_t o, template inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n, - const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){ + const enum UCL_MEMOPT kind, const enum UCL_MEMOPT /*kind2*/){ cl_mem_flags buffer_perm; cl_map_flags map_perm; if (kind==UCL_READ_ONLY) { @@ -583,7 +583,7 @@ template <> struct _ucl_memcpy<1,0> { template static inline void mc(p1 &dst, const p2 &src, const size_t n, cl_command_queue &cq, const cl_bool block, - const size_t dst_offset, const size_t src_offset) { + const size_t /*dst_offset*/, const size_t src_offset) { if (src.cbegin()==dst.cbegin()) { #ifdef UCL_DBG_MEM_TRACE std::cerr << "UCL_COPY 1S\n"; @@ -641,7 +641,7 @@ template <> struct _ucl_memcpy<0,1> { template static inline void mc(p1 &dst, const p2 &src, const size_t n, cl_command_queue &cq, const cl_bool block, - const size_t dst_offset, const size_t src_offset) { + const size_t dst_offset, const size_t /*src_offset*/) { if (src.cbegin()==dst.cbegin()) { if (block) ucl_sync(cq); #ifdef UCL_DBG_MEM_TRACE diff --git a/lib/gpu/geryon/ocl_texture.h b/lib/gpu/geryon/ocl_texture.h index 8ddde5b2a3..87db3794a6 100644 --- a/lib/gpu/geryon/ocl_texture.h +++ b/lib/gpu/geryon/ocl_texture.h @@ -35,19 +35,19 @@ class UCL_Texture { UCL_Texture() {} ~UCL_Texture() {} /// Construct with a specified texture reference - inline UCL_Texture(UCL_Program &prog, const char *texture_name) { } + inline UCL_Texture(UCL_Program & /*prog*/, const char * /*texture_name*/) { } /// Set the texture reference for this object - inline void get_texture(UCL_Program &prog, const char *texture_name) { } + inline void get_texture(UCL_Program & /*prog*/, const char * /*texture_name*/) { } /// Bind a float array where each fetch grabs a vector of length numel template - inline void bind_float(mat_typ &vec, const unsigned numel) { } + inline void bind_float(mat_typ & /*vec*/, const unsigned /*numel*/) { } /// Unbind the texture reference from the memory allocation inline void unbind() { } /// Make a texture reference available to kernel - inline void allow(UCL_Kernel &kernel) { } + inline void allow(UCL_Kernel & /*kernel*/) { } private: friend class UCL_Kernel; @@ -62,7 +62,7 @@ class UCL_Const { inline UCL_Const(UCL_Program &prog, const char *global_name) { get_global(prog,global_name); } /// Set the global reference for this object - inline void get_global(UCL_Program &prog, const char *global_name) { + inline void get_global(UCL_Program &prog, const char * /*global_name*/) { if (_active) { CL_DESTRUCT_CALL(clReleaseContext(_context)); CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq)); diff --git a/lib/gpu/geryon/ocl_timer.h b/lib/gpu/geryon/ocl_timer.h index 189871e631..8f55a91a28 100644 --- a/lib/gpu/geryon/ocl_timer.h +++ b/lib/gpu/geryon/ocl_timer.h @@ -71,7 +71,7 @@ class UCL_Timer { inline void init(UCL_Device &dev) { init(dev,dev.cq()); } /// Initialize command queue for timing - inline void init(UCL_Device &dev, command_queue &cq) { + inline void init(UCL_Device & /*dev*/, command_queue &cq) { clear(); _cq=cq; clRetainCommandQueue(_cq); diff --git a/lib/gpu/geryon/ucl_copy.h b/lib/gpu/geryon/ucl_copy.h index c906a14f30..94b57f7a09 100644 --- a/lib/gpu/geryon/ucl_copy.h +++ b/lib/gpu/geryon/ucl_copy.h @@ -205,12 +205,11 @@ template <> struct _host_host_copy<1,1> { // Should never be here template struct _host_host_copy { template - static inline void hhc(mat1 &dst, const mat2 &src, const size_t numel) { + static inline void hhc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*numel*/) { assert(0==1); } template - static inline void hhc(mat1 &dst, const mat2 &src, const size_t rows, - const size_t cols) { + static inline void hhc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*rows*/, const size_t /*cols*/) { assert(0==1); } }; @@ -470,24 +469,22 @@ template struct _ucl_cast_copy { // Neither on host or both on host template <> struct _ucl_cast_copy<1,1> { template - static inline void cc(mat1 &dst, const mat2 &src, const size_t numel, - mat3 &cast_buffer, command_queue &cq) { + static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*numel*/, + mat3 & /*cast_buffer*/, command_queue & /*cq*/) { assert(0==1); } template - static inline void cc(mat1 &dst, const mat2 &src, const size_t numel, - mat3 &cast_buffer) { + static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*numel*/, mat3 & /*cast_buffer*/) { assert(0==1); } template - static inline void cc(mat1 &dst, const mat2 &src, const size_t rows, - const size_t cols, mat3 &cast_buffer) { + static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*rows*/, + const size_t /*cols*/, mat3 & /*cast_buffer*/) { assert(0==1); } template - static inline void cc(mat1 &dst, const mat2 &src, const size_t rows, - const size_t cols, mat3 &cast_buffer, - command_queue &cq) { + static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*rows*/, + const size_t /*cols*/, mat3 & /*cast_buffer*/, command_queue & /*cq*/) { assert(0==1); } }; @@ -495,24 +492,22 @@ template <> struct _ucl_cast_copy<1,1> { // Neither on host or both on host template <> struct _ucl_cast_copy<0,0> { template - static inline void cc(mat1 &dst, const mat2 &src, const size_t numel, - mat3 &cast_buffer, command_queue &cq) { + static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*numel*/, + mat3 & /*cast_buffer*/, command_queue & /*cq*/) { assert(0==1); } template - static inline void cc(mat1 &dst, const mat2 &src, const size_t numel, - mat3 &cast_buffer) { + static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*numel*/, mat3 & /*cast_buffer*/) { assert(0==1); } template - static inline void cc(mat1 &dst, const mat2 &src, const size_t rows, - const size_t cols, mat3 &cast_buffer) { + static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*rows*/, + const size_t /*cols*/, mat3 & /*cast_buffer*/) { assert(0==1); } template - static inline void cc(mat1 &dst, const mat2 &src, const size_t rows, - const size_t cols, mat3 &cast_buffer, - command_queue &cq) { + static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*rows*/, + const size_t cols, mat3 & /*cast_buffer*/, command_queue & /*cq*/) { assert(0==1); } }; diff --git a/lib/gpu/geryon/ucl_d_vec.h b/lib/gpu/geryon/ucl_d_vec.h index 9158e145b3..5e281fef07 100644 --- a/lib/gpu/geryon/ucl_d_vec.h +++ b/lib/gpu/geryon/ucl_d_vec.h @@ -125,7 +125,7 @@ class UCL_D_Vec : public UCL_BaseMat { * - The view does not prevent the memory from being freed by the * allocating container when using CUDA APIs **/ template - inline void view(ucl_type &input, const size_t rows, const size_t cols) { + inline void view(ucl_type &input, const size_t UCL_DEBUG_ARG(rows), const size_t cols) { #ifdef UCL_DEBUG assert(rows==1); #endif @@ -230,8 +230,8 @@ class UCL_D_Vec : public UCL_BaseMat { * - The view does not prevent the memory from being freed by the * allocating container when using CUDA APIs **/ template - inline void view_offset(const size_t offset,ucl_type &input,const size_t rows, - const size_t cols) { + inline void view_offset(const size_t offset,ucl_type &input, + const size_t UCL_DEBUG_ARG(rows), const size_t cols) { #ifdef UCL_DEBUG assert(rows==1); #endif diff --git a/lib/gpu/geryon/ucl_h_vec.h b/lib/gpu/geryon/ucl_h_vec.h index 2f49f9f633..9f734ac40c 100644 --- a/lib/gpu/geryon/ucl_h_vec.h +++ b/lib/gpu/geryon/ucl_h_vec.h @@ -126,7 +126,7 @@ class UCL_H_Vec : public UCL_BaseMat { * allocating container when using CUDA APIs * - Viewing a device container on the host is not supported **/ template - inline void view(ucl_type &input, const size_t rows, const size_t cols) { + inline void view(ucl_type &input, const size_t UCL_DEBUG_ARG(rows), const size_t cols) { #ifdef UCL_DEBUG assert(rows==1); #endif @@ -188,7 +188,7 @@ class UCL_H_Vec : public UCL_BaseMat { * allocating container when using CUDA APIs * - Viewing a device pointer on the host is not supported **/ template - inline void view(ptr_type *input, const size_t rows, const size_t cols, + inline void view(ptr_type *input, const size_t UCL_DEBUG_ARG(rows), const size_t cols, UCL_Device &dev) { #ifdef UCL_DEBUG assert(rows==1); @@ -233,7 +233,7 @@ class UCL_H_Vec : public UCL_BaseMat { * allocating container when using CUDA APIs * - Viewing a device container on the host is not supported **/ template - inline void view_offset(const size_t offset,ucl_type &input,const size_t rows, + inline void view_offset(const size_t offset,ucl_type &input,const size_t UCL_DEBUG_ARG(rows), const size_t cols) { #ifdef UCL_DEBUG assert(rows==1); diff --git a/lib/gpu/geryon/ucl_s_obj_help.h b/lib/gpu/geryon/ucl_s_obj_help.h index a10f3cdb3f..9bc2c40fe2 100644 --- a/lib/gpu/geryon/ucl_s_obj_help.h +++ b/lib/gpu/geryon/ucl_s_obj_help.h @@ -27,7 +27,7 @@ template struct _ucl_s_obj_help; // -- Can potentially use same memory if shared by accelerator template <> struct _ucl_s_obj_help<1> { template - static inline int alloc(t1 &host, t2 &device, t3 &_buffer, + static inline int alloc(t1 &host, t2 &device, t3 & /*_buffer*/, const int cols, UCL_Device &acc, const enum UCL_MEMOPT kind1, const enum UCL_MEMOPT kind2) { @@ -131,41 +131,37 @@ template <> struct _ucl_s_obj_help<1> { } template - static inline void copy(t1 &dst, t2 &src, t3 &buffer, const bool async) { + static inline void copy(t1 &dst, t2 &src, t3 & /*buffer*/, const bool async) { ucl_copy(dst,src,async); } template - static inline void copy(t1 &dst, t2 &src, t3 &buffer, command_queue &cq) { + static inline void copy(t1 &dst, t2 &src, t3 & /*buffer*/, command_queue &cq) { ucl_copy(dst,src,cq); } template - static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer, - const bool async) { + static inline void copy(t1 &dst, t2 &src, const int cols, t3 & /*buffer*/, const bool async) { ucl_copy(dst,src,cols,async); } template - static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer, - command_queue &cq) { + static inline void copy(t1 &dst, t2 &src, const int cols, t3 & /*buffer*/, command_queue &cq) { ucl_copy(dst,src,cols,cq); } template - static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, - t3 &buffer, const bool async) { + static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, t3 & /*buffer*/, const bool async) { ucl_copy(dst,src,rows,cols,async); } template - static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, - t3 &buffer, command_queue &cq) { + static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, t3 & /*buffer*/, command_queue &cq) { ucl_copy(dst,src,rows,cols,cq); } template - static inline int dev_resize(t1 &device, t2 &host, t3 &buff,const int cols) { + static inline int dev_resize(t1 &device, t2 &host, t3 & /*buff*/,const int cols) { if (device.kind()==UCL_VIEW) { device.view(host); return UCL_SUCCESS; @@ -353,7 +349,7 @@ template struct _ucl_s_obj_help { } template - static inline int dev_resize(t1 &device, t2 &host, t3 &buff,const int cols) { + static inline int dev_resize(t1 &device, t2 & /*host*/, t3 &buff,const int cols) { int err=buff.resize(cols); if (err!=UCL_SUCCESS) return err; diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp new file mode 100644 index 0000000000..5e19997913 --- /dev/null +++ b/lib/gpu/lal_amoeba.cpp @@ -0,0 +1,322 @@ +/*************************************************************************** + amoeba.cpp + ------------------- + Trung Dac Nguyen (Northwestern) + + Class for acceleration of the amoeba pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : trung.nguyen@northwestern.edu + ***************************************************************************/ + +#if defined(USE_OPENCL) +#include "amoeba_cl.h" +#elif defined(USE_CUDART) +const char *amoeba=0; +#else +#include "amoeba_cubin.h" +#endif + +#include "lal_amoeba.h" +#include +namespace LAMMPS_AL { +#define AmoebaT Amoeba + +extern Device device; + +template +AmoebaT::Amoeba() : BaseAmoeba(), + _allocated(false) { +} + +template +AmoebaT::~Amoeba() { + clear(); +} + +template +int AmoebaT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template +int AmoebaT::init(const int ntypes, const int max_amtype, const int max_amclass, + const double *host_pdamp, const double *host_thole, + const double *host_dirdamp, const int *host_amtype2class, + const double *host_special_hal, + const double * /*host_special_repel*/, + const double * /*host_special_disp*/, + const double *host_special_mpole, + const double * /*host_special_polar_wscale*/, + const double *host_special_polar_piscale, + const double *host_special_polar_pscale, + const double *host_csix, const double *host_adisp, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const int maxspecial15, + const double cell_size, const double gpu_split, FILE *_screen, + const double polar_dscale, const double polar_uscale) { + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15, + cell_size,gpu_split,_screen,amoeba, + "k_amoeba_multipole", "k_amoeba_udirect2b", + "k_amoeba_umutual2b", "k_amoeba_polar", + "k_amoeba_fphi_uind", "k_amoeba_fphi_mpole", + "k_amoeba_short_nbor", "k_amoeba_special15"); + if (success!=0) + return success; + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + + UCL_H_Vec host_write(max_amtype, *(this->ucl_device), UCL_WRITE_ONLY); + for (int i = 0; i < max_amtype; i++) { + host_write[i].x = host_pdamp[i]; + host_write[i].y = host_thole[i]; + host_write[i].z = host_dirdamp[i]; + host_write[i].w = host_amtype2class[i]; + } + + coeff_amtype.alloc(max_amtype,*(this->ucl_device), UCL_READ_ONLY); + ucl_copy(coeff_amtype,host_write,false); + + UCL_H_Vec host_write2(max_amclass, *(this->ucl_device), UCL_WRITE_ONLY); + for (int i = 0; i < max_amclass; i++) { + host_write2[i].x = host_csix[i]; + host_write2[i].y = host_adisp[i]; + host_write2[i].z = (numtyp)0; + host_write2[i].w = (numtyp)0; + } + + coeff_amclass.alloc(max_amclass,*(this->ucl_device), UCL_READ_ONLY); + ucl_copy(coeff_amclass,host_write2,false); + + UCL_H_Vec dview(5, *(this->ucl_device), UCL_WRITE_ONLY); + sp_amoeba.alloc(5,*(this->ucl_device),UCL_READ_ONLY); + for (int i=0; i<5; i++) { + dview[i].x=host_special_hal[i]; + dview[i].y=host_special_polar_piscale[i]; + dview[i].z=host_special_polar_pscale[i]; + dview[i].w=host_special_mpole[i]; + } + ucl_copy(sp_amoeba,dview,5,false); + + _polar_dscale = polar_dscale; + _polar_uscale = polar_uscale; + + _allocated=true; + this->_max_bytes=coeff_amtype.row_bytes() + coeff_amclass.row_bytes() + + sp_amoeba.row_bytes() + this->_tep.row_bytes() + + this->_fieldp.row_bytes() + this->_thetai1.row_bytes() + + this->_thetai2.row_bytes() + this->_thetai3.row_bytes() + + this->_igrid.row_bytes() + this->_cgrid_brick.row_bytes(); + return 0; +} + +template +void AmoebaT::clear() { + if (!_allocated) + return; + _allocated=false; + + coeff_amtype.clear(); + coeff_amclass.clear(); + sp_amoeba.clear(); + + this->clear_atomic(); +} + +template +double AmoebaT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(Amoeba); +} + +// --------------------------------------------------------------------------- +// Calculate the multipole real-space term, returning tep +// --------------------------------------------------------------------------- +template +int AmoebaT::multipole_real(const int eflag, const int vflag) { + int ainum=this->ans->inum(); + if (ainum == 0) + return 0; + + int _nall=this->atom->nall(); + int nbor_pitch=this->nbor->nbor_pitch(); + + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + this->time_pair.start(); + + // Build the short neighbor list for the cutoff off2_mpole, + // at this point mpole is the first kernel in a time step for AMOEBA + + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->_off2_mpole, &ainum, + &nbor_pitch, &this->_threads_per_atom); + + this->k_multipole.set_size(GX,BX); + this->k_multipole.run(&this->atom->x, &this->atom->extra, + &coeff_amtype, &sp_amoeba, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, + &this->ans->force, &this->ans->engv, &this->_tep, + &eflag, &vflag, &ainum, &_nall, &nbor_pitch, + &this->_threads_per_atom, &this->_aewald, &this->_felec, + &this->_off2_mpole, &_polar_dscale, &_polar_uscale); + this->time_pair.stop(); + + return GX; +} + +// --------------------------------------------------------------------------- +// Launch the real-space permanent field kernel +// --------------------------------------------------------------------------- +template +int AmoebaT::udirect2b(const int /*eflag*/, const int /*vflag*/) { + int ainum=this->ans->inum(); + if (ainum == 0) + return 0; + + int _nall=this->atom->nall(); + int nbor_pitch=this->nbor->nbor_pitch(); + + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + this->time_pair.start(); + + // Build the short neighbor list for the cutoff _off2_polar, if not done yet + // this is the first kernel in a time step where _off2_polar is used + + if (!this->short_nbor_polar_avail) { + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->_off2_polar, &ainum, + &nbor_pitch, &this->_threads_per_atom); + this->short_nbor_polar_avail = true; + } + + this->k_udirect2b.set_size(GX,BX); + this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_amoeba, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, + &this->_fieldp, &ainum, &_nall, &nbor_pitch, + &this->_threads_per_atom, &this->_aewald, &this->_off2_polar, + &_polar_dscale, &_polar_uscale); + + this->time_pair.stop(); + return GX; +} + +// --------------------------------------------------------------------------- +// Launch the real-space induced field kernel, returning field and fieldp +// --------------------------------------------------------------------------- +template +int AmoebaT::umutual2b(const int /*eflag*/, const int /*vflag*/) { + int ainum=this->ans->inum(); + if (ainum == 0) + return 0; + + int _nall=this->atom->nall(); + int nbor_pitch=this->nbor->nbor_pitch(); + + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + this->time_pair.start(); + + // Build the short neighbor list if not done yet + if (!this->short_nbor_polar_avail) { + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), &this->dev_short_nbor, + &this->_off2_polar, &ainum, &nbor_pitch, + &this->_threads_per_atom); + this->short_nbor_polar_avail = true; + } + + this->k_umutual2b.set_size(GX,BX); + this->k_umutual2b.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_amoeba, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->_fieldp, &ainum, &_nall, + &nbor_pitch, &this->_threads_per_atom, &this->_aewald, + &this->_off2_polar, &_polar_dscale, &_polar_uscale); + + this->time_pair.stop(); + return GX; +} + +// --------------------------------------------------------------------------- +// Launch the polar real-space kernel, returning tep +// --------------------------------------------------------------------------- +template +int AmoebaT::polar_real(const int eflag, const int vflag) { + int ainum=this->ans->inum(); + if (ainum == 0) + return 0; + + int _nall=this->atom->nall(); + int nbor_pitch=this->nbor->nbor_pitch(); + + // Compute the block size and grid size to keep all cores busy + + const int BX=this->block_size(); + const int GX=static_cast(ceil(static_cast(ainum)/(BX/this->_threads_per_atom))); + /* + const int cus = this->device->gpu->cus(); + while (GX < cus && GX > 1) { + BX /= 2; + GX=static_cast(ceil(static_cast(ainum)/(BX/this->_threads_per_atom))); + } + */ + this->time_pair.start(); + + // Build the short neighbor list if not done yet + if (!this->short_nbor_polar_avail) { + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->_off2_polar, &ainum, + &nbor_pitch, &this->_threads_per_atom); + this->short_nbor_polar_avail = true; + } + + this->k_polar.set_size(GX,BX); + this->k_polar.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_amoeba, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, + &this->ans->force, &this->ans->engv, &this->_tep, + &eflag, &vflag, &ainum, &_nall, &nbor_pitch, + &this->_threads_per_atom, &this->_aewald, &this->_felec, + &this->_off2_polar, &_polar_dscale, &_polar_uscale); + this->time_pair.stop(); + + // Signal that short nbor list is not avail for the next time step + // do it here because polar_real() is the last kernel in a time step at this point + + this->short_nbor_polar_avail = false; + + return GX; +} + +template class Amoeba; +} diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu new file mode 100644 index 0000000000..f572d3ebd0 --- /dev/null +++ b/lib/gpu/lal_amoeba.cu @@ -0,0 +1,2099 @@ +// ************************************************************************** +// amoeba.cu +// ------------------- +// Trung Dac Nguyen (Northwestern) +// +// Device code for acceleration of the amoeba pair style +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : trung.nguyen@northwestern.edu +// *************************************************************************** + +#if defined(NV_KERNEL) || defined(USE_HIP) + +#include "lal_aux_fun1.h" +#ifdef LAMMPS_SMALLBIG +#define tagint int +#endif +#ifdef LAMMPS_BIGBIG +#include "inttypes.h" +#define tagint int64_t +#endif +#ifdef LAMMPS_SMALLSMALL +#define tagint int +#endif +#ifndef _DOUBLE_DOUBLE +_texture( pos_tex,float4); +_texture( q_tex,float); +#else +_texture_2d( pos_tex,int4); +_texture( q_tex,int2); +#endif + +#else +#define pos_tex x_ +#define q_tex q_ +#ifdef LAMMPS_SMALLBIG +#define tagint int +#endif +#ifdef LAMMPS_BIGBIG +#define tagint long +#endif +#ifdef LAMMPS_SMALLSMALL +#define tagint int +#endif + +#endif // defined(NV_KERNEL) || defined(USE_HIP) + + +#if (SHUFFLE_AVAIL == 0) + +#define local_allocate_store_ufld() \ + __local acctyp red_acc[6][BLOCK_PAIR]; + +#define store_answers_amoeba_tq(tq, ii, inum,tid, t_per_atom, offset, i, \ + tep) \ + if (t_per_atom>1) { \ + red_acc[0][tid]=tq.x; \ + red_acc[1][tid]=tq.y; \ + red_acc[2][tid]=tq.z; \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + simdsync(); \ + if (offset < s) { \ + for (int r=0; r<3; r++) \ + red_acc[r][tid] += red_acc[r][tid+s]; \ + } \ + } \ + tq.x=red_acc[0][tid]; \ + tq.y=red_acc[1][tid]; \ + tq.z=red_acc[2][tid]; \ + } \ + if (offset==0 && ii1) { \ + red_acc[0][tid]=ufld[0]; \ + red_acc[1][tid]=ufld[1]; \ + red_acc[2][tid]=ufld[2]; \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + simdsync(); \ + if (offset < s) { \ + for (int r=0; r<3; r++) \ + red_acc[r][tid] += red_acc[r][tid+s]; \ + } \ + } \ + ufld[0]=red_acc[0][tid]; \ + ufld[1]=red_acc[1][tid]; \ + ufld[2]=red_acc[2][tid]; \ + red_acc[0][tid]=dufld[0]; \ + red_acc[1][tid]=dufld[1]; \ + red_acc[2][tid]=dufld[2]; \ + red_acc[3][tid]=dufld[3]; \ + red_acc[4][tid]=dufld[4]; \ + red_acc[5][tid]=dufld[5]; \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + simdsync(); \ + if (offset < s) { \ + for (int r=0; r<6; r++) \ + red_acc[r][tid] += red_acc[r][tid+s]; \ + } \ + } \ + dufld[0]=red_acc[0][tid]; \ + dufld[1]=red_acc[1][tid]; \ + dufld[2]=red_acc[2][tid]; \ + dufld[3]=red_acc[3][tid]; \ + dufld[4]=red_acc[4][tid]; \ + dufld[5]=red_acc[5][tid]; \ + } \ + if (offset==0 && ii1) { \ + red_acc[0][tid]=_fieldp[0]; \ + red_acc[1][tid]=_fieldp[1]; \ + red_acc[2][tid]=_fieldp[2]; \ + red_acc[3][tid]=_fieldp[3]; \ + red_acc[4][tid]=_fieldp[4]; \ + red_acc[5][tid]=_fieldp[5]; \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + simdsync(); \ + if (offset < s) { \ + for (int r=0; r<6; r++) \ + red_acc[r][tid] += red_acc[r][tid+s]; \ + } \ + } \ + _fieldp[0]=red_acc[0][tid]; \ + _fieldp[1]=red_acc[1][tid]; \ + _fieldp[2]=red_acc[2][tid]; \ + _fieldp[3]=red_acc[3][tid]; \ + _fieldp[4]=red_acc[4][tid]; \ + _fieldp[5]=red_acc[5][tid]; \ + } \ + if (offset==0 && ii1) { \ + simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z); \ + if (EVFLAG && (vflag==2 || eflag==2)) { \ + if (eflag) { \ + simdsync(); \ + simd_reduce_add2(t_per_atom, red_acc, offset, tid, energy, e_coul); \ + } \ + if (vflag) { \ + simdsync(); \ + simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial); \ + } \ + } \ + } \ + if (offset==0 && ii1) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + tq.x += shfl_down(tq.x, s, t_per_atom); \ + tq.y += shfl_down(tq.y, s, t_per_atom); \ + tq.z += shfl_down(tq.z, s, t_per_atom); \ + } \ + } \ + if (offset==0 && ii1) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + ufld[0] += shfl_down(ufld[0], s, t_per_atom); \ + ufld[1] += shfl_down(ufld[1], s, t_per_atom); \ + ufld[2] += shfl_down(ufld[2], s, t_per_atom); \ + dufld[0] += shfl_down(dufld[0], s, t_per_atom); \ + dufld[1] += shfl_down(dufld[1], s, t_per_atom); \ + dufld[2] += shfl_down(dufld[2], s, t_per_atom); \ + dufld[3] += shfl_down(dufld[3], s, t_per_atom); \ + dufld[4] += shfl_down(dufld[4], s, t_per_atom); \ + dufld[5] += shfl_down(dufld[5], s, t_per_atom); \ + } \ + } \ + if (offset==0 && ii1) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + _fieldp[0] += shfl_down(_fieldp[0], s, t_per_atom); \ + _fieldp[1] += shfl_down(_fieldp[1], s, t_per_atom); \ + _fieldp[2] += shfl_down(_fieldp[2], s, t_per_atom); \ + _fieldp[3] += shfl_down(_fieldp[3], s, t_per_atom); \ + _fieldp[4] += shfl_down(_fieldp[4], s, t_per_atom); \ + _fieldp[5] += shfl_down(_fieldp[5], s, t_per_atom); \ + } \ + } \ + if (offset==0 && ii1) { \ + simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ + if (vflag==2 || eflag==2) { \ + if (eflag) \ + simd_reduce_add2(t_per_atom,energy,e_coul); \ + if (vflag) \ + simd_reduce_arr(6, t_per_atom,virial); \ + } \ + } \ + if (offset==0 && ii 1; active_subgs /= vwidth) { \ + if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads(); \ + if (bnum < active_subgs) { \ + if (eflag) { \ + simd_reduce_add2(vwidth, energy, e_coul); \ + if (voffset==0) { \ + red_acc[6][bnum] = energy; \ + red_acc[7][bnum] = e_coul; \ + } \ + } \ + if (vflag) { \ + simd_reduce_arr(6, vwidth, virial); \ + if (voffset==0) \ + for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r]; \ + } \ + } \ + \ + __syncthreads(); \ + if (tid < active_subgs) { \ + if (eflag) { \ + energy = red_acc[6][tid]; \ + e_coul = red_acc[7][tid]; \ + } \ + if (vflag) \ + for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid]; \ + } else { \ + if (eflag) energy = e_coul = (acctyp)0; \ + if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0; \ + } \ + } \ + \ + if (bnum == 0) { \ + int ei=BLOCK_ID_X; \ + if (eflag) { \ + simd_reduce_add2(vwidth, energy, e_coul); \ + if (tid==0) { \ + engv[ei]+=energy*(acctyp)0.5; \ + ei+=ev_stride; \ + engv[ei]+=e_coul*(acctyp)0.5; \ + ei+=ev_stride; \ + } \ + } \ + if (vflag) { \ + simd_reduce_arr(6, vwidth, virial); \ + if (tid==0) { \ + for (int r=0; r<6; r++) { \ + engv[ei]+=virial[r]*(acctyp)0.5; \ + ei+=ev_stride; \ + } \ + } \ + } \ + } \ + } else if (offset==0 && ii1) \ + simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ + if (offset==0 && ii (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald); + + int m; + for (m = 1; m < 6; m++) { + numtyp bfac = (numtyp) (m+m-1); + alsq2n = alsq2 * alsq2n; + bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) * r2inv; + } + for (m = 0; m < 6; m++) bn[m] *= felec; + + numtyp term1,term2,term3; + numtyp term4,term5,term6; + + term1 = ci*ck; + term2 = ck*dir - ci*dkr + dik; + term3 = ci*qkr + ck*qir - dir*dkr + (numtyp)2.0*(dkqi-diqk+qiqk); + term4 = dir*qkr - dkr*qir - (numtyp)4.0*qik; + term5 = qir*qkr; + numtyp scalek = (numtyp)1.0 - factor_mpole; + rr1 = bn[0] - scalek*rr1; + rr3 = bn[1] - scalek*rr3; + rr5 = bn[2] - scalek*rr5; + rr7 = bn[3] - scalek*rr7; + rr9 = bn[4] - scalek*rr9; + rr11 = bn[5] - scalek*rr11; + numtyp e = term1*rr1 + term2*rr3 + term3*rr5 + term4*rr7 + term5*rr9; + + // find standard multipole intermediates for force and torque + + numtyp de = term1*rr3 + term2*rr5 + term3*rr7 + term4*rr9 + term5*rr11; + term1 = -ck*rr3 + dkr*rr5 - qkr*rr7; + term2 = ci*rr3 + dir*rr5 + qir*rr7; + term3 = (numtyp)2.0 * rr5; + term4 = (numtyp)2.0 * (-ck*rr5+dkr*rr7-qkr*rr9); + term5 = (numtyp)2.0 * (-ci*rr5-dir*rr7-qir*rr9); + term6 = (numtyp)4.0 * rr7; + + energy += e; + + // compute the force components for this interaction + + numtyp frcx = de*xr + term1*dix + term2*dkx + term3*(diqkx-dkqix) + + term4*qix + term5*qkx + term6*(qixk+qkxi); + numtyp frcy = de*yr + term1*diy + term2*dky + term3*(diqky-dkqiy) + + term4*qiy + term5*qky + term6*(qiyk+qkyi); + numtyp frcz = de*zr + term1*diz + term2*dkz + term3*(diqkz-dkqiz) + + term4*qiz + term5*qkz + term6*(qizk+qkzi); + + // compute the torque components for this interaction + + numtyp ttmix = -rr3*dikx + term1*dirx + term3*(dqikx+dkqirx) - + term4*qirx - term6*(qikrx+qikx); + numtyp ttmiy = -rr3*diky + term1*diry + term3*(dqiky+dkqiry) - + term4*qiry - term6*(qikry+qiky); + numtyp ttmiz = -rr3*dikz + term1*dirz + term3*(dqikz+dkqirz) - + term4*qirz - term6*(qikrz+qikz); + + // increment force-based gradient and torque on first site + + f.x -= frcx; + f.y -= frcy; + f.z -= frcz; + tq.x += ttmix; + tq.y += ttmiy; + tq.z += ttmiz; + + if (EVFLAG && vflag) { + numtyp vxx = -xr * frcx; + numtyp vxy = (numtyp)-0.5 * (yr*frcx+xr*frcy); + numtyp vxz = (numtyp)-0.5 * (zr*frcx+xr*frcz); + numtyp vyy = -yr * frcy; + numtyp vyz = (numtyp)-0.5 * (zr*frcy+yr*frcz); + numtyp vzz = -zr * frcz; + + virial[0] -= vxx; + virial[1] -= vyy; + virial[2] -= vzz; + virial[3] -= vxy; + virial[4] -= vxz; + virial[5] -= vyz; + } + } // nbor + + } // ii (numtyp)0.0) aesq2n = (numtyp)1.0 / (MY_PIS*aewald); + + for ( ; nbor (numtyp)0.0) aesq2n = (numtyp)1.0 / (MY_PIS*aewald); + + for ( ; nbor (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald); + + for (m = 1; m <= 4; m++) { + numtyp bfac = (numtyp) (m+m-1); + alsq2n = alsq2 * alsq2n; + bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) * r2inv; + } + for (m = 0; m < 5; m++) bn[m] *= felec; + + // apply Thole polarization damping to scale factors + + numtyp sc3 = (numtyp)1.0; + numtyp sc5 = (numtyp)1.0; + numtyp sc7 = (numtyp)1.0; + for (k = 0; k < 3; k++) { + rc3[k] = (numtyp)0.0; + rc5[k] = (numtyp)0.0; + rc7[k] = (numtyp)0.0; + } + + // apply Thole polarization damping to scale factors + + numtyp damp = pdi * coeff[jtype].x; // pdamp[jtype] + if (damp != (numtyp)0.0) { + numtyp pgamma = MIN(pti,coeff[jtype].y); // thole[jtype] + numtyp tmp = r*ucl_recip(damp); + damp = pgamma * (tmp*tmp*tmp); + if (damp < (numtyp)50.0) { + numtyp expdamp = ucl_exp(-damp); + sc3 = (numtyp)1.0 - expdamp; + sc5 = (numtyp)1.0 - ((numtyp)1.0+damp)*expdamp; + sc7 = (numtyp)1.0 - ((numtyp)1.0+damp+(numtyp)0.6*damp*damp) * expdamp; + numtyp temp3 = (numtyp)3.0 * damp * expdamp * r2inv; + numtyp temp5 = damp; + numtyp temp7 = (numtyp)-0.2 + (numtyp)0.6*damp; + rc3[0] = xr * temp3; + rc3[1] = yr * temp3; + rc3[2] = zr * temp3; + rc5[0] = rc3[0] * temp5; + rc5[1] = rc3[1] * temp5; + rc5[2] = rc3[2] * temp5; + rc7[0] = rc5[0] * temp7; + rc7[1] = rc5[1] * temp7; + rc7[2] = rc5[2] * temp7; + } + + psc3 = (numtyp)1.0 - sc3*factor_pscale; + psc5 = (numtyp)1.0 - sc5*factor_pscale; + psc7 = (numtyp)1.0 - sc7*factor_pscale; + dsc3 = (numtyp)1.0 - sc3*factor_dscale; + dsc5 = (numtyp)1.0 - sc5*factor_dscale; + dsc7 = (numtyp)1.0 - sc7*factor_dscale; + usc3 = (numtyp)1.0 - sc3*factor_uscale; + usc5 = (numtyp)1.0 - sc5*factor_uscale; + psr3 = bn[1] - psc3*rr3; + psr5 = bn[2] - psc5*rr5; + psr7 = bn[3] - psc7*rr7; + dsr3 = bn[1] - dsc3*rr3; + dsr5 = bn[2] - dsc5*rr5; + dsr7 = bn[3] - dsc7*rr7; + usr5 = bn[2] - usc5*rr5; + for (k = 0; k < 3; k++) { + prc3[k] = rc3[k] * factor_pscale; + prc5[k] = rc5[k] * factor_pscale; + prc7[k] = rc7[k] * factor_pscale; + drc3[k] = rc3[k] * factor_dscale; + drc5[k] = rc5[k] * factor_dscale; + drc7[k] = rc7[k] * factor_dscale; + urc3[k] = rc3[k] * factor_uscale; + urc5[k] = rc5[k] * factor_uscale; + } + } else { // damp == 0: ??? + } + + // get the induced dipole field used for dipole torques + + numtyp tix3 = psr3*ukx + dsr3*ukxp; + numtyp tiy3 = psr3*uky + dsr3*ukyp; + numtyp tiz3 = psr3*ukz + dsr3*ukzp; + numtyp tuir = -psr5*ukr - dsr5*ukrp; + + ufld[0] += tix3 + xr*tuir; + ufld[1] += tiy3 + yr*tuir; + ufld[2] += tiz3 + zr*tuir; + + // get induced dipole field gradient used for quadrupole torques + + numtyp tix5 = (numtyp)2.0 * (psr5*ukx+dsr5*ukxp); + numtyp tiy5 = (numtyp)2.0 * (psr5*uky+dsr5*ukyp); + numtyp tiz5 = (numtyp)2.0 * (psr5*ukz+dsr5*ukzp); + tuir = -psr7*ukr - dsr7*ukrp; + + dufld[0] += xr*tix5 + xr*xr*tuir; + dufld[1] += xr*tiy5 + yr*tix5 + (numtyp)2.0*xr*yr*tuir; + dufld[2] += yr*tiy5 + yr*yr*tuir; + dufld[3] += xr*tiz5 + zr*tix5 + (numtyp)2.0*xr*zr*tuir; + dufld[4] += yr*tiz5 + zr*tiy5 + (numtyp)2.0*yr*zr*tuir; + dufld[5] += zr*tiz5 + zr*zr*tuir; + + // get the dEd/dR terms used for direct polarization force + + term1 = bn[2] - dsc3*rr5; + term2 = bn[3] - dsc5*rr7; + term3 = -dsr3 + term1*xr*xr - rr3*xr*drc3[0]; + term4 = rr3*drc3[0] - term1*xr - dsr5*xr; + term5 = term2*xr*xr - dsr5 - rr5*xr*drc5[0]; + term6 = (bn[4]-dsc7*rr9)*xr*xr - bn[3] - rr7*xr*drc7[0]; + term7 = rr5*drc5[0] - (numtyp)2.0*bn[3]*xr + (dsc5+(numtyp)1.5*dsc7)*rr7*xr; + numtyp tixx = ci*term3 + dix*term4 + dir*term5 + + (numtyp)2.0*dsr5*qixx + (qiy*yr+qiz*zr)*dsc7*rr7 + (numtyp)2.0*qix*term7 + qir*term6; + numtyp tkxx = ck*term3 - dkx*term4 - dkr*term5 + + (numtyp)2.0*dsr5*qkxx + (qky*yr+qkz*zr)*dsc7*rr7 + (numtyp)2.0*qkx*term7 + qkr*term6; + + term3 = -dsr3 + term1*yr*yr - rr3*yr*drc3[1]; + term4 = rr3*drc3[1] - term1*yr - dsr5*yr; + term5 = term2*yr*yr - dsr5 - rr5*yr*drc5[1]; + term6 = (bn[4]-dsc7*rr9)*yr*yr - bn[3] - rr7*yr*drc7[1]; + term7 = rr5*drc5[1] - (numtyp)2.0*bn[3]*yr + (dsc5+(numtyp)1.5*dsc7)*rr7*yr; + numtyp tiyy = ci*term3 + diy*term4 + dir*term5 + + (numtyp)2.0*dsr5*qiyy + (qix*xr+qiz*zr)*dsc7*rr7 + (numtyp)2.0*qiy*term7 + qir*term6; + numtyp tkyy = ck*term3 - dky*term4 - dkr*term5 + + (numtyp)2.0*dsr5*qkyy + (qkx*xr+qkz*zr)*dsc7*rr7 + (numtyp)2.0*qky*term7 + qkr*term6; + + term3 = -dsr3 + term1*zr*zr - rr3*zr*drc3[2]; + term4 = rr3*drc3[2] - term1*zr - dsr5*zr; + term5 = term2*zr*zr - dsr5 - rr5*zr*drc5[2]; + term6 = (bn[4]-dsc7*rr9)*zr*zr - bn[3] - rr7*zr*drc7[2]; + term7 = rr5*drc5[2] - (numtyp)2.0*bn[3]*zr + (dsc5+(numtyp)1.5*dsc7)*rr7*zr; + numtyp tizz = ci*term3 + diz*term4 + dir*term5 + + (numtyp)2.0*dsr5*qizz + (qix*xr+qiy*yr)*dsc7*rr7 + (numtyp)2.0*qiz*term7 + qir*term6; + numtyp tkzz = ck*term3 - dkz*term4 - dkr*term5 + + (numtyp)2.0*dsr5*qkzz + (qkx*xr+qky*yr)*dsc7*rr7 + (numtyp)2.0*qkz*term7 + qkr*term6; + + term3 = term1*xr*yr - rr3*yr*drc3[0]; + term4 = rr3*drc3[0] - term1*xr; + term5 = term2*xr*yr - rr5*yr*drc5[0]; + term6 = (bn[4]-dsc7*rr9)*xr*yr - rr7*yr*drc7[0]; + term7 = rr5*drc5[0] - term2*xr; + numtyp tixy = ci*term3 - dsr5*dix*yr + diy*term4 + dir*term5 + + (numtyp)2.0*dsr5*qixy - (numtyp)2.0*dsr7*yr*qix + (numtyp)2.0*qiy*term7 + qir*term6; + numtyp tkxy = ck*term3 + dsr5*dkx*yr - dky*term4 - dkr*term5 + + (numtyp)2.0*dsr5*qkxy - (numtyp)2.0*dsr7*yr*qkx +(numtyp) 2.0*qky*term7 + qkr*term6; + + term3 = term1*xr*zr - rr3*zr*drc3[0]; + term5 = term2*xr*zr - rr5*zr*drc5[0]; + term6 = (bn[4]-dsc7*rr9)*xr*zr - rr7*zr*drc7[0]; + numtyp tixz = ci*term3 - dsr5*dix*zr + diz*term4 + dir*term5 + + (numtyp)2.0*dsr5*qixz - (numtyp)2.0*dsr7*zr*qix + (numtyp)2.0*qiz*term7 + qir*term6; + numtyp tkxz = ck*term3 + dsr5*dkx*zr - dkz*term4 - dkr*term5 + + (numtyp)2.0*dsr5*qkxz - (numtyp)2.0*dsr7*zr*qkx + (numtyp)2.0*qkz*term7 + qkr*term6; + + term3 = term1*yr*zr - rr3*zr*drc3[1]; + term4 = rr3*drc3[1] - term1*yr; + term5 = term2*yr*zr - rr5*zr*drc5[1]; + term6 = (bn[4]-dsc7*rr9)*yr*zr - rr7*zr*drc7[1]; + term7 = rr5*drc5[1] - term2*yr; + numtyp tiyz = ci*term3 - dsr5*diy*zr + diz*term4 + dir*term5 + + (numtyp)2.0*dsr5*qiyz - (numtyp)2.0*dsr7*zr*qiy + (numtyp)2.0*qiz*term7 + qir*term6; + numtyp tkyz = ck*term3 + dsr5*dky*zr - dkz*term4 - dkr*term5 + + (numtyp)2.0*dsr5*qkyz - (numtyp)2.0*dsr7*zr*qky + (numtyp)2.0*qkz*term7 + qkr*term6; + + numtyp depx = tixx*ukxp + tixy*ukyp + tixz*ukzp - tkxx*uixp - tkxy*uiyp - tkxz*uizp; + numtyp depy = tixy*ukxp + tiyy*ukyp + tiyz*ukzp - tkxy*uixp - tkyy*uiyp - tkyz*uizp; + numtyp depz = tixz*ukxp + tiyz*ukyp + tizz*ukzp - tkxz*uixp - tkyz*uiyp - tkzz*uizp; + + numtyp frcx = depx; + numtyp frcy = depy; + numtyp frcz = depz; + + // get the dEp/dR terms used for direct polarization force + + // tixx and tkxx + term1 = bn[2] - psc3*rr5; + term2 = bn[3] - psc5*rr7; + term3 = -psr3 + term1*xr*xr - rr3*xr*prc3[0]; + term4 = rr3*prc3[0] - term1*xr - psr5*xr; + term5 = term2*xr*xr - psr5 - rr5*xr*prc5[0]; + term6 = (bn[4]-psc7*rr9)*xr*xr - bn[3] - rr7*xr*prc7[0]; + term7 = rr5*prc5[0] - (numtyp)2.0*bn[3]*xr + (psc5+(numtyp)1.5*psc7)*rr7*xr; + tixx = ci*term3 + dix*term4 + dir*term5 + + (numtyp)2.0*psr5*qixx + (qiy*yr+qiz*zr)*psc7*rr7 + (numtyp)2.0*qix*term7 + qir*term6; + tkxx = ck*term3 - dkx*term4 - dkr*term5 + + (numtyp)2.0*psr5*qkxx + (qky*yr+qkz*zr)*psc7*rr7 + (numtyp)2.0*qkx*term7 + qkr*term6; + + // tiyy and tkyy + term3 = -psr3 + term1*yr*yr - rr3*yr*prc3[1]; + term4 = rr3*prc3[1] - term1*yr - psr5*yr; + term5 = term2*yr*yr - psr5 - rr5*yr*prc5[1]; + term6 = (bn[4]-psc7*rr9)*yr*yr - bn[3] - rr7*yr*prc7[1]; + term7 = rr5*prc5[1] - (numtyp)2.0*bn[3]*yr + (psc5+(numtyp)1.5*psc7)*rr7*yr; + tiyy = ci*term3 + diy*term4 + dir*term5 + + (numtyp)2.0*psr5*qiyy + (qix*xr+qiz*zr)*psc7*rr7 + (numtyp)2.0*qiy*term7 + qir*term6; + tkyy = ck*term3 - dky*term4 - dkr*term5 + + (numtyp)2.0*psr5*qkyy + (qkx*xr+qkz*zr)*psc7*rr7 + (numtyp)2.0*qky*term7 + qkr*term6; + + // tizz and tkzz + term3 = -psr3 + term1*zr*zr - rr3*zr*prc3[2]; + term4 = rr3*prc3[2] - term1*zr - psr5*zr; + term5 = term2*zr*zr - psr5 - rr5*zr*prc5[2]; + term6 = (bn[4]-psc7*rr9)*zr*zr - bn[3] - rr7*zr*prc7[2]; + term7 = rr5*prc5[2] - (numtyp)2.0*bn[3]*zr + (psc5+(numtyp)1.5*psc7)*rr7*zr; + tizz = ci*term3 + diz*term4 + dir*term5 + + (numtyp)2.0*psr5*qizz + (qix*xr+qiy*yr)*psc7*rr7 + (numtyp)2.0*qiz*term7 + qir*term6; + tkzz = ck*term3 - dkz*term4 - dkr*term5 + + (numtyp)2.0*psr5*qkzz + (qkx*xr+qky*yr)*psc7*rr7 + (numtyp)2.0*qkz*term7 + qkr*term6; + + // tixy and tkxy + term3 = term1*xr*yr - rr3*yr*prc3[0]; + term4 = rr3*prc3[0] - term1*xr; + term5 = term2*xr*yr - rr5*yr*prc5[0]; + term6 = (bn[4]-psc7*rr9)*xr*yr - rr7*yr*prc7[0]; + term7 = rr5*prc5[0] - term2*xr; + tixy = ci*term3 - psr5*dix*yr + diy*term4 + dir*term5 + + (numtyp)2.0*psr5*qixy - (numtyp)2.0*psr7*yr*qix + (numtyp)2.0*qiy*term7 + qir*term6; + tkxy = ck*term3 + psr5*dkx*yr - dky*term4 - dkr*term5 + + (numtyp)2.0*psr5*qkxy - (numtyp)2.0*psr7*yr*qkx + (numtyp)2.0*qky*term7 + qkr*term6; + + // tixz and tkxz + term3 = term1*xr*zr - rr3*zr*prc3[0]; + term5 = term2*xr*zr - rr5*zr*prc5[0]; + term6 = (bn[4]-psc7*rr9)*xr*zr - rr7*zr*prc7[0]; + tixz = ci*term3 - psr5*dix*zr + diz*term4 + dir*term5 + + (numtyp)2.0*psr5*qixz - (numtyp)2.0*psr7*zr*qix + (numtyp)2.0*qiz*term7 + qir*term6; + tkxz = ck*term3 + psr5*dkx*zr - dkz*term4 - dkr*term5 + + (numtyp)2.0*psr5*qkxz - (numtyp)2.0*psr7*zr*qkx + (numtyp)2.0*qkz*term7 + qkr*term6; + + // tiyz and tkyz + term3 = term1*yr*zr - rr3*zr*prc3[1]; + term4 = rr3*prc3[1] - term1*yr; + term5 = term2*yr*zr - rr5*zr*prc5[1]; + term6 = (bn[4]-psc7*rr9)*yr*zr - rr7*zr*prc7[1]; + term7 = rr5*prc5[1] - term2*yr; + tiyz = ci*term3 - psr5*diy*zr + diz*term4 + dir*term5 + + (numtyp)2.0*psr5*qiyz - (numtyp)2.0*psr7*zr*qiy + (numtyp)2.0*qiz*term7 + qir*term6; + tkyz = ck*term3 + psr5*dky*zr - dkz*term4 - dkr*term5 + + (numtyp)2.0*psr5*qkyz - (numtyp)2.0*psr7*zr*qky + (numtyp)2.0*qkz*term7 + qkr*term6; + + depx = tixx*ukx + tixy*uky + tixz*ukz - tkxx*uix - tkxy*uiy - tkxz*uiz; + depy = tixy*ukx + tiyy*uky + tiyz*ukz - tkxy*uix - tkyy*uiy - tkyz*uiz; + depz = tixz*ukx + tiyz*uky + tizz*ukz - tkxz*uix - tkyz*uiy - tkzz*uiz; + + frcx = frcx + depx; + frcy = frcy + depy; + frcz = frcz + depz; + + // get the dtau/dr terms used for mutual polarization force + // poltyp == MUTUAL && amoeba + + term1 = bn[2] - usc3*rr5; + term2 = bn[3] - usc5*rr7; + term3 = usr5 + term1; + term4 = rr3 * factor_uscale; + term5 = -xr*term3 + rc3[0]*term4; + term6 = -usr5 + xr*xr*term2 - rr5*xr*urc5[0]; + tixx = uix*term5 + uir*term6; + tkxx = ukx*term5 + ukr*term6; + + term5 = -yr*term3 + rc3[1]*term4; + term6 = -usr5 + yr*yr*term2 - rr5*yr*urc5[1]; + tiyy = uiy*term5 + uir*term6; + tkyy = uky*term5 + ukr*term6; + + term5 = -zr*term3 + rc3[2]*term4; + term6 = -usr5 + zr*zr*term2 - rr5*zr*urc5[2]; + tizz = uiz*term5 + uir*term6; + tkzz = ukz*term5 + ukr*term6; + + term4 = -usr5 * yr; + term5 = -xr*term1 + rr3*urc3[0]; + term6 = xr*yr*term2 - rr5*yr*urc5[0]; + tixy = uix*term4 + uiy*term5 + uir*term6; + tkxy = ukx*term4 + uky*term5 + ukr*term6; + + term4 = -usr5 * zr; + term6 = xr*zr*term2 - rr5*zr*urc5[0]; + tixz = uix*term4 + uiz*term5 + uir*term6; + tkxz = ukx*term4 + ukz*term5 + ukr*term6; + + term5 = -yr*term1 + rr3*urc3[1]; + term6 = yr*zr*term2 - rr5*zr*urc5[1]; + tiyz = uiy*term4 + uiz*term5 + uir*term6; + tkyz = uky*term4 + ukz*term5 + ukr*term6; + + depx = tixx*ukxp + tixy*ukyp + tixz*ukzp + + tkxx*uixp + tkxy*uiyp + tkxz*uizp; + depy = tixy*ukxp + tiyy*ukyp + tiyz*ukzp + + tkxy*uixp + tkyy*uiyp + tkyz*uizp; + depz = tixz*ukxp + tiyz*ukyp + tizz*ukzp + + tkxz*uixp + tkyz*uiyp + tkzz*uizp; + + frcx = frcx + depx; + frcy = frcy + depy; + frcz = frcz + depz; + + f.x += frcx; + f.y += frcy; + f.z += frcz; + + if (EVFLAG && vflag) { + numtyp vxx = xr * frcx; + numtyp vxy = (numtyp)0.5 * (yr*frcx+xr*frcy); + numtyp vxz = (numtyp)0.5 * (zr*frcx+xr*frcz); + numtyp vyy = yr * frcy; + numtyp vyz = (numtyp)0.5 * (zr*frcy+yr*frcz); + numtyp vzz = zr * frcz; + + virial[0] -= vxx; + virial[1] -= vyy; + virial[2] -= vzz; + virial[3] -= vxy; + virial[4] -= vxz; + virial[5] -= vyz; + } + } // nbor + + } // ii> SBBITS & 3; + int j = sj & NEIGHMASK; + tagint jtag = tag[j]; + + if (!which) { + int offset=ii; + for (int k=0; k +class Amoeba : public BaseAmoeba { + public: + Amoeba(); + ~Amoeba(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successful + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, const int max_amtype, const int max_amclass, + const double *host_pdamp, const double *host_thole, + const double *host_dirdamp, const int *host_amtype2class, + const double *host_special_mpole, + const double *host_special_hal, + const double *host_special_repel, + const double *host_special_disp, + const double *host_special_polar_wscale, + const double *host_special_polar_piscale, + const double *host_special_polar_pscale, + const double *host_csix, const double *host_adisp, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const int maxspecial15, const double cell_size, + const double gpu_split, FILE *_screen, + const double polar_dscale, const double polar_uscale); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + // --------------------------- TYPE DATA -------------------------- + + /// pdamp = coeff_amtype.x; thole = coeff_amtype.y; + /// dirdamp = coeff_amtype.z; amtype2class = coeff_amtype.w + UCL_D_Vec coeff_amtype; + /// csix = coeff_amclass.x; adisp = coeff_amclass.y; + UCL_D_Vec coeff_amclass; + /// Special amoeba values [0-4]: + /// sp_amoeba.x = special_hal + /// sp_amoeba.y = special_polar_pscale, + /// sp_amoeba.z = special_polar_piscale + /// sp_amoeba.w = special_mpole + UCL_D_Vec sp_amoeba; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + numtyp _polar_dscale, _polar_uscale; + numtyp _qqrd2e; + + protected: + bool _allocated; + int multipole_real(const int eflag, const int vflag); + int udirect2b(const int eflag, const int vflag); + int umutual2b(const int eflag, const int vflag); + int polar_real(const int eflag, const int vflag); + +}; + +} + +#endif diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp new file mode 100644 index 0000000000..995dfbe95f --- /dev/null +++ b/lib/gpu/lal_amoeba_ext.cpp @@ -0,0 +1,213 @@ +/*************************************************************************** + amoeba_ext.cpp + ------------------- + Trung Dac Nguyen (Northwestern) + + Functions for LAMMPS access to amoeba acceleration routines. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : trung.nguyen@northwestern.edu + ***************************************************************************/ + +#include +#include +#include + +#include "lal_amoeba.h" + +using namespace std; +using namespace LAMMPS_AL; + +static Amoeba AMOEBAMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclass, + const double *host_pdamp, const double *host_thole, + const double *host_dirdamp, const int *host_amtype2class, + const double *host_special_hal, + const double *host_special_repel, + const double *host_special_disp, + const double *host_special_mpole, + const double *host_special_polar_wscale, + const double *host_special_polar_piscale, + const double *host_special_polar_pscale, + const double *host_csix, const double *host_adisp, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const int maxspecial15, + const double cell_size, int &gpu_mode, FILE *screen, + const double polar_dscale, const double polar_uscale) { + AMOEBAMF.clear(); + gpu_mode=AMOEBAMF.device->gpu_mode(); + double gpu_split=AMOEBAMF.device->particle_split(); + int first_gpu=AMOEBAMF.device->first_device(); + int last_gpu=AMOEBAMF.device->last_device(); + int world_me=AMOEBAMF.device->world_me(); + int gpu_rank=AMOEBAMF.device->gpu_rank(); + int procs_per_gpu=AMOEBAMF.device->procs_per_gpu(); + + AMOEBAMF.device->init_message(screen,"amoeba",first_gpu,last_gpu); + + bool message=false; + if (AMOEBAMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing GPU and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=AMOEBAMF.init(ntypes, max_amtype, max_amclass, + host_pdamp, host_thole, host_dirdamp, + host_amtype2class, host_special_hal, + host_special_repel, host_special_disp, + host_special_mpole, host_special_polar_wscale, + host_special_polar_piscale, host_special_polar_pscale, + host_csix, host_adisp, nlocal, nall, max_nbors, + maxspecial, maxspecial15, cell_size, gpu_split, + screen, polar_dscale, polar_uscale); + + AMOEBAMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; igpu_barrier(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + AMOEBAMF.estimate_gpu_overhead(); + return init_ok; +} + +void amoeba_gpu_clear() { + AMOEBAMF.clear(); +} + +int** amoeba_gpu_precompute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, double ** /*host_uind*/, + double ** /*host_uinp*/, double * /*host_pval*/, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, double *prd) { + return AMOEBAMF.precompute(ago, inum_full, nall, host_x, host_type, + host_amtype, host_amgroup, host_rpole, + nullptr, nullptr, nullptr, sublo, subhi, tag, + nspecial, special, nspecial15, special15, + eflag_in, vflag_in, eatom, vatom, + host_start, ilist, jnum, cpu_time, + success, host_q, boxlo, prd); +} + + +void amoeba_gpu_compute_multipole_real(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *host_amtype, int *host_amgroup, double **host_rpole, + double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, int *nspecial15, tagint** special15, + const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, const double aewald, const double felec, const double off2, + double *host_q, double *boxlo, double *prd, void **tep_ptr) { + AMOEBAMF.compute_multipole_real(ago, inum_full, nall, host_x, host_type, + host_amtype, host_amgroup, host_rpole, nullptr, sublo, subhi, + tag, nspecial, special, nspecial15, special15, + eflag, vflag, eatom, vatom, host_start, ilist, jnum, + cpu_time, success, aewald, felec, off2, host_q, boxlo, prd, tep_ptr); +} + +void amoeba_gpu_compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, + const double aewald, const double off2, void **fieldp_ptr) { + AMOEBAMF.compute_udirect2b(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, nullptr, + aewald, off2, fieldp_ptr); +} + +void amoeba_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, + const double aewald, const double off2, void **fieldp_ptr) { + AMOEBAMF.compute_umutual2b(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, nullptr, + aewald, off2, fieldp_ptr); +} + +void amoeba_gpu_update_fieldp(void **fieldp_ptr) { + AMOEBAMF.update_fieldp(fieldp_ptr); +} + +void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, + const double aewald, const double felec, const double off2, + void **tep_ptr) { + AMOEBAMF.compute_polar_real(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, nullptr, + eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr); +} + +void amoeba_gpu_precompute_kspace(const int inum_full, const int bsorder, + double ***host_thetai1, double ***host_thetai2, + double ***host_thetai3, int** igrid, + const int nzlo_out, const int nzhi_out, + const int nylo_out, const int nyhi_out, + const int nxlo_out, const int nxhi_out) { + AMOEBAMF.precompute_kspace(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3, igrid, + nzlo_out, nzhi_out, nylo_out, nyhi_out, nxlo_out, nxhi_out); +} + +void amoeba_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1, + void **host_fdip_phi2, void **host_fdip_sum_phi) { + AMOEBAMF.compute_fphi_uind(host_grid_brick, host_fdip_phi1, + host_fdip_phi2, host_fdip_sum_phi); +} + +void amoeba_gpu_fphi_mpole(double ***host_grid_brick, void **host_fphi, const double felec) { + AMOEBAMF.compute_fphi_mpole(host_grid_brick, host_fphi, felec); +} + +void amoeba_setup_fft(const int numel, const int element_type) { + AMOEBAMF.setup_fft(numel, element_type); +} + +void amoeba_compute_fft1d(void* in, void* out, const int numel, const int mode) { + AMOEBAMF.compute_fft1d(in, out, numel, mode); +} + +double amoeba_gpu_bytes() { + return AMOEBAMF.host_memory_usage(); +} diff --git a/lib/gpu/lal_atom.cpp b/lib/gpu/lal_atom.cpp index 17cfa0dc5a..72cb59a912 100644 --- a/lib/gpu/lal_atom.cpp +++ b/lib/gpu/lal_atom.cpp @@ -48,6 +48,8 @@ int AtomT::bytes_per_atom() const { bytes+=sizeof(numtyp); if (_vel) bytes+=4*sizeof(numtyp); + if (_extra_fields>0) + bytes+=_extra_fields*sizeof(numtyp4); return bytes; } @@ -122,6 +124,11 @@ bool AtomT::alloc(const int nall) { UCL_READ_ONLY)==UCL_SUCCESS); gpu_bytes+=v.device.row_bytes(); } + if (_extra_fields>0) { + success=success && (extra.alloc(_max_atoms*_extra_fields,*dev,UCL_WRITE_ONLY, + UCL_READ_ONLY)==UCL_SUCCESS); + gpu_bytes+=extra.device.row_bytes(); + } if (_gpu_nbor>0) { if (_bonds) { @@ -156,7 +163,8 @@ bool AtomT::alloc(const int nall) { template bool AtomT::add_fields(const bool charge, const bool rot, - const int gpu_nbor, const bool bonds, const bool vel) { + const int gpu_nbor, const bool bonds, const bool vel, + const int extra_fields) { bool success=true; // Ignore host/device transfers? int gpu_bytes=0; @@ -191,7 +199,17 @@ bool AtomT::add_fields(const bool charge, const bool rot, } } - if (bonds && !_bonds) { + if (extra_fields > 0 && _extra_fields==0) { + _extra_fields=extra_fields; + _other=true; + if (_host_view==false) { + success=success && (extra.alloc(_max_atoms*_extra_fields,*dev,UCL_WRITE_ONLY, + UCL_READ_ONLY)==UCL_SUCCESS); + gpu_bytes+=extra.device.row_bytes(); + } + } + + if (bonds && _bonds==false) { _bonds=true; if (_bonds && _gpu_nbor>0) { success=success && (dev_tag.alloc(_max_atoms,*dev, @@ -254,7 +272,8 @@ bool AtomT::add_fields(const bool charge, const bool rot, template bool AtomT::init(const int nall, const bool charge, const bool rot, - UCL_Device &devi, const int gpu_nbor, const bool bonds, const bool vel) { + UCL_Device &devi, const int gpu_nbor, const bool bonds, const bool vel, + const int extra_fields) { clear(); bool success=true; @@ -262,13 +281,15 @@ bool AtomT::init(const int nall, const bool charge, const bool rot, _q_avail=false; _quat_avail=false; _v_avail=false; + _extra_avail=false; _resized=false; _gpu_nbor=gpu_nbor; _bonds=bonds; _charge=charge; _rot=rot; _vel=vel; - _other=_charge || _rot || _vel; + _extra_fields=extra_fields; + _other=_charge || _rot || _vel || (extra_fields>0); dev=&devi; _time_transfer=0; @@ -282,10 +303,14 @@ bool AtomT::init(const int nall, const bool charge, const bool rot, time_q.init(*dev); time_quat.init(*dev); time_vel.init(*dev); + time_extra.init(*dev); + time_pos.zero(); time_q.zero(); time_quat.zero(); time_vel.zero(); + time_extra.zero(); + _time_cast=0.0; #ifdef GPU_CAST @@ -308,6 +333,8 @@ void AtomT::clear_resize() { quat.clear(); if (_vel) v.clear(); + if (_extra_fields>0) + extra.clear(); dev_cell_id.clear(); dev_particle_id.clear(); @@ -350,6 +377,7 @@ void AtomT::clear() { time_q.clear(); time_quat.clear(); time_vel.clear(); + time_extra.clear(); clear_resize(); #ifdef GPU_CAST @@ -370,12 +398,19 @@ double AtomT::host_memory_usage() const { atom_bytes+=4; if (_vel) atom_bytes+=4; + if (_extra_fields>0) + atom_bytes+=_extra_fields; return _max_atoms*atom_bytes*sizeof(numtyp)+sizeof(Atom); } +#ifdef USE_CUDPP +#define USE_CUDPP_ARG(arg) arg +#else +#define USE_CUDPP_ARG(arg) +#endif // Sort arrays for neighbor list calculation template -void AtomT::sort_neighbor(const int num_atoms) { +void AtomT::sort_neighbor(const int USE_CUDPP_ARG(num_atoms)) { #ifdef USE_CUDPP CUDPPResult result = cudppSort(sort_plan, (unsigned *)dev_cell_id.begin(), (int *)dev_particle_id.begin(), diff --git a/lib/gpu/lal_atom.h b/lib/gpu/lal_atom.h index 77c1faa784..771c2a3571 100644 --- a/lib/gpu/lal_atom.h +++ b/lib/gpu/lal_atom.h @@ -76,7 +76,7 @@ class Atom { * gpu_nbor 2 if binning on host and neighboring on device **/ bool init(const int nall, const bool charge, const bool rot, UCL_Device &dev, const int gpu_nbor=0, const bool bonds=false, - const bool vel=false); + const bool vel=false, const int extra_fields=0); /// Check if we have enough device storage and realloc if not /** Returns true if resized with any call during this timestep **/ @@ -96,7 +96,7 @@ class Atom { * gpu_nbor 1 if neighboring will be performed on device * gpu_nbor 2 if binning on host and neighboring on device **/ bool add_fields(const bool charge, const bool rot, const int gpu_nbor, - const bool bonds, const bool vel=false); + const bool bonds, const bool vel=false, const int extra_fields=0); /// Returns true if GPU is using charges bool charge() { return _charge; } @@ -107,6 +107,9 @@ class Atom { /// Returns true if GPU is using velocities bool velocity() { return _vel; } + /// Returns true if GPU is using extra fields + bool using_extra() { return (_extra_fields>0); } + /// Only free matrices of length inum or nall for resizing void clear_resize(); @@ -128,6 +131,8 @@ class Atom { time_quat.add_to_total(); if (_vel) time_vel.add_to_total(); + if (_extra_fields>0) + time_extra.add_to_total(); } /// Add copy times to timers @@ -139,6 +144,8 @@ class Atom { time_quat.zero(); if (_vel) time_vel.zero(); + if (_extra_fields>0) + time_extra.zero(); } /// Return the total time for host/device data transfer @@ -158,6 +165,10 @@ class Atom { total+=time_vel.total_seconds(); time_vel.zero_total(); } + if (_extra_fields>0) { + total+=time_extra.total_seconds(); + time_extra.zero_total(); + } return total+_time_transfer/1000.0; } @@ -281,7 +292,11 @@ class Atom { /// Signal that we need to transfer atom data for next timestep inline void data_unavail() - { _x_avail=false; _q_avail=false; _quat_avail=false; _v_avail=false; _resized=false; } + { _x_avail=false; _q_avail=false; _quat_avail=false; _v_avail=false; _extra_avail=false; _resized=false; } + + /// Signal that we need to transfer atom extra data for next kernel call + inline void extra_data_unavail() + { _extra_avail=false; } typedef struct { double x,y,z; } vec3d; typedef struct { numtyp x,y,z,w; } vec4d_t; @@ -312,7 +327,7 @@ class Atom { /// Copy positions and types to device asynchronously /** Copies nall() elements **/ - inline void add_x_data(double **host_ptr, int *host_type) { + inline void add_x_data(double ** /*host_ptr*/, int * /*host_type*/) { time_pos.start(); if (_x_avail==false) { #ifdef GPU_CAST @@ -426,7 +441,7 @@ class Atom { /// Copy velocities and tags to device asynchronously /** Copies nall() elements **/ - inline void add_v_data(double **host_ptr, tagint *host_tag) { + inline void add_v_data(double ** /*host_ptr*/, tagint * /*host_tag*/) { time_vel.start(); if (_v_avail==false) { #ifdef GPU_CAST @@ -450,6 +465,33 @@ class Atom { add_v_data(host_ptr,host_tag); } + // Cast extras to write buffer + template + inline void cast_extra_data(cpytyp *host_ptr) { + if (_extra_avail==false) { + double t=MPI_Wtime(); + #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1) + #pragma omp parallel for simd schedule(static) + #elif (LAL_USE_OMP_SIMD == 1) + #pragma omp simd + #endif + for (int i=0; i<_nall*_extra_fields; i++) + extra[i]=host_ptr[i]; + _time_cast+=MPI_Wtime()-t; + } + } + + // Copy extras to device + /** Copies nall()*_extra elements **/ + inline void add_extra_data() { + time_extra.start(); + if (_extra_avail==false) { + extra.update_device(_nall*_extra_fields,true); + _extra_avail=true; + } + time_extra.stop(); + } + /// Add in casting time from additional data (seconds) inline void add_cast_time(double t) { _time_cast+=t; } @@ -473,6 +515,8 @@ class Atom { UCL_Vector quat; /// Velocities UCL_Vector v; + /// Extras + UCL_Vector extra; #ifdef GPU_CAST UCL_Vector x_cast; @@ -493,7 +537,7 @@ class Atom { UCL_H_Vec host_particle_id; /// Device timers - UCL_Timer time_pos, time_q, time_quat, time_vel; + UCL_Timer time_pos, time_q, time_quat, time_vel, time_extra; /// Geryon device UCL_Device *dev; @@ -508,11 +552,12 @@ class Atom { bool _compiled; // True if data has been copied to device already - bool _x_avail, _q_avail, _quat_avail, _v_avail, _resized; + bool _x_avail, _q_avail, _quat_avail, _v_avail, _extra_avail, _resized; bool alloc(const int nall); bool _allocated, _rot, _charge, _bonds, _vel, _other; + int _extra_fields; int _max_atoms, _nall, _gpu_nbor; bool _host_view; double _time_cast, _time_transfer; diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp new file mode 100644 index 0000000000..09d7386461 --- /dev/null +++ b/lib/gpu/lal_base_amoeba.cpp @@ -0,0 +1,962 @@ +/*************************************************************************** + base_amoeba.cpp + ------------------- + Trung Dac Nguyen (Northwestern) + + Base class for pair styles needing per-particle data for position, + charge, and type. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : trung.nguyen@northwestern.edu + ***************************************************************************/ + +#include "lal_base_amoeba.h" + +namespace LAMMPS_AL { +#define BaseAmoebaT BaseAmoeba + +extern Device global_device; + +template +BaseAmoebaT::BaseAmoeba() : _compiled(false), _max_bytes(0), short_nbor_polar_avail(false) { + device=&global_device; + ans=new Answer(); + nbor=new Neighbor(); + pair_program=nullptr; + ucl_device=nullptr; +} + +template +BaseAmoebaT::~BaseAmoeba() { + delete ans; + delete nbor; + k_multipole.clear(); + k_udirect2b.clear(); + k_umutual2b.clear(); + k_fphi_uind.clear(); + k_fphi_mpole.clear(); + k_polar.clear(); + k_special15.clear(); + k_short_nbor.clear(); + + #if 0 // !defined(USE_OPENCL) && !defined(USE_HIP) + if (fft_plan_created) cufftDestroy(plan); + #endif + + if (pair_program) delete pair_program; +} + +template +int BaseAmoebaT::bytes_per_atom_atomic(const int max_nbors) const { + return device->atom.bytes_per_atom()+ans->bytes_per_atom()+ + nbor->bytes_per_atom(max_nbors); +} + +template +int BaseAmoebaT::init_atomic(const int nlocal, const int nall, + const int max_nbors, const int maxspecial, + const int maxspecial15, + const double cell_size, const double gpu_split, + FILE *_screen, const void *pair_program, + const char *k_name_multipole, + const char *k_name_udirect2b, + const char *k_name_umutual2b, + const char *k_name_polar, + const char *k_name_fphi_uind, + const char *k_name_fphi_mpole, + const char *k_name_short_nbor, + const char* k_name_special15) { + screen=_screen; + + int gpu_nbor=0; + if (device->gpu_mode()==Device::GPU_NEIGH) + gpu_nbor=1; + else if (device->gpu_mode()==Device::GPU_HYB_NEIGH) + gpu_nbor=2; + + int _gpu_host=0; + int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor); + if (host_nlocal>0) + _gpu_host=1; + + _threads_per_atom=device->threads_per_charge(); + + bool charge = true; + bool rot = false; + bool vel = false; + _extra_fields = 24; // round up to accomodate quadruples of numtyp values + // rpole 13; uind 3; uinp 3; amtype, amgroup; pval + int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial,vel,_extra_fields/4); + if (success!=0) + return success; + + if (ucl_device!=device->gpu) _compiled=false; + + ucl_device=device->gpu; + atom=&device->atom; + + _block_size=device->pair_block_size(); + _block_bio_size=device->block_bio_pair(); + compile_kernels(*ucl_device,pair_program,k_name_multipole, + k_name_udirect2b, k_name_umutual2b,k_name_polar, + k_name_fphi_uind, k_name_fphi_mpole, + k_name_short_nbor, k_name_special15); + + if (_threads_per_atom>1 && gpu_nbor==0) { + nbor->packing(true); + _nbor_data=&(nbor->dev_packed); + } else { + _nbor_data=&(nbor->dev_nbor); + } + + bool alloc_packed=false; + success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial, + _gpu_host,max_nbors,cell_size,alloc_packed, + _threads_per_atom); + if (success!=0) + return success; + + // Initialize host-device load balancer + hd_balancer.init(device,gpu_nbor,gpu_split); + + // Initialize timers for the selected GPU + time_pair.init(*ucl_device); + time_pair.zero(); + + pos_tex.bind_float(atom->x,4); + q_tex.bind_float(atom->q,1); + + _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes(); + _maxspecial=maxspecial; + _maxspecial15=maxspecial15; + + // allocate per-atom array tep + + int ef_nall=nlocal; //nall; + if (ef_nall==0) + ef_nall=2000; + + dev_short_nbor.alloc(ef_nall*(2+max_nbors),*(this->ucl_device),UCL_READ_WRITE); + + _max_tep_size=static_cast(static_cast(ef_nall)*1.10); + _tep.alloc(_max_tep_size*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); + + _max_fieldp_size = _max_tep_size; + _fieldp.alloc(_max_fieldp_size*8,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); + + _max_thetai_size = 0; + + _nmax = nall; + dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY); + dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY); + dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY); + + #if 0 // !defined(USE_OPENCL) && !defined(USE_HIP) + fft_plan_created = false; + #endif + + #ifdef ASYNC_DEVICE_COPY + _end_command_queue=ucl_device->num_queues(); + ucl_device->push_command_queue(); + #endif + + return success; +} + +template +void BaseAmoebaT::estimate_gpu_overhead(const int add_kernels) { + device->estimate_gpu_overhead(1+add_kernels,_gpu_overhead,_driver_overhead); +} + +template +void BaseAmoebaT::clear_atomic() { + // Output any timing information + acc_timers(); + double avg_split=hd_balancer.all_avg_split(); + _gpu_overhead*=hd_balancer.timestep(); + _driver_overhead*=hd_balancer.timestep(); + device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes, + _gpu_overhead,_driver_overhead,_threads_per_atom,screen); + + time_pair.clear(); + hd_balancer.clear(); + + dev_short_nbor.clear(); + nbor->clear(); + ans->clear(); + + _tep.clear(); + _fieldp.clear(); + _thetai1.clear(); + _thetai2.clear(); + _thetai3.clear(); + _igrid.clear(); + _fdip_phi1.clear(); + _fdip_phi2.clear(); + _fdip_sum_phi.clear(); + _cgrid_brick.clear(); + + dev_nspecial15.clear(); + dev_special15.clear(); + dev_special15_t.clear(); +} + +// --------------------------------------------------------------------------- +// Copy neighbor list from host +// --------------------------------------------------------------------------- +template +int * BaseAmoebaT::reset_nbors(const int nall, const int inum, int *ilist, + int *numj, int **firstneigh, bool &success) { + success=true; + + int mn=nbor->max_nbor_loop(inum,numj,ilist); + resize_atom(inum,nall,success); + resize_local(inum,mn,success); + if (!success) + return nullptr; + + nbor->get_host(inum,ilist,numj,firstneigh,block_size()); + + double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); + if (bytes>_max_an_bytes) + _max_an_bytes=bytes; + + return ilist; +} + +// --------------------------------------------------------------------------- +// Build neighbor list on device +// --------------------------------------------------------------------------- +template +inline int BaseAmoebaT::build_nbor_list(const int inum, const int host_inum, + const int nall, double **host_x, + int *host_type, double *sublo, + double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + bool &success) { + success=true; + resize_atom(inum,nall,success); + resize_local(inum,host_inum,nbor->max_nbors(),success); + if (!success) + return 0; + atom->cast_copy_x(host_x,host_type); + + int mn; + nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, + tag, nspecial, special, success, mn, ans->error_flag); + + // add one-five neighbors + + if (_maxspecial15>0) { + UCL_H_Vec view_nspecial15; + UCL_H_Vec view_special15; + view_nspecial15.view(nspecial15,nall,*ucl_device); + view_special15.view(special15[0],nall*_maxspecial15,*ucl_device); + ucl_copy(dev_nspecial15,view_nspecial15,nall,false); + ucl_copy(dev_special15_t,view_special15,_maxspecial15*nall,false); + nbor->transpose(dev_special15, dev_special15_t, _maxspecial15, nall); + + add_onefive_neighbors(); + } + + double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); + if (bytes>_max_an_bytes) + _max_an_bytes=bytes; + return mn; +} + +// --------------------------------------------------------------------------- +// Prepare for multiple kernel calls in a time step: +// - reallocate per-atom arrays, if needed +// - transfer extra data from host to device +// - build the full neighbor lists for use by different kernels +// --------------------------------------------------------------------------- + +template +int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, double *host_pval, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, int &host_start, + int **&ilist, int **&jnum, const double cpu_time, + bool &success, double *host_q, double * /*boxlo*/, double * /*prd*/) { + acc_timers(); + if (eatom) _eflag=2; + else if (eflag_in) _eflag=1; + else _eflag=0; + if (vatom) _vflag=2; + else if (vflag_in) _vflag=1; + else _vflag=0; + + #ifdef LAL_NO_BLOCK_REDUCE + if (_eflag) _eflag=2; + if (_vflag) _vflag=2; + #endif + + set_kernel(_eflag,_vflag); + + // ------------------- Resize 1-5 neighbor arrays ------------------------ + + if (nall>_nmax) { + _nmax = nall; + dev_nspecial15.clear(); + dev_special15.clear(); + dev_special15_t.clear(); + dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY); + dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY); + dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY); + } + + if (inum_full==0) { + host_start=0; + // Make sure textures are correct if realloc by a different hybrid style + resize_atom(0,nall,success); + zero_timers(); + return nullptr; + } + + hd_balancer.balance(cpu_time); + int inum=hd_balancer.get_gpu_count(ago,inum_full); + ans->inum(inum); + host_start=inum; + + // Build neighbor list on GPU if necessary + if (ago==0) { + _max_nbors = build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, + sublo, subhi, tag, nspecial, special, nspecial15, special15, + success); + if (!success) + return nullptr; + atom->cast_q_data(host_q); + hd_balancer.start_timer(); + } else { + atom->cast_x_data(host_x,host_type); + atom->cast_q_data(host_q); + hd_balancer.start_timer(); + atom->add_x_data(host_x,host_type); + } + atom->add_q_data(); + cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval); + atom->add_extra_data(); + + *ilist=nbor->host_ilist.begin(); + *jnum=nbor->host_acc.begin(); + + // re-allocate dev_short_nbor if necessary + if (inum_full*(2+_max_nbors) > dev_short_nbor.cols()) { + int _nmax=static_cast(static_cast(inum_full)*1.10); + dev_short_nbor.resize((2+_max_nbors)*_nmax); + } + + hd_balancer.stop_timer(); + + return nbor->host_jlist.begin()-host_start; +} + +// --------------------------------------------------------------------------- +// Compute multipole real-space part +// precompute() should be already invoked before mem (re)allocation +// this is the first part in a time step done on the GPU for AMOEBA for now +// --------------------------------------------------------------------------- +template +void BaseAmoebaT::compute_multipole_real(const int /*ago*/, const int inum_full, + const int /*nall*/, double ** /*host_x*/, + int * /*host_type*/, int * /*host_amtype*/, + int * /*host_amgroup*/, double ** /*host_rpole*/, + double */*host_pval*/, double * /*sublo*/, + double * /*subhi*/, tagint * /*tag*/, + int ** /*nspecial*/, tagint ** /*special*/, + int * /*nspecial15*/, tagint ** /*special15*/, + const bool /*eflag_in*/, const bool /*vflag_in*/, + const bool /*eatom*/, const bool /*vatom*/, + int & /*host_start*/, int ** /*ilist*/, int ** /*jnum*/, + const double /*cpu_time*/, bool & /*success*/, + const double aewald, const double felec, + const double off2_mpole, double * /*host_q*/, + double * /*boxlo*/, double * /*prd*/, void **tep_ptr) { + // ------------------- Resize _tep array ------------------------ + + if (inum_full>_max_tep_size) { + _max_tep_size=static_cast(static_cast(inum_full)*1.10); + _tep.resize(_max_tep_size*4); + } + *tep_ptr=_tep.host.begin(); + + _off2_mpole = off2_mpole; + _felec = felec; + _aewald = aewald; + multipole_real(_eflag,_vflag); + + // leave the answers (forces, energies and virial) on the device, + // only copy them back in the last kernel (polar_real) + //ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); + //device->add_ans_object(ans); + + // copy tep from device to host + + _tep.update_host(_max_tep_size*4,false); +} + +// --------------------------------------------------------------------------- +// Reneighbor on GPU if necessary, and then compute the direct real space part +// of the permanent field +// --------------------------------------------------------------------------- +template +void BaseAmoebaT::compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, double *host_pval, + const double aewald, const double off2_polar, + void** fieldp_ptr) { + // all the necessary data arrays are already copied from host to device + + cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval); + atom->add_extra_data(); + + *fieldp_ptr=_fieldp.host.begin(); + + // specify the correct cutoff and alpha values + _off2_polar = off2_polar; + _aewald = aewald; + udirect2b(_eflag,_vflag); + + // copy field and fieldp from device to host (_fieldp store both arrays, one after another) + + _fieldp.update_host(_max_fieldp_size*8,false); +} + +// --------------------------------------------------------------------------- +// Reneighbor on GPU if necessary, and then compute the direct real space part +// of the induced field +// --------------------------------------------------------------------------- +template +void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double ** /*host_rpole*/, + double **host_uind, double **host_uinp, double * /*host_pval*/, + const double aewald, const double off2_polar, + void** /*fieldp_ptr*/) { + // only copy the necessary data arrays that are updated over the iterations + // use nullptr for the other arrays that are already copied from host to device + cast_extra_data(host_amtype, host_amgroup, nullptr, host_uind, host_uinp, nullptr); + atom->add_extra_data(); + + // set the correct cutoff and alpha + _off2_polar = off2_polar; + _aewald = aewald; + // launch the kernel + umutual2b(_eflag,_vflag); + + // copy field and fieldp from device to host (_fieldp store both arrays, one after another) + // NOTE: move this step to update_fieldp() to delay device-host transfer + // after umutual1 and self are done on the GPU + // *fieldp_ptr=_fieldp.host.begin(); + // _fieldp.update_host(_max_fieldp_size*8,false); +} + +// --------------------------------------------------------------------------- +// Prepare for umutual1() after bspline_fill() is done on host +// - reallocate per-atom arrays, thetai1, thetai2, thetai3, and igrid if needed +// host_thetai1, host_thetai2, host_thetai3 are allocated with nmax by bsordermax by 4 +// host_igrid is allocated with nmax by 4 +// - transfer extra data from host to device +// NOTE: can be re-used for fphi_mpole() but with a different bsorder value +// --------------------------------------------------------------------------- + +template +void BaseAmoebaT::precompute_kspace(const int inum_full, const int bsorder, + double ***host_thetai1, double ***host_thetai2, + double ***host_thetai3, int** host_igrid, + const int nzlo_out, const int nzhi_out, + const int nylo_out, const int nyhi_out, + const int nxlo_out, const int nxhi_out) { + // update bsorder with that of the kspace solver + _bsorder = bsorder; + + // allocate or resize per-atom arrays + // _max_thetai_size, _max_tep_size and _max_fieldp_size are essentially _nmax + // will be consolidated once all terms are ready + + if (_max_thetai_size == 0) { + _max_thetai_size = static_cast(static_cast(inum_full)*1.10); + _thetai1.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY); + _thetai2.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY); + _thetai3.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY); + _igrid.alloc(_max_thetai_size*4,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY); + + _fdip_phi1.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_READ_WRITE); + _fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_READ_WRITE); + _fdip_sum_phi.alloc(_max_thetai_size*20,*(this->ucl_device),UCL_READ_WRITE); + } else { + if ((int)_thetai1.cols()<_max_thetai_size*bsorder) { + _max_thetai_size=static_cast(static_cast(inum_full)*1.10); + _thetai1.resize(_max_thetai_size*bsorder); + _thetai2.resize(_max_thetai_size*bsorder); + _thetai3.resize(_max_thetai_size*bsorder); + _igrid.resize(_max_thetai_size*4); + + _fdip_phi1.resize(_max_thetai_size*10); + _fdip_phi2.resize(_max_thetai_size*10); + _fdip_sum_phi.resize(_max_thetai_size*20); + } + } + + #ifdef ASYNC_DEVICE_COPY + _thetai1.cq(ucl_device->cq(_end_command_queue)); + _thetai2.cq(ucl_device->cq(_end_command_queue)); + _thetai3.cq(ucl_device->cq(_end_command_queue)); + #endif + + // pack host data to device + + for (int i = 0; i < inum_full; i++) + for (int j = 0; j < bsorder; j++) { + int idx = i*bsorder + j; + numtyp4 v; + v.x = host_thetai1[i][j][0]; + v.y = host_thetai1[i][j][1]; + v.z = host_thetai1[i][j][2]; + v.w = host_thetai1[i][j][3]; + _thetai1[idx] = v; + } + _thetai1.update_device(true); + + for (int i = 0; i < inum_full; i++) + for (int j = 0; j < bsorder; j++) { + int idx = i*bsorder + j; + numtyp4 v; + v.x = host_thetai2[i][j][0]; + v.y = host_thetai2[i][j][1]; + v.z = host_thetai2[i][j][2]; + v.w = host_thetai2[i][j][3]; + _thetai2[idx] = v; + } + _thetai2.update_device(true); + + for (int i = 0; i < inum_full; i++) + for (int j = 0; j < bsorder; j++) { + int idx = i*bsorder + j; + numtyp4 v; + v.x = host_thetai3[i][j][0]; + v.y = host_thetai3[i][j][1]; + v.z = host_thetai3[i][j][2]; + v.w = host_thetai3[i][j][3]; + _thetai3[idx] = v; + } + _thetai3.update_device(true); + + for (int i = 0; i < inum_full; i++) { + int idx = i*4; + _igrid[idx+0] = host_igrid[i][0]; + _igrid[idx+1] = host_igrid[i][1]; + _igrid[idx+2] = host_igrid[i][2]; + } + _igrid.update_device(true); + + // _cgrid_brick holds the grid-based potential + + _nzlo_out = nzlo_out; + _nzhi_out = nzhi_out; + _nylo_out = nylo_out; + _nyhi_out = nyhi_out; + _nxlo_out = nxlo_out; + _nxhi_out = nxhi_out; + _ngridz = nzhi_out - nzlo_out + 1; + _ngridy = nyhi_out - nylo_out + 1; + _ngridx = nxhi_out - nxlo_out + 1; + _num_grid_points = _ngridx * _ngridy * _ngridz; + + int numel = _num_grid_points; + if (_cgrid_brick.cols() == 0) { + int nsize=(int)(((double)numel)*1.1); + _cgrid_brick.alloc(nsize, *(this->ucl_device), UCL_READ_WRITE, UCL_READ_ONLY); + } else if (numel > (int)_cgrid_brick.cols()) { + _cgrid_brick.resize(numel); + } +} + +// --------------------------------------------------------------------------- +// fphi_uind = induced potential from grid +// fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid +// NOTE: host_grid_brick is from ic_kspace post_convolution() +// --------------------------------------------------------------------------- + +template +void BaseAmoebaT::compute_fphi_uind(double ****host_grid_brick, + void **host_fdip_phi1, + void **host_fdip_phi2, + void **host_fdip_sum_phi) +{ + int n = 0; + for (int iz = _nzlo_out; iz <= _nzhi_out; iz++) + for (int iy = _nylo_out; iy <= _nyhi_out; iy++) + for (int ix = _nxlo_out; ix <= _nxhi_out; ix++) { + numtyp2 v; + v.x = host_grid_brick[iz][iy][ix][0]; + v.y = host_grid_brick[iz][iy][ix][1]; + _cgrid_brick[n] = v; + n++; + } + _cgrid_brick.update_device(_num_grid_points, false); + + #ifdef ASYNC_DEVICE_COPY + ucl_device->sync(); + #endif + + // launch the kernel with its execution configuration (see below) + fphi_uind(); + + // copy data from device to host + _fdip_phi1.update_host(_max_thetai_size*10, false); + _fdip_phi2.update_host(_max_thetai_size*10, false); + _fdip_sum_phi.update_host(_max_thetai_size*20, false); + + // return the pointers to the host-side arrays + *host_fdip_phi1 = _fdip_phi1.host.begin(); + *host_fdip_phi2 = _fdip_phi2.host.begin(); + *host_fdip_sum_phi = _fdip_sum_phi.host.begin(); +} + +// --------------------------------------------------------------------------- +// Interpolate the potential from the PME grid +// --------------------------------------------------------------------------- +template +int BaseAmoebaT::fphi_uind() { + int ainum=ans->inum(); + if (ainum == 0) + return 0; + + // Compute the block size and grid size to keep all cores busy + + const int BX=block_size(); + const int GX=static_cast(ceil(static_cast(ainum)/BX)); + + time_pair.start(); + int ngridxy = _ngridx * _ngridy; + k_fphi_uind.set_size(GX,BX); + k_fphi_uind.run(&_thetai1, &_thetai2, &_thetai3, &_igrid, &_cgrid_brick, + &_fdip_phi1, &_fdip_phi2, &_fdip_sum_phi, &_bsorder, &ainum, + &_nzlo_out, &_nylo_out, &_nxlo_out, &ngridxy, &_ngridx); + time_pair.stop(); + + return GX; +} + +// --------------------------------------------------------------------------- +// fphi_mpole = multipole potential from grid (limited to polar_kspace for now) +// fphi_mpole extracts the permanent multipole potential from +// the particle mesh Ewald grid +// NOTE: host_grid_brick is from p_kspace post_convolution() +// --------------------------------------------------------------------------- + +template +void BaseAmoebaT::compute_fphi_mpole(double ***host_grid_brick, void **host_fphi, const double felec) +{ + int n = 0; + for (int iz = _nzlo_out; iz <= _nzhi_out; iz++) + for (int iy = _nylo_out; iy <= _nyhi_out; iy++) + for (int ix = _nxlo_out; ix <= _nxhi_out; ix++) { + numtyp2 v; + v.x = host_grid_brick[iz][iy][ix]; + v.y = (numtyp)0; + _cgrid_brick[n] = v; + n++; + } + _cgrid_brick.update_device(_num_grid_points, false); + + _felec = felec; + fphi_mpole(); + + _fdip_sum_phi.update_host(_max_thetai_size*20, false); + + *host_fphi = _fdip_sum_phi.host.begin(); +} + +// --------------------------------------------------------------------------- +// Interpolate the potential from the PME grid +// --------------------------------------------------------------------------- +template +int BaseAmoebaT::fphi_mpole() { + int ainum=ans->inum(); + if (ainum == 0) + return 0; + + // Compute the block size and grid size to keep all cores busy + + const int BX=block_size(); + const int GX=static_cast(ceil(static_cast(ainum)/BX)); + + time_pair.start(); + int ngridxy = _ngridx * _ngridy; + k_fphi_mpole.set_size(GX,BX); + k_fphi_mpole.run(&_thetai1, &_thetai2, &_thetai3, &_igrid, &_cgrid_brick, + &_fdip_sum_phi, &_bsorder, &ainum, &_felec, + &_nzlo_out, &_nylo_out, &_nxlo_out, &ngridxy, &_ngridx); + time_pair.stop(); + + return GX; +} + +// --------------------------------------------------------------------------- +// Reneighbor on GPU if necessary, and then compute polar real-space +// --------------------------------------------------------------------------- +template +void BaseAmoebaT::compute_polar_real(int *host_amtype, int *host_amgroup, + double **host_rpole, double **host_uind, + double **host_uinp, double *host_pval, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, + const double aewald, const double felec, + const double off2_polar, void **tep_ptr) { + + // cast necessary data arrays from host to device + + cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval); + atom->add_extra_data(); + + *tep_ptr=_tep.host.begin(); + + _off2_polar = off2_polar; + _felec = felec; + _aewald = aewald; + const int red_blocks=polar_real(_eflag,_vflag); + + // only copy answers (forces, energies and virial) back from the device + // in the last kernel (which is polar_real here) + ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); + device->add_ans_object(ans); + + // copy tep from device to host + _tep.update_host(_max_tep_size*4,false); +} + +// --------------------------------------------------------------------------- +// Return the memory bytes allocated on the host and device +// --------------------------------------------------------------------------- + +template +double BaseAmoebaT::host_memory_usage_atomic() const { + return device->atom.host_memory_usage()+nbor->host_memory_usage()+ + 4*sizeof(numtyp)+sizeof(BaseAmoeba); +} + +// --------------------------------------------------------------------------- +// Setup the FFT plan: only placeholder for now +// --------------------------------------------------------------------------- + +template +void BaseAmoebaT::setup_fft(const int /*numel*/, const int /*element_type*/) +{ + // TODO: setting up FFT plan based on the backend (cuFFT or hipFFT) +} + +// --------------------------------------------------------------------------- +// Compute FFT on the device: only placeholder for now +// --------------------------------------------------------------------------- + +template +void BaseAmoebaT::compute_fft1d(void * /*in*/, void * /*out*/, + const int /*numel*/, const int /*mode*/) +{ + // TODO: setting up FFT plan based on the backend (cuFFT or hipFFT) + #if 0 // !defined(USE_OPENCL) && !defined(USE_HIP) + if (fft_plan_created == false) { + int m = numel/2; + cufftPlan1d(&plan, m, CUFFT_Z2Z, 1); + fft_plan_created = true; + } + + // n = number of double complex + int n = numel/2; + + // copy the host array to the device (data) + UCL_Vector data; + data.alloc(n, *(this->ucl_device), UCL_READ_WRITE, UCL_READ_WRITE); + int m = 0; + double* d_in = (double*)in; + for (int i = 0; i < n; i++) { + data[i].x = d_in[m]; + data[i].y = d_in[m+1]; + m += 2; + } + data.update_device(false); + + // perform the in-place forward FFT + + cufftResult result = cufftExecZ2Z(plan, (cufftDoubleComplex*)&data.device, + (cufftDoubleComplex*)&data.device, CUFFT_FORWARD); + if (result != CUFFT_SUCCESS) printf("failed cufft %d\n", result); + ucl_device->sync(); + data.update_host(false); + + // copy back the data to the host array + + m = 0; + double* d_out = (double*)out; + for (int i = 0; i < n; i++) { + d_out[m] = data[i].x; + d_out[m+1] = data[i].y; + m += 2; + } + + data.clear(); + #endif +} + +// --------------------------------------------------------------------------- +// Copy the extra data from host to device +// --------------------------------------------------------------------------- + +template +void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole, + double** uind, double** uinp, double* pval) { + // signal that we need to transfer extra data from the host + + atom->extra_data_unavail(); + + int _nall=atom->nall(); + numtyp4 *pextra=reinterpret_cast(&(atom->extra[0])); + + int n = 0; + int nstride = 1; //4; + if (rpole) { + for (int i = 0; i < _nall; i++) { + int idx = n+i*nstride; + pextra[idx].x = rpole[i][0]; + pextra[idx].y = rpole[i][1]; + pextra[idx].z = rpole[i][2]; + pextra[idx].w = rpole[i][3]; + } + + n += nstride*_nall; + for (int i = 0; i < _nall; i++) { + int idx = n+i*nstride; + pextra[idx].x = rpole[i][4]; + pextra[idx].y = rpole[i][5]; + pextra[idx].z = rpole[i][6]; + pextra[idx].w = rpole[i][8]; + } + + n += nstride*_nall; + for (int i = 0; i < _nall; i++) { + int idx = n+i*nstride; + pextra[idx].x = rpole[i][9]; + pextra[idx].y = rpole[i][12]; + pextra[idx].z = (numtyp)amtype[i]; + pextra[idx].w = (numtyp)amgroup[i]; + } + } else { + n += 2*nstride*_nall; + } + + n += nstride*_nall; + if (uind) { + for (int i = 0; i < _nall; i++) { + int idx = n+i*nstride; + pextra[idx].x = uind[i][0]; + pextra[idx].y = uind[i][1]; + pextra[idx].z = uind[i][2]; + pextra[idx].w = 0; + } + } + + n += nstride*_nall; + if (uinp) { + for (int i = 0; i < _nall; i++) { + int idx = n+i*nstride; + pextra[idx].x = uinp[i][0]; + pextra[idx].y = uinp[i][1]; + pextra[idx].z = uinp[i][2]; + pextra[idx].w = 0; + } + } + + n += nstride*_nall; + if (pval) { + for (int i = 0; i < _nall; i++) { + int idx = n+i*nstride; + pextra[idx].x = pval[i]; + pextra[idx].y = 0; + pextra[idx].z = 0; + pextra[idx].w = 0; + } + } +} + +// --------------------------------------------------------------------------- +// Compile (load) the kernel strings and set the kernels +// --------------------------------------------------------------------------- + +template +void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str, + const char *kname_multipole, + const char *kname_udirect2b, + const char *kname_umutual2b, + const char *kname_polar, + const char *kname_fphi_uind, + const char *kname_fphi_mpole, + const char *kname_short_nbor, + const char* kname_special15) { + if (_compiled) + return; + + if (pair_program) delete pair_program; + pair_program=new UCL_Program(dev); + std::string oclstring = device->compile_string()+" -DEVFLAG=1"; + pair_program->load_string(pair_str, oclstring.c_str(),nullptr, screen); + + k_multipole.set_function(*pair_program, kname_multipole); + k_udirect2b.set_function(*pair_program, kname_udirect2b); + k_umutual2b.set_function(*pair_program, kname_umutual2b); + k_polar.set_function(*pair_program, kname_polar); + k_fphi_uind.set_function(*pair_program, kname_fphi_uind); + k_fphi_mpole.set_function(*pair_program, kname_fphi_mpole); + k_short_nbor.set_function(*pair_program, kname_short_nbor); + k_special15.set_function(*pair_program, kname_special15); + pos_tex.get_texture(*pair_program, "pos_tex"); + q_tex.get_texture(*pair_program, "q_tex"); + + _compiled=true; + + #if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0)) + if (dev.has_subgroup_support()) { + int mx_subgroup_sz = k_polar.max_subgroup_size(_block_size); + if (_threads_per_atom > mx_subgroup_sz) + _threads_per_atom = mx_subgroup_sz; + device->set_simd_size(mx_subgroup_sz); + } + #endif + +} + +// --------------------------------------------------------------------------- +// Specify 1-5 neighbors from the current neighbor list +// --------------------------------------------------------------------------- + +template +int BaseAmoebaT::add_onefive_neighbors() { + // Compute the block size and grid size to keep all cores busy + const int BX=block_size(); + int GX=static_cast(ceil(static_cast(ans->inum())/ + (BX/_threads_per_atom))); + + int _nall=atom->nall(); + int ainum=ans->inum(); + int nbor_pitch=nbor->nbor_pitch(); + + k_special15.set_size(GX,BX); + k_special15.run(&nbor->dev_nbor, &_nbor_data->begin(), + &atom->dev_tag, &dev_nspecial15, &dev_special15, + &ainum, &_nall, &nbor_pitch, + &_threads_per_atom); + + return GX; +} + +template class BaseAmoeba; +} diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h new file mode 100644 index 0000000000..0eaaafeb1e --- /dev/null +++ b/lib/gpu/lal_base_amoeba.h @@ -0,0 +1,325 @@ +/*************************************************************************** + base_amoeba.h + ------------------- + Trung Dac Nguyen (Northwestern) + + Base class for pair styles needing per-particle data for position, + charge, and type. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : trung.nguyen@northwestern.edu + ***************************************************************************/ + +#ifndef LAL_BASE_AMOEBA_H +#define LAL_BASE_AMOEBA_H + +#include "lal_device.h" +#include "lal_balance.h" +#include "mpi.h" + +#if defined(USE_OPENCL) +#include "geryon/ocl_texture.h" +#elif defined(USE_CUDART) +#include "geryon/nvc_texture.h" +#elif defined(USE_HIP) +#include "geryon/hip_texture.h" +#else +#include "geryon/nvd_texture.h" +#endif + +//#define ASYNC_DEVICE_COPY + +#if !defined(USE_OPENCL) && !defined(USE_HIP) +// temporary workaround for int2 also defined in cufft +#ifdef int2 +#undef int2 +#endif +#include "cufft.h" +#endif + +namespace LAMMPS_AL { + +template +class BaseAmoeba { + public: + BaseAmoeba(); + virtual ~BaseAmoeba(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * \param k_name name for the kernel for force calculation + * + * Returns: + * - 0 if successful + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init_atomic(const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const int maxspecial15, const double cell_size, + const double gpu_split, FILE *screen, const void *pair_program, + const char *kname_multipole, const char *kname_udirect2b, + const char *kname_umutual2b, const char *kname_polar, + const char *kname_fphi_uind, const char *kname_fphi_mpole, + const char *kname_short_nbor, const char* kname_special15); + + /// Estimate the overhead for GPU context changes and CPU driver + void estimate_gpu_overhead(const int add_kernels=0); + + /// Check if there is enough storage for atom arrays and realloc if not + /** \param success set to false if insufficient memory **/ + inline void resize_atom(const int inum, const int nall, bool &success) { + if (atom->resize(nall, success)) { + pos_tex.bind_float(atom->x,4); + q_tex.bind_float(atom->q,1); + } + ans->resize(inum,success); + } + + /// Check if there is enough storage for neighbors and realloc if not + /** \param nlocal number of particles whose nbors must be stored on device + * \param host_inum number of particles whose nbors need to copied to host + * \param current maximum number of neighbors + * \note olist_size=total number of local particles **/ + inline void resize_local(const int inum, const int max_nbors, bool &success) { + nbor->resize(inum,max_nbors,success); + } + + /// Check if there is enough storage for neighbors and realloc if not + /** \param nlocal number of particles whose nbors must be stored on device + * \param host_inum number of particles whose nbors need to copied to host + * \param current maximum number of neighbors + * \note host_inum is 0 if the host is performing neighboring + * \note nlocal+host_inum=total number local particles + * \note olist_size=0 **/ + inline void resize_local(const int inum, const int host_inum, + const int max_nbors, bool &success) { + nbor->resize(inum,host_inum,max_nbors,success); + } + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear_atomic(); + + /// Returns memory usage on device per atom + int bytes_per_atom_atomic(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage_atomic() const; + + /// Accumulate timers + inline void acc_timers() { + if (device->time_device()) { + nbor->acc_timers(screen); + time_pair.add_to_total(); + atom->acc_timers(); + ans->acc_timers(); + } + } + + /// Zero timers + inline void zero_timers() { + time_pair.zero(); + atom->zero_timers(); + ans->zero_timers(); + } + + /// Copy neighbor list from host + int * reset_nbors(const int nall, const int inum, int *ilist, int *numj, + int **firstneigh, bool &success); + + /// Build neighbor list on device + int build_nbor_list(const int inum, const int host_inum, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, int *nspecial15, tagint **special15, + bool &success); + + /// Reallocate per-atom arrays if needed, and build neighbor lists once, if needed + virtual int** precompute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, double **host_uind, + double **host_uinp, double *host_pval, double *sublo, double *subhi, + tagint *tag, int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **&ilist, int **&numj, const double cpu_time, bool &success, + double *charge, double *boxlo, double *prd); + + /// Compute multipole real-space with device neighboring + virtual void compute_multipole_real(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, double *host_pval, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, int *nspecial15, tagint **special15, + const bool eflag, const bool vflag, const bool eatom, const bool vatom, + int &host_start, int **ilist, int **numj, const double cpu_time, + bool &success, const double aewald, const double felec, + const double off2_mpole, double *charge, double *boxlo, + double *prd, void **tep_ptr); + + /// Compute the real space part of the permanent field (udirect2b) with device neighboring + virtual void compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, double *host_pval, + const double aewald, const double off2_polar, void **fieldp_ptr); + + /// Compute the real space part of the induced field (umutual2b) with device neighboring + virtual void compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, double *host_pval, + const double aewald, const double off2_polar, void **fieldp_ptr); + + /// Allocate/resize per-atom arrays before the kspace parts in induce() and polar + virtual void precompute_kspace(const int inum_full, const int bsorder, + double ***host_thetai1, double ***host_thetai2, + double ***host_thetai3, int** igrid, + const int nzlo_out, const int nzhi_out, + const int nylo_out, const int nyhi_out, + const int nxlo_out, const int nxhi_out); + /// Interpolate the induced potential from the grid + virtual void compute_fphi_uind(double ****host_grid_brick, + void **host_fdip_phi1, void **host_fdip_phi2, + void **host_fdip_sum_phi); + + /// Interpolate the multipolar potential from the grid + virtual void compute_fphi_mpole(double ***host_grid_brick, void **host_fphi, + const double felec); + + /// Compute polar real-space with device neighboring + virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, double *host_pval, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, + const double aewald, const double felec, const double off2_polar, + void **tep_ptr); + + // copy field and fieldp from device to host after umutual2b + virtual void update_fieldp(void **fieldp_ptr) { + *fieldp_ptr=_fieldp.host.begin(); + // _fieldp store both arrays, one after another + _fieldp.update_host(_max_fieldp_size*8,false); + } + + /// setup a plan for FFT, where size is the number of elements + + void setup_fft(const int size, const int element_type=0); + + /// compute forward/backward FFT on the device + + void compute_fft1d(void* in, void* out, const int numel, const int mode); + + // -------------------------- DEVICE DATA ------------------------- + + /// Device Properties and Atom and Neighbor storage + Device *device; + + /// Geryon device + UCL_Device *ucl_device; + + /// Device Timers + UCL_Timer time_pair; + + /// Host device load balancer + Balance hd_balancer; + + /// LAMMPS pointer for screen output + FILE *screen; + + // --------------------------- ATOM DATA -------------------------- + + /// Atom Data + Atom *atom; + + UCL_Vector polar1, polar2, polar3, polar4, polar5; + + /// cast host arrays into a single array for atom->extra + void cast_extra_data(int* amtype, int* amgroup, double** rpole, + double** uind, double** uinp, double* pval=nullptr); + + /// Per-atom arrays + UCL_Vector _tep, _fieldp; + int _nmax, _max_tep_size, _max_fieldp_size; + + int _bsorder; + UCL_Vector _thetai1, _thetai2, _thetai3; + UCL_Vector _igrid; + UCL_Vector _cgrid_brick; + UCL_Vector _fdip_phi1, _fdip_phi2, _fdip_sum_phi; + int _max_thetai_size; + int _nzlo_out, _nzhi_out, _nylo_out, _nyhi_out, _nxlo_out, _nxhi_out; + int _ngridx, _ngridy, _ngridz, _num_grid_points; + + int _end_command_queue; + + // ------------------------ FORCE/ENERGY DATA ----------------------- + + Answer *ans; + + // --------------------------- NBOR DATA ---------------------------- + + /// Neighbor data + Neighbor *nbor; + /// Device storage for 1-5 special neighbor counts + UCL_D_Vec dev_nspecial15; + /// Device storage for special neighbors + UCL_D_Vec dev_special15, dev_special15_t; + + int add_onefive_neighbors(); + + UCL_D_Vec dev_short_nbor; + + // ------------------------- DEVICE KERNELS ------------------------- + UCL_Program *pair_program; + UCL_Kernel k_multipole, k_udirect2b, k_umutual2b, k_polar; + UCL_Kernel k_fphi_uind, k_fphi_mpole; + UCL_Kernel k_special15, k_short_nbor; + inline int block_size() { return _block_size; } + inline void set_kernel(const int /*eflag*/, const int /*vflag*/) {} + + // --------------------------- TEXTURES ----------------------------- + UCL_Texture pos_tex; + UCL_Texture q_tex; + + protected: + bool _compiled; + int _block_size, _block_bio_size, _threads_per_atom; + int _extra_fields; + double _max_bytes, _max_an_bytes, _maxspecial, _maxspecial15, _max_nbors; + double _gpu_overhead, _driver_overhead; + bool short_nbor_polar_avail; + UCL_D_Vec *_nbor_data; + + numtyp _aewald,_felec; + numtyp _off2_hal,_off2_repulse,_off2_disp,_off2_mpole,_off2_polar; + + int _eflag, _vflag; + + void compile_kernels(UCL_Device &dev, const void *pair_string, + const char *kname_multipole, const char *kname_udirect2b, + const char *kname_umutual2b, const char *kname_polar, + const char *kname_fphi_uind, const char *kname_fphi_mpole, + const char *kname_short_nbor, const char* kname_special15); + + virtual int multipole_real(const int eflag, const int vflag) = 0; + virtual int udirect2b(const int eflag, const int vflag) = 0; + virtual int umutual2b(const int eflag, const int vflag) = 0; + virtual int fphi_uind(); + virtual int fphi_mpole(); + virtual int polar_real(const int eflag, const int vflag) = 0; + + + #if !defined(USE_OPENCL) && !defined(USE_HIP) + cufftHandle plan; + #endif + bool fft_plan_created; +}; + +} + +#endif diff --git a/lib/gpu/lal_base_atomic.cpp b/lib/gpu/lal_base_atomic.cpp index bb0e815b3f..0cfc084fa4 100644 --- a/lib/gpu/lal_base_atomic.cpp +++ b/lib/gpu/lal_base_atomic.cpp @@ -72,7 +72,9 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall, _threads_per_atom=device->threads_per_atom(); - int success=device->init(*ans,false,false,nlocal,nall,maxspecial); + bool charge = false; + bool rot = false; + int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial); if (success!=0) return success; diff --git a/lib/gpu/lal_base_charge.cpp b/lib/gpu/lal_base_charge.cpp index 4a59f70d83..3cd6c6030a 100644 --- a/lib/gpu/lal_base_charge.cpp +++ b/lib/gpu/lal_base_charge.cpp @@ -72,7 +72,9 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall, _threads_per_atom=device->threads_per_charge(); - int success=device->init(*ans,true,false,nlocal,nall,maxspecial); + bool charge = true; + bool rot = false; + int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial); if (success!=0) return success; diff --git a/lib/gpu/lal_base_dipole.cpp b/lib/gpu/lal_base_dipole.cpp index 66e03de651..6ef1c40ca7 100644 --- a/lib/gpu/lal_base_dipole.cpp +++ b/lib/gpu/lal_base_dipole.cpp @@ -73,7 +73,9 @@ int BaseDipoleT::init_atomic(const int nlocal, const int nall, _threads_per_atom=device->threads_per_charge(); - int success=device->init(*ans,true,true,nlocal,nall,maxspecial); + bool charge = true; + bool rot = true; + int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial); if (success!=0) return success; diff --git a/lib/gpu/lal_base_dpd.cpp b/lib/gpu/lal_base_dpd.cpp index 44b86abeeb..e103699d40 100644 --- a/lib/gpu/lal_base_dpd.cpp +++ b/lib/gpu/lal_base_dpd.cpp @@ -72,7 +72,10 @@ int BaseDPDT::init_atomic(const int nlocal, const int nall, _threads_per_atom=device->threads_per_atom(); - int success=device->init(*ans,false,false,nlocal,nall,maxspecial,true); + bool charge = false; + bool rot = false; + bool vel = true; + int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial,vel); if (success!=0) return success; @@ -193,7 +196,7 @@ void BaseDPDT::compute(const int f_ago, const int inum_full, const int nall, const double cpu_time, bool &success, tagint *tag, double **host_v, const double dtinvsqrt, const int seed, const int timestep, - const int nlocal, double *boxlo, double *prd) { + const int /*nlocal*/, double * /*boxlo*/, double * /*prd*/) { acc_timers(); int eflag, vflag; if (eatom) eflag=2; @@ -258,7 +261,7 @@ int** BaseDPDT::compute(const int ago, const int inum_full, const double cpu_time, bool &success, double **host_v, const double dtinvsqrt, const int seed, const int timestep, - double *boxlo, double *prd) { + double * /*boxlo*/, double * /*prd*/) { acc_timers(); int eflag, vflag; if (eatom) eflag=2; diff --git a/lib/gpu/lal_base_three.cpp b/lib/gpu/lal_base_three.cpp index 3457955b3e..bfadfebf66 100644 --- a/lib/gpu/lal_base_three.cpp +++ b/lib/gpu/lal_base_three.cpp @@ -94,7 +94,9 @@ int BaseThreeT::init_three(const int nlocal, const int nall, else _threads_per_atom=device->threads_per_three(); - int success=device->init(*ans,false,false,nlocal,nall,maxspecial); + bool charge = false; + bool rot = false; + int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial); if (success!=0) return success; diff --git a/lib/gpu/lal_charmm_long.cpp b/lib/gpu/lal_charmm_long.cpp index 8008b1fbb3..0d01d70fb1 100644 --- a/lib/gpu/lal_charmm_long.cpp +++ b/lib/gpu/lal_charmm_long.cpp @@ -44,19 +44,15 @@ int CHARMMLongT::bytes_per_atom(const int max_nbors) const { } template -int CHARMMLongT::init(const int ntypes, - double host_cut_bothsq, double **host_lj1, - double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, - double *host_special_lj, const int nlocal, - const int nall, const int max_nbors, - const int maxspecial, const double cell_size, - const double gpu_split, FILE *_screen, - double host_cut_ljsq, const double host_cut_coulsq, - double *host_special_coul, const double qqrd2e, - const double g_ewald, const double cut_lj_innersq, - const double denom_lj, double **epsilon, - double **sigma, const bool mix_arithmetic) { +int CHARMMLongT::init(const int ntypes, double host_cut_bothsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double ** /*host_offset*/, double *host_special_lj, const int nlocal, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, const double gpu_split, FILE *_screen, + double host_cut_ljsq, const double host_cut_coulsq, + double *host_special_coul, const double qqrd2e, const double g_ewald, + const double cut_lj_innersq, const double denom_lj, double **epsilon, + double **sigma, const bool mix_arithmetic) { int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, _screen,charmm_long,"k_charmm_long"); diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp index 0d9578b491..dd3ce15827 100644 --- a/lib/gpu/lal_device.cpp +++ b/lib/gpu/lal_device.cpp @@ -52,7 +52,7 @@ DeviceT::~Device() { } template -int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int ngpu, +int DeviceT::init_device(MPI_Comm /*world*/, MPI_Comm replica, const int ngpu, const int first_gpu_id, const int gpu_mode, const double p_split, const int t_per_atom, const double user_cell_size, char *ocl_args, @@ -386,6 +386,9 @@ int DeviceT::set_ocl_params(std::string s_config, const std::string &extra_args) } _ocl_compile_string="-cl-mad-enable "; + #ifdef CL_VERSION_2_0 + _ocl_compile_string+="-cl-std=CL2.0 "; + #endif if (params[4]!="0") _ocl_compile_string+="-cl-fast-relaxed-math "; _ocl_compile_string+=std::string(OCL_INT_TYPE)+" "+ std::string(OCL_PRECISION_COMPILE); @@ -438,7 +441,7 @@ template int DeviceT::init(Answer &ans, const bool charge, const bool rot, const int nlocal, const int nall, const int maxspecial, - const bool vel) { + const bool vel, const int extra_fields) { if (!_device_init) return -1; if (sizeof(acctyp)==sizeof(double) && !gpu->double_precision()) @@ -467,7 +470,7 @@ int DeviceT::init(Answer &ans, const bool charge, if (_init_count==0) { // Initialize atom and nbor data - if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor>0 && maxspecial>0,vel)) + if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor>0 && maxspecial>0,vel,extra_fields)) return -3; _data_in_estimate++; @@ -477,6 +480,9 @@ int DeviceT::init(Answer &ans, const bool charge, _data_in_estimate++; if (vel) _data_in_estimate++; + if (extra_fields>0) + _data_in_estimate++; + } else { if (!atom.charge() && charge) _data_in_estimate++; @@ -484,7 +490,9 @@ int DeviceT::init(Answer &ans, const bool charge, _data_in_estimate++; if (!atom.velocity() && vel) _data_in_estimate++; - if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial,vel)) + if (atom.using_extra() && extra_fields>0) + _data_in_estimate++; + if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial,vel,extra_fields)) return -3; } @@ -520,7 +528,7 @@ int DeviceT::init(Answer &ans, const int nlocal, template int DeviceT::init_nbor(Neighbor *nbor, const int nlocal, - const int host_nlocal, const int nall, + const int host_nlocal, const int /*nall*/, const int maxspecial, const int gpu_host, const int max_nbors, const double cutoff, const bool pre_cut, const int threads_per_atom, diff --git a/lib/gpu/lal_device.h b/lib/gpu/lal_device.h index f5136d9fa0..3b27223007 100644 --- a/lib/gpu/lal_device.h +++ b/lib/gpu/lal_device.h @@ -61,6 +61,7 @@ class Device { * \param nall Total number of local+ghost particles * \param maxspecial Maximum mumber of special bonded atoms per atom * \param vel True if velocities need to be stored + * \param extra_fields Nonzero if extra fields need to be stored * * Returns: * - 0 if successful @@ -70,7 +71,7 @@ class Device { * - -5 Double precision is not supported on card **/ int init(Answer &ans, const bool charge, const bool rot, const int nlocal, const int nall, const int maxspecial, - const bool vel=false); + const bool vel=false, const int extra_fields=0); /// Initialize the device for Atom storage only /** \param nlocal Total number of local particles to allocate memory for diff --git a/lib/gpu/lal_dpd_tstat_ext.cpp b/lib/gpu/lal_dpd_tstat_ext.cpp index 2b63bf62e7..78a1bf2d9d 100644 --- a/lib/gpu/lal_dpd_tstat_ext.cpp +++ b/lib/gpu/lal_dpd_tstat_ext.cpp @@ -28,10 +28,10 @@ static DPD DPDTMF; // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int dpd_tstat_gpu_init(const int ntypes, double **cutsq, double **host_a0, - double **host_gamma, double **host_sigma, double **host_cut, - double *special_lj, const int inum, - const int nall, const int max_nbors, const int maxspecial, - const double cell_size, int &gpu_mode, FILE *screen) { + double **host_gamma, double **host_sigma, double **host_cut, + double *special_lj, const int inum, + const int nall, const int /*max_nbors*/, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen) { DPDTMF.clear(); gpu_mode=DPDTMF.device->gpu_mode(); double gpu_split=DPDTMF.device->particle_split(); diff --git a/lib/gpu/lal_eam.cpp b/lib/gpu/lal_eam.cpp index 2c0d63f7bf..b7bc7b958a 100644 --- a/lib/gpu/lal_eam.cpp +++ b/lib/gpu/lal_eam.cpp @@ -310,7 +310,7 @@ void EAMT::compute(const int f_ago, const int inum_full, const int nlocal, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag_in, const bool vflag_in, - const bool eatom, const bool vatom, + const bool /*eatom*/, const bool /*vatom*/, int &host_start, const double cpu_time, bool &success, void **fp_ptr) { this->acc_timers(); @@ -386,8 +386,8 @@ int** EAMT::compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag_in, - const bool vflag_in, const bool eatom, - const bool vatom, int &host_start, int **ilist, int **jnum, + const bool vflag_in, const bool /*eatom*/, + const bool /*vatom*/, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, int &inum, void **fp_ptr) { this->acc_timers(); diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp new file mode 100644 index 0000000000..24ffae8de2 --- /dev/null +++ b/lib/gpu/lal_hippo.cpp @@ -0,0 +1,641 @@ +/*************************************************************************** + hippo.cpp + ------------------- + Trung Dac Nguyen (Northwestern) + + Class for acceleration of the hippo pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : trung.nguyen@northwestern.edu + ***************************************************************************/ + +#if defined(USE_OPENCL) +#include "hippo_cl.h" +#elif defined(USE_CUDART) +const char *hippo=0; +#else +#include "hippo_cubin.h" +#endif + +#include "lal_hippo.h" +#include +namespace LAMMPS_AL { +#define HippoT Hippo + +extern Device device; + +template +HippoT::Hippo() : BaseAmoeba(), + _allocated(false) { +} + +template +HippoT::~Hippo() { + clear(); + k_repulsion.clear(); + k_dispersion.clear(); + +} + +template +int HippoT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template +int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass, + const double *host_pdamp, const double *host_thole, + const double *host_dirdamp, const int *host_amtype2class, + const double *host_special_repel, const double *host_special_disp, + const double *host_special_mpole, + const double *host_special_polar_wscale, + const double *host_special_polar_piscale, + const double *host_special_polar_pscale, + const double *host_sizpr, const double *host_dmppr, const double *host_elepr, + const double *host_csix, const double *host_adisp, + const double *host_pcore, const double *host_palpha, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const int maxspecial15, + const double cell_size, const double gpu_split, FILE *_screen, + const double polar_dscale, const double polar_uscale) { + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15, + cell_size,gpu_split,_screen,hippo, + "k_hippo_multipole", "k_hippo_udirect2b", + "k_hippo_umutual2b", "k_hippo_polar", + "k_hippo_fphi_uind", "k_hippo_fphi_mpole", + "k_hippo_short_nbor", "k_hippo_special15"); + if (success!=0) + return success; + + // specific to HIPPO + k_repulsion.set_function(*(this->pair_program),"k_hippo_repulsion"); + k_dispersion.set_function(*(this->pair_program),"k_hippo_dispersion"); + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + + UCL_H_Vec host_write(max_amtype, *(this->ucl_device), UCL_WRITE_ONLY); + for (int i = 0; i < max_amtype; i++) { + host_write[i].x = host_pdamp[i]; + host_write[i].y = host_thole[i]; + host_write[i].z = host_dirdamp[i]; + host_write[i].w = host_amtype2class[i]; + } + + coeff_amtype.alloc(max_amtype,*(this->ucl_device), UCL_READ_ONLY); + ucl_copy(coeff_amtype,host_write,false); + + for (int i = 0; i < max_amtype; i++) { + host_write[i].x = host_sizpr[i]; + host_write[i].y = host_dmppr[i]; + host_write[i].z = host_elepr[i]; + host_write[i].w = (numtyp)0; + } + + coeff_rep.alloc(max_amtype,*(this->ucl_device), UCL_READ_ONLY); + ucl_copy(coeff_rep,host_write,false); + + UCL_H_Vec host_write2(max_amclass, *(this->ucl_device), UCL_WRITE_ONLY); + for (int i = 0; i < max_amclass; i++) { + host_write2[i].x = host_csix[i]; + host_write2[i].y = host_adisp[i]; + host_write2[i].z = host_pcore[i]; + host_write2[i].w = host_palpha[i]; + } + + coeff_amclass.alloc(max_amclass,*(this->ucl_device), UCL_READ_ONLY); + ucl_copy(coeff_amclass,host_write2,false); + + UCL_H_Vec dview(5, *(this->ucl_device), UCL_WRITE_ONLY); + sp_polar.alloc(5,*(this->ucl_device),UCL_READ_ONLY); + for (int i=0; i<5; i++) { + dview[i].x=host_special_polar_wscale[i]; + dview[i].y=host_special_polar_piscale[i]; + dview[i].z=host_special_polar_pscale[i]; + dview[i].w=host_special_mpole[i]; + } + ucl_copy(sp_polar,dview,5,false); + + sp_nonpolar.alloc(5,*(this->ucl_device),UCL_READ_ONLY); + for (int i=0; i<5; i++) { + dview[i].x=host_special_repel[i]; + dview[i].y=host_special_disp[i]; + dview[i].z=(numtyp)0; + dview[i].w=(numtyp)0; + } + ucl_copy(sp_nonpolar,dview,5,false); + + _polar_dscale = polar_dscale; + _polar_uscale = polar_uscale; + + _allocated=true; + this->_max_bytes=coeff_amtype.row_bytes() + coeff_rep.row_bytes() + + coeff_amclass.row_bytes() + sp_polar.row_bytes() + + sp_nonpolar.row_bytes() + this->_tep.row_bytes() + + this->_fieldp.row_bytes() + this->_thetai1.row_bytes() + + this->_thetai2.row_bytes() + this->_thetai3.row_bytes() + + this->_igrid.row_bytes() + this->_cgrid_brick.row_bytes(); + return 0; +} + +template +void HippoT::clear() { + if (!_allocated) + return; + _allocated=false; + + coeff_amtype.clear(); + coeff_rep.clear(); + coeff_amclass.clear(); + sp_polar.clear(); + sp_nonpolar.clear(); + + this->clear_atomic(); +} + +template +double HippoT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(Hippo); +} + +// --------------------------------------------------------------------------- +// Compute the repulsion term, returning tep +// --------------------------------------------------------------------------- +template +void HippoT::compute_repulsion(const int /*ago*/, const int inum_full, + const int /*nall*/, double ** /*host_x*/, + int * /*host_type*/, int * /*host_amtype*/, + int * /*host_amgroup*/, double ** /*host_rpole*/, + double * /*sublo*/, double * /*subhi*/, tagint * /*tag*/, + int ** /*nspecial*/, tagint ** /*special*/, + int * /*nspecial15*/, tagint ** /*special15*/, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, + int & /*host_start*/, int ** /*ilist*/, int ** /*jnum*/, + const double /*cpu_time*/, bool & /*success*/, + const double /*aewald*/, const double off2_repulse, + double * /*host_q*/, double * /*boxlo*/, double * /*prd*/, + double cut2, double c0, double c1, double c2, + double c3, double c4, double c5, void **tep_ptr) { + this->acc_timers(); + int eflag, vflag; + if (eatom) eflag=2; + else if (eflag_in) eflag=1; + else eflag=0; + if (vatom) vflag=2; + else if (vflag_in) vflag=1; + else vflag=0; + + #ifdef LAL_NO_BLOCK_REDUCE + if (eflag) eflag=2; + if (vflag) vflag=2; + #endif + + this->set_kernel(eflag,vflag); + + // ------------------- Resize _tep array ------------------------ + + if (inum_full>this->_max_tep_size) { + this->_max_tep_size=static_cast(static_cast(inum_full)*1.10); + this->_tep.resize(this->_max_tep_size*4); + } + *tep_ptr=this->_tep.host.begin(); + + this->_off2_repulse = off2_repulse; + _cut2 = cut2; + _c0 = c0; + _c1 = c1; + _c2 = c2; + _c3 = c3; + _c4 = c4; + _c5 = c5; + repulsion(this->_eflag,this->_vflag); + + // copy tep from device to host + this->_tep.update_host(this->_max_tep_size*4,false); +} + +// --------------------------------------------------------------------------- +// Launch the repulsion kernel +// --------------------------------------------------------------------------- +template +int HippoT::repulsion(const int eflag, const int vflag) { + int ainum=this->ans->inum(); + if (ainum == 0) + return 0; + + int _nall=this->atom->nall(); + int nbor_pitch=this->nbor->nbor_pitch(); + + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + this->time_pair.start(); + + // Build the short neighbor list for the cutoff off2_disp, + // at this point repuslion is the first kernel in a time step for HIPPO + + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->_off2_repulse, &ainum, + &nbor_pitch, &this->_threads_per_atom); + + k_repulsion.set_size(GX,BX); + k_repulsion.run(&this->atom->x, &this->atom->extra, + &coeff_rep, &sp_nonpolar, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, + &this->ans->force, &this->ans->engv, &this->_tep, + &eflag, &vflag, &ainum, &_nall, &nbor_pitch, + &this->_threads_per_atom, &this->_aewald, + &this->_off2_repulse, &_cut2, + &_c0, &_c1, &_c2, &_c3, &_c4, &_c5); + this->time_pair.stop(); + + return GX; +} + +// --------------------------------------------------------------------------- +// Compute dispersion real-space +// --------------------------------------------------------------------------- +template +void HippoT::compute_dispersion_real(int *host_amtype, int *host_amgroup, + double **host_rpole, const double aewald, + const double off2_disp) { + + // cast necessary data arrays from host to device + + this->cast_extra_data(host_amtype, host_amgroup, host_rpole, + nullptr, nullptr, nullptr); + this->atom->add_extra_data(); + + this->_off2_disp = off2_disp; + this->_aewald = aewald; + dispersion_real(this->_eflag,this->_vflag); + + // only copy them back if this is the last kernel + // otherwise, commenting out these two lines to leave the answers + // (forces, energies and virial) on the device until the last kernel + //this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); + //this->device->add_ans_object(this->ans); +} + +// --------------------------------------------------------------------------- +// Launch the dispersion real-space kernel +// --------------------------------------------------------------------------- +template +int HippoT::dispersion_real(const int eflag, const int vflag) { + int ainum=this->ans->inum(); + if (ainum == 0) + return 0; + + int _nall=this->atom->nall(); + int nbor_pitch=this->nbor->nbor_pitch(); + + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + this->time_pair.start(); + + // Build the short neighbor list for the cutoff off2_disp, + // at this point dispersion is the first kernel in a time step + + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->_off2_disp, &ainum, + &nbor_pitch, &this->_threads_per_atom); + + k_dispersion.set_size(GX,BX); + k_dispersion.run(&this->atom->x, &this->atom->extra, + &coeff_amtype, &coeff_amclass, &sp_nonpolar, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, + &this->ans->force, &this->ans->engv, + &eflag, &vflag, &ainum, &_nall, &nbor_pitch, + &this->_threads_per_atom, &this->_aewald, + &this->_off2_disp); + this->time_pair.stop(); + + return GX; +} + +// --------------------------------------------------------------------------- +// Compute the multipole real-space term, returning tep +// --------------------------------------------------------------------------- +template +void HippoT::compute_multipole_real(const int /*ago*/, const int inum_full, + const int /*nall*/, double ** /*host_x*/, + int * /*host_type*/, int * /*host_amtype*/, + int * /*host_amgroup*/, double ** /*host_rpole*/, + double* host_pval, double * /*sublo*/, + double * /*subhi*/, tagint * /*tag*/, + int ** /*nspecial*/, tagint ** /*special*/, + int * /*nspecial15*/, tagint ** /*special15*/, + const bool /*eflag_in*/, const bool /*vflag_in*/, + const bool /*eatom*/, const bool /*vatom*/, + int & /*host_start*/, int ** /*ilist*/, int ** /*jnum*/, + const double /*cpu_time*/, bool & /*success*/, + const double aewald, const double felec, + const double off2_mpole, double * /*host_q*/, + double * /*boxlo*/, double * /*prd*/, void **tep_ptr) { + + // cast necessary data arrays from host to device + + this->cast_extra_data(nullptr, nullptr, nullptr, nullptr, nullptr, host_pval); + this->atom->add_extra_data(); + + // ------------------- Resize _tep array ------------------------ + + if (inum_full>this->_max_tep_size) { + this->_max_tep_size=static_cast(static_cast(inum_full)*1.10); + this->_tep.resize(this->_max_tep_size*4); + } + *tep_ptr=this->_tep.host.begin(); + + this->_off2_mpole = off2_mpole; + this->_felec = felec; + this->_aewald = aewald; + multipole_real(this->_eflag,this->_vflag); + + // copy tep from device to host + this->_tep.update_host(this->_max_tep_size*4,false); +} + +// --------------------------------------------------------------------------- +// Launch the multipole real-space kernel +// --------------------------------------------------------------------------- +template +int HippoT::multipole_real(const int eflag, const int vflag) { + int ainum=this->ans->inum(); + if (ainum == 0) + return 0; + + int _nall=this->atom->nall(); + int nbor_pitch=this->nbor->nbor_pitch(); + + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + this->time_pair.start(); + + // Build the short neighbor list for the cutoff off2_mpole + + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->_off2_mpole, &ainum, + &nbor_pitch, &this->_threads_per_atom); + + this->k_multipole.set_size(GX,BX); + this->k_multipole.run(&this->atom->x, &this->atom->extra, + &coeff_amtype, &coeff_amclass, &sp_polar, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, + &this->ans->force, &this->ans->engv, &this->_tep, + &eflag, &vflag, &ainum, &_nall, &nbor_pitch, + &this->_threads_per_atom, &this->_aewald, &this->_felec, + &this->_off2_mpole, &_polar_dscale, &_polar_uscale); + this->time_pair.stop(); + + return GX; +} + +// --------------------------------------------------------------------------- +// Compute the direct real space part of the permanent field +// returning field and fieldp +// --------------------------------------------------------------------------- +template +void HippoT::compute_udirect2b(int * /*host_amtype*/, int * /*host_amgroup*/, double ** /*host_rpole*/, + double **host_uind, double **host_uinp, double* host_pval, + const double aewald, const double off2_polar, + void** fieldp_ptr) { + + // all the necessary data arrays are already copied from host to device + + this->cast_extra_data(nullptr, nullptr, nullptr, host_uind, host_uinp, host_pval); + this->atom->add_extra_data(); + + *fieldp_ptr=this->_fieldp.host.begin(); + + this->_off2_polar = off2_polar; + this->_aewald = aewald; + udirect2b(this->_eflag,this->_vflag); + + // copy field and fieldp from device to host (_fieldp store both arrays, one after another) + + this->_fieldp.update_host(this->_max_fieldp_size*8,false); +} + +// --------------------------------------------------------------------------- +// Launch the real-space permanent field kernel +// --------------------------------------------------------------------------- +template +int HippoT::udirect2b(const int /*eflag*/, const int /*vflag*/) { + int ainum=this->ans->inum(); + if (ainum == 0) + return 0; + + int _nall=this->atom->nall(); + int nbor_pitch=this->nbor->nbor_pitch(); + + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + this->time_pair.start(); + + // Build the short neighbor list for the cutoff _off2_polar, if not done yet + // this is the first kernel in a time step where _off2_polar is used + + if (!this->short_nbor_polar_avail) { + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->_off2_polar, &ainum, + &nbor_pitch, &this->_threads_per_atom); + this->short_nbor_polar_avail = true; + } + + this->k_udirect2b.set_size(GX,BX); + this->k_udirect2b.run(&this->atom->x, &this->atom->extra, + &coeff_amtype, &coeff_amclass, &sp_polar, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, + &this->_fieldp, &ainum, &_nall, &nbor_pitch, + &this->_threads_per_atom, &this->_aewald, &this->_off2_polar, + &_polar_dscale, &_polar_uscale); + + this->time_pair.stop(); + return GX; +} + +// --------------------------------------------------------------------------- +// Compute the direct real space term of the induced field +// returning field and fieldp +// --------------------------------------------------------------------------- +template +void HippoT::compute_umutual2b(int * /*host_amtype*/, int * /*host_amgroup*/, double ** /*host_rpole*/, + double **host_uind, double **host_uinp, double * /*host_pval*/, + const double aewald, const double off2_polar, void ** /*fieldp_ptr*/) { + + // cast necessary data arrays from host to device + + this->cast_extra_data(nullptr, nullptr, nullptr, host_uind, host_uinp, nullptr); + this->atom->add_extra_data(); + + this->_off2_polar = off2_polar; + this->_aewald = aewald; + umutual2b(this->_eflag,this->_vflag); + + // copy field and fieldp from device to host (_fieldp store both arrays, one after another) + // NOTE: move this step to update_fieldp() to delay device-host transfer + // *fieldp_ptr=this->_fieldp.host.begin(); + // this->_fieldp.update_host(this->_max_fieldp_size*8,false); +} + +// --------------------------------------------------------------------------- +// Launch the real-space induced field kernel +// --------------------------------------------------------------------------- +template +int HippoT::umutual2b(const int /*eflag*/, const int /*vflag*/) { + int ainum=this->ans->inum(); + if (ainum == 0) + return 0; + + int _nall=this->atom->nall(); + int nbor_pitch=this->nbor->nbor_pitch(); + + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + this->time_pair.start(); + + // Build the short neighbor list if not done yet + if (!this->short_nbor_polar_avail) { + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), &this->dev_short_nbor, + &this->_off2_polar, &ainum, &nbor_pitch, + &this->_threads_per_atom); + this->short_nbor_polar_avail = true; + } + + this->k_umutual2b.set_size(GX,BX); + this->k_umutual2b.run(&this->atom->x, &this->atom->extra, + &coeff_amtype, &coeff_amclass, &sp_polar, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->_fieldp, &ainum, &_nall, + &nbor_pitch, &this->_threads_per_atom, &this->_aewald, + &this->_off2_polar, &_polar_dscale, &_polar_uscale); + + this->time_pair.stop(); + return GX; +} + +// --------------------------------------------------------------------------- +// Reneighbor on GPU if necessary, and then compute polar real-space +// --------------------------------------------------------------------------- +template +void HippoT::compute_polar_real(int * /*host_amtype*/, int * /*host_amgroup*/, double ** /*host_rpole*/, + double **host_uind, double **host_uinp, double * /*host_pval*/, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, + const double aewald, const double felec, + const double off2_polar, void **tep_ptr) { + // cast necessary data arrays from host to device + + this->cast_extra_data(nullptr, nullptr, nullptr, host_uind, host_uinp, nullptr); + this->atom->add_extra_data(); + + *tep_ptr=this->_tep.host.begin(); + + this->_off2_polar = off2_polar; + this->_felec = felec; + this->_aewald = aewald; + const int red_blocks=polar_real(this->_eflag,this->_vflag); + + // only copy answers (forces, energies and virial) back from the device + // in the last kernel in a timestep (which is polar_real here) + this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); + this->device->add_ans_object(this->ans); + + // copy tep from device to host + this->_tep.update_host(this->_max_tep_size*4,false); +} + +// --------------------------------------------------------------------------- +// Launch the polar real-space kernel +// --------------------------------------------------------------------------- +template +int HippoT::polar_real(const int eflag, const int vflag) { + int ainum=this->ans->inum(); + if (ainum == 0) + return 0; + + int _nall=this->atom->nall(); + int nbor_pitch=this->nbor->nbor_pitch(); + + // Compute the block size and grid size to keep all cores busy + + const int BX=this->block_size(); + const int GX=static_cast(ceil(static_cast(ainum)/(BX/this->_threads_per_atom))); + /* + const int cus = this->device->gpu->cus(); + while (GX < cus && GX > 1) { + BX /= 2; + GX=static_cast(ceil(static_cast(ainum)/(BX/this->_threads_per_atom))); + } + */ + this->time_pair.start(); + + // Build the short neighbor list if not done yet + if (!this->short_nbor_polar_avail) { + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->_off2_polar, &ainum, + &nbor_pitch, &this->_threads_per_atom); + this->short_nbor_polar_avail = true; + } + + this->k_polar.set_size(GX,BX); + this->k_polar.run(&this->atom->x, &this->atom->extra, + &coeff_amtype, &coeff_amclass, &sp_polar, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, + &this->ans->force, &this->ans->engv, &this->_tep, + &eflag, &vflag, &ainum, &_nall, &nbor_pitch, + &this->_threads_per_atom, &this->_aewald, &this->_felec, + &this->_off2_polar, &_polar_dscale, &_polar_uscale); + this->time_pair.stop(); + + // Signal that short nbor list is not avail for the next time step + // do it here because polar_real() is the last kernel in a time step at this point + + this->short_nbor_polar_avail = false; + + return GX; +} + +template class Hippo; +} diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu new file mode 100644 index 0000000000..99e20db223 --- /dev/null +++ b/lib/gpu/lal_hippo.cu @@ -0,0 +1,2519 @@ +// ************************************************************************** +// hippo.cu +// ------------------- +// Trung Dac Nguyen (Northwestern) +// +// Device code for acceleration of the hippo pair style +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : trung.nguyen@northwestern.edu +// *************************************************************************** + +#if defined(NV_KERNEL) || defined(USE_HIP) + +#include "lal_hippo_extra.h" +#ifdef LAMMPS_SMALLBIG +#define tagint int +#endif +#ifdef LAMMPS_BIGBIG +#include "inttypes.h" +#define tagint int64_t +#endif +#ifdef LAMMPS_SMALLSMALL +#define tagint int +#endif +#ifndef _DOUBLE_DOUBLE +_texture( pos_tex,float4); +_texture( q_tex,float); +#else +_texture_2d( pos_tex,int4); +_texture( q_tex,int2); +#endif + +#else +#define pos_tex x_ +#define q_tex q_ +#ifdef LAMMPS_SMALLBIG +#define tagint int +#endif +#ifdef LAMMPS_BIGBIG +#define tagint long +#endif +#ifdef LAMMPS_SMALLSMALL +#define tagint int +#endif + +#endif // defined(NV_KERNEL) || defined(USE_HIP) + + +#if (SHUFFLE_AVAIL == 0) + +#define local_allocate_store_ufld() \ + __local acctyp red_acc[6][BLOCK_PAIR]; + +#define store_answers_hippo_tq(tq, ii, inum,tid, t_per_atom, offset, i, \ + tep) \ + if (t_per_atom>1) { \ + red_acc[0][tid]=tq.x; \ + red_acc[1][tid]=tq.y; \ + red_acc[2][tid]=tq.z; \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + simdsync(); \ + if (offset < s) { \ + for (int r=0; r<3; r++) \ + red_acc[r][tid] += red_acc[r][tid+s]; \ + } \ + } \ + tq.x=red_acc[0][tid]; \ + tq.y=red_acc[1][tid]; \ + tq.z=red_acc[2][tid]; \ + } \ + if (offset==0 && ii1) { \ + red_acc[0][tid]=ufld[0]; \ + red_acc[1][tid]=ufld[1]; \ + red_acc[2][tid]=ufld[2]; \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + simdsync(); \ + if (offset < s) { \ + for (int r=0; r<3; r++) \ + red_acc[r][tid] += red_acc[r][tid+s]; \ + } \ + } \ + ufld[0]=red_acc[0][tid]; \ + ufld[1]=red_acc[1][tid]; \ + ufld[2]=red_acc[2][tid]; \ + red_acc[0][tid]=dufld[0]; \ + red_acc[1][tid]=dufld[1]; \ + red_acc[2][tid]=dufld[2]; \ + red_acc[3][tid]=dufld[3]; \ + red_acc[4][tid]=dufld[4]; \ + red_acc[5][tid]=dufld[5]; \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + simdsync(); \ + if (offset < s) { \ + for (int r=0; r<6; r++) \ + red_acc[r][tid] += red_acc[r][tid+s]; \ + } \ + } \ + dufld[0]=red_acc[0][tid]; \ + dufld[1]=red_acc[1][tid]; \ + dufld[2]=red_acc[2][tid]; \ + dufld[3]=red_acc[3][tid]; \ + dufld[4]=red_acc[4][tid]; \ + dufld[5]=red_acc[5][tid]; \ + } \ + if (offset==0 && ii1) { \ + red_acc[0][tid]=_fieldp[0]; \ + red_acc[1][tid]=_fieldp[1]; \ + red_acc[2][tid]=_fieldp[2]; \ + red_acc[3][tid]=_fieldp[3]; \ + red_acc[4][tid]=_fieldp[4]; \ + red_acc[5][tid]=_fieldp[5]; \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + simdsync(); \ + if (offset < s) { \ + for (int r=0; r<6; r++) \ + red_acc[r][tid] += red_acc[r][tid+s]; \ + } \ + } \ + _fieldp[0]=red_acc[0][tid]; \ + _fieldp[1]=red_acc[1][tid]; \ + _fieldp[2]=red_acc[2][tid]; \ + _fieldp[3]=red_acc[3][tid]; \ + _fieldp[4]=red_acc[4][tid]; \ + _fieldp[5]=red_acc[5][tid]; \ + } \ + if (offset==0 && ii1) { \ + simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z); \ + if (EVFLAG && (vflag==2 || eflag==2)) { \ + if (eflag) { \ + simdsync(); \ + simd_reduce_add2(t_per_atom, red_acc, offset, tid, energy, e_coul); \ + } \ + if (vflag) { \ + simdsync(); \ + simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial); \ + } \ + } \ + } \ + if (offset==0 && ii1) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + tq.x += shfl_down(tq.x, s, t_per_atom); \ + tq.y += shfl_down(tq.y, s, t_per_atom); \ + tq.z += shfl_down(tq.z, s, t_per_atom); \ + } \ + } \ + if (offset==0 && ii1) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + ufld[0] += shfl_down(ufld[0], s, t_per_atom); \ + ufld[1] += shfl_down(ufld[1], s, t_per_atom); \ + ufld[2] += shfl_down(ufld[2], s, t_per_atom); \ + dufld[0] += shfl_down(dufld[0], s, t_per_atom); \ + dufld[1] += shfl_down(dufld[1], s, t_per_atom); \ + dufld[2] += shfl_down(dufld[2], s, t_per_atom); \ + dufld[3] += shfl_down(dufld[3], s, t_per_atom); \ + dufld[4] += shfl_down(dufld[4], s, t_per_atom); \ + dufld[5] += shfl_down(dufld[5], s, t_per_atom); \ + } \ + } \ + if (offset==0 && ii1) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + _fieldp[0] += shfl_down(_fieldp[0], s, t_per_atom); \ + _fieldp[1] += shfl_down(_fieldp[1], s, t_per_atom); \ + _fieldp[2] += shfl_down(_fieldp[2], s, t_per_atom); \ + _fieldp[3] += shfl_down(_fieldp[3], s, t_per_atom); \ + _fieldp[4] += shfl_down(_fieldp[4], s, t_per_atom); \ + _fieldp[5] += shfl_down(_fieldp[5], s, t_per_atom); \ + } \ + } \ + if (offset==0 && ii1) { \ + simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ + if (vflag==2 || eflag==2) { \ + if (eflag) \ + simd_reduce_add2(t_per_atom,energy,e_coul); \ + if (vflag) \ + simd_reduce_arr(6, t_per_atom,virial); \ + } \ + } \ + if (offset==0 && ii 1; active_subgs /= vwidth) { \ + if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads(); \ + if (bnum < active_subgs) { \ + if (eflag) { \ + simd_reduce_add2(vwidth, energy, e_coul); \ + if (voffset==0) { \ + red_acc[6][bnum] = energy; \ + red_acc[7][bnum] = e_coul; \ + } \ + } \ + if (vflag) { \ + simd_reduce_arr(6, vwidth, virial); \ + if (voffset==0) \ + for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r]; \ + } \ + } \ + \ + __syncthreads(); \ + if (tid < active_subgs) { \ + if (eflag) { \ + energy = red_acc[6][tid]; \ + e_coul = red_acc[7][tid]; \ + } \ + if (vflag) \ + for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid]; \ + } else { \ + if (eflag) energy = e_coul = (acctyp)0; \ + if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0; \ + } \ + } \ + \ + if (bnum == 0) { \ + int ei=BLOCK_ID_X; \ + if (eflag) { \ + simd_reduce_add2(vwidth, energy, e_coul); \ + if (tid==0) { \ + engv[ei]+=energy*(acctyp)0.5; \ + ei+=ev_stride; \ + engv[ei]+=e_coul*(acctyp)0.5; \ + ei+=ev_stride; \ + } \ + } \ + if (vflag) { \ + simd_reduce_arr(6, vwidth, virial); \ + if (tid==0) { \ + for (int r=0; r<6; r++) { \ + engv[ei]+=virial[r]*(acctyp)0.5; \ + ei+=ev_stride; \ + } \ + } \ + } \ + } \ + } else if (offset==0 && ii1) \ + simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ + if (offset==0 && ii cut2) { + numtyp r3 = r2 * r; + numtyp r4 = r2 * r2; + numtyp r5 = r2 * r3; + numtyp taper = c5*r5 + c4*r4 + c3*r3 + c2*r2 + c1*r + c0; + numtyp dtaper = (numtyp)5.0*c5*r4 + (numtyp)4.0*c4*r3 + + (numtyp)3.0*c3*r2 + (numtyp)2.0*c2*r + c1; + dtaper *= e * rr1; + e *= taper; + frcx = frcx*taper - dtaper*xr; + frcy = frcy*taper - dtaper*yr; + frcz = frcz*taper - dtaper*zr; + ttmix *= taper; + ttmiy *= taper; + ttmiz *= taper; + } + + energy += e; + + // increment force-based gradient and torque on atom I + + f.x -= frcx; + f.y -= frcy; + f.z -= frcz; + tq.x += ttmix; + tq.y += ttmiy; + tq.z += ttmiz; + + // increment the internal virial tensor components + if (EVFLAG && vflag) { + numtyp vxx = -xr * frcx; + numtyp vxy = (numtyp)-0.5 * (yr*frcx+xr*frcy); + numtyp vxz = (numtyp)-0.5 * (zr*frcx+xr*frcz); + numtyp vyy = -yr * frcy; + numtyp vyz = (numtyp)-0.5 * (zr*frcy+yr*frcz); + numtyp vzz = -zr * frcz; + + virial[0] -= vxx; + virial[1] -= vyy; + virial[2] -= vzz; + virial[3] -= vxy; + virial[4] -= vxz; + virial[5] -= vyz; + } + } // nbor + + } // ii (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald); + + int m; + for (m = 1; m < 6; m++) { + numtyp bfac = (numtyp) (m+m-1); + alsq2n = alsq2 * alsq2n; + bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) * r2inv; + } + for (m = 0; m < 6; m++) bn[m] *= felec; + + numtyp term1,term2,term3; + numtyp term4,term5,term6; + + term1 = corei*corek; + numtyp term1i = corek*vali; + numtyp term2i = corek*dir; + numtyp term3i = corek*qir; + numtyp term1k = corei*valk; + numtyp term2k = -corei*dkr; + numtyp term3k = corei*qkr; + numtyp term1ik = vali*valk; + numtyp term2ik = valk*dir - vali*dkr + dik; + numtyp term3ik = vali*qkr + valk*qir - dir*dkr + 2.0*(dkqi-diqk+qiqk); + numtyp term4ik = dir*qkr - dkr*qir - 4.0*qik; + numtyp term5ik = qir*qkr; + numtyp dmpi[9],dmpj[9]; + numtyp dmpij[11]; + damppole(r,11,alphai,alphak,dmpi,dmpj,dmpij); + numtyp scalek = factor_mpole; + numtyp rr1i = bn[0] - ((numtyp)1.0-scalek*dmpi[0])*rr1; + numtyp rr3i = bn[1] - ((numtyp)1.0-scalek*dmpi[2])*rr3; + numtyp rr5i = bn[2] - ((numtyp)1.0-scalek*dmpi[4])*rr5; + numtyp rr7i = bn[3] - ((numtyp)1.0-scalek*dmpi[6])*rr7; + numtyp rr1k = bn[0] - ((numtyp)1.0-scalek*dmpj[0])*rr1; + numtyp rr3k = bn[1] - ((numtyp)1.0-scalek*dmpj[2])*rr3; + numtyp rr5k = bn[2] - ((numtyp)1.0-scalek*dmpj[4])*rr5; + numtyp rr7k = bn[3] - ((numtyp)1.0-scalek*dmpj[6])*rr7; + numtyp rr1ik = bn[0] - ((numtyp)1.0-scalek*dmpij[0])*rr1; + numtyp rr3ik = bn[1] - ((numtyp)1.0-scalek*dmpij[2])*rr3; + numtyp rr5ik = bn[2] - ((numtyp)1.0-scalek*dmpij[4])*rr5; + numtyp rr7ik = bn[3] - ((numtyp)1.0-scalek*dmpij[6])*rr7; + numtyp rr9ik = bn[4] - ((numtyp)1.0-scalek*dmpij[8])*rr9; + numtyp rr11ik = bn[5] - ((numtyp)1.0-scalek*dmpij[10])*rr11; + rr1 = bn[0] - ((numtyp)1.0-scalek)*rr1; + rr3 = bn[1] - ((numtyp)1.0-scalek)*rr3; + numtyp e = term1*rr1 + term4ik*rr7ik + term5ik*rr9ik + + term1i*rr1i + term1k*rr1k + term1ik*rr1ik + + term2i*rr3i + term2k*rr3k + term2ik*rr3ik + + term3i*rr5i + term3k*rr5k + term3ik*rr5ik; + + // find damped multipole intermediates for force and torque + + numtyp de = term1*rr3 + term4ik*rr9ik + term5ik*rr11ik + + term1i*rr3i + term1k*rr3k + term1ik*rr3ik + + term2i*rr5i + term2k*rr5k + term2ik*rr5ik + + term3i*rr7i + term3k*rr7k + term3ik*rr7ik; + term1 = -corek*rr3i - valk*rr3ik + dkr*rr5ik - qkr*rr7ik; + term2 = corei*rr3k + vali*rr3ik + dir*rr5ik + qir*rr7ik; + term3 = (numtyp)2.0 * rr5ik; + term4 = (numtyp)-2.0 * (corek*rr5i+valk*rr5ik - dkr*rr7ik+qkr*rr9ik); + term5 = (numtyp)-2.0 * (corei*rr5k+vali*rr5ik + dir*rr7ik+qir*rr9ik); + term6 = (numtyp)4.0 * rr7ik; + rr3 = rr3ik; + + energy += e; + + // compute the force components for this interaction + + numtyp frcx = de*xr + term1*dix + term2*dkx + term3*(diqkx-dkqix) + + term4*qix + term5*qkx + term6*(qixk+qkxi); + numtyp frcy = de*yr + term1*diy + term2*dky + term3*(diqky-dkqiy) + + term4*qiy + term5*qky + term6*(qiyk+qkyi); + numtyp frcz = de*zr + term1*diz + term2*dkz + term3*(diqkz-dkqiz) + + term4*qiz + term5*qkz + term6*(qizk+qkzi); + + // compute the torque components for this interaction + + numtyp ttmix = -rr3*dikx + term1*dirx + term3*(dqikx+dkqirx) - + term4*qirx - term6*(qikrx+qikx); + numtyp ttmiy = -rr3*diky + term1*diry + term3*(dqiky+dkqiry) - + term4*qiry - term6*(qikry+qiky); + numtyp ttmiz = -rr3*dikz + term1*dirz + term3*(dqikz+dkqirz) - + term4*qirz - term6*(qikrz+qikz); + + // increment force-based gradient and torque on first site + + f.x -= frcx; + f.y -= frcy; + f.z -= frcz; + tq.x += ttmix; + tq.y += ttmiy; + tq.z += ttmiz; + + if (EVFLAG && vflag) { + numtyp vxx = -xr * frcx; + numtyp vxy = (numtyp)-0.5 * (yr*frcx+xr*frcy); + numtyp vxz = (numtyp)-0.5 * (zr*frcx+xr*frcz); + numtyp vyy = -yr * frcy; + numtyp vyz = (numtyp)-0.5 * (zr*frcy+yr*frcz); + numtyp vzz = -zr * frcz; + + virial[0] -= vxx; + virial[1] -= vyy; + virial[2] -= vzz; + virial[3] -= vxy; + virial[4] -= vxz; + virial[5] -= vyz; + } + } // nbor + + } // ii (numtyp)0.0) aesq2n = (numtyp)1.0 / (MY_PIS*aewald); + + for ( ; nbor (numtyp)0.0) aesq2n = (numtyp)1.0 / (MY_PIS*aewald); + + for ( ; nbor (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald); + + for (m = 1; m <= 4; m++) { + numtyp bfac = (numtyp) (m+m-1); + alsq2n = alsq2 * alsq2n; + bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) * r2inv; + } + for (m = 0; m < 5; m++) bn[m] *= felec; + + // apply charge penetration damping to scale factors + + numtyp corek = coeff_amclass[jtype].z; // pcore[jclass]; + numtyp alphak = coeff_amclass[jtype].w; // palpha[jclass]; + numtyp valk = polar6[j].x; + numtyp dmpi[9],dmpk[9]; + numtyp dmpik[9]; + damppole(r,9,alphai,alphak,dmpi,dmpk,dmpik); + numtyp rr3core = bn[1] - ((numtyp)1.0-factor_dscale)*rr3; + numtyp rr5core = bn[2] - ((numtyp)1.0-factor_dscale)*rr5; + + numtyp rr3i = bn[1] - ((numtyp)1.0-factor_dscale*dmpi[2])*rr3; + numtyp rr5i = bn[2] - ((numtyp)1.0-factor_dscale*dmpi[4])*rr5; + numtyp rr7i = bn[3] - ((numtyp)1.0-factor_dscale*dmpi[6])*rr7; + numtyp rr9i = bn[4] - ((numtyp)1.0-factor_dscale*dmpi[8])*rr9; + numtyp rr3k = bn[1] - ((numtyp)1.0-factor_dscale*dmpk[2])*rr3; + numtyp rr5k = bn[2] - ((numtyp)1.0-factor_dscale*dmpk[4])*rr5; + numtyp rr7k = bn[3] - ((numtyp)1.0-factor_dscale*dmpk[6])*rr7; + numtyp rr9k = bn[4] - ((numtyp)1.0-factor_dscale*dmpk[8])*rr9; + numtyp rr5ik = bn[2] - ((numtyp)1.0-factor_wscale*dmpik[4])*rr5; + numtyp rr7ik = bn[3] - ((numtyp)1.0-factor_wscale*dmpik[6])*rr7; + + // get the induced dipole field used for dipole torques + + numtyp tix3 = (numtyp)2.0*rr3i*ukx; + numtyp tiy3 = (numtyp)2.0*rr3i*uky; + numtyp tiz3 = (numtyp)2.0*rr3i*ukz; + numtyp tuir = (numtyp)-2.0*rr5i*ukr; + + ufld[0] += tix3 + xr*tuir; + ufld[1] += tiy3 + yr*tuir; + ufld[2] += tiz3 + zr*tuir; + + // get induced dipole field gradient used for quadrupole torques + + numtyp tix5 = (numtyp)4.0 * (rr5i*ukx); + numtyp tiy5 = (numtyp)4.0 * (rr5i*uky); + numtyp tiz5 = (numtyp)4.0 * (rr5i*ukz); + tuir = (numtyp)-2.0*rr7i*ukr; + + dufld[0] += xr*tix5 + xr*xr*tuir; + dufld[1] += xr*tiy5 + yr*tix5 + (numtyp)2.0*xr*yr*tuir; + dufld[2] += yr*tiy5 + yr*yr*tuir; + dufld[3] += xr*tiz5 + zr*tix5 + (numtyp)2.0*xr*zr*tuir; + dufld[4] += yr*tiz5 + zr*tiy5 + (numtyp)2.0*yr*zr*tuir; + dufld[5] += zr*tiz5 + zr*zr*tuir; + + // get the field gradient for direct polarization force + + numtyp term1i,term2i,term3i,term4i,term5i,term6i,term7i,term8i; + numtyp term1k,term2k,term3k,term4k,term5k,term6k,term7k,term8k; + numtyp term1core; + numtyp tixx,tiyy,tizz,tixy,tixz,tiyz; + numtyp tkxx,tkyy,tkzz,tkxy,tkxz,tkyz; + + term1i = rr3i - rr5i*xr*xr; + term1core = rr3core - rr5core*xr*xr; + term2i = (numtyp)2.0*rr5i*xr ; + term3i = rr7i*xr*xr - rr5i; + term4i = (numtyp)2.0*rr5i; + term5i = (numtyp)5.0*rr7i*xr; + term6i = rr9i*xr*xr; + term1k = rr3k - rr5k*xr*xr; + term2k = (numtyp)2.0*rr5k*xr; + term3k = rr7k*xr*xr - rr5k; + term4k = (numtyp)2.0*rr5k; + term5k = (numtyp)5.0*rr7k*xr; + term6k = rr9k*xr*xr; + tixx = vali*term1i + corei*term1core + dix*term2i - dir*term3i - + qixx*term4i + qix*term5i - qir*term6i + (qiy*yr+qiz*zr)*rr7i; + tkxx = valk*term1k + corek*term1core - dkx*term2k + dkr*term3k - + qkxx*term4k + qkx*term5k - qkr*term6k + (qky*yr+qkz*zr)*rr7k; + + term1i = rr3i - rr5i*yr*yr; + term1core = rr3core - rr5core*yr*yr; + term2i = (numtyp)2.0*rr5i*yr; + term3i = rr7i*yr*yr - rr5i; + term4i = (numtyp)2.0*rr5i; + term5i = (numtyp)5.0*rr7i*yr; + term6i = rr9i*yr*yr; + term1k = rr3k - rr5k*yr*yr; + term2k = (numtyp)2.0*rr5k*yr; + term3k = rr7k*yr*yr - rr5k; + term4k = (numtyp)2.0*rr5k; + term5k = (numtyp)5.0*rr7k*yr; + term6k = rr9k*yr*yr; + tiyy = vali*term1i + corei*term1core + diy*term2i - dir*term3i - + qiyy*term4i + qiy*term5i - qir*term6i + (qix*xr+qiz*zr)*rr7i; + tkyy = valk*term1k + corek*term1core - dky*term2k + dkr*term3k - + qkyy*term4k + qky*term5k - qkr*term6k + (qkx*xr+qkz*zr)*rr7k; + + term1i = rr3i - rr5i*zr*zr; + term1core = rr3core - rr5core*zr*zr; + term2i = (numtyp)2.0*rr5i*zr; + term3i = rr7i*zr*zr - rr5i; + term4i = (numtyp)2.0*rr5i; + term5i = (numtyp)5.0*rr7i*zr; + term6i = rr9i*zr*zr; + term1k = rr3k - rr5k*zr*zr; + term2k = (numtyp)2.0*rr5k*zr; + term3k = rr7k*zr*zr - rr5k; + term4k = (numtyp)2.0*rr5k; + term5k = (numtyp)5.0*rr7k*zr; + term6k = rr9k*zr*zr; + tizz = vali*term1i + corei*term1core + diz*term2i - dir*term3i - + qizz*term4i + qiz*term5i - qir*term6i + (qix*xr+qiy*yr)*rr7i; + tkzz = valk*term1k + corek*term1core - dkz*term2k + dkr*term3k - + qkzz*term4k + qkz*term5k - qkr*term6k + (qkx*xr+qky*yr)*rr7k; + + term2i = rr5i*xr ; + term1i = yr * term2i; + term1core = rr5core*xr*yr; + term3i = rr5i*yr; + term4i = yr * (rr7i*xr); + term5i = (numtyp)2.0*rr5i; + term6i = (numtyp)2.0*rr7i*xr; + term7i = (numtyp)2.0*rr7i*yr; + term8i = yr*rr9i*xr; + term2k = rr5k*xr; + term1k = yr * term2k; + term3k = rr5k*yr; + term4k = yr * (rr7k*xr); + term5k = (numtyp)2.0*rr5k; + term6k = (numtyp)2.0*rr7k*xr; + term7k = (numtyp)2.0*rr7k*yr; + term8k = yr*rr9k*xr; + tixy = -vali*term1i - corei*term1core + diy*term2i + dix*term3i - + dir*term4i - qixy*term5i + qiy*term6i + qix*term7i - qir*term8i; + tkxy = -valk*term1k - corek*term1core - dky*term2k - dkx*term3k + + dkr*term4k - qkxy*term5k + qky*term6k + qkx*term7k - qkr*term8k; + + term2i = rr5i*xr; + term1i = zr * term2i; + term1core = rr5core*xr*zr; + term3i = rr5i*zr; + term4i = zr * (rr7i*xr); + term5i = (numtyp)2.0*rr5i; + term6i = (numtyp)2.0*rr7i*xr; + term7i = (numtyp)2.0*rr7i*zr; + term8i = zr*rr9i*xr; + term2k = rr5k*xr; + term1k = zr * term2k; + term3k = rr5k*zr; + term4k = zr * (rr7k*xr); + term5k = (numtyp)2.0*rr5k; + term6k = (numtyp)2.0*rr7k*xr; + term7k = (numtyp)2.0*rr7k*zr; + term8k = zr*rr9k*xr; + tixz = -vali*term1i - corei*term1core + diz*term2i + dix*term3i - + dir*term4i - qixz*term5i + qiz*term6i + qix*term7i - qir*term8i; + tkxz = -valk*term1k - corek*term1core - dkz*term2k - dkx*term3k + + dkr*term4k - qkxz*term5k + qkz*term6k + qkx*term7k - qkr*term8k; + + term2i = rr5i*yr; + term1i = zr * term2i; + term1core = rr5core*yr*zr; + term3i = rr5i*zr; + term4i = zr * (rr7i*yr); + term5i = (numtyp)2.0*rr5i; + term6i = (numtyp)2.0*rr7i*yr; + term7i = (numtyp)2.0*rr7i*zr; + term8i = zr*rr9i*yr; + term2k = rr5k*yr; + term1k = zr * term2k; + term3k = rr5k*zr; + term4k = zr * (rr7k*yr); + term5k = (numtyp)2.0*rr5k; + term6k = (numtyp)2.0*rr7k*yr; + term7k = (numtyp)2.0*rr7k*zr; + term8k = zr*rr9k*yr; + tiyz = -vali*term1i - corei*term1core + diz*term2i + diy*term3i - + dir*term4i - qiyz*term5i + qiz*term6i + qiy*term7i - qir*term8i; + tkyz = -valk*term1k - corek*term1core - dkz*term2k - dky*term3k + + dkr*term4k - qkyz*term5k + qkz*term6k + qky*term7k - qkr*term8k; + + numtyp depx = tixx*ukx + tixy*uky + tixz*ukz - tkxx*uix - tkxy*uiy - tkxz*uiz; + numtyp depy = tixy*ukx + tiyy*uky + tiyz*ukz - tkxy*uix - tkyy*uiy - tkyz*uiz; + numtyp depz = tixz*ukx + tiyz*uky + tizz*ukz - tkxz*uix - tkyz*uiy - tkzz*uiz; + + numtyp frcx = (numtyp)-2.0 * depx; + numtyp frcy = (numtyp)-2.0 * depy; + numtyp frcz = (numtyp)-2.0 * depz; + + numtyp term1,term2,term3; + + // get the dEp/dR terms used for direct polarization force + // poltyp == MUTUAL && hippo + // tixx and tkxx + term1 = (numtyp)2.0 * rr5ik; + term2 = term1*xr; + term3 = rr5ik - rr7ik*xr*xr; + tixx = uix*term2 + uir*term3; + tkxx = ukx*term2 + ukr*term3; + + // tiyy and tkyy + term2 = term1*yr; + term3 = rr5ik - rr7ik*yr*yr; + tiyy = uiy*term2 + uir*term3; + tkyy = uky*term2 + ukr*term3; + + // tiz and tkzz + term2 = term1*zr; + term3 = rr5ik - rr7ik*zr*zr; + tizz = uiz*term2 + uir*term3; + tkzz = ukz*term2 + ukr*term3; + + // tixy and tkxy + term1 = rr5ik*yr; + term2 = rr5ik*xr; + term3 = yr * (rr7ik*xr); + tixy = uix*term1 + uiy*term2 - uir*term3; + tkxy = ukx*term1 + uky*term2 - ukr*term3; + + // tixx and tkxx + term1 = rr5ik * zr; + term3 = zr * (rr7ik*xr); + tixz = uix*term1 + uiz*term2 - uir*term3; + tkxz = ukx*term1 + ukz*term2 - ukr*term3; + + // tiyz and tkyz + term2 = rr5ik*yr; + term3 = zr * (rr7ik*yr); + tiyz = uiy*term1 + uiz*term2 - uir*term3; + tkyz = uky*term1 + ukz*term2 - ukr*term3; + + depx = tixx*ukxp + tixy*ukyp + tixz*ukzp + tkxx*uixp + tkxy*uiyp + tkxz*uizp; + depy = tixy*ukxp + tiyy*ukyp + tiyz*ukzp + tkxy*uixp + tkyy*uiyp + tkyz*uizp; + depz = tixz*ukxp + tiyz*ukyp + tizz*ukzp + tkxz*uixp + tkyz*uiyp + tkzz*uizp; + + frcx = frcx - depx; + frcy = frcy - depy; + frcz = frcz - depz; + + f.x += frcx; + f.y += frcy; + f.z += frcz; + + if (EVFLAG && vflag) { + numtyp vxx = xr * frcx; + numtyp vxy = (numtyp)0.5 * (yr*frcx+xr*frcy); + numtyp vxz = (numtyp)0.5 * (zr*frcx+xr*frcz); + numtyp vyy = yr * frcy; + numtyp vyz = (numtyp)0.5 * (zr*frcy+yr*frcz); + numtyp vzz = zr * frcz; + + virial[0] -= vxx; + virial[1] -= vyy; + virial[2] -= vzz; + virial[3] -= vxy; + virial[4] -= vxz; + virial[5] -= vyz; + } + } // nbor + + } // ii> SBBITS & 3; + int j = sj & NEIGHMASK; + tagint jtag = tag[j]; + + if (!which) { + int offset=ii; + for (int k=0; k +class Hippo : public BaseAmoeba { + public: + Hippo(); + ~Hippo(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successful + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, const int max_amtype, const int max_amclass, + const double *host_pdamp, const double *host_thole, + const double *host_dirdamp, const int *host_amtype2class, + const double *host_special_mpole, + const double *host_special_repel, + const double *host_special_disp, + const double *host_special_polar_wscale, + const double *host_special_polar_piscale, + const double *host_special_polar_pscale, + const double *host_sizpr, const double *host_dmppr, const double *host_elepr, + const double *host_csix, const double *host_adisp, + const double *host_pcore, const double *host_palpha, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const int maxspecial15, const double cell_size, + const double gpu_split, FILE *_screen, + const double polar_dscale, const double polar_uscale); + + /// Compute repulsion with device neighboring + virtual void compute_repulsion(const int ago, const int inum_full, + const int nall, double **host_x, + int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, + const double aewald, const double off2_repulse, + double *host_q, double *boxlo, double *prd, + double cut2, double c0, double c1, double c2, + double c3, double c4, double c5,void** tep_ptr); + + /// Compute dispersion real-space with device neighboring + virtual void compute_dispersion_real(int *host_amtype, int *host_amgroup, + double **host_rpole, const double aewald, + const double off2_disp); + + /// Compute multipole real-space with device neighboring + virtual void compute_multipole_real(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, double *host_pval, + double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **numj, const double cpu_time, bool &success, + const double aewald, const double felec, const double off2_mpole, double *charge, + double *boxlo, double *prd, void **tep_ptr); + + /// Compute the real space part of the permanent field (udirect2b) with device neighboring + virtual void compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, double* host_pval, + const double aewald, const double off2_polar, void** fieldp_ptr); + + /// Compute the real space part of the induced field (umutual2b) with device neighboring + virtual void compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, double *host_pval, + const double aewald, const double off2_polar, + void** fieldp_ptr); + + /// Compute polar real-space with device neighboring + virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, double *host_pval, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, + const double aewald, const double felec, const double off2_polar, + void **tep_ptr); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + // --------------------------- TYPE DATA -------------------------- + + /// pdamp = coeff_amtype.x; thole = coeff_amtype.y; + /// dirdamp = coeff_amtype.z; amtype2class = coeff_amtype.w + UCL_D_Vec coeff_amtype; + /// csix = coeff_amclass.x; adisp = coeff_amclass.y; + UCL_D_Vec coeff_amclass; + /// sizpr = coeff_rep.x; dmppr = coeff_rep.y; elepr = coeff_rep.z; + UCL_D_Vec coeff_rep; + /// Special polar values [0-4]: + /// sp_polar.x = special_polar_wscale + /// sp_polar.y special_polar_pscale, + /// sp_polar.z = special_polar_piscale + /// sp_polar.w = special_mpole + UCL_D_Vec sp_polar; + /// Special nonpolar values [0-4]: + /// sp_nonpolar.x = special_hal + /// sp_nonpolar.y special_repel + /// sp_nonpolar.z = special_disp + UCL_D_Vec sp_nonpolar; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + numtyp _cut2,_c0,_c1,_c2,_c3,_c4,_c5; + numtyp _polar_dscale, _polar_uscale; + numtyp _qqrd2e; + + UCL_Kernel k_repulsion, k_dispersion; + + protected: + bool _allocated; + int repulsion(const int eflag, const int vflag); + int dispersion_real(const int eflag, const int vflag); + int multipole_real(const int eflag, const int vflag); + int udirect2b(const int eflag, const int vflag); + int umutual2b(const int eflag, const int vflag); + int polar_real(const int eflag, const int vflag); + +}; + +} + +#endif diff --git a/lib/gpu/lal_hippo_ext.cpp b/lib/gpu/lal_hippo_ext.cpp new file mode 100644 index 0000000000..0cb00387ca --- /dev/null +++ b/lib/gpu/lal_hippo_ext.cpp @@ -0,0 +1,231 @@ +/*************************************************************************** + hippo_ext.cpp + ------------------- + Trung Dac Nguyen (Northwestern) + + Functions for LAMMPS access to hippo acceleration routines. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : trung.nguyen@northwestern.edu + ***************************************************************************/ + +#include +#include +#include + +#include "lal_hippo.h" + +using namespace std; +using namespace LAMMPS_AL; + +static Hippo HIPPOMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass, + const double *host_pdamp, const double *host_thole, + const double *host_dirdamp, const int *host_amtype2class, + const double *host_special_repel, + const double *host_special_disp, + const double *host_special_mpole, + const double *host_special_polar_wscale, + const double *host_special_polar_piscale, + const double *host_special_polar_pscale, + const double *host_sizpr, const double *host_dmppr, const double *host_elepr, + const double *host_csix, const double *host_adisp, + const double *host_pcore, const double *host_palpha, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const int maxspecial15, + const double cell_size, int &gpu_mode, FILE *screen, + const double polar_dscale, const double polar_uscale) { + HIPPOMF.clear(); + gpu_mode=HIPPOMF.device->gpu_mode(); + double gpu_split=HIPPOMF.device->particle_split(); + int first_gpu=HIPPOMF.device->first_device(); + int last_gpu=HIPPOMF.device->last_device(); + int world_me=HIPPOMF.device->world_me(); + int gpu_rank=HIPPOMF.device->gpu_rank(); + int procs_per_gpu=HIPPOMF.device->procs_per_gpu(); + + HIPPOMF.device->init_message(screen,"HIPPO",first_gpu,last_gpu); + + bool message=false; + if (HIPPOMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing GPU and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=HIPPOMF.init(ntypes, max_amtype, max_amclass, + host_pdamp, host_thole, host_dirdamp, + host_amtype2class, host_special_repel, host_special_disp, + host_special_mpole, host_special_polar_wscale, + host_special_polar_piscale, host_special_polar_pscale, + host_sizpr, host_dmppr, host_elepr, + host_csix, host_adisp, host_pcore, host_palpha, + nlocal, nall, max_nbors, + maxspecial, maxspecial15, cell_size, gpu_split, + screen, polar_dscale, polar_uscale); + + HIPPOMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; igpu_barrier(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + HIPPOMF.estimate_gpu_overhead(); + return init_ok; +} + +void hippo_gpu_clear() { + HIPPOMF.clear(); +} + +int** hippo_gpu_precompute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double ** /*host_uind*/, double ** /*host_uinp*/, double * /*host_pval*/, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, double *prd) { + return HIPPOMF.precompute(ago, inum_full, nall, host_x, host_type, + host_amtype, host_amgroup, host_rpole, + nullptr, nullptr, nullptr, sublo, subhi, tag, + nspecial, special, nspecial15, special15, + eflag_in, vflag_in, eatom, vatom, + host_start, ilist, jnum, cpu_time, + success, host_q, boxlo, prd); +} + +void hippo_gpu_compute_repulsion(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *host_amtype, int *host_amgroup, double **host_rpole, + double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, int *nspecial15, tagint** special15, + const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, const double aewald, const double off2, + double *host_q, double *boxlo, double *prd, + double cut2, double c0, double c1, double c2, + double c3, double c4, double c5, void **tep_ptr) { + HIPPOMF.compute_repulsion(ago, inum_full, nall, host_x, host_type, + host_amtype, host_amgroup, host_rpole, sublo, subhi, + tag, nspecial, special, nspecial15, special15, + eflag, vflag, eatom, vatom, host_start, ilist, jnum, + cpu_time, success, aewald, off2, host_q, boxlo, prd, + cut2, c0, c1, c2, c3, c4, c5, tep_ptr); +} + +void hippo_gpu_compute_dispersion_real(int *host_amtype, int *host_amgroup, + double **host_rpole, const double aewald, + const double off2) { + HIPPOMF.compute_dispersion_real(host_amtype, host_amgroup, host_rpole, + aewald, off2); +} + +void hippo_gpu_compute_multipole_real(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *host_amtype, int *host_amgroup, double **host_rpole, + double *host_pval, double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, int *nspecial15, tagint** special15, + const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, const double aewald, const double felec, const double off2, + double *host_q, double *boxlo, double *prd, void **tep_ptr) { + HIPPOMF.compute_multipole_real(ago, inum_full, nall, host_x, host_type, + host_amtype, host_amgroup, host_rpole, host_pval, sublo, subhi, + tag, nspecial, special, nspecial15, special15, + eflag, vflag, eatom, vatom, host_start, ilist, jnum, + cpu_time, success, aewald, felec, off2, host_q, boxlo, prd, tep_ptr); +} + +void hippo_gpu_compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, double *host_pval, + const double aewald, const double off2, void **fieldp_ptr) { + HIPPOMF.compute_udirect2b(host_amtype, host_amgroup, host_rpole, + host_uind, host_uinp, host_pval, + aewald, off2, fieldp_ptr); +} + +void hippo_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, double *host_pval, + const double aewald, const double off2, void **fieldp_ptr) { + HIPPOMF.compute_umutual2b(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval, + aewald, off2, fieldp_ptr); +} + +void hippo_gpu_update_fieldp(void **fieldp_ptr) { + HIPPOMF.update_fieldp(fieldp_ptr); +} + +void hippo_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, double *host_pval, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, + const double aewald, const double felec, const double off2, + void **tep_ptr) { + HIPPOMF.compute_polar_real(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval, + eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr); +} + +void hippo_gpu_precompute_kspace(const int inum_full, const int bsorder, + double ***host_thetai1, double ***host_thetai2, + double ***host_thetai3, int** igrid, + const int nzlo_out, const int nzhi_out, + const int nylo_out, const int nyhi_out, + const int nxlo_out, const int nxhi_out) { + HIPPOMF.precompute_kspace(inum_full, bsorder, host_thetai1, host_thetai2, + host_thetai3, igrid, nzlo_out, nzhi_out, + nylo_out, nyhi_out, nxlo_out, nxhi_out); +} + +void hippo_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1, + void **host_fdip_phi2, void **host_fdip_sum_phi) { + HIPPOMF.compute_fphi_uind(host_grid_brick, host_fdip_phi1, host_fdip_phi2, host_fdip_sum_phi); +} + +double hippo_gpu_bytes() { + return HIPPOMF.host_memory_usage(); +} diff --git a/lib/gpu/lal_hippo_extra.h b/lib/gpu/lal_hippo_extra.h new file mode 100644 index 0000000000..7ff62aa9a4 --- /dev/null +++ b/lib/gpu/lal_hippo_extra.h @@ -0,0 +1,431 @@ +/// ************************************************************************** +// hippo_extra.h +// ------------------- +// Trung Dac Nguyen +// +// Device code for hippo math routines +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : ndactrung@gmail.com +// ***************************************************************************/* + +#ifndef LAL_HIPPO_EXTRA_H +#define LAL_HIPPO_EXTRA_H + +#if defined(NV_KERNEL) || defined(USE_HIP) +#include "lal_aux_fun1.h" +#else +#endif + +#define MY_PI2 (numtyp)1.57079632679489661923 +#define MY_PI4 (numtyp)0.78539816339744830962 + +/* ---------------------------------------------------------------------- + damprep generates coefficients for the Pauli repulsion + damping function for powers of the interatomic distance + + literature reference: + + J. A. Rackers and J. W. Ponder, "Classical Pauli Repulsion: An + Anisotropic, Atomic Multipole Model", Journal of Chemical Physics, + 150, 084104 (2019) +------------------------------------------------------------------------- */ + +ucl_inline void damprep(const numtyp r, const numtyp r2, const numtyp rr1, + const numtyp rr3, const numtyp rr5, const numtyp rr7, + const numtyp rr9, const numtyp rr11, const int rorder, + const numtyp dmpi, const numtyp dmpk, numtyp dmpik[11]) +{ + numtyp r3,r4; + numtyp r5,r6,r7,r8; + numtyp s,ds,d2s; + numtyp d3s,d4s,d5s; + numtyp dmpi2,dmpk2; + numtyp dmpi22,dmpi23; + numtyp dmpi24,dmpi25; + numtyp dmpi26,dmpi27; + numtyp dmpk22,dmpk23; + numtyp dmpk24,dmpk25; + numtyp dmpk26; + numtyp eps,diff; + numtyp expi,expk; + numtyp dampi,dampk; + numtyp pre,term,tmp; + + // compute tolerance value for damping exponents + + eps = (numtyp)0.001; + diff = dmpi-dmpk; // fabs(dmpi-dmpk) + if (diff < (numtyp)0) diff = -diff; + + // treat the case where alpha damping exponents are equal + + if (diff < eps) { + r3 = r2 * r; + r4 = r3 * r; + r5 = r4 * r; + r6 = r5 * r; + r7 = r6 * r; + dmpi2 = (numtyp)0.5 * dmpi; + dampi = dmpi2 * r; + expi = ucl_exp(-dampi); + dmpi22 = dmpi2 * dmpi2; + dmpi23 = dmpi22 * dmpi2; + dmpi24 = dmpi23 * dmpi2; + dmpi25 = dmpi24 * dmpi2; + dmpi26 = dmpi25 * dmpi2; + pre = (numtyp)128.0; + s = (r + dmpi2*r2 + dmpi22*r3/(numtyp)3.0) * expi; + + ds = (dmpi22*r3 + dmpi23*r4) * expi / (numtyp)3.0; + d2s = dmpi24 * expi * r5 / (numtyp)9.0; + d3s = dmpi25 * expi * r6 / (numtyp)45.0; + d4s = (dmpi25*r6 + dmpi26*r7) * expi / (numtyp)315.0; + if (rorder >= 11) { + r8 = r7 * r; + dmpi27 = dmpi2 * dmpi26; + d5s = (dmpi25*r6 + dmpi26*r7 + dmpi27*r8/(numtyp)3.0) * expi / (numtyp)945.0; + } + + // treat the case where alpha damping exponents are unequal + + } else { + r3 = r2 * r; + r4 = r3 * r; + r5 = r4 * r; + dmpi2 = (numtyp)0.5 * dmpi; + dmpk2 = (numtyp)0.5 * dmpk; + dampi = dmpi2 * r; + dampk = dmpk2 * r; + expi = ucl_exp(-dampi); + expk = ucl_exp(-dampk); + dmpi22 = dmpi2 * dmpi2; + dmpi23 = dmpi22 * dmpi2; + dmpi24 = dmpi23 * dmpi2; + dmpi25 = dmpi24 * dmpi2; + dmpk22 = dmpk2 * dmpk2; + dmpk23 = dmpk22 * dmpk2; + dmpk24 = dmpk23 * dmpk2; + dmpk25 = dmpk24 * dmpk2; + term = dmpi22 - dmpk22; + pre = (numtyp)8192.0 * dmpi23 * dmpk23 / (term*term*term*term); //ucl_powr(term,(numtyp)4.0); + tmp = (numtyp)4.0 * dmpi2 * dmpk2 / term; + s = (dampi-tmp)*expk + (dampk+tmp)*expi; + + ds = (dmpi2*dmpk2*r2 - (numtyp)4.0*dmpi2*dmpk22*r/term - + (numtyp)4.0*dmpi2*dmpk2/term) * expk + + (dmpi2*dmpk2*r2 + (numtyp)4.0*dmpi22*dmpk2*r/term + (numtyp)4.0*dmpi2*dmpk2/term) * expi; + d2s = (dmpi2*dmpk2*r2/3.0 + dmpi2*dmpk22*r3/(numtyp)3.0 - + ((numtyp)4.0/(numtyp)3.0)*dmpi2*dmpk23*r2/term - (numtyp)4.0*dmpi2*dmpk22*r/term - + (numtyp)4.0*dmpi2*dmpk2/term) * expk + + (dmpi2*dmpk2*r2/(numtyp)3.0 + dmpi22*dmpk2*r3/(numtyp)3.0 + + ((numtyp)4.0/(numtyp)3.0)*dmpi23*dmpk2*r2/term + (numtyp)4.0*dmpi22*dmpk2*r/term + + (numtyp)4.0*dmpi2*dmpk2/term) * expi; + d3s = (dmpi2*dmpk23*r4/(numtyp)15.0 + dmpi2*dmpk22*r3/(numtyp)5.0 + dmpi2*dmpk2*r2/(numtyp)5.0 - + ((numtyp)4.0/(numtyp)15.0)*dmpi2*dmpk24*r3/term - ((numtyp)8.0/(numtyp)5.0)*dmpi2*dmpk23*r2/term - + (numtyp)4.0*dmpi2*dmpk22*r/term - (numtyp)4.0/term*dmpi2*dmpk2) * expk + + (dmpi23*dmpk2*r4/(numtyp)15.0 + dmpi22*dmpk2*r3/(numtyp)5.0 + dmpi2*dmpk2*r2/(numtyp)5.0 + + ((numtyp)4.0/(numtyp)15.0)*dmpi24*dmpk2*r3/term + ((numtyp)8.0/(numtyp)5.0)*dmpi23*dmpk2*r2/term + + (numtyp)4.0*dmpi22*dmpk2*r/term + (numtyp)4.0/term*dmpi2*dmpk2) * expi; + d4s = (dmpi2*dmpk24*r5/(numtyp)105.0 + ((numtyp)2.0/(numtyp)35.0)*dmpi2*dmpk23*r4 + + dmpi2*dmpk22*r3/(numtyp)7.0 + dmpi2*dmpk2*r2/(numtyp)7.0 - + ((numtyp)4.0/(numtyp)105.0)*dmpi2*dmpk25*r4/term - ((numtyp)8.0/21.0)*dmpi2*dmpk24*r3/term - + ((numtyp)12.0/(numtyp)7.0)*dmpi2*dmpk23*r2/term - (numtyp)4.0*dmpi2*dmpk22*r/term - + (numtyp)4.0*dmpi2*dmpk2/term) * expk + + (dmpi24*dmpk2*r5/(numtyp)105.0 + ((numtyp)2.0/(numtyp)35.0)*dmpi23*dmpk2*r4 + + dmpi22*dmpk2*r3/(numtyp)7.0 + dmpi2*dmpk2*r2/(numtyp)7.0 + + ((numtyp)4.0/(numtyp)105.0)*dmpi25*dmpk2*r4/term + ((numtyp)8.0/(numtyp)21.0)*dmpi24*dmpk2*r3/term + + ((numtyp)12.0/(numtyp)7.0)*dmpi23*dmpk2*r2/term + (numtyp)4.0*dmpi22*dmpk2*r/term + + (numtyp)4.0*dmpi2*dmpk2/term) * expi; + + if (rorder >= 11) { + r6 = r5 * r; + dmpi26 = dmpi25 * dmpi2; + dmpk26 = dmpk25 * dmpk2; + d5s = (dmpi2*dmpk25*r6/(numtyp)945.0 + ((numtyp)2.0/(numtyp)189.0)*dmpi2*dmpk24*r5 + + dmpi2*dmpk23*r4/(numtyp)21.0 + dmpi2*dmpk22*r3/(numtyp)9.0 + dmpi2*dmpk2*r2/(numtyp)9.0 - + ((numtyp)4.0/(numtyp)945.0)*dmpi2*dmpk26*r5/term - + ((numtyp)4.0/(numtyp)63.0)*dmpi2*dmpk25*r4/term - ((numtyp)4.0/(numtyp)9.0)*dmpi2*dmpk24*r3/term - + ((numtyp)16.0/(numtyp)9.0)*dmpi2*dmpk23*r2/term - (numtyp)4.0*dmpi2*dmpk22*r/term - + (numtyp)4.0*dmpi2*dmpk2/term) * expk + + (dmpi25*dmpk2*r6/(numtyp)945.0 + ((numtyp)2.0/(numtyp)189.0)*dmpi24*dmpk2*r5 + + dmpi23*dmpk2*r4/(numtyp)21.0 + dmpi22*dmpk2*r3/(numtyp)9.0 + dmpi2*dmpk2*r2/(numtyp)9.0 + + ((numtyp)4.0/(numtyp)945.0)*dmpi26*dmpk2*r5/term + ((numtyp)4.0/(numtyp)63.0)*dmpi25*dmpk2*r4/term + + ((numtyp)4.0/(numtyp)9.0)*dmpi24*dmpk2*r3/term + ((numtyp)16.0/(numtyp)9.0)*dmpi23*dmpk2*r2/term + + (numtyp)4.0*dmpi22*dmpk2*r/term + (numtyp)4.0*dmpi2*dmpk2/term) * expi; + } + } + + // convert partial derivatives into full derivatives + + s = s * rr1; + ds = ds * rr3; + d2s = d2s * rr5; + d3s = d3s * rr7; + d4s = d4s * rr9; + d5s = d5s * rr11; + dmpik[0] = (numtyp)0.5 * pre * s * s; + dmpik[2] = pre * s * ds; + dmpik[4] = pre * (s*d2s + ds*ds); + dmpik[6] = pre * (s*d3s + (numtyp)3.0*ds*d2s); + dmpik[8] = pre * (s*d4s + (numtyp)4.0*ds*d3s + (numtyp)3.0*d2s*d2s); + + if (rorder >= 11) dmpik[10] = pre * (s*d5s + (numtyp)5.0*ds*d4s + (numtyp)10.0*d2s*d3s); +} + +/* ---------------------------------------------------------------------- + damppole generates coefficients for the charge penetration + damping function for powers of the interatomic distance + + literature references: + + L. V. Slipchenko and M. S. Gordon, "Electrostatic Energy in the + Effective Fragment Potential Method: Theory and Application to + the Benzene Dimer", Journal of Computational Chemistry, 28, + 276-291 (2007) [Gordon f1 and f2 models] + + J. A. Rackers, Q. Wang, C. Liu, J.-P. Piquemal, P. Ren and + J. W. Ponder, "An Optimized Charge Penetration Model for Use with + the AMOEBA Force Field", Physical Chemistry Chemical Physics, 19, + 276-291 (2017) +------------------------------------------------------------------------- */ + +ucl_inline void damppole(const numtyp r, const int rorder, + const numtyp alphai, const numtyp alphak, + numtyp dmpi[9], numtyp dmpk[9], numtyp dmpik[11]) +{ + numtyp termi,termk; + numtyp termi2,termk2; + numtyp alphai2,alphak2; + numtyp eps,diff; + numtyp expi,expk; + numtyp dampi,dampk; + numtyp dampi2,dampi3; + numtyp dampi4,dampi5; + numtyp dampi6,dampi7; + numtyp dampi8; + numtyp dampk2,dampk3; + numtyp dampk4,dampk5; + numtyp dampk6; + + // compute tolerance and exponential damping factors + + eps = (numtyp)0.001; + diff = alphai-alphak; + if (diff < (numtyp)0) diff = -diff; + dampi = alphai * r; + dampk = alphak * r; + expi = ucl_exp(-dampi); + expk = ucl_exp(-dampk); + + // core-valence charge penetration damping for Gordon f1 + + dampi2 = dampi * dampi; + dampi3 = dampi * dampi2; + dampi4 = dampi2 * dampi2; + dampi5 = dampi2 * dampi3; + dmpi[0] = (numtyp)1.0 - ((numtyp)1.0 + (numtyp)0.5*dampi)*expi; + dmpi[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2)*expi; + dmpi[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi; + dmpi[6] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + dampi4/(numtyp)30.0)*expi; + dmpi[8] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + + (numtyp)4.0*dampi4/(numtyp)105.0 + dampi5/(numtyp)210.0)*expi; + if (diff < eps) { + dmpk[0] = dmpi[0]; + dmpk[2] = dmpi[2]; + dmpk[4] = dmpi[4]; + dmpk[6] = dmpi[6]; + dmpk[8] = dmpi[8]; + } else { + dampk2 = dampk * dampk; + dampk3 = dampk * dampk2; + dampk4 = dampk2 * dampk2; + dampk5 = dampk2 * dampk3; + dmpk[0] = (numtyp)1.0 - ((numtyp)1.0 + (numtyp)0.5*dampk)*expk; + dmpk[2] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2)*expk; + dmpk[4] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0)*expk; + dmpk[6] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + dampk4/(numtyp)30.0)*expk; + dmpk[8] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + + (numtyp)4.0*dampk4/(numtyp)105.0 + dampk5/(numtyp)210.0)*expk; + } + + // valence-valence charge penetration damping for Gordon f1 + + if (diff < eps) { + dampi6 = dampi3 * dampi3; + dampi7 = dampi3 * dampi4; + dmpik[0] = (numtyp)1.0 - ((numtyp)1.0 + (numtyp)11.0*dampi/(numtyp)16.0 + (numtyp)3.0*dampi2/(numtyp)16.0 + + dampi3/(numtyp)48.0)*expi; + dmpik[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + + (numtyp)7.0*dampi3/(numtyp)48.0 + dampi4/(numtyp)48.0)*expi; + dmpik[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + + dampi4/(numtyp)24.0 + dampi5/(numtyp)144.0)*expi; + dmpik[6] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + + dampi4/(numtyp)24.0 + dampi5/(numtyp)120.0 + dampi6/(numtyp)720.0)*expi; + dmpik[8] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + + dampi4/(numtyp)24.0 + dampi5/(numtyp)120.0 + dampi6/(numtyp)720.0 + + dampi7/(numtyp)5040.0)*expi; + if (rorder >= 11) { + dampi8 = dampi4 * dampi4; + dmpik[10] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + + dampi4/(numtyp)24.0 + dampi5/(numtyp)120.0 + dampi6/(numtyp)720.0 + + dampi7/(numtyp)5040.0 + dampi8/(numtyp)45360.0)*expi; + } + + } else { + alphai2 = alphai * alphai; + alphak2 = alphak * alphak; + termi = alphak2 / (alphak2-alphai2); + termk = alphai2 / (alphai2-alphak2); + termi2 = termi * termi; + termk2 = termk * termk; + dmpik[0] = (numtyp)1.0 - termi2*(1.0 + (numtyp)2.0*termk + (numtyp)0.5*dampi)*expi - + termk2*((numtyp)1.0 + (numtyp)2.0*termi + (numtyp)0.5*dampk)*expk; + dmpik[2] = (numtyp)1.0 - termi2*((numtyp)1.0+dampi+(numtyp)0.5*dampi2)*expi - + termk2*((numtyp)1.0+dampk+(numtyp)0.5*dampk2)*expk - + (numtyp)2.0*termi2*termk*((numtyp)1.0+dampi)*expi - + (numtyp)2.0*termk2*termi*((numtyp)1.0+dampk)*expk; + dmpik[4] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi - + termk2*(1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0)*expk - + (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + dampi2/(numtyp)3.0)*expi - + (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + dampk2/(numtyp)3.0)*expk; + dmpik[6] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + + dampi3/(numtyp)6.0 + dampi4/(numtyp)30.0)*expi - + termk2*((numtyp)1.0 + dampk + 0.5*dampk2 + dampk3/(numtyp)6.0 + dampk4/(numtyp)30.0)*expk - + (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)2.0*dampi2/(numtyp)5.0 + dampi3/(numtyp)15.0)*expi - + (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + (numtyp)2.0*dampk2/(numtyp)5.0 + dampk3/(numtyp)15.0)*expk; + dmpik[8] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + + (numtyp)4.0*dampi4/(numtyp)105.0 + dampi5/(numtyp)210.0)*expi - + termk2*((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + + (numtyp)4.0*dampk4/105.0 + dampk5/(numtyp)210.0)*expk - + (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)3.0*dampi2/(numtyp)7.0 + + (numtyp)2.0*dampi3/(numtyp)21.0 + dampi4/(numtyp)105.0)*expi - + (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + (numtyp)3.0*dampk2/(numtyp)7.0 + + (numtyp)2.0*dampk3/(numtyp)21.0 + dampk4/(numtyp)105.0)*expk; + + if (rorder >= 11) { + dampi6 = dampi3 * dampi3; + dampk6 = dampk3 * dampk3; + dmpik[10] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + + (numtyp)5.0*dampi4/(numtyp)126.0 + (numtyp)2.0*dampi5/(numtyp)315.0 + + dampi6/(numtyp)1890.0)*expi - + termk2*((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + (numtyp)5.0*dampk4/(numtyp)126.0 + + (numtyp)2.0*dampk5/(numtyp)315.0 + dampk6/(numtyp)1890.0)*expk - + (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)4.0*dampi2/(numtyp)9.0 + dampi3/(numtyp)9.0 + + dampi4/(numtyp)63.0 + dampi5/(numtyp)945.0)*expi - + (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + 4.0*dampk2/(numtyp)9.0 + dampk3/(numtyp)9.0 + + dampk4/(numtyp)63.0 + dampk5/(numtyp)945.0)*expk; + } + } +} + +/* ---------------------------------------------------------------------- + dampdir = direct field damping coefficents + dampdir generates coefficients for the direct field damping + function for powers of the interatomic distance +------------------------------------------------------------------------- */ + +ucl_inline void dampdir(numtyp r, numtyp alphai, numtyp alphak, numtyp *dmpi, numtyp *dmpk) +{ + numtyp eps,diff; + numtyp expi,expk; + numtyp dampi,dampk; + numtyp dampi2,dampk2; + numtyp dampi3,dampk3; + numtyp dampi4,dampk4; + + // compute tolerance and exponential damping factors + + eps = (numtyp)0.001; + diff = alphai-alphak; // fabs(alphai-alphak); + if (diff < (numtyp)0) diff = -diff; + dampi = alphai * r; + dampk = alphak * r; + expi = ucl_exp(-dampi); + expk = ucl_exp(-dampk); + + // core-valence charge penetration damping for Gordon f1 (HIPPO) + + dampi2 = dampi * dampi; + dampi3 = dampi * dampi2; + dampi4 = dampi2 * dampi2; + dmpi[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2)*expi; + dmpi[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi; + dmpi[6] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + dampi4/(numtyp)30.0)*expi; + if (diff < eps) { + dmpk[2] = dmpi[2]; + dmpk[4] = dmpi[4]; + dmpk[6] = dmpi[6]; + } else { + dampk2 = dampk * dampk; + dampk3 = dampk * dampk2; + dampk4 = dampk2 * dampk2; + dmpk[2] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2)*expk; + dmpk[4] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0)*expk; + dmpk[6] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + dampk4/30.0)*expk; + } +} + +/* ---------------------------------------------------------------------- + dampmut = mutual field damping coefficents + dampmut generates coefficients for the mutual field damping + function for powers of the interatomic distance +------------------------------------------------------------------------- */ + +ucl_inline void dampmut(numtyp r, numtyp alphai, numtyp alphak, numtyp dmpik[5]) +{ + numtyp termi,termk; + numtyp termi2,termk2; + numtyp alphai2,alphak2; + numtyp eps,diff; + numtyp expi,expk; + numtyp dampi,dampk; + numtyp dampi2,dampi3; + numtyp dampi4,dampi5; + numtyp dampk2,dampk3; + + // compute tolerance and exponential damping factors + + eps = (numtyp)0.001; + diff = alphai-alphak; // fabs(alphai-alphak); + if (diff < (numtyp)0) diff = -diff; + dampi = alphai * r; + dampk = alphak * r; + expi = ucl_exp(-dampi); + expk = ucl_exp(-dampk); + + // valence-valence charge penetration damping for Gordon f1 (HIPPO) + + dampi2 = dampi * dampi; + dampi3 = dampi * dampi2; + if (diff < eps) { + dampi4 = dampi2 * dampi2; + dampi5 = dampi2 * dampi3; + dmpik[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + + 7.0*dampi3/(numtyp)48.0 + dampi4/48.0)*expi; + dmpik[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + + dampi4/(numtyp)24.0 + dampi5/(numtyp)144.0)*expi; + } else { + dampk2 = dampk * dampk; + dampk3 = dampk * dampk2; + alphai2 = alphai * alphai; + alphak2 = alphak * alphak; + termi = alphak2 / (alphak2-alphai2); + termk = alphai2 / (alphai2-alphak2); + termi2 = termi * termi; + termk2 = termk * termk; + dmpik[2] = (numtyp)1.0 - termi2*((numtyp)1.0+dampi+(numtyp)0.5*dampi2)*expi - + termk2*((numtyp)1.0+dampk+(numtyp)0.5*dampk2)*expk - + (numtyp)2.0*termi2*termk*((numtyp)1.0+dampi)*expi - (numtyp)2.0*termk2*termi*((numtyp)1.0+dampk)*expk; + dmpik[4] = (numtyp)1.0 - termi2*((numtyp)1.0+dampi+(numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi - + termk2*((numtyp)1.0+dampk+(numtyp)0.5*dampk2 + dampk3/(numtyp)6.00)*expk - + (numtyp)2.0*termi2*termk *((numtyp)1.0+dampi+dampi2/(numtyp)3.0)*expi - + (numtyp)2.0*termk2*termi *((numtyp)1.0+dampk+dampk2/(numtyp)3.0)*expk; + } +} + +#endif diff --git a/lib/gpu/lal_neighbor.cpp b/lib/gpu/lal_neighbor.cpp index a327fdd45b..10816e2fa6 100644 --- a/lib/gpu/lal_neighbor.cpp +++ b/lib/gpu/lal_neighbor.cpp @@ -576,6 +576,11 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, time_nbor.stop(); if (_time_device) time_nbor.add_to_total(); + + // on the host, special[i][j] = the special j neighbor of atom i (nall by maxspecial) + // on the device, transpose the matrix (1-d array) for coalesced reads + // dev_special[i][j] = the special i neighbor of atom j + time_transpose.start(); const int b2x=_block_cell_2d; const int b2y=_block_cell_2d; @@ -679,6 +684,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, if (_cutoff < _cell_size) vadjust*=1.46; mn=std::max(mn,static_cast(ceil(_max_neighbor_factor*vadjust*mn))); if (mn<33) mn+=3; + resize_max_neighbors(mn,success); set_nbor_block_size(mn/2); if (!success) @@ -831,6 +837,17 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, time_nbor.stop(); } +void Neighbor::transpose(UCL_D_Vec &out, const UCL_D_Vec &in, + const int columns_in, const int rows_in) +{ + const int b2x=_block_cell_2d; + const int b2y=_block_cell_2d; + const int g2x=static_cast(ceil(static_cast(columns_in)/b2x)); + const int g2y=static_cast(ceil(static_cast(rows_in)/b2y)); + _shared->k_transpose.set_size(g2x,g2y,b2x,b2y); + _shared->k_transpose.run(&out, &in, &columns_in, &rows_in); +} + template void Neighbor::build_nbor_list (double **x, const int inum, const int host_inum, const int nall, Atom &atom, double *sublo, double *subhi, diff --git a/lib/gpu/lal_neighbor.h b/lib/gpu/lal_neighbor.h index 5b569f804a..45ec95a9d1 100644 --- a/lib/gpu/lal_neighbor.h +++ b/lib/gpu/lal_neighbor.h @@ -33,7 +33,7 @@ #endif #endif -#if defined(USE_HIP) +#if defined(USE_HIP) || defined(__APPLE__) #define LAL_USE_OLD_NEIGHBOR #endif @@ -259,6 +259,10 @@ class Neighbor { return o.str(); } + /// Helper function + void transpose(UCL_D_Vec &out, const UCL_D_Vec &in, + const int columns_in, const int rows_in); + private: NeighborShared *_shared; UCL_Device *dev; @@ -289,15 +293,17 @@ class Neighbor { #endif int _simd_size; + #ifdef LAL_USE_OLD_NEIGHBOR inline void set_nbor_block_size(const int mn) { - #ifdef LAL_USE_OLD_NEIGHBOR int desired=mn/(2*_simd_size); desired*=_simd_size; if (desired<_simd_size) desired=_simd_size; else if (desired>_max_block_nbor_build) desired=_max_block_nbor_build; _block_nbor_build=desired; - #endif } + #else + inline void set_nbor_block_size(const int) {} + #endif }; } diff --git a/lib/gpu/lal_neighbor_gpu.cu b/lib/gpu/lal_neighbor_gpu.cu index 352f1d6138..359d9b75cb 100644 --- a/lib/gpu/lal_neighbor_gpu.cu +++ b/lib/gpu/lal_neighbor_gpu.cu @@ -48,6 +48,19 @@ _texture_2d( pos_tex,int4); #define LAL_USE_OLD_NEIGHBOR #endif +/* + compute the id of the cell where the atoms belong to +x: atom coordinates +cell_id: cell ids +particle_id: +boxlo[0-2]: the lower left corner of the local box +ncell[xyz]: the number of cells in xyz dims +i_cell_size is the inverse cell size +inum = the number of the local atoms that are ported to the device +nall = the number of the local+ghost atoms that are ported to the device +cells_in_cutoff = the number of cells that are within the cutoff +*/ + __kernel void calc_cell_id(const numtyp4 *restrict x_, unsigned *restrict cell_id, int *restrict particle_id, @@ -90,6 +103,8 @@ __kernel void calc_cell_id(const numtyp4 *restrict x_, } } +// compute the number of atoms in each cell + __kernel void kernel_calc_cell_counts(const unsigned *restrict cell_id, int *restrict cell_counts, int nall, int ncell) { diff --git a/lib/gpu/lal_pppm.cu b/lib/gpu/lal_pppm.cu index e17df5b88c..a8e929efe4 100644 --- a/lib/gpu/lal_pppm.cu +++ b/lib/gpu/lal_pppm.cu @@ -273,19 +273,19 @@ __kernel void interp(const __global numtyp4 *restrict x_, int my=mz+fast_mul(ny,npts_x); for (int m=0; m> SBBITS & 3; }; +#define SBBITS15 29 +#define NEIGHMASK15 0x1FFFFFFF +ucl_inline int sbmask15(int j) { return j >> SBBITS15 & 7; }; + // default to 32-bit smallint and other ints, 64-bit bigint: // same as defined in src/lmptype.h #if !defined(LAMMPS_SMALLSMALL) && !defined(LAMMPS_BIGBIG) && \ diff --git a/lib/gpu/lal_sw.cpp b/lib/gpu/lal_sw.cpp index eb42c710cc..9687a0352d 100644 --- a/lib/gpu/lal_sw.cpp +++ b/lib/gpu/lal_sw.cpp @@ -150,7 +150,7 @@ double SWT::host_memory_usage() const { // --------------------------------------------------------------------------- template int SWT::loop(const int eflag, const int vflag, const int evatom, - bool &success) { + bool & /*success*/) { const int nbor_pitch=this->nbor->nbor_pitch(); // build the short neighbor list diff --git a/lib/gpu/lal_tersoff.cu b/lib/gpu/lal_tersoff.cu index 8baa5ce12a..feab8bb5c0 100644 --- a/lib/gpu/lal_tersoff.cu +++ b/lib/gpu/lal_tersoff.cu @@ -106,6 +106,7 @@ _texture_2d( pos_tex,int4); } \ } +// (SHUFFLE_AVAIL == 1) #else #define local_allocate_acc_zeta() @@ -202,6 +203,7 @@ _texture_2d( pos_tex,int4); } \ } +// EVFLAG == 0 #else #define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom, \ @@ -216,8 +218,8 @@ _texture_2d( pos_tex,int4); ans[ii]=old; \ } -#endif -#endif +#endif // EVFLAG +#endif // SHUFFLE_AVAIL #ifdef LAL_SIMD_IP_SYNC #define t_per_atom t_per_atom_in diff --git a/lib/gpu/lal_vashishta.cpp b/lib/gpu/lal_vashishta.cpp index c343de3f55..fcc9d00ab0 100644 --- a/lib/gpu/lal_vashishta.cpp +++ b/lib/gpu/lal_vashishta.cpp @@ -56,7 +56,7 @@ int VashishtaT::init(const int ntypes, const int nlocal, const int nall, const i const double* costheta, const double* bigb, const double* big2b, const double* bigc) { - int success; + int success=0; success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split, _screen,vashishta,"k_vashishta","k_vashishta_three_center", "k_vashishta_three_end","k_vashishta_short_nbor"); @@ -211,7 +211,7 @@ double VashishtaT::host_memory_usage() const { // --------------------------------------------------------------------------- template int VashishtaT::loop(const int eflag, const int vflag, const int evatom, - bool &success) { + bool & /*success*/) { const int nbor_pitch=this->nbor->nbor_pitch(); // build the short neighbor list diff --git a/lib/kokkos/kokkos_5538.diff b/lib/kokkos/kokkos_5538.diff deleted file mode 100644 index 6bf2ccf6a4..0000000000 --- a/lib/kokkos/kokkos_5538.diff +++ /dev/null @@ -1,199 +0,0 @@ -diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos -index 22af411f32..530510a0d1 100644 ---- a/lib/kokkos/Makefile.kokkos -+++ b/lib/kokkos/Makefile.kokkos -@@ -20,7 +20,7 @@ KOKKOS_DEVICES ?= "OpenMP" - #KOKKOS_DEVICES ?= "Threads" - # Options: - # Intel: KNC,KNL,SNB,HSW,BDW,SKL,SKX,ICL,ICX,SPR --# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86 -+# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86,Hopper90 - # ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX - # IBM: BGQ,Power7,Power8,Power9 - # AMD-GPUS: Vega900,Vega906,Vega908,Vega90A -@@ -401,6 +401,7 @@ KOKKOS_INTERNAL_USE_ARCH_VOLTA72 := $(call kokkos_has_string,$(KOKKOS_ARCH),Volt - KOKKOS_INTERNAL_USE_ARCH_TURING75 := $(call kokkos_has_string,$(KOKKOS_ARCH),Turing75) - KOKKOS_INTERNAL_USE_ARCH_AMPERE80 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere80) - KOKKOS_INTERNAL_USE_ARCH_AMPERE86 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere86) -+KOKKOS_INTERNAL_USE_ARCH_HOPPER90 := $(call kokkos_has_string,$(KOKKOS_ARCH),Hopper90) - KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30) \ - + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32) \ - + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \ -@@ -414,7 +415,8 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLE - + $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72) \ - + $(KOKKOS_INTERNAL_USE_ARCH_TURING75) \ - + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE80) \ -- + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86)) -+ + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86) \ -+ + $(KOKKOS_INTERNAL_USE_ARCH_HOPPER90)) - - #SEK: This seems like a bug to me - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0) -@@ -1194,6 +1196,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA_ARCH), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE86") - KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_86 - endif -+ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_HOPPER90), 1) -+ tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER") -+ tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER90") -+ KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_90 -+ endif - - ifneq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0) - KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG) -diff --git a/lib/kokkos/cmake/KokkosCore_config.h.in b/lib/kokkos/cmake/KokkosCore_config.h.in -index 88ddc48378..b83ced9243 100644 ---- a/lib/kokkos/cmake/KokkosCore_config.h.in -+++ b/lib/kokkos/cmake/KokkosCore_config.h.in -@@ -102,6 +102,7 @@ - #cmakedefine KOKKOS_ARCH_AMPERE - #cmakedefine KOKKOS_ARCH_AMPERE80 - #cmakedefine KOKKOS_ARCH_AMPERE86 -+#cmakedefine KOKKOS_ARCH_HOPPER90 - #cmakedefine KOKKOS_ARCH_AMD_ZEN - #cmakedefine KOKKOS_ARCH_AMD_ZEN2 - #cmakedefine KOKKOS_ARCH_AMD_ZEN3 -diff --git a/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc b/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc -index f56cef1651..2585a6a64c 100644 ---- a/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc -+++ b/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc -@@ -74,6 +74,7 @@ int main() { - case 75: std::cout << "Set -DKokkos_ARCH_TURING75=ON ." << std::endl; break; - case 80: std::cout << "Set -DKokkos_ARCH_AMPERE80=ON ." << std::endl; break; - case 86: std::cout << "Set -DKokkos_ARCH_AMPERE86=ON ." << std::endl; break; -+ case 90: std::cout << "Set -DKokkos_ARCH_HOPPER90=ON ." << std::endl; break; - default: - std::cout << "Compute capability " << compute_capability - << " is not supported" << std::endl; -diff --git a/lib/kokkos/cmake/kokkos_arch.cmake b/lib/kokkos/cmake/kokkos_arch.cmake -index ef16aad047..c1d76cceeb 100644 ---- a/lib/kokkos/cmake/kokkos_arch.cmake -+++ b/lib/kokkos/cmake/kokkos_arch.cmake -@@ -86,6 +86,7 @@ KOKKOS_ARCH_OPTION(VOLTA72 GPU "NVIDIA Volta generation CC 7.2" "KOKK - KOKKOS_ARCH_OPTION(TURING75 GPU "NVIDIA Turing generation CC 7.5" "KOKKOS_SHOW_CUDA_ARCHS") - KOKKOS_ARCH_OPTION(AMPERE80 GPU "NVIDIA Ampere generation CC 8.0" "KOKKOS_SHOW_CUDA_ARCHS") - KOKKOS_ARCH_OPTION(AMPERE86 GPU "NVIDIA Ampere generation CC 8.6" "KOKKOS_SHOW_CUDA_ARCHS") -+KOKKOS_ARCH_OPTION(HOPPER90 GPU "NVIDIA Hopper generation CC 9.0" "KOKKOS_SHOW_CUDA_ARCHS") - - IF(Kokkos_ENABLE_HIP OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_UNSUPPORTED_ARCHS) - SET(KOKKOS_SHOW_HIP_ARCHS ON) -@@ -544,6 +545,7 @@ CHECK_CUDA_ARCH(VOLTA72 sm_72) - CHECK_CUDA_ARCH(TURING75 sm_75) - CHECK_CUDA_ARCH(AMPERE80 sm_80) - CHECK_CUDA_ARCH(AMPERE86 sm_86) -+CHECK_CUDA_ARCH(HOPPER90 sm_90) - - SET(AMDGPU_ARCH_ALREADY_SPECIFIED "") - FUNCTION(CHECK_AMDGPU_ARCH ARCH FLAG) -@@ -806,6 +808,10 @@ IF (KOKKOS_ARCH_AMPERE80 OR KOKKOS_ARCH_AMPERE86) - SET(KOKKOS_ARCH_AMPERE ON) - ENDIF() - -+IF (KOKKOS_ARCH_HOPPER90) -+ SET(KOKKOS_ARCH_HOPPER ON) -+ENDIF() -+ - #Regardless of version, make sure we define the general architecture name - IF (KOKKOS_ARCH_VEGA900 OR KOKKOS_ARCH_VEGA906 OR KOKKOS_ARCH_VEGA908 OR KOKKOS_ARCH_VEGA90A) - SET(KOKKOS_ARCH_VEGA ON) -diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp -index 56f9117844..fcd4773dbc 100644 ---- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp -+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp -@@ -232,7 +232,8 @@ inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) { - case 61: return 96; - case 70: - case 80: -- case 86: return 8; -+ case 86: -+ case 90: return 8; - case 75: return 32; - default: - Kokkos::Impl::throw_runtime_exception( -diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp -index 40a263561f..8c40ebd60d 100644 ---- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp -+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp -@@ -418,7 +418,7 @@ KOKKOS_INLINE_FUNCTION - #endif // CUDA_VERSION >= 11000 && CUDA_VERSION < 11010 - - #if CUDA_VERSION >= 11010 && \ -- ((defined(KOKKOS_ARCH_AMPERE80) || defined(KOKKOS_ARCH_AMPERE86))) -+ ((defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER))) - KOKKOS_INLINE_FUNCTION - bhalf_t cast_to_bhalf(bhalf_t val) { return val; } - KOKKOS_INLINE_FUNCTION -diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp -index f9451ecfe6..2ce1efb98c 100644 ---- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp -+++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp -@@ -51,7 +51,7 @@ namespace Kokkos::Experimental::Impl { - - struct OpenACC_Traits { - #if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \ -- defined(KOKKOS_ARCH_AMPERE) -+ defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER) - static constexpr acc_device_t dev_type = acc_device_nvidia; - static constexpr bool may_fallback_to_host = false; - #else -diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp -index a9bc085912..27ee1d4232 100644 ---- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp -+++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp -@@ -115,8 +115,9 @@ void OpenMPTargetInternal::impl_initialize() { - - // FIXME_OPENMPTARGET: Only fix the number of teams for NVIDIA architectures - // from Pascal and upwards. --#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \ -- defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) -+#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \ -+ defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \ -+ defined(KOKKOS_ARCH_HOPPER) - #if defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG >= 1300) - omp_set_num_teams(512); - #endif -diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp -index 840db4327c..7e5addbc5b 100644 ---- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp -+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp -@@ -155,7 +155,7 @@ void SYCL::impl_initialize(InitializationSettings const& settings) { - #if !defined(KOKKOS_ARCH_INTEL_GPU) && !defined(KOKKOS_ARCH_KEPLER) && \ - !defined(KOKKOS_ARCH_MAXWELL) && !defined(KOKKOS_ARCH_PASCAL) && \ - !defined(KOKKOS_ARCH_VOLTA) && !defined(KOKKOS_ARCH_TURING75) && \ -- !defined(KOKKOS_ARCH_AMPERE) -+ !defined(KOKKOS_ARCH_AMPERE) && !defined(KOKKOS_ARCH_HOPPER) - if (!settings.has_device_id() && gpu_devices.empty()) { - Impl::SYCLInternal::singleton().initialize(sycl::device()); - return; -diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp -index 5ac7d8af30..ba101f699e 100644 ---- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp -+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp -@@ -335,9 +335,10 @@ class TeamPolicyInternal - return std::min({ - int(m_space.impl_internal_space_instance()->m_maxWorkgroupSize), - // FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs. --#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \ -- defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \ -- defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) -+#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \ -+ defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \ -+ defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \ -+ defined(KOKKOS_ARCH_HOPPER) - 256, - #endif - max_threads_for_memory -@@ -367,9 +368,10 @@ class TeamPolicyInternal - return std::min({ - int(m_space.impl_internal_space_instance()->m_maxWorkgroupSize), - // FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs. --#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \ -- defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \ -- defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) -+#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \ -+ defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \ -+ defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \ -+ defined(KOKKOS_ARCH_HOPPER) - 256, - #endif - max_threads_for_memory diff --git a/lib/kokkos/kokkos_5706.diff b/lib/kokkos/kokkos_5706.diff deleted file mode 100644 index 2bfbb35b06..0000000000 --- a/lib/kokkos/kokkos_5706.diff +++ /dev/null @@ -1,523 +0,0 @@ -diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp -index fcd4773dbc..30b6958a67 100644 ---- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp -+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp -@@ -207,7 +207,6 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance, - LaunchBounds{}); - } - --// Assuming cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1) - // NOTE these number can be obtained several ways: - // * One option is to download the CUDA Occupancy Calculator spreadsheet, select - // "Compute Capability" first and check what is the smallest "Shared Memory -@@ -242,6 +241,7 @@ inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) { - return 0; - }() * 1024; - } -+ - } // namespace Impl - } // namespace Kokkos - -diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp -index 5811498e01..e22eb3b842 100644 ---- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp -+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp -@@ -569,12 +569,6 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default - } - #endif - --#ifdef KOKKOS_ENABLE_PRE_CUDA_10_DEPRECATION_API -- cudaThreadSetCacheConfig(cudaFuncCachePreferShared); --#else -- cudaDeviceSetCacheConfig(cudaFuncCachePreferShared); --#endif -- - // Init the array for used for arbitrarily sized atomics - if (stream == nullptr) Impl::initialize_host_cuda_lock_arrays(); - -diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp -index b7a80ad84f..5c4c3a7d39 100644 ---- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp -+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp -@@ -93,10 +93,6 @@ namespace Impl { - // __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor) - // function qualifier which could be used to improve performance. - //---------------------------------------------------------------------------- --// Maximize L1 cache and minimize shared memory: --// cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1 ); --// For 2.0 capability: 48 KB L1 and 16 KB shared --//---------------------------------------------------------------------------- - - template - __global__ static void cuda_parallel_launch_constant_memory() { -@@ -158,63 +154,105 @@ inline void check_shmem_request(CudaInternal const* cuda_instance, int shmem) { - } - } - --// This function needs to be template on DriverType and LaunchBounds -+// These functions needs to be template on DriverType and LaunchBounds - // so that the static bool is unique for each type combo - // KernelFuncPtr does not necessarily contain that type information. -+ - template --inline void configure_shmem_preference(KernelFuncPtr const& func, -- bool prefer_shmem) { -+const cudaFuncAttributes& get_cuda_kernel_func_attributes( -+ const KernelFuncPtr& func) { -+ // Only call cudaFuncGetAttributes once for each unique kernel -+ // by leveraging static variable initialization rules -+ auto wrap_get_attributes = [&]() -> cudaFuncAttributes { -+ cudaFuncAttributes attr; -+ KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncGetAttributes(&attr, func)); -+ return attr; -+ }; -+ static cudaFuncAttributes func_attr = wrap_get_attributes(); -+ return func_attr; -+} -+ -+template -+inline void configure_shmem_preference(const KernelFuncPtr& func, -+ const cudaDeviceProp& device_props, -+ const size_t block_size, int& shmem, -+ const size_t occupancy) { - #ifndef KOKKOS_ARCH_KEPLER -- // On Kepler the L1 has no benefit since it doesn't cache reads -+ -+ const auto& func_attr = -+ get_cuda_kernel_func_attributes(func); -+ -+ // Compute limits for number of blocks due to registers/SM -+ const size_t regs_per_sm = device_props.regsPerMultiprocessor; -+ const size_t regs_per_thread = func_attr.numRegs; -+ // The granularity of register allocation is chunks of 256 registers per warp -+ // -> 8 registers per thread -+ const size_t allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8); -+ const size_t max_blocks_regs = -+ regs_per_sm / (allocated_regs_per_thread * block_size); -+ -+ // Compute how many threads per sm we actually want -+ const size_t max_threads_per_sm = device_props.maxThreadsPerMultiProcessor; -+ // only allocate multiples of warp size -+ const size_t num_threads_desired = -+ ((max_threads_per_sm * occupancy / 100 + 31) / 32) * 32; -+ // Get close to the desired occupancy, -+ // don't undershoot by much but also don't allocate a whole new block just -+ // because one is a few threads over otherwise. -+ size_t num_blocks_desired = -+ (num_threads_desired + block_size * 0.8) / block_size; -+ num_blocks_desired = ::std::min(max_blocks_regs, num_blocks_desired); -+ if (num_blocks_desired == 0) num_blocks_desired = 1; -+ -+ // Calculate how much shared memory we need per block -+ size_t shmem_per_block = shmem + func_attr.sharedSizeBytes; -+ -+ // The minimum shared memory allocation we can have in total per SM is 8kB. -+ // If we want to lower occupancy we have to make sure we request at least that -+ // much in aggregate over all blocks, so that shared memory actually becomes a -+ // limiting factor for occupancy -+ constexpr size_t min_shmem_size_per_sm = 8192; -+ if ((occupancy < 100) && -+ (shmem_per_block * num_blocks_desired < min_shmem_size_per_sm)) { -+ shmem_per_block = min_shmem_size_per_sm / num_blocks_desired; -+ // Need to set the caller's shmem variable so that the -+ // kernel launch uses the correct dynamic shared memory request -+ shmem = shmem_per_block - func_attr.sharedSizeBytes; -+ } -+ -+ // Compute the carveout fraction we need based on occupancy -+ // Use multiples of 8kB -+ const size_t max_shmem_per_sm = device_props.sharedMemPerMultiprocessor; -+ size_t carveout = shmem_per_block == 0 -+ ? 0 -+ : 100 * -+ (((num_blocks_desired * shmem_per_block + -+ min_shmem_size_per_sm - 1) / -+ min_shmem_size_per_sm) * -+ min_shmem_size_per_sm) / -+ max_shmem_per_sm; -+ if (carveout > 100) carveout = 100; -+ -+ // Set the carveout, but only call it once per kernel or when it changes - auto set_cache_config = [&] { -- KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetCacheConfig( -- func, -- (prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1))); -- return prefer_shmem; -+ KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetAttribute( -+ func, cudaFuncAttributePreferredSharedMemoryCarveout, carveout)); -+ return carveout; - }; -- static bool cache_config_preference_cached = set_cache_config(); -- if (cache_config_preference_cached != prefer_shmem) { -+ // Store the value in a static variable so we only reset if needed -+ static size_t cache_config_preference_cached = set_cache_config(); -+ if (cache_config_preference_cached != carveout) { - cache_config_preference_cached = set_cache_config(); - } - #else - // Use the parameters so we don't get a warning - (void)func; -- (void)prefer_shmem; -+ (void)device_props; -+ (void)block_size; -+ (void)occupancy; - #endif - } - --template --std::enable_if_t --modify_launch_configuration_if_desired_occupancy_is_specified( -- Policy const& policy, cudaDeviceProp const& properties, -- cudaFuncAttributes const& attributes, dim3 const& block, int& shmem, -- bool& prefer_shmem) { -- int const block_size = block.x * block.y * block.z; -- int const desired_occupancy = policy.impl_get_desired_occupancy().value(); -- -- size_t const shmem_per_sm_prefer_l1 = get_shmem_per_sm_prefer_l1(properties); -- size_t const static_shmem = attributes.sharedSizeBytes; -- -- // round to nearest integer and avoid division by zero -- int active_blocks = std::max( -- 1, static_cast(std::round( -- static_cast(properties.maxThreadsPerMultiProcessor) / -- block_size * desired_occupancy / 100))); -- int const dynamic_shmem = -- shmem_per_sm_prefer_l1 / active_blocks - static_shmem; -- -- if (dynamic_shmem > shmem) { -- shmem = dynamic_shmem; -- prefer_shmem = false; -- } --} -- --template --std::enable_if_t --modify_launch_configuration_if_desired_occupancy_is_specified( -- Policy const&, cudaDeviceProp const&, cudaFuncAttributes const&, -- dim3 const& /*block*/, int& /*shmem*/, bool& /*prefer_shmem*/) {} -- - // end Some helper functions for launch code readability }}}1 - //============================================================================== - -@@ -348,7 +386,7 @@ struct CudaParallelLaunchKernelInvoker< - #ifdef KOKKOS_CUDA_ENABLE_GRAPHS - inline static void create_parallel_launch_graph_node( - DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem, -- CudaInternal const* cuda_instance, bool prefer_shmem) { -+ CudaInternal const* cuda_instance) { - //---------------------------------------- - auto const& graph = Impl::get_cuda_graph_from_kernel(driver); - KOKKOS_EXPECTS(bool(graph)); -@@ -358,8 +396,15 @@ struct CudaParallelLaunchKernelInvoker< - - if (!Impl::is_empty_launch(grid, block)) { - Impl::check_shmem_request(cuda_instance, shmem); -- Impl::configure_shmem_preference( -- base_t::get_kernel_func(), prefer_shmem); -+ if (DriverType::Policy:: -+ experimental_contains_desired_occupancy) { -+ int desired_occupancy = -+ driver.get_policy().impl_get_desired_occupancy().value(); -+ size_t block_size = block.x * block.y * block.z; -+ Impl::configure_shmem_preference( -+ base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size, -+ shmem, desired_occupancy); -+ } - - void const* args[] = {&driver}; - -@@ -442,7 +487,7 @@ struct CudaParallelLaunchKernelInvoker< - #ifdef KOKKOS_CUDA_ENABLE_GRAPHS - inline static void create_parallel_launch_graph_node( - DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem, -- CudaInternal const* cuda_instance, bool prefer_shmem) { -+ CudaInternal const* cuda_instance) { - //---------------------------------------- - auto const& graph = Impl::get_cuda_graph_from_kernel(driver); - KOKKOS_EXPECTS(bool(graph)); -@@ -452,8 +497,15 @@ struct CudaParallelLaunchKernelInvoker< - - if (!Impl::is_empty_launch(grid, block)) { - Impl::check_shmem_request(cuda_instance, shmem); -- Impl::configure_shmem_preference( -- base_t::get_kernel_func(), prefer_shmem); -+ if constexpr (DriverType::Policy:: -+ experimental_contains_desired_occupancy) { -+ int desired_occupancy = -+ driver.get_policy().impl_get_desired_occupancy().value(); -+ size_t block_size = block.x * block.y * block.z; -+ Impl::configure_shmem_preference( -+ base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size, -+ shmem, desired_occupancy); -+ } - - auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver); - -@@ -566,7 +618,7 @@ struct CudaParallelLaunchKernelInvoker< - #ifdef KOKKOS_CUDA_ENABLE_GRAPHS - inline static void create_parallel_launch_graph_node( - DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem, -- CudaInternal const* cuda_instance, bool prefer_shmem) { -+ CudaInternal const* cuda_instance) { - // Just use global memory; coordinating through events to share constant - // memory with the non-graph interface is not really reasonable since - // events don't work with Graphs directly, and this would anyway require -@@ -580,7 +632,7 @@ struct CudaParallelLaunchKernelInvoker< - DriverType, LaunchBounds, - Experimental::CudaLaunchMechanism::GlobalMemory>; - global_launch_impl_t::create_parallel_launch_graph_node( -- driver, grid, block, shmem, cuda_instance, prefer_shmem); -+ driver, grid, block, shmem, cuda_instance); - } - #endif - }; -@@ -613,8 +665,7 @@ struct CudaParallelLaunchImpl< - - inline static void launch_kernel(const DriverType& driver, const dim3& grid, - const dim3& block, int shmem, -- const CudaInternal* cuda_instance, -- bool prefer_shmem) { -+ const CudaInternal* cuda_instance) { - if (!Impl::is_empty_launch(grid, block)) { - // Prevent multiple threads to simultaneously set the cache configuration - // preference and launch the same kernel -@@ -623,18 +674,17 @@ struct CudaParallelLaunchImpl< - - Impl::check_shmem_request(cuda_instance, shmem); - -- // If a desired occupancy is specified, we compute how much shared memory -- // to ask for to achieve that occupancy, assuming that the cache -- // configuration is `cudaFuncCachePreferL1`. If the amount of dynamic -- // shared memory computed is actually smaller than `shmem` we overwrite -- // `shmem` and set `prefer_shmem` to `false`. -- modify_launch_configuration_if_desired_occupancy_is_specified( -- driver.get_policy(), cuda_instance->m_deviceProp, -- get_cuda_func_attributes(), block, shmem, prefer_shmem); -- -- Impl::configure_shmem_preference< -- DriverType, Kokkos::LaunchBounds>( -- base_t::get_kernel_func(), prefer_shmem); -+ if (DriverType::Policy:: -+ experimental_contains_desired_occupancy) { -+ int desired_occupancy = -+ driver.get_policy().impl_get_desired_occupancy().value(); -+ size_t block_size = block.x * block.y * block.z; -+ Impl::configure_shmem_preference< -+ DriverType, -+ Kokkos::LaunchBounds>( -+ base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size, -+ shmem, desired_occupancy); -+ } - - KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE(); - -@@ -650,18 +700,9 @@ struct CudaParallelLaunchImpl< - } - - static cudaFuncAttributes get_cuda_func_attributes() { -- // Race condition inside of cudaFuncGetAttributes if the same address is -- // given requires using a local variable as input instead of a static Rely -- // on static variable initialization to make sure only one thread executes -- // the code and the result is visible. -- auto wrap_get_attributes = []() -> cudaFuncAttributes { -- cudaFuncAttributes attr_tmp; -- KOKKOS_IMPL_CUDA_SAFE_CALL( -- cudaFuncGetAttributes(&attr_tmp, base_t::get_kernel_func())); -- return attr_tmp; -- }; -- static cudaFuncAttributes attr = wrap_get_attributes(); -- return attr; -+ return get_cuda_kernel_func_attributes< -+ DriverType, Kokkos::LaunchBounds>( -+ base_t::get_kernel_func()); - } - }; - -diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp -index e586bb4cc6..0e348c092a 100644 ---- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp -+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp -@@ -121,8 +121,7 @@ class ParallelFor, Kokkos::Cuda> { - maxblocks[1]), - 1); - CudaParallelLaunch( -- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(), -- false); -+ *this, grid, block, 0, m_rp.space().impl_internal_space_instance()); - } else if (RP::rank == 3) { - const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], m_rp.m_tile[2]); - KOKKOS_ASSERT(block.x > 0); -@@ -139,8 +138,7 @@ class ParallelFor, Kokkos::Cuda> { - (m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1) / block.z, - maxblocks[2])); - CudaParallelLaunch( -- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(), -- false); -+ *this, grid, block, 0, m_rp.space().impl_internal_space_instance()); - } else if (RP::rank == 4) { - // id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to - // threadIdx.z -@@ -158,8 +156,7 @@ class ParallelFor, Kokkos::Cuda> { - (m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1) / block.z, - maxblocks[2])); - CudaParallelLaunch( -- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(), -- false); -+ *this, grid, block, 0, m_rp.space().impl_internal_space_instance()); - } else if (RP::rank == 5) { - // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to - // threadIdx.z -@@ -175,8 +172,7 @@ class ParallelFor, Kokkos::Cuda> { - (m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1) / block.z, - maxblocks[2])); - CudaParallelLaunch( -- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(), -- false); -+ *this, grid, block, 0, m_rp.space().impl_internal_space_instance()); - } else if (RP::rank == 6) { - // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4,id5 to - // threadIdx.z -@@ -191,8 +187,7 @@ class ParallelFor, Kokkos::Cuda> { - std::min(m_rp.m_tile_end[4] * m_rp.m_tile_end[5], - maxblocks[2])); - CudaParallelLaunch( -- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(), -- false); -+ *this, grid, block, 0, m_rp.space().impl_internal_space_instance()); - } else { - Kokkos::abort("Kokkos::MDRange Error: Exceeded rank bounds with Cuda\n"); - } -@@ -405,8 +400,8 @@ class ParallelReduce, ReducerType, - - CudaParallelLaunch( - *this, grid, block, shmem, -- m_policy.space().impl_internal_space_instance(), -- false); // copy to device and execute -+ m_policy.space() -+ .impl_internal_space_instance()); // copy to device and execute - - if (!m_result_ptr_device_accessible) { - if (m_result_ptr) { -diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp -index ac160f8fe2..d1031751c2 100644 ---- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp -+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp -@@ -135,8 +135,7 @@ class ParallelFor, Kokkos::Cuda> { - #endif - - CudaParallelLaunch( -- *this, grid, block, 0, m_policy.space().impl_internal_space_instance(), -- false); -+ *this, grid, block, 0, m_policy.space().impl_internal_space_instance()); - } - - ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) -@@ -375,8 +374,8 @@ class ParallelReduce, ReducerType, - - CudaParallelLaunch( - *this, grid, block, shmem, -- m_policy.space().impl_internal_space_instance(), -- false); // copy to device and execute -+ m_policy.space() -+ .impl_internal_space_instance()); // copy to device and execute - - if (!m_result_ptr_device_accessible) { - if (m_result_ptr) { -@@ -726,16 +725,16 @@ class ParallelScan, Kokkos::Cuda> { - m_final = false; - CudaParallelLaunch( - *this, grid, block, shmem, -- m_policy.space().impl_internal_space_instance(), -- false); // copy to device and execute -+ m_policy.space() -+ .impl_internal_space_instance()); // copy to device and execute - #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION - } - #endif - m_final = true; - CudaParallelLaunch( - *this, grid, block, shmem, -- m_policy.space().impl_internal_space_instance(), -- false); // copy to device and execute -+ m_policy.space() -+ .impl_internal_space_instance()); // copy to device and execute - } - } - -@@ -1038,16 +1037,16 @@ class ParallelScanWithTotal, - m_final = false; - CudaParallelLaunch( - *this, grid, block, shmem, -- m_policy.space().impl_internal_space_instance(), -- false); // copy to device and execute -+ m_policy.space() -+ .impl_internal_space_instance()); // copy to device and execute - #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION - } - #endif - m_final = true; - CudaParallelLaunch( - *this, grid, block, shmem, -- m_policy.space().impl_internal_space_instance(), -- false); // copy to device and execute -+ m_policy.space() -+ .impl_internal_space_instance()); // copy to device and execute - - const int size = Analysis::value_size(m_functor); - #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION -diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp -index cdd16085b3..ea9430b812 100644 ---- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp -+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp -@@ -552,8 +552,8 @@ class ParallelFor, - - CudaParallelLaunch( - *this, grid, block, shmem_size_total, -- m_policy.space().impl_internal_space_instance(), -- true); // copy to device and execute -+ m_policy.space() -+ .impl_internal_space_instance()); // copy to device and execute - } - - ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) -@@ -878,8 +878,8 @@ class ParallelReduce, - - CudaParallelLaunch( - *this, grid, block, shmem_size_total, -- m_policy.space().impl_internal_space_instance(), -- true); // copy to device and execute -+ m_policy.space() -+ .impl_internal_space_instance()); // copy to device and execute - - if (!m_result_ptr_device_accessible) { - m_policy.space().fence( -diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp -index 34d4bef9fd..178012431c 100644 ---- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp -+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp -@@ -428,11 +428,6 @@ struct CudaReductionsFunctor { - // __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor) - // function qualifier which could be used to improve performance. - //---------------------------------------------------------------------------- --// Maximize shared memory and minimize L1 cache: --// cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferShared ); --// For 2.0 capability: 48 KB shared and 16 KB L1 --//---------------------------------------------------------------------------- --//---------------------------------------------------------------------------- - /* - * Algorithmic constraints: - * (a) blockDim.y <= 1024 -diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp -index fb3a6b138f..a12378a891 100644 ---- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp -+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp -@@ -100,8 +100,7 @@ class ParallelFor, - const int shared = 0; - - Kokkos::Impl::CudaParallelLaunch( -- *this, grid, block, shared, Cuda().impl_internal_space_instance(), -- false); -+ *this, grid, block, shared, Cuda().impl_internal_space_instance()); - } - - inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) diff --git a/lib/kokkos/kokkos_5731.diff b/lib/kokkos/kokkos_5731.diff deleted file mode 100644 index e95f4a1546..0000000000 --- a/lib/kokkos/kokkos_5731.diff +++ /dev/null @@ -1,46 +0,0 @@ -diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp -index 30b6958a67..b94f053272 100644 ---- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp -+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp -@@ -207,41 +207,6 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance, - LaunchBounds{}); - } - --// NOTE these number can be obtained several ways: --// * One option is to download the CUDA Occupancy Calculator spreadsheet, select --// "Compute Capability" first and check what is the smallest "Shared Memory --// Size Config" that is available. The "Shared Memory Per Multiprocessor" in --// bytes is then to be found below in the summary. --// * Another option would be to look for the information in the "Tuning --// Guide(s)" of the CUDA Toolkit Documentation for each GPU architecture, in --// the "Shared Memory" section (more tedious) --inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) { -- int const compute_capability = properties.major * 10 + properties.minor; -- return [compute_capability]() { -- switch (compute_capability) { -- case 30: -- case 32: -- case 35: return 16; -- case 37: return 80; -- case 50: -- case 53: -- case 60: -- case 62: return 64; -- case 52: -- case 61: return 96; -- case 70: -- case 80: -- case 86: -- case 90: return 8; -- case 75: return 32; -- default: -- Kokkos::Impl::throw_runtime_exception( -- "Unknown device in cuda block size deduction"); -- } -- return 0; -- }() * 1024; --} -- - } // namespace Impl - } // namespace Kokkos - diff --git a/lib/kokkos/kokkos_5739.diff b/lib/kokkos/kokkos_5739.diff deleted file mode 100644 index fe7a1ff551..0000000000 --- a/lib/kokkos/kokkos_5739.diff +++ /dev/null @@ -1,204 +0,0 @@ -diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp -index b94f053272..252c13c524 100644 ---- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp -+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp -@@ -53,17 +53,69 @@ - namespace Kokkos { - namespace Impl { - -+inline int cuda_warp_per_sm_allocation_granularity( -+ cudaDeviceProp const& properties) { -+ // Allocation granularity of warps in each sm -+ switch (properties.major) { -+ case 3: -+ case 5: -+ case 7: -+ case 8: -+ case 9: return 4; -+ case 6: return (properties.minor == 0 ? 2 : 4); -+ default: -+ throw_runtime_exception( -+ "Unknown device in cuda warp per sm allocation granularity"); -+ return 0; -+ } -+} -+ -+inline int cuda_max_warps_per_sm_registers( -+ cudaDeviceProp const& properties, cudaFuncAttributes const& attributes) { -+ // Maximum number of warps per sm as a function of register counts, -+ // subject to the constraint that warps are allocated with a fixed granularity -+ int const max_regs_per_block = properties.regsPerBlock; -+ int const regs_per_warp = attributes.numRegs * properties.warpSize; -+ int const warp_granularity = -+ cuda_warp_per_sm_allocation_granularity(properties); -+ // The granularity of register allocation is chunks of 256 registers per warp, -+ // which implies a need to over-allocate, so we round up -+ int const allocated_regs_per_warp = (regs_per_warp + 256 - 1) / 256; -+ -+ // The maximum number of warps per SM is constrained from above by register -+ // allocation. To satisfy the constraint that warps per SM is allocated at a -+ // finite granularity, we need to round down. -+ int const max_warps_per_sm = -+ warp_granularity * -+ (max_regs_per_block / (allocated_regs_per_warp * warp_granularity)); -+ -+ return max_warps_per_sm; -+} -+ - inline int cuda_max_active_blocks_per_sm(cudaDeviceProp const& properties, - cudaFuncAttributes const& attributes, - int block_size, size_t dynamic_shmem) { -- // Limits due do registers/SM -+ // Limits due to registers/SM - int const regs_per_sm = properties.regsPerMultiprocessor; - int const regs_per_thread = attributes.numRegs; - // The granularity of register allocation is chunks of 256 registers per warp - // -> 8 registers per thread - int const allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8); -- int const max_blocks_regs = -- regs_per_sm / (allocated_regs_per_thread * block_size); -+ int max_blocks_regs = regs_per_sm / (allocated_regs_per_thread * block_size); -+ -+ // Compute the maximum number of warps as a function of the number of -+ // registers -+ int const max_warps_per_sm_registers = -+ cuda_max_warps_per_sm_registers(properties, attributes); -+ -+ // Constrain the number of blocks to respect the maximum number of warps per -+ // SM On face value this should be an equality, but due to the warp -+ // granularity constraints noted in `cuda_max_warps_per_sm_registers` the -+ // left-hand-side of this comparison can overshoot what the hardware allows -+ // based on register counts alone -+ while ((max_blocks_regs * block_size / properties.warpSize) > -+ max_warps_per_sm_registers) -+ max_blocks_regs--; - - // Limits due to shared memory/SM - size_t const shmem_per_sm = properties.sharedMemPerMultiprocessor; -@@ -207,6 +259,19 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance, - LaunchBounds{}); - } - -+template -+int cuda_get_opt_block_size_no_shmem(const cudaFuncAttributes& attr, -+ LaunchBounds) { -+ auto const& prop = Kokkos::Cuda().cuda_device_prop(); -+ -+ // Thin version of cuda_get_opt_block_size for cases where there is no shared -+ // memory -+ auto const block_size_to_no_shmem = [&](int /*block_size*/) { return 0; }; -+ -+ return cuda_deduce_block_size(false, prop, attr, block_size_to_no_shmem, -+ LaunchBounds{}); -+} -+ - } // namespace Impl - } // namespace Kokkos - -diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp -index 5c4c3a7d39..170183ca0a 100644 ---- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp -+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp -@@ -188,9 +188,23 @@ inline void configure_shmem_preference(const KernelFuncPtr& func, - // The granularity of register allocation is chunks of 256 registers per warp - // -> 8 registers per thread - const size_t allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8); -- const size_t max_blocks_regs = -+ size_t max_blocks_regs = - regs_per_sm / (allocated_regs_per_thread * block_size); - -+ // Compute the maximum number of warps as a function of the number of -+ // registers -+ const size_t max_warps_per_sm_registers = -+ cuda_max_warps_per_sm_registers(device_props, func_attr); -+ -+ // Constrain the number of blocks to respect the maximum number of warps per -+ // SM On face value this should be an equality, but due to the warp -+ // granularity constraints noted in `cuda_max_warps_per_sm_registers` the -+ // left-hand-side of this comparison can overshoot what the hardware allows -+ // based on register counts alone -+ while ((max_blocks_regs * block_size / device_props.warpSize) > -+ max_warps_per_sm_registers) -+ max_blocks_regs--; -+ - // Compute how many threads per sm we actually want - const size_t max_threads_per_sm = device_props.maxThreadsPerMultiProcessor; - // only allocate multiples of warp size -diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp -index 0e348c092a..7e4f62f12e 100644 ---- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp -+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp -@@ -67,6 +67,34 @@ - namespace Kokkos { - namespace Impl { - -+template -+int max_tile_size_product_helper(const Policy& pol, const LaunchBounds&) { -+ cudaFuncAttributes attr = -+ CudaParallelLaunch::get_cuda_func_attributes(); -+ auto const& prop = pol.space().cuda_device_prop(); -+ -+ // Limits due to registers/SM, MDRange doesn't have -+ // shared memory constraints -+ int const optimal_block_size = -+ Kokkos::Impl::cuda_get_opt_block_size_no_shmem(attr, LaunchBounds{}); -+ -+ // Compute how many blocks of this size we can launch, based on warp -+ // constraints -+ int const max_warps_per_sm_registers = -+ Kokkos::Impl::cuda_max_warps_per_sm_registers(prop, attr); -+ int const max_num_threads_from_warps = -+ max_warps_per_sm_registers * prop.warpSize; -+ int const max_num_blocks = max_num_threads_from_warps / optimal_block_size; -+ -+ // Compute the total number of threads -+ int const max_threads_per_sm = optimal_block_size * max_num_blocks; -+ -+ return std::min( -+ max_threads_per_sm, -+ static_cast(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism)); -+} -+ - template - class ParallelFor, Kokkos::Cuda> { - public: -@@ -85,18 +113,7 @@ class ParallelFor, Kokkos::Cuda> { - public: - template - static int max_tile_size_product(const Policy& pol, const Functor&) { -- cudaFuncAttributes attr = -- CudaParallelLaunch::get_cuda_func_attributes(); -- auto const& prop = pol.space().cuda_device_prop(); -- // Limits due to registers/SM, MDRange doesn't have -- // shared memory constraints -- int const regs_per_sm = prop.regsPerMultiprocessor; -- int const regs_per_thread = attr.numRegs; -- int const max_threads_per_sm = regs_per_sm / regs_per_thread; -- return std::min( -- max_threads_per_sm, -- static_cast(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism)); -+ return max_tile_size_product_helper(pol, LaunchBounds{}); - } - Policy const& get_policy() const { return m_rp; } - inline __device__ void operator()() const { -@@ -258,17 +275,7 @@ class ParallelReduce, ReducerType, - public: - template - static int max_tile_size_product(const Policy& pol, const Functor&) { -- cudaFuncAttributes attr = -- CudaParallelLaunch::get_cuda_func_attributes(); -- auto const& prop = pol.space().cuda_device_prop(); -- // Limits due do registers/SM -- int const regs_per_sm = prop.regsPerMultiprocessor; -- int const regs_per_thread = attr.numRegs; -- int const max_threads_per_sm = regs_per_sm / regs_per_thread; -- return std::min( -- max_threads_per_sm, -- static_cast(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism)); -+ return max_tile_size_product_helper(pol, LaunchBounds{}); - } - Policy const& get_policy() const { return m_policy; } - inline __device__ void exec_range(reference_type update) const { diff --git a/lib/kokkos/kokkos_fix_5706_apply_last.diff b/lib/kokkos/kokkos_fix_5706_apply_last.diff deleted file mode 100644 index 5d298323fd..0000000000 --- a/lib/kokkos/kokkos_fix_5706_apply_last.diff +++ /dev/null @@ -1,63 +0,0 @@ -diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp -index 170183ca0a..ba43e362bb 100644 ---- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp -+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp -@@ -412,12 +412,16 @@ struct CudaParallelLaunchKernelInvoker< - Impl::check_shmem_request(cuda_instance, shmem); - if (DriverType::Policy:: - experimental_contains_desired_occupancy) { -+ /* - int desired_occupancy = - driver.get_policy().impl_get_desired_occupancy().value(); - size_t block_size = block.x * block.y * block.z; - Impl::configure_shmem_preference( - base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size, -- shmem, desired_occupancy); -+ shmem, desired_occupancy);*/ -+ Kokkos::Impl::throw_runtime_exception( -+ std::string("Cuda graph node creation FAILED:" -+ " occupancy requests are currently broken.")); - } - - void const* args[] = {&driver}; -@@ -511,14 +515,17 @@ struct CudaParallelLaunchKernelInvoker< - - if (!Impl::is_empty_launch(grid, block)) { - Impl::check_shmem_request(cuda_instance, shmem); -- if constexpr (DriverType::Policy:: -+ if (DriverType::Policy:: - experimental_contains_desired_occupancy) { -- int desired_occupancy = -+ /*int desired_occupancy = - driver.get_policy().impl_get_desired_occupancy().value(); - size_t block_size = block.x * block.y * block.z; - Impl::configure_shmem_preference( - base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size, -- shmem, desired_occupancy); -+ shmem, desired_occupancy);*/ -+ Kokkos::Impl::throw_runtime_exception( -+ std::string("Cuda graph node creation FAILED:" -+ " occupancy requests are currently broken.")); - } - - auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver); -@@ -690,14 +697,17 @@ struct CudaParallelLaunchImpl< - - if (DriverType::Policy:: - experimental_contains_desired_occupancy) { -- int desired_occupancy = -+ /*int desired_occupancy = - driver.get_policy().impl_get_desired_occupancy().value(); - size_t block_size = block.x * block.y * block.z; - Impl::configure_shmem_preference< - DriverType, - Kokkos::LaunchBounds>( - base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size, -- shmem, desired_occupancy); -+ shmem, desired_occupancy);*/ -+ Kokkos::Impl::throw_runtime_exception( -+ std::string("Cuda graph node creation FAILED:" -+ " occupancy requests are currently broken.")); - } - - KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE(); diff --git a/potentials/HGa.msmeam b/potentials/HGa.msmeam new file mode 100644 index 0000000000..9f01501c16 --- /dev/null +++ b/potentials/HGa.msmeam @@ -0,0 +1,30 @@ +bkgd_dyn = 1 +emb_lin_neg = 1 +augt1=0 +ialloy=1 +rc = 5.9 +#H +attrac(1,1)=0.460 +repuls(1,1)=0.460 +Cmin(1,1,1)=1.3 # PuMS +Cmax(1,1,1)= 2.80 +nn2(1,1)=1 +#Ga +rho0(2) = 0.6 +attrac(2,2)=0.097 +repuls(2,2)=0.097 +nn2(2,2)=1 +#HGa +attrac(1,2)=0.300 +repuls(1,2)=0.300 +lattce(1,2)=l12 +re(1,2)=3.19 +delta(1,2)=-0.48 +alpha(1,2)=6.6 +Cmin(1,1,2)=2.0 +Cmin(2,1,2)= 2.0 +Cmin(1,2,1)=2.0 +Cmin(2,2,1) = 1.4 +Cmin(1,2,2) = 1.4 +Cmin(1,1,2) = 1.4 +nn2(1,2)=1 diff --git a/potentials/library.msmeam b/potentials/library.msmeam new file mode 100644 index 0000000000..9937eaee08 --- /dev/null +++ b/potentials/library.msmeam @@ -0,0 +1,14 @@ +# DATE: 2018-09-22 UNITS: metal CONTRIBUTOR: Steve Valone, smv@lanl.gov CITATION: Baskes, PRB 1992; smv, sr, mib, JNM 2010 +# ms-meam data format May 2010 +# elt lat z ielement atwt +# alpha b0 b1 b2 b3 b1m b2m b3m alat esub asub +# - t0 t1 t2 t3 t1m t2m t3m rozero ibar +# NOTE: leading character cannot be a space + +'H' 'dim' 1.0 1 1.0079 +2.960 2.960 3.0 1.0 1.0 1.0 3.0 1.0 0.741 2.235 2.50 +1.0 0.44721 0.0 0.00 0.0 0.31623 0 6.70 0 + +'Ga4' 'fcc' 12.0 31 69.723 +4.42 4.80 3.10 6.00 0.00 0.0 0.0 0.5 4.247 2.897 0.97 +1.0 1.649 1.435 0.00 0.0 0.0 2.0 0.70 0 diff --git a/python/lammps/mliap/__init__.py b/python/lammps/mliap/__init__.py index c1a9752855..6e638ac360 100644 --- a/python/lammps/mliap/__init__.py +++ b/python/lammps/mliap/__init__.py @@ -32,7 +32,7 @@ if not pylib.Py_IsInitialized(): else: from .loader import load_model, load_unified, activate_mliappy try: - from .loader import load_model_kokkos, activate_mliappy_kokkos + from .loader import load_model_kokkos, load_unified_kokkos, activate_mliappy_kokkos except Exception as ee: # ignore import error, it means that the KOKKOS package was not included in LAMMPS pass diff --git a/python/lammps/mliap/loader.py b/python/lammps/mliap/loader.py index 940bd10f1f..558c69a7a9 100644 --- a/python/lammps/mliap/loader.py +++ b/python/lammps/mliap/loader.py @@ -75,7 +75,7 @@ def activate_mliappy(lmp): def activate_mliappy_kokkos(lmp): try: library = lmp.lib - module_names = ["mliap_model_python_couple_kokkos"] + module_names = ["mliap_model_python_couple_kokkos", "mliap_unified_couple_kokkos"] api_version = library.lammps_python_api_version() for module_name in module_names: @@ -118,3 +118,12 @@ def load_unified(model): ) from ie mliap_unified_couple.load_from_python(model) +def load_unified_kokkos(model): + try: + import mliap_unified_couple_kokkos + except ImportError as ie: + raise ImportError("ML-IAP python module must be activated before loading\n" + "the pair style. Call lammps.mliap.activate_mliappy(lmp)." + ) from ie + mliap_unified_couple_kokkos.load_from_python(model) + diff --git a/src/AMOEBA/amoeba_convolution.cpp b/src/AMOEBA/amoeba_convolution.cpp index f222613c3c..ae3dbf16c4 100644 --- a/src/AMOEBA/amoeba_convolution.cpp +++ b/src/AMOEBA/amoeba_convolution.cpp @@ -22,6 +22,7 @@ #include "memory.h" #include "neighbor.h" #include "remap_wrap.h" +#include "timer.h" #include #include @@ -326,15 +327,23 @@ FFT_SCALAR *AmoebaConvolution::pre_convolution_3d() cfft[n++] = ZEROF; } + double time0,time1; + + if (timer->has_sync()) MPI_Barrier(world); + time0 = platform::walltime(); + // perform forward FFT fft1->compute(cfft,cfft,FFT3d::FORWARD); + time1 = platform::walltime(); if (SCALE) { - double scale = 1.0/nfft_global; + FFT_SCALAR scale = 1.0/nfft_global; for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale; } + time_fft += time1 - time0; + #if DEBUG_AMOEBA debug_scalar(CFFT1,"PRE Convo / POST FFT"); debug_file(CFFT1,"pre.convo.post.fft"); @@ -382,15 +391,24 @@ FFT_SCALAR *AmoebaConvolution::pre_convolution_4d() debug_scalar(FFT,"PRE Convo / POST Remap"); debug_file(FFT,"pre.convo.post.remap"); #endif + + double time0,time1; + + if (timer->has_sync()) MPI_Barrier(world); + time0 = platform::walltime(); + // perform forward FFT fft1->compute(cfft,cfft,FFT3d::FORWARD); + time1 = platform::walltime(); if (SCALE) { - double scale = 1.0/nfft_global; + FFT_SCALAR scale = 1.0/nfft_global; for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale; } + time_fft += time1 - time0; + #if DEBUG_AMOEBA debug_scalar(CFFT1,"PRE Convo / POST FFT"); debug_file(CFFT1,"pre.convo.post.fft"); @@ -423,7 +441,16 @@ void *AmoebaConvolution::post_convolution_3d() debug_scalar(CFFT1,"POST Convo / PRE FFT"); debug_file(CFFT1,"post.convo.pre.fft"); #endif + + double time0,time1; + + if (timer->has_sync()) MPI_Barrier(world); + time0 = platform::walltime(); + fft2->compute(cfft,cfft,FFT3d::BACKWARD); + time1 = platform::walltime(); + + time_fft += time1 - time0; #if DEBUG_AMOEBA debug_scalar(CFFT2,"POST Convo / POST FFT"); @@ -465,8 +492,18 @@ void *AmoebaConvolution::post_convolution_4d() debug_scalar(CFFT1,"POST Convo / PRE FFT"); debug_file(CFFT1,"post.convo.pre.fft"); #endif + + double time0,time1; + + if (timer->has_sync()) MPI_Barrier(world); + time0 = platform::walltime(); + fft2->compute(cfft,cfft,FFT3d::BACKWARD); + time1 = platform::walltime(); + + time_fft += time1 - time0; + #if DEBUG_AMOEBA debug_scalar(CFFT2,"POST Convo / POST FFT"); debug_file(CFFT2,"post.convo.post.fft"); diff --git a/src/AMOEBA/amoeba_convolution.h b/src/AMOEBA/amoeba_convolution.h index 99ad11ade4..bed65149ec 100644 --- a/src/AMOEBA/amoeba_convolution.h +++ b/src/AMOEBA/amoeba_convolution.h @@ -38,7 +38,7 @@ class AmoebaConvolution : protected Pointers { int nxlo_out, nxhi_out, nylo_out, nyhi_out, nzlo_out, nzhi_out; int nxlo_fft, nxhi_fft, nylo_fft, nyhi_fft, nzlo_fft, nzhi_fft; bigint nfft_global; // nx * ny * nz - double *grid_brick_start; // lower left corner of (c)grid_brick data + FFT_SCALAR *grid_brick_start; // lower left corner of (c)grid_brick data AmoebaConvolution(class LAMMPS *, class Pair *, int, int, int, int, int); ~AmoebaConvolution(); @@ -47,35 +47,37 @@ class AmoebaConvolution : protected Pointers { FFT_SCALAR *pre_convolution(); void *post_convolution(); - private: - int which; // caller name for convolution being performed - int flag3d; // 1 if using 3d grid_brick, 0 for 4d cgrid_brick - int nbrick_owned; // owned grid points in brick decomp - int nbrick_ghosts; // owned + ghost brick grid points - int ngrid_either; // max of nbrick_owned or nfft_owned + double time_fft; + + protected: + int which; // caller name for convolution being performed + int flag3d; // 1 if using 3d grid_brick, 0 for 4d cgrid_brick + int nbrick_owned; // owned grid points in brick decomp + int nbrick_ghosts; // owned + ghost brick grid points + int ngrid_either; // max of nbrick_owned or nfft_owned class Pair *amoeba; class FFT3d *fft1, *fft2; class Grid3d *gc; class Remap *remap; - double ***grid_brick; // 3d real brick grid with ghosts - double ****cgrid_brick; // 4d complex brick grid with ghosts + FFT_SCALAR ***grid_brick; // 3d real brick grid with ghosts + FFT_SCALAR ****cgrid_brick; // 4d complex brick grid with ghosts FFT_SCALAR *grid_fft; // 3d FFT grid as 1d vector FFT_SCALAR *cfft; // 3d complex FFT grid as 1d vector - double *gc_buf1, *gc_buf2; // buffers for GridComm - double *remap_buf; // buffer for Remap + FFT_SCALAR *gc_buf1, *gc_buf2; // buffers for GridComm + FFT_SCALAR *remap_buf; // buffer for Remap void allocate_grid(); void deallocate_grid(); void *zero_3d(); void *zero_4d(); FFT_SCALAR *pre_convolution_3d(); - FFT_SCALAR *pre_convolution_4d(); + virtual FFT_SCALAR *pre_convolution_4d(); void *post_convolution_3d(); - void *post_convolution_4d(); + virtual void *post_convolution_4d(); void procs2grid2d(int, int, int, int &, int &); // DEBUG diff --git a/src/AMOEBA/amoeba_dispersion.cpp b/src/AMOEBA/amoeba_dispersion.cpp index f3af921d85..cc283f22d2 100644 --- a/src/AMOEBA/amoeba_dispersion.cpp +++ b/src/AMOEBA/amoeba_dispersion.cpp @@ -285,7 +285,7 @@ void PairAmoeba::dispersion_kspace() // gridpre = my portion of 3d grid in brick decomp w/ ghost values // zeroed by zero() - double ***gridpre = (double ***) d_kspace->zero(); + FFT_SCALAR ***gridpre = (FFT_SCALAR ***) d_kspace->zero(); // map atoms to grid @@ -294,7 +294,7 @@ void PairAmoeba::dispersion_kspace() // pre-convolution operations including forward FFT // gridfft = my portion of complex 3d grid in FFT decomposition - double *gridfft = d_kspace->pre_convolution(); + FFT_SCALAR *gridfft = d_kspace->pre_convolution(); // --------------------- // convolution operation diff --git a/src/AMOEBA/amoeba_induce.cpp b/src/AMOEBA/amoeba_induce.cpp index a6724e2bb7..ecc20a198c 100644 --- a/src/AMOEBA/amoeba_induce.cpp +++ b/src/AMOEBA/amoeba_induce.cpp @@ -24,6 +24,7 @@ #include "math_special.h" #include "my_page.h" #include "neigh_list.h" +#include "timer.h" #include @@ -381,8 +382,6 @@ void PairAmoeba::induce() } } - // if (comm->me == 0) printf("CG iteration count = %d\n",iter); - // terminate the calculation if dipoles failed to converge // NOTE: could make this an error @@ -546,13 +545,19 @@ void PairAmoeba::ufield0c(double **field, double **fieldp) } } - // get the reciprocal space part of the mutual field - - if (polar_kspace_flag) umutual1(field,fieldp); + double time0, time1, time2; + if (timer->has_sync()) MPI_Barrier(world); + time0 = platform::walltime(); // get the real space portion of the mutual field if (polar_rspace_flag) umutual2b(field,fieldp); + time1 = platform::walltime(); + + // get the reciprocal space part of the mutual field + + if (polar_kspace_flag) umutual1(field,fieldp); + time2 = platform::walltime(); // add the self-energy portion of the mutual field @@ -563,6 +568,11 @@ void PairAmoeba::ufield0c(double **field, double **fieldp) fieldp[i][j] += term*uinp[i][j]; } } + + // accumulate timing information + + time_mutual_rspace += time1 - time0; + time_mutual_kspace += time2 - time1; } /* ---------------------------------------------------------------------- @@ -785,7 +795,12 @@ void PairAmoeba::dfield0c(double **field, double **fieldp) // get the reciprocal space part of the permanent field + double time0, time1, time2; + if (timer->has_sync()) MPI_Barrier(world); + time0 = platform::walltime(); + if (polar_kspace_flag) udirect1(field); + time1 = platform::walltime(); for (i = 0; i < nlocal; i++) { for (j = 0; j < 3; j++) { @@ -796,6 +811,7 @@ void PairAmoeba::dfield0c(double **field, double **fieldp) // get the real space portion of the permanent field if (polar_rspace_flag) udirect2b(field,fieldp); + time2 = platform::walltime(); // get the self-energy portion of the permanent field @@ -806,6 +822,11 @@ void PairAmoeba::dfield0c(double **field, double **fieldp) fieldp[i][j] += term*rpole[i][j+1]; } } + + // accumulate timing information + + time_direct_kspace += time1 - time0; + time_direct_rspace += time2 - time1; } /* ---------------------------------------------------------------------- @@ -842,18 +863,26 @@ void PairAmoeba::umutual1(double **field, double **fieldp) } } + double time0, time1; + // gridpre = my portion of 4d grid in brick decomp w/ ghost values - double ****gridpre = (double ****) ic_kspace->zero(); + FFT_SCALAR ****gridpre = (FFT_SCALAR ****) ic_kspace->zero(); // map 2 values to grid + if (timer->has_sync()) MPI_Barrier(world); + time0 = platform::walltime(); + grid_uind(fuind,fuinp,gridpre); + time1 = platform::walltime(); + time_grid_uind += (time1 - time0); + // pre-convolution operations including forward FFT // gridfft = my portion of complex 3d grid in FFT decomposition - double *gridfft = ic_kspace->pre_convolution(); + FFT_SCALAR *gridfft = ic_kspace->pre_convolution(); // --------------------- // convolution operation @@ -883,12 +912,18 @@ void PairAmoeba::umutual1(double **field, double **fieldp) // post-convolution operations including backward FFT // gridppost = my portion of 4d grid in brick decomp w/ ghost values - double ****gridpost = (double ****) ic_kspace->post_convolution(); + FFT_SCALAR ****gridpost = (FFT_SCALAR ****) ic_kspace->post_convolution(); // get potential + if (timer->has_sync()) MPI_Barrier(world); + time0 = platform::walltime(); + fphi_uind(gridpost,fdip_phi1,fdip_phi2,fdip_sum_phi); + time1 = platform::walltime(); + time_fphi_uind += (time1 - time0); + // store fractional reciprocal potentials for OPT method if (poltyp == OPT) { @@ -1055,7 +1090,7 @@ void PairAmoeba::udirect1(double **field) // gridpre = my portion of 3d grid in brick decomp w/ ghost values // zeroed by setup() - double ***gridpre = (double ***) i_kspace->zero(); + FFT_SCALAR ***gridpre = (FFT_SCALAR ***) i_kspace->zero(); // map multipole moments to grid @@ -1064,7 +1099,7 @@ void PairAmoeba::udirect1(double **field) // pre-convolution operations including forward FFT // gridfft = my 1d portion of complex 3d grid in FFT decomp - double *gridfft = i_kspace->pre_convolution(); + FFT_SCALAR *gridfft = i_kspace->pre_convolution(); // --------------------- // convolution operation @@ -1109,7 +1144,7 @@ void PairAmoeba::udirect1(double **field) // post-convolution operations including backward FFT // gridppost = my portion of 3d grid in brick decomp w/ ghost values - double ***gridpost = (double ***) i_kspace->post_convolution(); + FFT_SCALAR ***gridpost = (FFT_SCALAR ***) i_kspace->post_convolution(); // get potential diff --git a/src/AMOEBA/amoeba_kspace.cpp b/src/AMOEBA/amoeba_kspace.cpp index da6483ef40..6d2fb64dd6 100644 --- a/src/AMOEBA/amoeba_kspace.cpp +++ b/src/AMOEBA/amoeba_kspace.cpp @@ -68,25 +68,23 @@ void PairAmoeba::moduli() int maxfft = MAX(nfft1,nfft2); maxfft = MAX(maxfft,nfft3); - double *array = new double[bsorder]; - double *bsarray = new double[maxfft]; + if (maxfft > _nfft_max) { + memory->destroy(_moduli_bsarray); + _nfft_max = maxfft; + memory->create(_moduli_bsarray,_nfft_max,"amoeba:_moduli_bsarray"); + } // compute and load the moduli values double x = 0.0; - bspline(x,bsorder,array); + bspline(x,bsorder,_moduli_array); - for (i = 0; i < maxfft; i++) bsarray[i] = 0.0; - for (i = 0; i < bsorder; i++) bsarray[i+1] = array[i]; + for (i = 0; i < maxfft; i++) _moduli_bsarray[i] = 0.0; + for (i = 0; i < bsorder; i++) _moduli_bsarray[i+1] = _moduli_array[i]; - dftmod(bsmod1,bsarray,nfft1,bsorder); - dftmod(bsmod2,bsarray,nfft2,bsorder); - dftmod(bsmod3,bsarray,nfft3,bsorder); - - // perform deallocation of local arrays - - delete[] array; - delete[] bsarray; + dftmod(bsmod1,_moduli_bsarray,nfft1,bsorder); + dftmod(bsmod2,_moduli_bsarray,nfft2,bsorder); + dftmod(bsmod3,_moduli_bsarray,nfft3,bsorder); } /* ---------------------------------------------------------------------- @@ -525,7 +523,7 @@ void PairAmoeba::frac_to_cart() grid_mpole maps fractional atomic multipoles to PME grid ------------------------------------------------------------------------- */ -void PairAmoeba::grid_mpole(double **fmp, double ***grid) +void PairAmoeba::grid_mpole(double **fmp, FFT_SCALAR ***grid) { int i,j,k,m,ib,jb,kb; double v0,u0,t0; @@ -598,7 +596,7 @@ void PairAmoeba::grid_mpole(double **fmp, double ***grid) the particle mesh Ewald grid ------------------------------------------------------------------------- */ -void PairAmoeba::fphi_mpole(double ***grid, double **fphi) +void PairAmoeba::fphi_mpole(FFT_SCALAR ***grid, double **fphi) { int i,j,k,m,ib,jb,kb; double v0,v1,v2,v3; @@ -742,7 +740,7 @@ void PairAmoeba::fphi_mpole(double ***grid, double **fphi) grid_uind maps fractional induced dipoles to the PME grid ------------------------------------------------------------------------- */ -void PairAmoeba::grid_uind(double **fuind, double **fuinp, double ****grid) +void PairAmoeba::grid_uind(double **fuind, double **fuinp, FFT_SCALAR ****grid) { int i,j,k,m,ib,jb,kb; double v0,u0,t0; @@ -793,7 +791,7 @@ void PairAmoeba::grid_uind(double **fuind, double **fuinp, double ****grid) fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid ------------------------------------------------------------------------- */ -void PairAmoeba::fphi_uind(double ****grid, double **fdip_phi1, +void PairAmoeba::fphi_uind(FFT_SCALAR ****grid, double **fdip_phi1, double **fdip_phi2, double **fdip_sum_phi) { int i,j,k,m,ib,jb,kb; @@ -1042,7 +1040,7 @@ void PairAmoeba::fphi_uind(double ****grid, double **fdip_phi1, grid_disp maps dispersion coefficients to PME grid ------------------------------------------------------------------------- */ -void PairAmoeba::grid_disp(double ***grid) +void PairAmoeba::grid_disp(FFT_SCALAR ***grid) { int i,j,k,m,ib,jb,kb,itype,iclass; double v0,u0,t0; diff --git a/src/AMOEBA/amoeba_multipole.cpp b/src/AMOEBA/amoeba_multipole.cpp index f58395aa1c..a1503a91f3 100644 --- a/src/AMOEBA/amoeba_multipole.cpp +++ b/src/AMOEBA/amoeba_multipole.cpp @@ -21,6 +21,7 @@ #include "math_const.h" #include "math_special.h" #include "neigh_list.h" +#include "timer.h" #include @@ -55,6 +56,8 @@ void PairAmoeba::multipole() double qixx,qixy,qixz,qiyy,qiyz,qizz; double cii,dii,qii; + double time0,time1,time2; + // set cutoffs, taper coeffs, and PME params if (use_ewald) choose(MPOLE_LONG); @@ -78,13 +81,18 @@ void PairAmoeba::multipole() felec = electric / am_dielectric; + if (timer->has_sync()) MPI_Barrier(world); + time0 = platform::walltime(); + // compute the real space part of the Ewald summation if (mpole_rspace_flag) multipole_real(); + time1 = platform::walltime(); // compute the reciprocal space part of the Ewald summation if (mpole_kspace_flag) multipole_kspace(); + time2 = platform::walltime(); // compute the Ewald self-energy term over all the atoms @@ -109,6 +117,11 @@ void PairAmoeba::multipole() e = fterm * (cii + term*(dii/3.0+2.0*term*qii/5.0)); empole += e; } + + // accumulate timing information + + time_mpole_rspace += time1 - time0; + time_mpole_kspace += time2 - time1; } /* ---------------------------------------------------------------------- @@ -361,6 +374,9 @@ void PairAmoeba::multipole_real() bn[k] = (bfac*bn[k-1]+alsq2n*exp2a) / r2; } for (k = 0; k < 6; k++) bn[k] *= felec; + //if (i == 0 && j < 10) { + // printf("j = %d: aewald = %f; rr1 = %f; bn: %f %f %f %f %f %f\n", j, aewald, rr1, bn[0], bn[1], bn[2], bn[3], bn[4], bn[5]); + //} // find damped multipole intermediates and energy value @@ -404,6 +420,8 @@ void PairAmoeba::multipole_real() term2i*rr3i + term2k*rr3k + term2ik*rr3ik + term3i*rr5i + term3k*rr5k + term3ik*rr5ik; + + // find damped multipole intermediates for force and torque de = term1*rr3 + term4ik*rr9ik + term5ik*rr11ik + @@ -444,6 +462,7 @@ void PairAmoeba::multipole_real() term4 = 2.0 * (-ck*rr5+dkr*rr7-qkr*rr9); term5 = 2.0 * (-ci*rr5-dir*rr7-qir*rr9); term6 = 4.0 * rr7; + } empole += e; @@ -482,6 +501,7 @@ void PairAmoeba::multipole_real() tq[i][2] += ttmi[2]; // increment force-based gradient and torque on second site + // commenting out j parts for DEBUGGING f[j][0] += frcx; f[j][1] += frcy; @@ -638,7 +658,7 @@ void PairAmoeba::multipole_kspace() // gridpre = my portion of 3d grid in brick decomp w/ ghost values - double ***gridpre = (double ***) m_kspace->zero(); + FFT_SCALAR ***gridpre = (FFT_SCALAR ***) m_kspace->zero(); // map atoms to grid @@ -647,7 +667,7 @@ void PairAmoeba::multipole_kspace() // pre-convolution operations including forward FFT // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector - double *gridfft = m_kspace->pre_convolution(); + FFT_SCALAR *gridfft = m_kspace->pre_convolution(); // --------------------- // convolution operation @@ -718,7 +738,7 @@ void PairAmoeba::multipole_kspace() // post-convolution operations including backward FFT // gridppost = my portion of 3d grid in brick decomp w/ ghost values - double ***gridpost = (double ***) m_kspace->post_convolution(); + FFT_SCALAR ***gridpost = (FFT_SCALAR ***) m_kspace->post_convolution(); // get potential diff --git a/src/AMOEBA/amoeba_polar.cpp b/src/AMOEBA/amoeba_polar.cpp index 4d143c7a22..3c51426beb 100644 --- a/src/AMOEBA/amoeba_polar.cpp +++ b/src/AMOEBA/amoeba_polar.cpp @@ -21,6 +21,7 @@ #include "math_const.h" #include "math_special.h" #include "neigh_list.h" +#include "timer.h" #include #include @@ -55,6 +56,8 @@ void PairAmoeba::polar() double fix[3],fiy[3],fiz[3]; double tep[3]; + double time0,time1,time2; + // set cutoffs, taper coeffs, and PME params if (use_ewald) choose(POLAR_LONG); @@ -76,11 +79,16 @@ void PairAmoeba::polar() // compute the real space part of the dipole interactions + if (timer->has_sync()) MPI_Barrier(world); + time0 = platform::walltime(); + if (polar_rspace_flag) polar_real(); + time1 = platform::walltime(); // compute the reciprocal space part of dipole interactions if (polar_kspace_flag) polar_kspace(); + time2 = platform::walltime(); // compute the Ewald self-energy torque and virial terms @@ -133,6 +141,11 @@ void PairAmoeba::polar() virpolar[4] -= vxz; virpolar[5] -= vyz; } + + // accumulate timing information + + time_polar_rspace += time1 - time0; + time_polar_kspace += time2 - time1; } /* ---------------------------------------------------------------------- @@ -382,7 +395,7 @@ void PairAmoeba::polar_real() factor_uscale = 1.0; } } - + //if (i == 12 && j < 20) printf("j = %d: r = %f; factor_wscale = %f\n", j, sqrt(r2), factor_wscale); r = sqrt(r2); ck = rpole[j][0]; dkx = rpole[j][1]; @@ -597,7 +610,6 @@ void PairAmoeba::polar_real() dufld[i][3] += xr*tiz5 + zr*tix5 + 2.0*xr*zr*tuir; dufld[i][4] += yr*tiz5 + zr*tiy5 + 2.0*yr*zr*tuir; dufld[i][5] += zr*tiz5 + zr*zr*tuir; - dufld[j][0] -= xr*tkx5 + xr*xr*tukr; dufld[j][1] -= xr*tky5 + yr*tkx5 + 2.0*xr*yr*tukr; dufld[j][2] -= yr*tky5 + yr*yr*tukr; @@ -855,6 +867,7 @@ void PairAmoeba::polar_real() frcx = -2.0 * depx; frcy = -2.0 * depy; frcz = -2.0 * depz; + } // get the dtau/dr terms used for mutual polarization force @@ -1327,7 +1340,7 @@ void PairAmoeba::polar_kspace() // gridpre = my portion of 3d grid in brick decomp w/ ghost values - double ***gridpre = (double ***) p_kspace->zero(); + FFT_SCALAR ***gridpre = (FFT_SCALAR ***) p_kspace->zero(); // map atoms to grid @@ -1336,7 +1349,7 @@ void PairAmoeba::polar_kspace() // pre-convolution operations including forward FFT // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector - double *gridfft = p_kspace->pre_convolution(); + FFT_SCALAR *gridfft = p_kspace->pre_convolution(); // --------------------- // convolution operation @@ -1386,7 +1399,7 @@ void PairAmoeba::polar_kspace() // post-convolution operations including backward FFT // gridppost = my portion of 3d grid in brick decomp w/ ghost values - double ***gridpost = (double ***) p_kspace->post_convolution(); + FFT_SCALAR ***gridpost = (FFT_SCALAR ***) p_kspace->post_convolution(); // get potential @@ -1419,7 +1432,7 @@ void PairAmoeba::polar_kspace() // gridpre2 = my portion of 4d grid in brick decomp w/ ghost values - double ****gridpre2 = (double ****) pc_kspace->zero(); + FFT_SCALAR ****gridpre2 = (FFT_SCALAR ****) pc_kspace->zero(); // map 2 values to grid @@ -1428,7 +1441,7 @@ void PairAmoeba::polar_kspace() // pre-convolution operations including forward FFT // gridfft = my portion of complex 3d grid in FFT decomposition - double *gridfft = pc_kspace->pre_convolution(); + FFT_SCALAR *gridfft = pc_kspace->pre_convolution(); // --------------------- // convolution operation @@ -1451,7 +1464,7 @@ void PairAmoeba::polar_kspace() // post-convolution operations including backward FFT // gridppost = my portion of 4d grid in brick decomp w/ ghost values - double ****gridpost = (double ****) pc_kspace->post_convolution(); + FFT_SCALAR ****gridpost = (FFT_SCALAR ****) pc_kspace->post_convolution(); // get potential @@ -1857,7 +1870,7 @@ void PairAmoeba::polar_kspace() // gridpre = my portion of 3d grid in brick decomp w/ ghost values // zeroed by zero() - double ***gridpre = (double ***) p_kspace->zero(); + FFT_SCALAR ***gridpre = (FFT_SCALAR ***) p_kspace->zero(); // map atoms to grid @@ -1887,7 +1900,7 @@ void PairAmoeba::polar_kspace() // gridpre = my portion of 3d grid in brick decomp w/ ghost values // zeroed by zero() - gridpre = (double ***) p_kspace->zero(); + gridpre = (FFT_SCALAR ***) p_kspace->zero(); // map atoms to grid @@ -1896,7 +1909,7 @@ void PairAmoeba::polar_kspace() // pre-convolution operations including forward FFT // gridfft1/2 = my portions of complex 3d grid in FFT decomp as 1d vectors - double *gridfft2 = p_kspace->pre_convolution(); + FFT_SCALAR *gridfft2 = p_kspace->pre_convolution(); // --------------------- // convolution operation @@ -1953,7 +1966,7 @@ void PairAmoeba::polar_kspace() // gridpre = my portion of 3d grid in brick decomp w/ ghost values // zeroed by zero() - double ***gridpre = (double ***) p_kspace->zero(); + FFT_SCALAR ***gridpre = (FFT_SCALAR ***) p_kspace->zero(); // map atoms to grid @@ -1962,12 +1975,12 @@ void PairAmoeba::polar_kspace() // pre-convolution operations including forward FFT // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector - double *gridfft = p_kspace->pre_convolution(); + FFT_SCALAR *gridfft = p_kspace->pre_convolution(); // gridfft1 = copy of first FFT int nfft_owned = p_kspace->nfft_owned; - memcpy(gridfft1,gridfft,2*nfft_owned*sizeof(double)); + memcpy(gridfft1,gridfft,2*nfft_owned*sizeof(FFT_SCALAR)); // assign ??? to the PME grid @@ -1982,7 +1995,7 @@ void PairAmoeba::polar_kspace() // gridpre = my portion of 3d grid in brick decomp w/ ghost values - gridpre = (double ***) p_kspace->zero(); + gridpre = (FFT_SCALAR ***) p_kspace->zero(); // map atoms to grid @@ -1991,7 +2004,7 @@ void PairAmoeba::polar_kspace() // pre-convolution operations including forward FFT // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector - double *gridfft2 = p_kspace->pre_convolution(); + FFT_SCALAR *gridfft2 = p_kspace->pre_convolution(); // --------------------- // convolution operation diff --git a/src/AMOEBA/fix_amoeba_bitorsion.cpp b/src/AMOEBA/fix_amoeba_bitorsion.cpp index aeba26fb4d..cb8c62819d 100644 --- a/src/AMOEBA/fix_amoeba_bitorsion.cpp +++ b/src/AMOEBA/fix_amoeba_bitorsion.cpp @@ -194,8 +194,8 @@ void FixAmoebaBiTorsion::init() // error check that PairAmoeba or PairHiippo exist pair = nullptr; - pair = force->pair_match("amoeba",1,0); - if (!pair) pair = force->pair_match("hippo",1,0); + pair = force->pair_match("^amoeba",0,0); + if (!pair) pair = force->pair_match("^hippo",0,0); if (!pair) error->all(FLERR,"Cannot use fix amoeba/bitorsion w/out pair amoeba/hippo"); diff --git a/src/AMOEBA/improper_amoeba.cpp b/src/AMOEBA/improper_amoeba.cpp index b1e403da78..cb9db01b59 100644 --- a/src/AMOEBA/improper_amoeba.cpp +++ b/src/AMOEBA/improper_amoeba.cpp @@ -285,8 +285,9 @@ void ImproperAmoeba::init_style() // check if PairAmoeba disabled improper terms Pair *pair = nullptr; - pair = force->pair_match("amoeba",1,0); - if (!pair) pair = force->pair_match("hippo",1,0); + pair = force->pair_match("^amoeba",0,0); + if (!pair) pair = force->pair_match("^hippo",0,0); + if (!pair) error->all(FLERR,"Improper amoeba could not find pair amoeba/hippo"); int tmp; diff --git a/src/AMOEBA/pair_amoeba.cpp b/src/AMOEBA/pair_amoeba.cpp index e8b7a18dba..0812fe43f0 100644 --- a/src/AMOEBA/pair_amoeba.cpp +++ b/src/AMOEBA/pair_amoeba.cpp @@ -29,6 +29,7 @@ #include "my_page.h" #include "neigh_list.h" #include "neighbor.h" +#include "timer.h" #include "update.h" #include @@ -47,6 +48,7 @@ enum{MUTUAL,OPT,TCG,DIRECT}; enum{GEAR,ASPC,LSQR}; #define DELTASTACK 16 +#define DEBUG_AMOEBA 0 /* ---------------------------------------------------------------------- */ @@ -85,6 +87,10 @@ PairAmoeba::PairAmoeba(LAMMPS *lmp) : Pair(lmp) cmp = fmp = nullptr; cphi = fphi = nullptr; + _moduli_array = nullptr; + _moduli_bsarray = nullptr; + _nfft_max = 0; + poli = nullptr; conj = conjp = nullptr; vec = vecp = nullptr; @@ -227,6 +233,9 @@ PairAmoeba::~PairAmoeba() memory->destroy(fphidp); memory->destroy(cphidp); + memory->destroy(_moduli_array); + memory->destroy(_moduli_bsarray); + memory->destroy(thetai1); memory->destroy(thetai2); memory->destroy(thetai3); @@ -349,12 +358,22 @@ void PairAmoeba::compute(int eflag, int vflag) if (update->ntimestep <= update->beginstep+1) { time_init = time_hal = time_repulse = time_disp = time_mpole = 0.0; time_induce = time_polar = time_qxfer = 0.0; + + time_mpole_rspace = time_mpole_kspace = 0.0; + time_direct_rspace = time_direct_kspace = 0.0; + time_mutual_rspace = time_mutual_kspace = 0.0; + time_polar_rspace = time_polar_kspace = 0.0; + + time_grid_uind = time_fphi_uind = 0.0; + if (ic_kspace) { + ic_kspace->time_fft = 0.0; + } } double time0,time1,time2,time3,time4,time5,time6,time7,time8; - MPI_Barrier(world); - time0 = MPI_Wtime(); + if (timer->has_sync()) MPI_Barrier(world); + time0 = platform::walltime(); // if reneighboring step: // augment neighbor list to include 1-5 neighbor flags @@ -410,8 +429,7 @@ void PairAmoeba::compute(int eflag, int vflag) comm->forward_comm(this); if (amoeba) pbc_xred(); - - time1 = MPI_Wtime(); + time1 = platform::walltime(); // ---------------------------------------- // compute components of force field @@ -420,22 +438,22 @@ void PairAmoeba::compute(int eflag, int vflag) // buffered 14-7 Vdwl, pairwise if (amoeba && hal_flag) hal(); - time2 = MPI_Wtime(); + time2 = platform::walltime(); // Pauli repulsion, pairwise if (!amoeba && repulse_flag) repulsion(); - time3 = MPI_Wtime(); + time3 = platform::walltime(); // Ewald dispersion, pairwise and long range if (!amoeba && (disp_rspace_flag || disp_kspace_flag)) dispersion(); - time4 = MPI_Wtime(); + time4 = platform::walltime(); // multipole, pairwise and long range if (mpole_rspace_flag || mpole_kspace_flag) multipole(); - time5 = MPI_Wtime(); + time5 = platform::walltime(); // induced dipoles, interative CG relaxation // communicate induce() output values needed by ghost atoms @@ -445,17 +463,17 @@ void PairAmoeba::compute(int eflag, int vflag) cfstyle = INDUCE; comm->forward_comm(this); } - time6 = MPI_Wtime(); + time6 = platform::walltime(); // dipoles, pairwise and long range if (polar_rspace_flag || polar_kspace_flag) polar(); - time7 = MPI_Wtime(); + time7 = platform::walltime(); // charge transfer, pairwise if (!amoeba && qxfer_flag) charge_transfer(); - time8 = MPI_Wtime(); + time8 = platform::walltime(); // store energy components for output by compute pair command @@ -518,6 +536,44 @@ void PairAmoeba::finish() MPI_Allreduce(&time_qxfer,&ave,1,MPI_DOUBLE,MPI_SUM,world); time_qxfer = ave/comm->nprocs; + #if DEBUG_AMOEBA + // real-space/kspace breakdown + MPI_Allreduce(&time_mpole_rspace,&ave,1,MPI_DOUBLE,MPI_SUM,world); + time_mpole_rspace = ave/comm->nprocs; + + MPI_Allreduce(&time_mpole_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world); + time_mpole_kspace = ave/comm->nprocs; + + MPI_Allreduce(&time_direct_rspace,&ave,1,MPI_DOUBLE,MPI_SUM,world); + time_direct_rspace = ave/comm->nprocs; + + MPI_Allreduce(&time_direct_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world); + time_direct_kspace = ave/comm->nprocs; + + MPI_Allreduce(&time_mutual_rspace,&ave,1,MPI_DOUBLE,MPI_SUM,world); + time_mutual_rspace = ave/comm->nprocs; + + MPI_Allreduce(&time_mutual_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world); + time_mutual_kspace = ave/comm->nprocs; + + MPI_Allreduce(&time_polar_rspace,&ave,1,MPI_DOUBLE,MPI_SUM,world); + time_polar_rspace = ave/comm->nprocs; + + MPI_Allreduce(&time_polar_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world); + time_polar_kspace = ave/comm->nprocs; + + MPI_Allreduce(&time_grid_uind,&ave,1,MPI_DOUBLE,MPI_SUM,world); + time_grid_uind = ave/comm->nprocs; + + MPI_Allreduce(&time_fphi_uind,&ave,1,MPI_DOUBLE,MPI_SUM,world); + time_fphi_uind = ave/comm->nprocs; + + double time_mutual_fft = 0; + if (ic_kspace) time_mutual_fft = ic_kspace->time_fft; + MPI_Allreduce(&time_mutual_fft,&ave,1,MPI_DOUBLE,MPI_SUM,world); + time_mutual_fft = ave/comm->nprocs; + #endif // DEBUG_AMOEBA + double time_total = (time_init + time_hal + time_repulse + time_disp + time_mpole + time_induce + time_polar + time_qxfer) / 100.0; @@ -534,8 +590,27 @@ void PairAmoeba::finish() utils::logmesg(lmp," Induce time: {:<12.6g} {:6.2f}%\n", time_induce, time_induce/time_total); utils::logmesg(lmp," Polar time: {:<12.6g} {:6.2f}%\n", time_polar, time_polar/time_total); if (!amoeba) - utils::logmesg(lmp," Qxfer time: {:<12.6g} {:6.2f}%\n", time_qxfer, time_qxfer/time_total); - utils::logmesg(lmp," Total time: {:<12.6g}\n",time_total * 100.0); + utils::logmesg(lmp," Qxfer time: {:.6g} {:.6g}\n", time_qxfer, time_qxfer/time_total); + utils::logmesg(lmp," Total time: {:.6g}\n",time_total * 100.0); + + #if DEBUG_AMOEBA + double rspace_time = time_mpole_rspace + time_direct_rspace + time_mutual_rspace + time_polar_rspace; + double kspace_time = time_mpole_kspace + time_direct_kspace + time_mutual_kspace + time_polar_kspace; + + utils::logmesg(lmp," Real-space timing breakdown: {:.3g}%\n", rspace_time/time_total); + utils::logmesg(lmp," Mpole time: {:.6g} {:.3g}%\n", time_mpole_rspace, time_mpole_rspace/time_total); + utils::logmesg(lmp," Direct time: {:.6g} {:.3g}%\n", time_direct_rspace, time_direct_rspace/time_total); + utils::logmesg(lmp," Mutual time: {:.6g} {:.3g}%\n", time_mutual_rspace, time_mutual_rspace/time_total); + utils::logmesg(lmp," Polar time: {:.6g} {:.3g}%\n", time_polar_rspace, time_polar_rspace/time_total); + utils::logmesg(lmp," K-space timing breakdown : {:.3g}%\n", kspace_time/time_total); + utils::logmesg(lmp," Mpole time: {:.6g} {:.3g}%\n", time_mpole_kspace, time_mpole_kspace/time_total); + utils::logmesg(lmp," Direct time: {:.6g} {:.3g}%\n", time_direct_kspace, time_direct_kspace/time_total); + utils::logmesg(lmp," Mutual time: {:.6g} {:.3g}%\n", time_mutual_kspace, time_mutual_kspace/time_total); + utils::logmesg(lmp," - Grid : {:.6g} {:.3g}%\n", time_grid_uind, time_grid_uind/time_total); + utils::logmesg(lmp," - FFT : {:.6g} {:.3g}%\n", time_mutual_fft, time_mutual_fft/time_total); + utils::logmesg(lmp," - Interp : {:.6g} {:.3g}%\n", time_fphi_uind, time_fphi_uind/time_total); + utils::logmesg(lmp," Polar time: {:.6g} {:.3g}%\n", time_polar_kspace, time_polar_kspace/time_total); + #endif } } @@ -2320,6 +2395,8 @@ void PairAmoeba::grow_local() firstneigh_pcpc = (double **) memory->smalloc(nmax*sizeof(double *),"induce:firstneigh_pcpc"); } + + memory->create(_moduli_array,bsordermax,"amoeba:_moduli_array"); } /* ---------------------------------------------------------------------- diff --git a/src/AMOEBA/pair_amoeba.h b/src/AMOEBA/pair_amoeba.h index 847764244b..cdeee6c95f 100644 --- a/src/AMOEBA/pair_amoeba.h +++ b/src/AMOEBA/pair_amoeba.h @@ -82,6 +82,12 @@ class PairAmoeba : public Pair { double time_init, time_hal, time_repulse, time_disp; double time_mpole, time_induce, time_polar, time_qxfer; + double time_mpole_rspace, time_mpole_kspace; + double time_direct_rspace, time_direct_kspace; + double time_mutual_rspace, time_mutual_kspace; + double time_polar_rspace, time_polar_kspace; + double time_grid_uind, time_fphi_uind; + // energy/virial components double ehal, erepulse, edisp, epolar, empole, eqxfer; @@ -324,8 +330,12 @@ class PairAmoeba : public Pair { double *qfac; // convoulution pre-factors double *gridfft1; // copy of p_kspace FFT grid - double **cmp, **fmp; // Cartesian and fractional multipoles - double **cphi, **fphi; + double **cmp,**fmp; // Cartesian and fractional multipoles + double **cphi,**fphi; + + double *_moduli_array; // buffers for moduli + double *_moduli_bsarray; + int _nfft_max; // params for current KSpace solve and FFT being worked on @@ -335,8 +345,12 @@ class PairAmoeba : public Pair { double ctf[10][10]; // indices NOT flipped vs Fortran double ftc[10][10]; // indices NOT flipped vs Fortran - class AmoebaConvolution *m_kspace, *p_kspace, *pc_kspace, *d_kspace; - class AmoebaConvolution *i_kspace, *ic_kspace; + class AmoebaConvolution *m_kspace; // multipole KSpace + class AmoebaConvolution *p_kspace; // polar KSpace + class AmoebaConvolution *pc_kspace; + class AmoebaConvolution *d_kspace; // dispersion KSpace + class AmoebaConvolution *i_kspace; // induce KSpace + class AmoebaConvolution *ic_kspace; // FFT grid size factors @@ -347,33 +361,33 @@ class PairAmoeba : public Pair { void hal(); - void repulsion(); - void damprep(double, double, double, double, double, double, double, double, int, double, double, - double *); + virtual void repulsion(); + void damprep(double, double, double, double, double, double, double, double, + int, double, double, double *); void dispersion(); - void dispersion_real(); + virtual void dispersion_real(); void dispersion_kspace(); void multipole(); - void multipole_real(); + virtual void multipole_real(); void multipole_kspace(); void polar(); void polar_energy(); - void polar_real(); - void polar_kspace(); + virtual void polar_real(); + virtual void polar_kspace(); void damppole(double, int, double, double, double *, double *, double *); - void induce(); + virtual void induce(); void ulspred(); - void ufield0c(double **, double **); + virtual void ufield0c(double **, double **); void uscale0b(int, double **, double **, double **, double **); void dfield0c(double **, double **); - void umutual1(double **, double **); - void umutual2b(double **, double **); + virtual void umutual1(double **, double **); + virtual void umutual2b(double **, double **); void udirect1(double **); - void udirect2b(double **, double **); + virtual void udirect2b(double **, double **); void dampmut(double, double, double, double *); void dampdir(double, double, double, double *, double *); void cholesky(int, double *, double *); @@ -393,11 +407,11 @@ class PairAmoeba : public Pair { void fphi_to_cphi(double **, double **); void frac_to_cart(); - void grid_mpole(double **, double ***); - void fphi_mpole(double ***, double **); - void grid_uind(double **, double **, double ****); - void fphi_uind(double ****, double **, double **, double **); - void grid_disp(double ***); + void grid_mpole(double **, FFT_SCALAR ***); + void fphi_mpole(FFT_SCALAR ***, double **); + void grid_uind(double **, double **, FFT_SCALAR ****); + virtual void fphi_uind(FFT_SCALAR ****, double **, double **, double **); + void grid_disp(FFT_SCALAR ***); void kewald(); void kewald_parallel(int, int, int, int, int &, int &, int &, int &, int &, int &, int &, int &, diff --git a/src/Depend.sh b/src/Depend.sh index 10d612f490..470a0a2a2b 100755 --- a/src/Depend.sh +++ b/src/Depend.sh @@ -45,6 +45,10 @@ depend () { # add one if statement per parent package # add one depend() call per child package that depends on that parent +if (test $1 = "AMOEBA") then + depend GPU +fi + if (test $1 = "ASPHERE") then depend GPU depend OPENMP diff --git a/src/GPU/Install.sh b/src/GPU/Install.sh index d28e6260f8..19e89498fc 100755 --- a/src/GPU/Install.sh +++ b/src/GPU/Install.sh @@ -28,6 +28,8 @@ action () { # list of files with optional dependcies +action amoeba_convolution_gpu.cpp amoeba_convolution.cpp +action amoeba_convolution_gpu.h amoeba_convolution.cpp action fix_gpu.cpp action fix_gpu.h action fix_nve_gpu.h @@ -41,6 +43,8 @@ action fix_npt_gpu.cpp action fix_nve_asphere_gpu.h fix_nve_asphere.h action fix_nve_asphere_gpu.cpp fix_nve_asphere.cpp action gpu_extra.h +action pair_amoeba_gpu.cpp pair_amoeba.cpp +action pair_amoeba_gpu.h pair_amoeba.h action pair_beck_gpu.cpp pair_beck.cpp action pair_beck_gpu.h pair_beck.h action pair_born_coul_long_gpu.cpp pair_born_coul_long.cpp @@ -89,6 +93,8 @@ action pair_gauss_gpu.cpp pair_gauss.cpp action pair_gauss_gpu.h pair_gauss.h action pair_gayberne_gpu.cpp pair_gayberne.cpp action pair_gayberne_gpu.h pair_gayberne.cpp +action pair_hippo_gpu.cpp pair_hippo.cpp +action pair_hippo_gpu.h pair_hippo.cpp action pair_lj96_cut_gpu.cpp pair_lj96_cut.cpp action pair_lj96_cut_gpu.h pair_lj96_cut.h action pair_lj_charmm_coul_long_gpu.cpp pair_lj_charmm_coul_long.cpp @@ -113,6 +119,10 @@ action pair_lj_cut_coul_msm_gpu.cpp pair_lj_cut_coul_msm.cpp action pair_lj_cut_coul_msm_gpu.h pair_lj_cut_coul_msm.h action pair_lj_cut_gpu.cpp action pair_lj_cut_gpu.h +action pair_lj_cut_dipole_long_gpu.cpp pair_lj_cut_dipole_long.cpp +action pair_lj_cut_dipole_long_gpu.h pair_lj_cut_dipole_long.cpp +action pair_lj_cut_tip4p_long_gpu.h pair_lj_cut_tip4p_long.cpp +action pair_lj_cut_tip4p_long_gpu.cpp pair_lj_cut_tip4p_long.cpp action pair_lj_smooth_gpu.cpp pair_lj_smooth.cpp action pair_lj_smooth_gpu.h pair_lj_smooth.cpp action pair_lj_expand_gpu.cpp @@ -155,10 +165,6 @@ action pppm_gpu.cpp pppm.cpp action pppm_gpu.h pppm.cpp action pair_ufm_gpu.cpp pair_ufm.cpp action pair_ufm_gpu.h pair_ufm.h -action pair_lj_cut_dipole_long_gpu.cpp pair_lj_cut_dipole_long.cpp -action pair_lj_cut_dipole_long_gpu.h pair_lj_cut_dipole_long.cpp -action pair_lj_cut_tip4p_long_gpu.h pair_lj_cut_tip4p_long.cpp -action pair_lj_cut_tip4p_long_gpu.cpp pair_lj_cut_tip4p_long.cpp # edit 2 Makefile.package files to include/exclude package info diff --git a/src/GPU/amoeba_convolution_gpu.cpp b/src/GPU/amoeba_convolution_gpu.cpp new file mode 100644 index 0000000000..908c9e409c --- /dev/null +++ b/src/GPU/amoeba_convolution_gpu.cpp @@ -0,0 +1,181 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/ Sandia National Laboratories + LAMMPS Development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include "amoeba_convolution_gpu.h" +#include "comm.h" +#include "fft3d_wrap.h" +#include "remap_wrap.h" +#include "grid3d.h" + +using namespace LAMMPS_NS; + +// DEBUG + +#define DEBUG_AMOEBA 0 +#if DEBUG_AMOEBA +char *labels[7] = + {(char *) "MPOLE_GRID", (char *) "POLAR_GRID", + (char *) "POLAR_GRIDC", (char *) "DISP_GRID", + (char *) "INDUCE_GRID", (char *) "INDUCE_GRIDC"}; + +enum{GRIDBRICK_OUT,GRIDBRICK_IN,FFT,CFFT1,CFFT2}; +#endif +// END DEBUG + +#define SCALE 0 + +//#define USE_AMOEBA_FFT +#ifdef USE_AMOEBA_FFT +// External functions from GPU library +int amoeba_setup_fft(const int size, const int numel, const int element_type); +int amoeba_compute_fft1d(void* in, void* out, const int numel, const int mode); +#endif + +/* ---------------------------------------------------------------------- + partition an FFT grid across processors + both for a brick and FFT x pencil decomposition + nx,nz,nz = global FFT grid size + order = size of stencil in each dimension that maps atoms to grid + adapted from PPPM::set_grid_local() +------------------------------------------------------------------------- */ + +AmoebaConvolutionGPU::AmoebaConvolutionGPU(LAMMPS *lmp, Pair *pair, + int nx_caller, int ny_caller, int nz_caller, + int order_caller, int which_caller) : + AmoebaConvolution(lmp, pair, nx_caller, ny_caller, nz_caller, order_caller, + which_caller) +{ + +} + +/* ---------------------------------------------------------------------- + perform pre-convolution grid operations for 4d cgrid_brick array +------------------------------------------------------------------------- */ + +FFT_SCALAR *AmoebaConvolutionGPU::pre_convolution_4d() +{ + int ix,iy,iz,n; + + // reverse comm for 4d brick grid + ghosts + +#if DEBUG_AMOEBA + debug_scalar(GRIDBRICK_OUT,"PRE Convo / PRE Grid3d"); +#endif + + gc->reverse_comm(Grid3d::PAIR,amoeba,which,2,sizeof(FFT_SCALAR), + gc_buf1,gc_buf2,MPI_FFT_SCALAR); + +#if DEBUG_AMOEBA + debug_scalar(GRIDBRICK_IN,"PRE Convo / POST Grid3d"); + debug_file(GRIDBRICK_IN,"pre.convo.post.grid3d"); +#endif + // copy owned 4d brick grid values to FFT grid + + n = 0; + for (iz = nzlo_in; iz <= nzhi_in; iz++) + for (iy = nylo_in; iy <= nyhi_in; iy++) + for (ix = nxlo_in; ix <= nxhi_in; ix++) { + cfft[n++] = cgrid_brick[iz][iy][ix][0]; + cfft[n++] = cgrid_brick[iz][iy][ix][1]; + } + + // remap FFT grid from brick to x pencil partitioning + // NOTE: could just setup FFT to start from brick decomp and skip remap + + remap->perform(cfft,cfft,remap_buf); + +#if DEBUG_AMOEBA + debug_scalar(FFT,"PRE Convo / POST Remap"); + debug_file(FFT,"pre.convo.post.remap"); +#endif + + double time0,time1; + + MPI_Barrier(world); + time0 = platform::walltime(); + + // perform forward FFT + + #ifdef USE_AMOEBA_FFT + amoeba_compute_fft1d(cfft,cfft,2*nfft_owned,FFT3d::FORWARD); + #else + fft1->compute(cfft,cfft,FFT3d::FORWARD); + #endif + + time1 = platform::walltime(); + + time_fft += time1 - time0; + + if (SCALE) { + double scale = 1.0/nfft_global; + for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale; + } + +#if DEBUG_AMOEBA + debug_scalar(CFFT1,"PRE Convo / POST FFT"); + debug_file(CFFT1,"pre.convo.post.fft"); +#endif + return cfft; +} + +/* ---------------------------------------------------------------------- + perform post-convolution grid operations for 4d cgrid_brick array +------------------------------------------------------------------------- */ + +void *AmoebaConvolutionGPU::post_convolution_4d() +{ + int ix,iy,iz,n; + + // perform backward FFT + +#if DEBUG_AMOEBA + debug_scalar(CFFT1,"POST Convo / PRE FFT"); + debug_file(CFFT1,"post.convo.pre.fft"); +#endif + + double time0,time1; + + MPI_Barrier(world); + time0 = platform::walltime(); + + fft2->compute(cfft,cfft,FFT3d::BACKWARD); + + time1 = platform::walltime(); + + time_fft += time1 - time0; + +#if DEBUG_AMOEBA + debug_scalar(CFFT2,"POST Convo / POST FFT"); + debug_file(CFFT2,"post.convo.post.fft"); +#endif + // copy 1d complex values into 4d complex grid + + n = 0; + for (iz = nzlo_in; iz <= nzhi_in; iz++) + for (iy = nylo_in; iy <= nyhi_in; iy++) + for (ix = nxlo_in; ix <= nxhi_in; ix++) { + cgrid_brick[iz][iy][ix][0] = cfft[n++]; + cgrid_brick[iz][iy][ix][1] = cfft[n++]; + } + + // forward comm to populate ghost grid values + +#if DEBUG_AMOEBA + debug_scalar(GRIDBRICK_IN,"POST Convo / PRE grid3d"); + debug_file(GRIDBRICK_IN,"post.convo.pre.grid3d"); +#endif + gc->forward_comm(Grid3d::PAIR,amoeba,which,2,sizeof(FFT_SCALAR), + gc_buf1,gc_buf2,MPI_FFT_SCALAR); + + return (void *) cgrid_brick; +} diff --git a/src/GPU/amoeba_convolution_gpu.h b/src/GPU/amoeba_convolution_gpu.h new file mode 100644 index 0000000000..4286f2155f --- /dev/null +++ b/src/GPU/amoeba_convolution_gpu.h @@ -0,0 +1,32 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/ Sandia National Laboratories + LAMMPS Development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifndef LMP_AMOEBA_CONVOLUTION_GPU_H +#define LMP_AMOEBA_CONVOLUTION_GPU_H + +#include "amoeba_convolution.h" + + +namespace LAMMPS_NS { + +class AmoebaConvolutionGPU : public AmoebaConvolution { + public: + AmoebaConvolutionGPU(class LAMMPS *, class Pair *, int, int, int, int, int); + + FFT_SCALAR *pre_convolution_4d() override; + void *post_convolution_4d() override; + +}; + +} // namespace LAMMPS_NS +#endif diff --git a/src/GPU/fix_gpu.cpp b/src/GPU/fix_gpu.cpp index 97f22da0a7..23191c12c8 100644 --- a/src/GPU/fix_gpu.cpp +++ b/src/GPU/fix_gpu.cpp @@ -131,7 +131,7 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) : _gpu_mode = GPU_NEIGH; _particle_split = 1.0; int nthreads = 0; - int newtonflag = 0; + int newtonflag = force->newton_pair; int threads_per_atom = -1; double binsize = 0.0; char *opencl_args = nullptr; @@ -360,6 +360,8 @@ double FixGPU::memory_usage() return bytes; } +/* ---------------------------------------------------------------------- */ + double FixGPU::binsize(const double subx, const double suby, const double subz, const int nlocal, const double cut) { diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp new file mode 100644 index 0000000000..fd423486fd --- /dev/null +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -0,0 +1,2067 @@ +// clang-format off +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS Development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Trung Nguyen (Northwestern/UChicago) +------------------------------------------------------------------------- */ + +#include "pair_amoeba_gpu.h" + +#include "amoeba_convolution_gpu.h" +#include "atom.h" +#include "comm.h" +#include "domain.h" +#include "error.h" +#include "fix_store_peratom.h" +#include "force.h" +#include "gpu_extra.h" +#include "info.h" +#include "math_const.h" +#include "memory.h" +#include "my_page.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "neighbor.h" +#include "suffix.h" +#include + +using namespace LAMMPS_NS; +using namespace MathConst; + +// same as in amoeba_induce.cpp +enum{INDUCE,RSD,SETUP_AMOEBA,SETUP_HIPPO,KMPOLE,AMGROUP}; // forward comm +enum{FIELD,ZRSD,TORQUE,UFLD}; // reverse comm +enum{VDWL,REPULSE,QFER,DISP,MPOLE,POLAR,USOLV,DISP_LONG,MPOLE_LONG,POLAR_LONG}; +enum{MUTUAL,OPT,TCG,DIRECT}; +enum{GEAR,ASPC,LSQR}; +enum{BUILD,APPLY}; +enum{GORDON1,GORDON2}; + +// same as in pair_amoeba.cpp +enum{MPOLE_GRID,POLAR_GRID,POLAR_GRIDC,DISP_GRID,INDUCE_GRID,INDUCE_GRIDC}; + +#define DEBYE 4.80321 // conversion factor from q-Angs (real units) to Debye + +// External functions from cuda library for atom decomposition + +int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclass, + const double *host_pdamp, const double *host_thole, + const double *host_dirdamp, const int* host_amtype2class, + const double *host_special_hal, const double *host_special_repel, + const double *host_special_disp, const double *host_special_mpole, + const double *host_special_polar_wscale, + const double *host_special_polar_piscale, + const double *host_special_polar_pscale, + const double *host_csix, const double *host_adisp, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const int maxspecial15, + const double cell_size, int &gpu_mode, FILE *screen, + const double polar_dscale, const double polar_uscale); +void amoeba_gpu_clear(); + +int** amoeba_gpu_precompute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, double *host_pval, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, double *prd); + +void amoeba_gpu_compute_multipole_real(const int ago, const int inum, const int nall, + double **host_x, int *host_type, int *host_amtype, int *host_amgroup, + double **host_rpole, double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, int* nspecial15, tagint** special15, + const bool eflag, const bool vflag, const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, const double cpu_time, + bool &success, const double aewald, const double felec, const double off2, + double *host_q, double *boxlo, double *prd, void **tq_ptr); + +void amoeba_gpu_compute_udirect2b(int *host_amtype, int *host_amgroup, + double **host_rpole, double **host_uind, double **host_uinp, + const double aewald, const double off2, void **fieldp_ptr); + +void amoeba_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup, + double **host_rpole, double **host_uind, double **host_uinp, + const double aewald, const double off2, void **fieldp_ptr); + +void amoeba_gpu_update_fieldp(void **fieldp_ptr); + +void amoeba_gpu_precompute_kspace(const int inum_full, const int bsorder, + double ***host_thetai1, double ***host_thetai2, + double ***host_thetai3, int** igrid, + const int nzlo_out, const int nzhi_out, + const int nylo_out, const int nyhi_out, + const int nxlo_out, const int nxhi_out); + +void amoeba_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1, + void **host_fdip_phi2, void **host_fdip_sum_phi); + +void amoeba_gpu_fphi_mpole(double ***host_grid_brick, void **host_fdip_sum_phi, + const double felec); + +void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, + double **host_rpole, double **host_uind, double **host_uinp, + const bool eflag, const bool vflag, const bool eatom, const bool vatom, + const double aewald, const double felec, const double off2, + void **tq_ptr); + +double amoeba_gpu_bytes(); + +/* ---------------------------------------------------------------------- */ + +PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE) +{ + respa_enable = 0; + reinitflag = 0; + cpu_time = 0.0; + suffix_flag |= Suffix::GPU; + fieldp_pinned = nullptr; + tq_pinned = nullptr; + + gpu_hal_ready = false; // true for AMOEBA when ready + gpu_repulsion_ready = false; // always false for AMOEBA + gpu_dispersion_real_ready = false; // always false for AMOEBA + gpu_multipole_real_ready = true; // need to be true for precompute() + gpu_udirect2b_ready = true; + gpu_umutual1_ready = true; + gpu_fphi_uind_ready = true; + gpu_umutual2b_ready = true; + gpu_polar_real_ready = true; // need to be true for copying data from device back to host + + GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); +} + +/* ---------------------------------------------------------------------- + free all arrays +------------------------------------------------------------------------- */ + +PairAmoebaGPU::~PairAmoebaGPU() +{ + amoeba_gpu_clear(); +} + +/* ---------------------------------------------------------------------- + init specific to this pair style +------------------------------------------------------------------------- */ + +void PairAmoebaGPU::init_style() +{ + PairAmoeba::init_style(); + + // Repeat cutsq calculation because done after call to init_style + + double maxcut = -1.0; + double cut; + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = i; j <= atom->ntypes; j++) { + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + cut = init_one(i,j); + cut *= cut; + if (cut > maxcut) + maxcut = cut; + cutsq[i][j] = cutsq[j][i] = cut; + } else + cutsq[i][j] = cutsq[j][i] = 0.0; + } + } + + double cell_size = sqrt(maxcut) + neighbor->skin; + + int maxspecial=0; + int maxspecial15=0; + if (atom->molecular != Atom::ATOMIC) { + maxspecial=atom->maxspecial; + maxspecial15=atom->maxspecial15; + } + + int mnf = 5e-2 * neighbor->oneatom; + int success = amoeba_gpu_init(atom->ntypes+1, max_amtype, max_amclass, + pdamp, thole, dirdamp, amtype2class, special_hal, + special_repel, special_disp, special_mpole, + special_polar_wscale, special_polar_piscale, + special_polar_pscale, csix, adisp, atom->nlocal, + atom->nlocal+atom->nghost, mnf, maxspecial, + maxspecial15, cell_size, gpu_mode, screen, + polar_dscale, polar_uscale); + GPU_EXTRA::check_flag(success,error,world); + + if (gpu_mode == GPU_FORCE) + error->all(FLERR,"Pair style amoeba/gpu does not support neigh no for now"); + + acc_float = Info::has_accelerator_feature("GPU", "precision", "single"); + + // replace with the gpu counterpart + + if (gpu_umutual1_ready) { + if (use_ewald && ic_kspace) { + delete ic_kspace; + ic_kspace = + new AmoebaConvolutionGPU(lmp,this,nefft1,nefft2,nefft3,bsporder,INDUCE_GRIDC); + } + } +} + +/* ---------------------------------------------------------------------- + multipole_real = real-space portion of mulipole interactions + adapted from Tinker emreal1d() routine +------------------------------------------------------------------------- */ + +void PairAmoebaGPU::multipole_real() +{ + if (!gpu_multipole_real_ready) { + PairAmoeba::multipole_real(); + return; + } + + int eflag=1, vflag=1; + double **f = atom->f; + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + bool success = true; + int *ilist, *numneigh; + + double sublo[3],subhi[3]; + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi); + } + inum = atom->nlocal; + + amoeba_gpu_precompute(neighbor->ago, inum, nall, atom->x, + atom->type, amtype, amgroup, rpole, + nullptr, nullptr, nullptr, + sublo, subhi, atom->tag, + atom->nspecial, atom->special, + atom->nspecial15, atom->special15, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, atom->q, domain->boxlo, domain->prd); + if (!success) + error->one(FLERR,"Insufficient memory on accelerator"); + + // select the correct cutoff for the term + + if (use_ewald) choose(MPOLE_LONG); + else choose(MPOLE); + + // set the energy unit conversion factor for multipolar real-space calculation + + double felec = electric / am_dielectric; + + amoeba_gpu_compute_multipole_real(neighbor->ago, inum, nall, atom->x, + atom->type, amtype, amgroup, rpole, + sublo, subhi, atom->tag, + atom->nspecial, atom->special, + atom->nspecial15, atom->special15, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, aewald, felec, off2, atom->q, + domain->boxlo, domain->prd, &tq_pinned); + + + + // reference to the tep array from GPU lib + + if (acc_float) { + auto *tq_ptr = (float *)tq_pinned; + compute_force_from_torque(tq_ptr, f, virmpole); // fmpole + } else { + auto *tq_ptr = (double *)tq_pinned; + compute_force_from_torque(tq_ptr, f, virmpole); // fmpole + } +} + +/* ---------------------------------------------------------------------- + induce = induced dipole moments via pre-conditioned CG solver + adapted from Tinker induce0a() routine + NOTE: Almost the same in the CPU version, except that there is no need + to call reverse_comm() for crstyle = FIELD; +------------------------------------------------------------------------- */ + +void PairAmoebaGPU::induce() +{ + bool done; + int i,j,m,itype; + int iter,maxiter; + double polmin; + double eps,epsold; + double epsd,epsp; + double udsum,upsum; + double a,ap,b,bp; + double sum,sump,term; + double reduce[4],allreduce[4]; + + // set cutoffs, taper coeffs, and PME params + // create qfac here, free at end of polar() + + if (use_ewald) choose(POLAR_LONG); + else choose(POLAR); + + // owned atoms + + int nlocal = atom->nlocal; + + // zero out the induced dipoles at each site + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + uind[i][j] = 0.0; + uinp[i][j] = 0.0; + } + } + + // get the electrostatic field due to permanent multipoles + + dfield0c(field,fieldp); + + // need reverse_comm if dfield0c (i.e. udirect2b) is CPU-only + + if (!gpu_udirect2b_ready) { + crstyle = FIELD; + comm->reverse_comm(this); + } + + // set induced dipoles to polarizability times direct field + + for (i = 0; i < nlocal; i++) { + itype = amtype[i]; + for (j = 0; j < 3; j++) { + udir[i][j] = polarity[itype] * field[i][j]; + udirp[i][j] = polarity[itype] * fieldp[i][j]; + if (pcgguess) { + uind[i][j] = udir[i][j]; + uinp[i][j] = udirp[i][j]; + } + } + } + + // allocate memory and make early host-device transfers + // must be done before the first ufield0c + // NOTE: this is for ic_kspace, and thetai[1-3] + + if (ic_kspace) + amoeba_gpu_precompute_kspace(atom->nlocal, bsorder, thetai1, thetai2, + thetai3, igrid, + ic_kspace->nzlo_out, ic_kspace->nzhi_out, + ic_kspace->nylo_out, ic_kspace->nyhi_out, + ic_kspace->nxlo_out, ic_kspace->nxhi_out); + + // get induced dipoles via the OPT extrapolation method + // NOTE: any way to rewrite these loops to avoid allocating + // uopt,uoptp with a optorder+1 dimension, just optorder ?? + // since no need to store optorder+1 values after these loops + + if (poltyp == OPT) { + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + uopt[i][0][j] = udir[i][j]; + uoptp[i][0][j] = udirp[i][j]; + } + } + + for (m = 1; m <= optorder; m++) { + optlevel = m - 1; // used in umutual1() for fopt,foptp + + cfstyle = INDUCE; + comm->forward_comm(this); + + ufield0c(field,fieldp); + + if (!gpu_umutual2b_ready) { + crstyle = FIELD; + comm->reverse_comm(this); + } + + for (i = 0; i < nlocal; i++) { + itype = amtype[i]; + for (j = 0; j < 3; j++) { + uopt[i][m][j] = polarity[itype] * field[i][j]; + uoptp[i][m][j] = polarity[itype] * fieldp[i][j]; + uind[i][j] = uopt[i][m][j]; + uinp[i][j] = uoptp[i][m][j]; + } + } + } + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + uind[i][j] = 0.0; + uinp[i][j] = 0.0; + usum[i][j] = 0.0; + usump[i][j] = 0.0; + for (m = 0; m <= optorder; m++) { + usum[i][j] += uopt[i][m][j]; + usump[i][j] += uoptp[i][m][j]; + uind[i][j] += copt[m]*usum[i][j]; + uinp[i][j] += copt[m]*usump[i][j]; + } + } + } + } + + // set tolerances for computation of mutual induced dipoles + + if (poltyp == MUTUAL) { + done = false; + maxiter = 100; + iter = 0; + polmin = 0.00000001; + eps = 100.0; + + // estimate induced dipoles using a polynomial predictor + + if (use_pred && nualt == maxualt) { + ulspred(); + + double ***udalt = fixudalt->tstore; + double ***upalt = fixupalt->tstore; + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + udsum = 0.0; + upsum = 0.0; + for (m = 0; m < nualt; m++) { + udsum += bpred[m]*udalt[i][m][j]; + upsum += bpredp[m]*upalt[i][m][j]; + } + uind[i][j] = udsum; + uinp[i][j] = upsum; + } + } + } + + // estimate induced dipoles via inertial extended Lagrangian + // not supported for now + // requires uaux,upaux to persist with each atom + // also requires a velocity vector(s) to persist + // also requires updating uaux,upaux in the Verlet integration + + /* + if (use_ielscf) { + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + uind[i][j] = uaux[i][j]; + uinp[i][j] = upaux[i][j]; + } + } + } + */ + + // get the electrostatic field due to induced dipoles + + cfstyle = INDUCE; + comm->forward_comm(this); + + ufield0c(field,fieldp); + + if (!gpu_umutual2b_ready) { + crstyle = FIELD; + comm->reverse_comm(this); + } + + // set initial conjugate gradient residual and conjugate vector + + for (i = 0; i < nlocal; i++) { + itype = amtype[i]; + + poli[i] = MAX(polmin,polarity[itype]); + for (j = 0; j < 3; j++) { + if (pcgguess) { + rsd[i][j] = (udir[i][j]-uind[i][j])/poli[i] + field[i][j]; + rsdp[i][j] = (udirp[i][j]-uinp[i][j])/poli[i] + fieldp[i][j]; + } else { + rsd[i][j] = udir[i][j] / poli[i]; + rsdp[i][j] = udirp[i][j] / poli[i]; + } + zrsd[i][j] = rsd[i][j]; + zrsdp[i][j] = rsdp[i][j]; + } + } + + if (pcgprec) { + cfstyle = RSD; + comm->forward_comm(this); + uscale0b(BUILD,rsd,rsdp,zrsd,zrsdp); + uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp); + crstyle = ZRSD; + comm->reverse_comm(this); + } + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + conj[i][j] = zrsd[i][j]; + conjp[i][j] = zrsdp[i][j]; + } + } + + // conjugate gradient iteration of the mutual induced dipoles + + while (!done) { + iter++; + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + vec[i][j] = uind[i][j]; + vecp[i][j] = uinp[i][j]; + uind[i][j] = conj[i][j]; + uinp[i][j] = conjp[i][j]; + } + } + + cfstyle = INDUCE; + comm->forward_comm(this); + + ufield0c(field,fieldp); + + if (!gpu_umutual2b_ready) { + crstyle = FIELD; + comm->reverse_comm(this); + } + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + uind[i][j] = vec[i][j]; + uinp[i][j] = vecp[i][j]; + vec[i][j] = conj[i][j]/poli[i] - field[i][j]; + vecp[i][j] = conjp[i][j]/poli[i] - fieldp[i][j]; + } + } + + a = 0.0; + ap = 0.0; + sum = 0.0; + sump = 0.0; + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + a += conj[i][j]*vec[i][j]; + ap += conjp[i][j]*vecp[i][j]; + sum += rsd[i][j]*zrsd[i][j]; + sump += rsdp[i][j]*zrsdp[i][j]; + } + } + + reduce[0] = a; + reduce[1] = ap; + reduce[2] = sum; + reduce[3] = sump; + MPI_Allreduce(reduce,allreduce,4,MPI_DOUBLE,MPI_SUM,world); + a = allreduce[0]; + ap = allreduce[1]; + sum = allreduce[2]; + sump = allreduce[3]; + + if (a != 0.0) a = sum / a; + if (ap != 0.0) ap = sump / ap; + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + uind[i][j] = uind[i][j] + a*conj[i][j]; + uinp[i][j] = uinp[i][j] + ap*conjp[i][j]; + rsd[i][j] = rsd[i][j] - a*vec[i][j]; + rsdp[i][j] = rsdp[i][j] - ap*vecp[i][j]; + zrsd[i][j] = rsd[i][j]; + zrsdp[i][j] = rsdp[i][j]; + } + } + + if (pcgprec) { + cfstyle = RSD; + comm->forward_comm(this); + uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp); + crstyle = ZRSD; + comm->reverse_comm(this); + } + + b = 0.0; + bp = 0.0; + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + b += rsd[i][j]*zrsd[i][j]; + bp += rsdp[i][j]*zrsdp[i][j]; + } + } + + reduce[0] = b; + reduce[1] = bp; + MPI_Allreduce(reduce,allreduce,4,MPI_DOUBLE,MPI_SUM,world); + b = allreduce[0]; + bp = allreduce[1]; + + if (sum != 0.0) b /= sum; + if (sump != 0.0) bp /= sump; + + epsd = 0.0; + epsp = 0.0; + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + conj[i][j] = zrsd[i][j] + b*conj[i][j]; + conjp[i][j] = zrsdp[i][j] + bp*conjp[i][j]; + epsd += rsd[i][j]*rsd[i][j]; + epsp += rsdp[i][j]*rsdp[i][j]; + } + } + + reduce[0] = epsd; + reduce[1] = epsp; + MPI_Allreduce(reduce,allreduce,4,MPI_DOUBLE,MPI_SUM,world); + epsd = allreduce[0]; + epsp = allreduce[1]; + + // check the convergence of the mutual induced dipoles + + epsold = eps; + eps = MAX(epsd,epsp); + eps = DEBYE * sqrt(eps/atom->natoms); + + if (eps < poleps) done = true; + if (eps > epsold) done = true; + if (iter >= politer) done = true; + + // apply a "peek" iteration to the mutual induced dipoles + + if (done) { + for (i = 0; i < nlocal; i++) { + term = pcgpeek * poli[i]; + for (j = 0; j < 3; j++) { + uind[i][j] += term*rsd[i][j]; + uinp[i][j] += term*rsdp[i][j]; + } + } + } + + } + + // terminate the calculation if dipoles failed to converge + // NOTE: could make this an error + + if (iter >= maxiter || eps > epsold) + if (comm->me == 0) + error->warning(FLERR,"AMOEBA induced dipoles did not converge"); + } + + // update the lists of previous induced dipole values + // shift previous m values up to m+1, add new values at m = 0 + // only when preconditioner is used + + if (use_pred) { + double ***udalt = fixudalt->tstore; + double ***upalt = fixupalt->tstore; + + nualt = MIN(nualt+1,maxualt); + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + for (m = nualt-1; m > 0; m--) { + udalt[i][m][j] = udalt[i][m-1][j]; + upalt[i][m][j] = upalt[i][m-1][j]; + } + udalt[i][0][j] = uind[i][j]; + upalt[i][0][j] = uinp[i][j]; + } + } + } +} + +/* ---------------------------------------------------------------------- + udirect2b = Ewald real direct field via list + udirect2b computes the real space contribution of the permanent + atomic multipole moments to the field via a neighbor list +------------------------------------------------------------------------- */ + +void PairAmoebaGPU::udirect2b(double **field, double **fieldp) +{ + if (!gpu_udirect2b_ready) { + PairAmoeba::udirect2b(field, fieldp); + return; + } + + int inum; + double sublo[3],subhi[3]; + + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi); + } + inum = atom->nlocal; + + // select the correct cutoff (off2) for the term + + if (use_ewald) choose(POLAR_LONG); + else choose(POLAR); + + amoeba_gpu_compute_udirect2b(amtype, amgroup, rpole, uind, uinp, + aewald, off2, &fieldp_pinned); + + // rebuild dipole-dipole pair list and store pairwise dipole matrices + // done one atom at a time in real-space double loop over atoms & neighs + // NOTE: for the moment the tdipdip values are computed just in time in umutual2b() + // so no need to call ubdirect2b_cpu(). + // udirect2b_cpu(); + + // accumulate the field and fieldp values from the GPU lib + // field and fieldp may already have some nonzero values from kspace (udirect1) + + int nlocal = atom->nlocal; + if (acc_float) { + auto field_ptr = (float *)fieldp_pinned; + + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + field[i][0] += field_ptr[idx]; + field[i][1] += field_ptr[idx+1]; + field[i][2] += field_ptr[idx+2]; + } + + field_ptr += 4*inum; + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + fieldp[i][0] += field_ptr[idx]; + fieldp[i][1] += field_ptr[idx+1]; + fieldp[i][2] += field_ptr[idx+2]; + } + } else { + auto field_ptr = (double *)fieldp_pinned; + + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + field[i][0] += field_ptr[idx]; + field[i][1] += field_ptr[idx+1]; + field[i][2] += field_ptr[idx+2]; + } + + field_ptr += 4*inum; + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + fieldp[i][0] += field_ptr[idx]; + fieldp[i][1] += field_ptr[idx+1]; + fieldp[i][2] += field_ptr[idx+2]; + } + } + + +} + +/* ---------------------------------------------------------------------- + udirect2b = Ewald real direct field via list + udirect2b computes the real space contribution of the permanent + atomic multipole moments to the field via a neighbor list +------------------------------------------------------------------------- */ + +void PairAmoebaGPU::udirect2b_cpu() +{ + int i,j,m,n,ii,jj,jextra,ndip,itype,jtype,igroup,jgroup; + double xr,yr,zr,r,r2; + double rr1,rr2,rr3,rr5; + double bfac,exp2a; + double ralpha,aefac; + double aesq2,aesq2n; + double pdi,pti; + double pgamma; + double damp,expdamp; + double scale3,scale5; + double scalek; + double bn[4],bcn[3]; + double factor_uscale; + + int inum,jnum; + int *ilist,*jlist,*numneigh,**firstneigh; + + double **x = atom->x; + + // neigh list + + inum = list->inum; + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + + // NOTE: doesn't this have a problem if aewald is tiny ?? + + aesq2 = 2.0 * aewald * aewald; + aesq2n = 0.0; + if (aewald > 0.0) aesq2n = 1.0 / (MY_PIS*aewald); + + // rebuild dipole-dipole pair list and store pairwise dipole matrices + // done one atom at a time in real-space double loop over atoms & neighs + + int *neighptr; + double *tdipdip; + + // compute the real space portion of the Ewald summation + + for (ii = 0; ii < inum; ii++) { + i = ilist[ii]; + itype = amtype[i]; + igroup = amgroup[i]; + jlist = firstneigh[i]; + jnum = numneigh[i]; + + n = ndip = 0; + neighptr = ipage_dipole->vget(); + tdipdip = dpage_dipdip->vget(); + + pdi = pdamp[itype]; + pti = thole[itype]; + + // evaluate all sites within the cutoff distance + + for (jj = 0; jj < jnum; jj++) { + jextra = jlist[jj]; + j = jextra & NEIGHMASK15; + + xr = x[j][0] - x[i][0]; + yr = x[j][1] - x[i][1]; + zr = x[j][2] - x[i][2]; + r2 = xr*xr + yr* yr + zr*zr; + if (r2 > off2) continue; + + jtype = amtype[j]; + jgroup = amgroup[j]; + + if (igroup == jgroup) factor_uscale = polar_uscale; + else factor_uscale = 1.0; + + r = sqrt(r2); + rr1 = 1.0 / r; + rr2 = rr1 * rr1; + rr3 = rr2 * rr1; + rr5 = 3.0 * rr2 * rr3; + + // calculate the real space Ewald error function terms + + ralpha = aewald * r; + bn[0] = erfc(ralpha) * rr1; + exp2a = exp(-ralpha*ralpha); + aefac = aesq2n; + for (m = 1; m <= 3; m++) { + bfac = m+m-1; + aefac = aesq2 * aefac; + bn[m] = (bfac*bn[m-1]+aefac*exp2a) * rr2; + } + + // find terms needed later to compute mutual polarization + + if (poltyp != DIRECT) { + scale3 = 1.0; + scale5 = 1.0; + damp = pdi * pdamp[jtype]; + if (damp != 0.0) { + pgamma = MIN(pti,thole[jtype]); + damp = pgamma * pow(r/damp,3.0); + if (damp < 50.0) { + expdamp = exp(-damp); + scale3 = 1.0 - expdamp; + scale5 = 1.0 - expdamp*(1.0+damp); + } + } + scalek = factor_uscale; + bcn[0] = bn[1] - (1.0-scalek*scale3)*rr3; + bcn[1] = bn[2] - (1.0-scalek*scale5)*rr5; + + neighptr[n++] = j; + tdipdip[ndip++] = -bcn[0] + bcn[1]*xr*xr; + tdipdip[ndip++] = bcn[1]*xr*yr; + tdipdip[ndip++] = bcn[1]*xr*zr; + tdipdip[ndip++] = -bcn[0] + bcn[1]*yr*yr; + tdipdip[ndip++] = bcn[1]*yr*zr; + tdipdip[ndip++] = -bcn[0] + bcn[1]*zr*zr; + } else { + if (comm->me == 0) printf("i = %d: j = %d: poltyp == DIRECT\n", i, j); + } + + } // jj + + firstneigh_dipole[i] = neighptr; + firstneigh_dipdip[i] = tdipdip; + numneigh_dipole[i] = n; + ipage_dipole->vgot(n); + dpage_dipdip->vgot(ndip); + } +} + +/* ---------------------------------------------------------------------- + ufield0c = mutual induction via Ewald sum + ufield0c computes the mutual electrostatic field due to + induced dipole moments via Ewald summation +------------------------------------------------------------------------- */ + +void PairAmoebaGPU::ufield0c(double **field, double **fieldp) +{ + double term; + + // zero field,fieldp for owned and ghost atoms + + int nlocal = atom->nlocal; + int nall = nlocal + atom->nghost; + + memset(&field[0][0], 0, 3*nall *sizeof(double)); + memset(&fieldp[0][0], 0, 3*nall *sizeof(double)); + + // get the real space portion of the mutual field first + + double time0, time1, time2; + + MPI_Barrier(world); + time0 = platform::walltime(); + + if (polar_rspace_flag) umutual2b(field,fieldp); + time1 = platform::walltime(); + + // get the reciprocal space part of the mutual field + + if (polar_kspace_flag) umutual1(field,fieldp); + time2 = platform::walltime(); + + // add the self-energy portion of the mutual field + + term = (4.0/3.0) * aewald*aewald*aewald / MY_PIS; + for (int i = 0; i < nlocal; i++) { + field[i][0] += term*uind[i][0]; + field[i][1] += term*uind[i][1]; + field[i][2] += term*uind[i][2]; + } + + for (int i = 0; i < nlocal; i++) { + fieldp[i][0] += term*uinp[i][0]; + fieldp[i][1] += term*uinp[i][1]; + fieldp[i][2] += term*uinp[i][2]; + } + + // accumulate the field and fieldp values from the real-space portion from umutual2b() on the GPU + // field and fieldp may already have some nonzero values from kspace (umutual1 and self) + + amoeba_gpu_update_fieldp(&fieldp_pinned); + + int inum = atom->nlocal; + if (acc_float) { + auto field_ptr = (float *)fieldp_pinned; + + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + field[i][0] += field_ptr[idx]; + field[i][1] += field_ptr[idx+1]; + field[i][2] += field_ptr[idx+2]; + } + + field_ptr += 4*inum; + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + fieldp[i][0] += field_ptr[idx]; + fieldp[i][1] += field_ptr[idx+1]; + fieldp[i][2] += field_ptr[idx+2]; + } + } else { + auto field_ptr = (double *)fieldp_pinned; + + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + field[i][0] += field_ptr[idx]; + field[i][1] += field_ptr[idx+1]; + field[i][2] += field_ptr[idx+2]; + } + + field_ptr += 4*inum; + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + fieldp[i][0] += field_ptr[idx]; + fieldp[i][1] += field_ptr[idx+1]; + fieldp[i][2] += field_ptr[idx+2]; + } + } + + + // accumulate timing information + + time_mutual_rspace += time1 - time0; + time_mutual_kspace += time2 - time1; +} + +/* ---------------------------------------------------------------------- + umutual1 = Ewald recip mutual induced field + umutual1 computes the reciprocal space contribution of the + induced atomic dipole moments to the field +------------------------------------------------------------------------- */ + +void PairAmoebaGPU::umutual1(double **field, double **fieldp) +{ + int m,n; + int nxlo,nxhi,nylo,nyhi,nzlo,nzhi; + double term; + double a[3][3]; // indices not flipped vs Fortran + + // return if the Ewald coefficient is zero + + if (aewald < 1.0e-6) return; + + // convert Cartesian dipoles to fractional coordinates + + for (int j = 0; j < 3; j++) { + a[0][j] = nfft1 * recip[0][j]; + a[1][j] = nfft2 * recip[1][j]; + a[2][j] = nfft3 * recip[2][j]; + } + + int nlocal = atom->nlocal; + for (int i = 0; i < nlocal; i++) { + fuind[i][0] = a[0][0]*uind[i][0] + a[0][1]*uind[i][1] + a[0][2]*uind[i][2]; + fuind[i][1] = a[1][0]*uind[i][0] + a[1][1]*uind[i][1] + a[1][2]*uind[i][2]; + fuind[i][2] = a[2][0]*uind[i][0] + a[2][1]*uind[i][1] + a[2][2]*uind[i][2]; + } + + for (int i = 0; i < nlocal; i++) { + fuinp[i][0] = a[0][0]*uinp[i][0] + a[0][1]*uinp[i][1] + a[0][2]*uinp[i][2]; + fuinp[i][1] = a[1][0]*uinp[i][0] + a[1][1]*uinp[i][1] + a[1][2]*uinp[i][2]; + fuinp[i][2] = a[2][0]*uinp[i][0] + a[2][1]*uinp[i][1] + a[2][2]*uinp[i][2]; + } + + // gridpre = my portion of 4d grid in brick decomp w/ ghost values + + FFT_SCALAR ****gridpre = (FFT_SCALAR ****) ic_kspace->zero(); + + // map 2 values to grid + + double time0, time1; + MPI_Barrier(world); + time0 = platform::walltime(); + + grid_uind(fuind,fuinp,gridpre); + + time1 = platform::walltime(); + time_grid_uind += (time1 - time0); + + // pre-convolution operations including forward FFT + // gridfft = my portion of complex 3d grid in FFT decomposition + + FFT_SCALAR *gridfft = ic_kspace->pre_convolution(); + + // --------------------- + // convolution operation + // --------------------- + + nxlo = ic_kspace->nxlo_fft; + nxhi = ic_kspace->nxhi_fft; + nylo = ic_kspace->nylo_fft; + nyhi = ic_kspace->nyhi_fft; + nzlo = ic_kspace->nzlo_fft; + nzhi = ic_kspace->nzhi_fft; + + // use qfac values stored in udirect1() + + m = n = 0; + for (int k = nzlo; k <= nzhi; k++) { + for (int j = nylo; j <= nyhi; j++) { + for (int i = nxlo; i <= nxhi; i++) { + term = qfac[m++]; + gridfft[n] *= term; + gridfft[n+1] *= term; + n += 2; + } + } + } + + // post-convolution operations including backward FFT + // gridppost = my portion of 4d grid in brick decomp w/ ghost values + + FFT_SCALAR ****gridpost = (FFT_SCALAR ****) ic_kspace->post_convolution(); + + // get potential + + MPI_Barrier(world); + time0 = platform::walltime(); + + fphi_uind(gridpost,fdip_phi1,fdip_phi2,fdip_sum_phi); + + time1 = platform::walltime(); + time_fphi_uind += (time1 - time0); + + // store fractional reciprocal potentials for OPT method + + if (poltyp == OPT) { + for (int i = 0; i < nlocal; i++) { + for (int j = 0; j < 10; j++) { + fopt[i][optlevel][j] = fdip_phi1[i][j]; + foptp[i][optlevel][j] = fdip_phi2[i][j]; + } + } + } + + for (int i = 0; i < nlocal; i++) { + double dfx = a[0][0]*fdip_phi1[i][1] + + a[0][1]*fdip_phi1[i][2] + a[0][2]*fdip_phi1[i][3]; + double dfy = a[1][0]*fdip_phi1[i][1] + + a[1][1]*fdip_phi1[i][2] + a[1][2]*fdip_phi1[i][3]; + double dfz = a[2][0]*fdip_phi1[i][1] + + a[2][1]*fdip_phi1[i][2] + a[2][2]*fdip_phi1[i][3]; + field[i][0] -= dfx; + field[i][1] -= dfy; + field[i][2] -= dfz; + } + + for (int i = 0; i < nlocal; i++) { + double dfx = a[0][0]*fdip_phi2[i][1] + + a[0][1]*fdip_phi2[i][2] + a[0][2]*fdip_phi2[i][3]; + double dfy = a[1][0]*fdip_phi2[i][1] + + a[1][1]*fdip_phi2[i][2] + a[1][2]*fdip_phi2[i][3]; + double dfz = a[2][0]*fdip_phi2[i][1] + + a[2][1]*fdip_phi2[i][2] + a[2][2]*fdip_phi2[i][3]; + fieldp[i][0] -= dfx; + fieldp[i][1] -= dfy; + fieldp[i][2] -= dfz; + } + +} + +/* ---------------------------------------------------------------------- + fphi_uind = induced potential from grid + fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid +------------------------------------------------------------------------- */ + +void PairAmoebaGPU::fphi_uind(FFT_SCALAR ****grid, double **fdip_phi1, + double **fdip_phi2, double **fdip_sum_phi) +{ + if (!gpu_fphi_uind_ready) { + PairAmoeba::fphi_uind(grid, fdip_phi1, fdip_phi2, fdip_sum_phi); + return; + } + + void* fdip_phi1_pinned = nullptr; + void* fdip_phi2_pinned = nullptr; + void* fdip_sum_phi_pinned = nullptr; + amoeba_gpu_fphi_uind(grid, &fdip_phi1_pinned, &fdip_phi2_pinned, + &fdip_sum_phi_pinned); + + int nlocal = atom->nlocal; + if (acc_float) { + auto _fdip_phi1_ptr = (float *)fdip_phi1_pinned; + for (int i = 0; i < nlocal; i++) { + int n = i; + for (int m = 0; m < 10; m++) { + fdip_phi1[i][m] = _fdip_phi1_ptr[n]; + n += nlocal; + } + } + + auto _fdip_phi2_ptr = (float *)fdip_phi2_pinned; + for (int i = 0; i < nlocal; i++) { + int n = i; + for (int m = 0; m < 10; m++) { + fdip_phi2[i][m] = _fdip_phi2_ptr[n]; + n += nlocal; + } + } + + auto _fdip_sum_phi_ptr = (float *)fdip_sum_phi_pinned; + for (int i = 0; i < nlocal; i++) { + int n = i; + for (int m = 0; m < 20; m++) { + fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[n]; + n += nlocal; + } + } + + } else { + auto _fdip_phi1_ptr = (double *)fdip_phi1_pinned; + for (int i = 0; i < nlocal; i++) { + int n = i; + for (int m = 0; m < 10; m++) { + fdip_phi1[i][m] = _fdip_phi1_ptr[n]; + n += nlocal; + } + } + + auto _fdip_phi2_ptr = (double *)fdip_phi2_pinned; + for (int i = 0; i < nlocal; i++) { + int n = i; + for (int m = 0; m < 10; m++) { + fdip_phi2[i][m] = _fdip_phi2_ptr[n]; + n += nlocal; + } + } + + auto _fdip_sum_phi_ptr = (double *)fdip_sum_phi_pinned; + for (int i = 0; i < nlocal; i++) { + int n = i; + for (int m = 0; m < 20; m++) { + fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[n]; + n += nlocal; + } + } + } + +} + +/* ---------------------------------------------------------------------- + umutual2b = Ewald real mutual field via list + umutual2b computes the real space contribution of the induced + atomic dipole moments to the field via a neighbor list +------------------------------------------------------------------------- */ + +void PairAmoebaGPU::umutual2b(double **field, double **fieldp) +{ + if (!gpu_umutual2b_ready) { + PairAmoeba::umutual2b(field, fieldp); + return; + } + + double sublo[3],subhi[3]; + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi); + } + + // select the correct cutoff (off2) for the term + + if (use_ewald) choose(POLAR_LONG); + else choose(POLAR); + + amoeba_gpu_compute_umutual2b(amtype, amgroup, rpole, uind, uinp, + aewald, off2, &fieldp_pinned); +} + +/* ---------------------------------------------------------------------- + polar_real = real-space portion of induced dipole polarization + adapted from Tinker epreal1d() routine +------------------------------------------------------------------------- */ + +void PairAmoebaGPU::polar_real() +{ + if (!gpu_polar_real_ready) { + PairAmoeba::polar_real(); + return; + } + + int eflag=1, vflag=1; + double **f = atom->f; + + double sublo[3],subhi[3]; + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi); + } + + // select the correct cutoff and aewald for the term + + if (use_ewald) choose(POLAR_LONG); + else choose(POLAR); + + // set the energy unit conversion factor for polar real-space calculation + + double felec = 0.5 * electric / am_dielectric; + + amoeba_gpu_compute_polar_real(amtype, amgroup, rpole, uind, uinp, + eflag, vflag, eflag_atom, vflag_atom, + aewald, felec, off2, &tq_pinned); + + // reference to the tep array from GPU lib + + if (acc_float) { + auto *tep_ptr = (float *)tq_pinned; + compute_force_from_torque(tep_ptr, f, virpolar); // fpolar + } else { + auto *tep_ptr = (double *)tq_pinned; + compute_force_from_torque(tep_ptr, f, virpolar); // fpolar + } +} + +/* ---------------------------------------------------------------------- + polar_kspace = KSpace portion of induced dipole polarization + adapted from Tinker eprecip1() routine + same as PairAmoeba, except that fphi_uind() is reimplemented here + ------------------------------------------------------------------------- */ + +void PairAmoebaGPU::polar_kspace() +{ + int i,j,k,m,n; + int nhalf1,nhalf2,nhalf3; + int nxlo,nxhi,nylo,nyhi,nzlo,nzhi; + int j1,j2,j3; + int ix,iy,iz; + double eterm,felec; + double r1,r2,r3; + double h1,h2,h3; + double f1,f2,f3; + double xix,yix,zix; + double xiy,yiy,ziy; + double xiz,yiz,ziz; + double vxx,vyy,vzz; + double vxy,vxz,vyz; + double volterm,denom; + double hsq,expterm; + double term,pterm; + double vterm,struc2; + double tep[3]; + double fix[3],fiy[3],fiz[3]; + double cphid[4],cphip[4]; + double a[3][3]; // indices not flipped vs Fortran + + bool gpu_fphi_mpole_ready = true; + + // indices into the electrostatic field array + // decremented by 1 versus Fortran + + int deriv1[10] = {1, 4, 7, 8, 10, 15, 17, 13, 14, 19}; + int deriv2[10] = {2, 7, 5, 9, 13, 11, 18, 15, 19, 16}; + int deriv3[10] = {3, 8, 9, 6, 14, 16, 12, 19, 17, 18}; + + // return if the Ewald coefficient is zero + + if (aewald < 1.0e-6) return; + + // owned atoms + + double **x = atom->x; + double **f = atom->f; + int nlocal = atom->nlocal; + + double volbox = domain->prd[0] * domain->prd[1] * domain->prd[2]; + pterm = pow((MY_PI/aewald),2.0); + volterm = MY_PI * volbox; + + // initialize variables required for the scalar summation + + felec = electric / am_dielectric; + + // remove scalar sum virial from prior multipole FFT + // can only do this if multipoles were computed with same aeewald = apewald + // else need to re-compute it via new long-range solve + + nfft1 = p_kspace->nx; + nfft2 = p_kspace->ny; + nfft3 = p_kspace->nz; + bsorder = p_kspace->order; + + nhalf1 = (nfft1+1) / 2; + nhalf2 = (nfft2+1) / 2; + nhalf3 = (nfft3+1) / 2; + + nxlo = p_kspace->nxlo_fft; + nxhi = p_kspace->nxhi_fft; + nylo = p_kspace->nylo_fft; + nyhi = p_kspace->nyhi_fft; + nzlo = p_kspace->nzlo_fft; + nzhi = p_kspace->nzhi_fft; + + // use previous results or compute new qfac and convolution + + if (aewald == aeewald) { + vxx = -vmsave[0]; + vyy = -vmsave[1]; + vzz = -vmsave[2]; + vxy = -vmsave[3]; + vxz = -vmsave[4]; + vyz = -vmsave[5]; + + } else { + + // setup stencil size and B-spline coefficients + + moduli(); + bspline_fill(); + + // allocate memory and make early host-device transfers + + // NOTE: this is for p_kspace, and igrid and thetai[1-3] are filled by bpsline_fill + if (gpu_fphi_mpole_ready) { + amoeba_gpu_precompute_kspace(atom->nlocal, bsorder, + thetai1, thetai2, thetai3, igrid, + p_kspace->nzlo_out, p_kspace->nzhi_out, + p_kspace->nylo_out, p_kspace->nyhi_out, + p_kspace->nxlo_out, p_kspace->nxhi_out); + } + + + // convert Cartesian multipoles to fractional coordinates + + cmp_to_fmp(cmp,fmp); + + // gridpre = my portion of 3d grid in brick decomp w/ ghost values + + FFT_SCALAR ***gridpre = (FFT_SCALAR ***) p_kspace->zero(); + + // map atoms to grid + + grid_mpole(fmp,gridpre); + + // pre-convolution operations including forward FFT + // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector + + FFT_SCALAR *gridfft = p_kspace->pre_convolution(); + + // --------------------- + // convolution operation + // --------------------- + + // zero virial accumulation variables + + vxx = vyy = vzz = vxy = vxz = vyz = 0.0; + + // perform convolution on K-space points I own + + m = n = 0; + for (k = nzlo; k <= nzhi; k++) { + for (j = nylo; j <= nyhi; j++) { + for (i = nxlo; i <= nxhi; i++) { + r1 = (i >= nhalf1) ? i-nfft1 : i; + r2 = (j >= nhalf2) ? j-nfft2 : j; + r3 = (k >= nhalf3) ? k-nfft3 : k; + h1 = recip[0][0]*r1 + recip[0][1]*r2 + recip[0][2]*r3; // matvec + h2 = recip[1][0]*r1 + recip[1][1]*r2 + recip[1][2]*r3; + h3 = recip[2][0]*r1 + recip[2][1]*r2 + recip[2][2]*r3; + hsq = h1*h1 + h2*h2 + h3*h3; + term = -pterm * hsq; + expterm = 0.0; + if (term > -50.0 && hsq != 0.0) { + denom = volterm*hsq*bsmod1[i]*bsmod2[j]*bsmod3[k]; + if (hsq) expterm = exp(term) / denom; + struc2 = gridfft[n]*gridfft[n] + gridfft[n+1]*gridfft[n+1]; + eterm = 0.5 * felec * expterm * struc2; + vterm = (2.0/hsq) * (1.0-term) * eterm; + vxx -= h1*h1*vterm - eterm; + vyy -= h2*h2*vterm - eterm; + vzz -= h3*h3*vterm - eterm; + vxy -= h1*h2*vterm; + vxz -= h1*h3*vterm; + vyz -= h2*h3*vterm; + } + + expterm = qfac[m++]; + gridfft[n] *= expterm; + gridfft[n+1] *= expterm; + n += 2; + } + } + } + + // post-convolution operations including backward FFT + // gridppost = my portion of 3d grid in brick decomp w/ ghost values + + FFT_SCALAR ***gridpost = (FFT_SCALAR ***) p_kspace->post_convolution(); + + // get potential + + if (!gpu_fphi_mpole_ready) { + fphi_mpole(gridpost,fphi); + + for (i = 0; i < nlocal; i++) { + for (k = 0; k < 20; k++) + fphi[i][k] *= felec; + } + + } else { + void* fphi_pinned = nullptr; + amoeba_gpu_fphi_mpole(gridpost, &fphi_pinned, felec); + if (acc_float) { + auto _fphi_ptr = (float *)fphi_pinned; + for (int i = 0; i < nlocal; i++) { + int idx = i; + for (int m = 0; m < 20; m++) { + fphi[i][m] = _fphi_ptr[idx]; + idx += nlocal; + } + } + } else { + auto _fphi_ptr = (double *)fphi_pinned; + for (int i = 0; i < nlocal; i++) { + int idx = i; + for (int m = 0; m < 20; m++) { + fphi[i][m] = _fphi_ptr[idx]; + idx += nlocal; + } + } + } + } + + // convert field from fractional to Cartesian + + fphi_to_cphi(fphi,cphi); + } + + // convert Cartesian induced dipoles to fractional coordinates + + for (i = 0; i < 3; i++) { + a[0][i] = nfft1 * recip[0][i]; + a[1][i] = nfft2 * recip[1][i]; + a[2][i] = nfft3 * recip[2][i]; + } + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + fuind[i][j] = a[j][0]*uind[i][0] + a[j][1]*uind[i][1] + a[j][2]*uind[i][2]; + fuinp[i][j] = a[j][0]*uinp[i][0] + a[j][1]*uinp[i][1] + a[j][2]*uinp[i][2]; + } + } + + // gridpre2 = my portion of 4d grid in brick decomp w/ ghost values + + FFT_SCALAR ****gridpre2 = (FFT_SCALAR ****) pc_kspace->zero(); + + // map 2 values to grid + + grid_uind(fuind,fuinp,gridpre2); + + // pre-convolution operations including forward FFT + // gridfft = my portion of complex 3d grid in FFT decomposition + + FFT_SCALAR *gridfft = pc_kspace->pre_convolution(); + + // --------------------- + // convolution operation + // --------------------- + + // use qfac values from above or from induce() + + m = n = 0; + for (k = nzlo; k <= nzhi; k++) { + for (j = nylo; j <= nyhi; j++) { + for (i = nxlo; i <= nxhi; i++) { + term = qfac[m++]; + gridfft[n] *= term; + gridfft[n+1] *= term; + n += 2; + } + } + } + + // post-convolution operations including backward FFT + // gridppost = my portion of 4d grid in brick decomp w/ ghost values + + FFT_SCALAR ****gridpost = (FFT_SCALAR ****) pc_kspace->post_convolution(); + + // get potential + + fphi_uind(gridpost,fphid,fphip,fphidp); + + // TODO: port the remaining loops to the GPU + + for (i = 0; i < nlocal; i++) { + for (j = 1; j < 10; j++) { + fphid[i][j] = felec * fphid[i][j]; + fphip[i][j] = felec * fphip[i][j]; + } + for (j = 0; j < 20; j++) + fphidp[i][j] = felec * fphidp[i][j]; + } + + // increment the dipole polarization gradient contributions + + for (i = 0; i < nlocal; i++) { + f1 = 0.0; + f2 = 0.0; + f3 = 0.0; + for (k = 0; k < 3; k++) { + j1 = deriv1[k+1]; + j2 = deriv2[k+1]; + j3 = deriv3[k+1]; + f1 += (fuind[i][k]+fuinp[i][k])*fphi[i][j1]; + f2 += (fuind[i][k]+fuinp[i][k])*fphi[i][j2]; + f3 += (fuind[i][k]+fuinp[i][k])*fphi[i][j3]; + if (poltyp == MUTUAL) { + f1 += fuind[i][k]*fphip[i][j1] + fuinp[i][k]*fphid[i][j1]; + f2 += fuind[i][k]*fphip[i][j2] + fuinp[i][k]*fphid[i][j2]; + f3 += fuind[i][k]*fphip[i][j3] + fuinp[i][k]*fphid[i][j3]; + } + } + for (k = 0; k < 10; k++) { + f1 += fmp[i][k]*fphidp[i][deriv1[k]]; + f2 += fmp[i][k]*fphidp[i][deriv2[k]]; + f3 += fmp[i][k]*fphidp[i][deriv3[k]]; + } + f1 *= 0.5 * nfft1; + f2 *= 0.5 * nfft2; + f3 *= 0.5 * nfft3; + h1 = recip[0][0]*f1 + recip[0][1]*f2 + recip[0][2]*f3; + h2 = recip[1][0]*f1 + recip[1][1]*f2 + recip[1][2]*f3; + h3 = recip[2][0]*f1 + recip[2][1]*f2 + recip[2][2]*f3; + f[i][0] -= h1; + f[i][1] -= h2; + f[i][2] -= h3; + } + + // set the potential to be the induced dipole average + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 10; j++) + fphidp[i][j] *= 0.5; + } + + fphi_to_cphi(fphidp,cphidp); + + // get the fractional to Cartesian transformation matrix + + //frac_to_cart(); + + // increment the dipole polarization virial contributions + + for (i = 0; i < nlocal; i++) { + for (j = 1; j < 4; j++) { + cphid[j] = 0.0; + cphip[j] = 0.0; + for (k = 1; k < 4; k++) { + cphid[j] += ftc[j][k]*fphid[i][k]; + cphip[j] += ftc[j][k]*fphip[i][k]; + } + } + + vxx -= cmp[i][1]*cphidp[i][1] + + 0.5*((uind[i][0]+uinp[i][0])*cphi[i][1]); + vyy -= cmp[i][2]*cphidp[i][2] + + 0.5*((uind[i][1]+uinp[i][1])*cphi[i][2]); + vzz -= cmp[i][3]*cphidp[i][3] + + 0.5*((uind[i][2]+uinp[i][2])*cphi[i][3]); + vxy -= 0.5*(cphidp[i][1]*cmp[i][2]+cphidp[i][2]*cmp[i][1]) + + 0.25*((uind[i][1]+uinp[i][1])*cphi[i][1] + + (uind[i][0]+uinp[i][0])*cphi[i][2]); + vyz -= 0.5*(cphidp[i][2]*cmp[i][3]+cphidp[i][3]*cmp[i][2]) + + 0.25*((uind[i][2]+uinp[i][2])*cphi[i][2] + + (uind[i][1]+uinp[i][1])*cphi[i][3]); + vxz -= 0.5*(cphidp[i][1]*cmp[i][3]+cphidp[i][3]*cmp[i][1]) + + 0.25*((uind[i][2]+uinp[i][2])*cphi[i][1] + + (uind[i][0]+uinp[i][0])*cphi[i][3]); + + vxx -= 2.0*cmp[i][4]*cphidp[i][4] + cmp[i][7]*cphidp[i][7] + + cmp[i][8]*cphidp[i][8]; + vyy -= 2.0*cmp[i][5]*cphidp[i][5] + cmp[i][7]*cphidp[i][7] + + cmp[i][9]*cphidp[i][9]; + vzz -= 2.0*cmp[i][6]*cphidp[i][6] + cmp[i][8]*cphidp[i][8] + + cmp[i][9]*cphidp[i][9]; + vxy -= (cmp[i][4]+cmp[i][5])*cphidp[i][7] + + 0.5*(cmp[i][7]*(cphidp[i][5]+cphidp[i][4]) + + cmp[i][8]*cphidp[i][9]+cmp[i][9]*cphidp[i][8]); + vyz -= (cmp[i][5]+cmp[i][6])*cphidp[i][9] + + 0.5*(cmp[i][9]*(cphidp[i][5]+cphidp[i][6]) + + cmp[i][7]*cphidp[i][8]+cmp[i][8]*cphidp[i][7]); + vxz -= (cmp[i][4]+cmp[i][6])*cphidp[i][8] + + 0.5*(cmp[i][8]*(cphidp[i][4]+cphidp[i][6]) + + cmp[i][7]*cphidp[i][9]+cmp[i][9]*cphidp[i][7]); + + if (poltyp == MUTUAL) { + vxx -= 0.5 * (cphid[1]*uinp[i][0]+cphip[1]*uind[i][0]); + vyy -= 0.5 * (cphid[2]*uinp[i][1]+cphip[2]*uind[i][1]); + vzz -= 0.5 * (cphid[3]*uinp[i][2]+cphip[3]*uind[i][2]); + vxy -= 0.25 * (cphid[1]*uinp[i][1]+cphip[1]*uind[i][1] + + cphid[2]*uinp[i][0]+cphip[2]*uind[i][0]); + vyz -= 0.25 * (cphid[2]*uinp[i][2]+cphip[2]*uind[i][2] + + cphid[3]*uinp[i][1]+cphip[3]*uind[i][1]); + vxz -= 0.25 * (cphid[1]*uinp[i][2]+cphip[1]*uind[i][2] + + cphid[3]*uinp[i][0]+cphip[3]*uind[i][0]); + } + } + + + // resolve site torques then increment forces and virial + + for (i = 0; i < nlocal; i++) { + tep[0] = cmp[i][3]*cphidp[i][2] - cmp[i][2]*cphidp[i][3] + + 2.0*(cmp[i][6]-cmp[i][5])*cphidp[i][9] + cmp[i][8]*cphidp[i][7] + + cmp[i][9]*cphidp[i][5]- cmp[i][7]*cphidp[i][8] - cmp[i][9]*cphidp[i][6]; + tep[1] = cmp[i][1]*cphidp[i][3] - cmp[i][3]*cphidp[i][1] + + 2.0*(cmp[i][4]-cmp[i][6])*cphidp[i][8] + cmp[i][7]*cphidp[i][9] + + cmp[i][8]*cphidp[i][6] - cmp[i][8]*cphidp[i][4] - cmp[i][9]*cphidp[i][7]; + tep[2] = cmp[i][2]*cphidp[i][1] - cmp[i][1]*cphidp[i][2] + + 2.0*(cmp[i][5]-cmp[i][4])*cphidp[i][7] + cmp[i][7]*cphidp[i][4] + + cmp[i][9]*cphidp[i][8] - cmp[i][7]*cphidp[i][5] - cmp[i][8]*cphidp[i][9]; + + torque2force(i,tep,fix,fiy,fiz,f); + + iz = zaxis2local[i]; + ix = xaxis2local[i]; + iy = yaxis2local[i]; + + xiz = x[iz][0] - x[i][0]; + yiz = x[iz][1] - x[i][1]; + ziz = x[iz][2] - x[i][2]; + xix = x[ix][0] - x[i][0]; + yix = x[ix][1] - x[i][1]; + zix = x[ix][2] - x[i][2]; + xiy = x[iy][0] - x[i][0]; + yiy = x[iy][1] - x[i][1]; + ziy = x[iy][2] - x[i][2]; + + vxx += xix*fix[0] + xiy*fiy[0] + xiz*fiz[0]; + vyy += yix*fix[1] + yiy*fiy[1] + yiz*fiz[1]; + vzz += zix*fix[2] + ziy*fiy[2] + ziz*fiz[2]; + vxy += 0.5*(yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] + + xix*fix[1] + xiy*fiy[1] + xiz*fiz[1]); + vyz += 0.5*(zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] + + yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]); + vxz += 0.5*(zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] + + xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]); + } + + // account for dipole response terms in the OPT method + + if (poltyp == OPT) { + for (i = 0; i < nlocal; i++) { + for (k = 0; k < optorder; k++) { + for (j = 1; j < 10; j++) { + fphid[i][j] = felec * fopt[i][k][j]; + fphip[i][j] = felec * foptp[i][k][j]; + } + + for (m = 0; m < optorder-k; m++) { + for (j = 0; j < 3; j++) { + fuind[i][j] = a[0][j]*uopt[i][m][0] + a[1][j]*uopt[i][m][1] + + a[2][j]*uopt[i][m][2]; + fuinp[i][j] = a[0][j]*uoptp[i][m][0] + a[1][j]*uoptp[i][m][1] + + a[2][j]*uoptp[i][m][2]; + } + + f1 = 0.0; + f2 = 0.0; + f3 = 0.0; + + for (j = 0; j < 3; j++) { + j1 = deriv1[j+1]; + j2 = deriv2[j+1]; + j3 = deriv3[j+1]; + f1 += fuind[i][j]*fphip[i][j1] + fuinp[i][j]*fphid[i][j1]; + f2 += fuind[i][j]*fphip[i][j2] + fuinp[i][j]*fphid[i][j2]; + f3 += fuind[i][j]*fphip[i][j3] + fuinp[i][j]*fphid[i][j3]; + } + + f1 *= 0.5 * nfft1; + f2 *= 0.5 * nfft2; + f3 *= 0.5 * nfft3; + h1 = recip[0][0]*f1 + recip[0][1]*f2 + recip[0][2]*f3; + h2 = recip[1][0]*f1 + recip[1][1]*f2 + recip[1][2]*f3; + h3 = recip[2][0]*f1 + recip[2][1]*f2 + recip[2][2]*f3; + + f[i][0] -= copm[k+m+1]*h1; + f[i][1] -= copm[k+m+1]*h2; + f[i][2] -= copm[k+m+1]*h3; + + for (j = 1; j < 4; j++) { + cphid[j] = 0.0; + cphip[j] = 0.0; + for (j1 = 1; j1 < 4; j1++) { + cphid[j] += ftc[j][j1]*fphid[i][j1]; + cphip[j] += ftc[j][j1]*fphip[i][j1]; + } + } + + vxx -= 0.5*copm[k+m+1] * + (cphid[1]*uoptp[i][m][0] + cphip[1]*uopt[i][m][0]); + vyy -= 0.5*copm[k+m+1] * + (cphid[2]*uoptp[i][m][1]+ cphip[2]*uopt[i][m][1]); + vzz -= 0.5*copm[k+m+1] * + (cphid[3]*uoptp[i][m][2]+ cphip[3]*uopt[i][m][2]); + vxy -= 0.25*copm[k+m+1] * + (cphid[1]*uoptp[i][m][1]+ cphip[1]*uopt[i][m][1]+ + cphid[2]*uoptp[i][m][0]+ cphip[2]*uopt[i][m][0]); + vyz -= 0.25*copm[k+m+1] * + (cphid[1]*uoptp[i][m][2]+ cphip[1]*uopt[i][m][2]+ + cphid[3]*uoptp[i][m][0]+ cphip[3]*uopt[i][m][0]); + vxz -= 0.25*copm[k+m+1] * + (cphid[2]*uoptp[i][m][2]+ cphip[2]*uopt[i][m][2]+ + cphid[3]*uoptp[i][m][1]+ cphip[3]*uopt[i][m][1]); + } + } + } + } + + // assign permanent and induced multipoles to the PME grid + + for (i = 0; i < nlocal; i++) { + for (j = 1; j < 4; j++) + cmp[i][j] += uinp[i][j-1]; + } + + // convert Cartesian multipoles to fractional multipoles + + cmp_to_fmp(cmp,fmp); + + // gridpre = my portion of 3d grid in brick decomp w/ ghost values + // zeroed by zero() + + FFT_SCALAR ***gridpre = (FFT_SCALAR ***) p_kspace->zero(); + + // map atoms to grid + + grid_mpole(fmp,gridpre); + + // pre-convolution operations including forward FFT + // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector + + gridfft = p_kspace->pre_convolution(); + + // gridfft1 = copy of first FFT + + int nfft_owned = p_kspace->nfft_owned; + memcpy(gridfft1,gridfft,2*nfft_owned*sizeof(FFT_SCALAR)); + + // assign induced dipoles to the PME grid + + for (i = 0; i < nlocal; i++) { + for (j = 1; j < 4; j++) + cmp[i][j] += uind[i][j-1] - uinp[i][j-1]; + } + + // convert Cartesian multipoles to fractional multipoles + + cmp_to_fmp(cmp,fmp); + + // gridpre = my portion of 3d grid in brick decomp w/ ghost values + // zeroed by zero() + + gridpre = (FFT_SCALAR ***) p_kspace->zero(); + + // map atoms to grid + + grid_mpole(fmp,gridpre); + + // pre-convolution operations including forward FFT + // gridfft1/2 = my portions of complex 3d grid in FFT decomp as 1d vectors + + FFT_SCALAR *gridfft2 = p_kspace->pre_convolution(); + + // --------------------- + // convolution operation + // --------------------- + + m = n = 0; + for (k = nzlo; k <= nzhi; k++) { + for (j = nylo; j <= nyhi; j++) { + for (i = nxlo; i <= nxhi; i++) { + r1 = (i >= nhalf1) ? i-nfft1 : i; + r2 = (j >= nhalf2) ? j-nfft2 : j; + r3 = (k >= nhalf3) ? k-nfft3 : k; + h1 = recip[0][0]*r1 + recip[0][1]*r2 + recip[0][2]*r3; // matvec + h2 = recip[1][0]*r1 + recip[1][1]*r2 + recip[1][2]*r3; + h3 = recip[2][0]*r1 + recip[2][1]*r2 + recip[2][2]*r3; + hsq = h1*h1 + h2*h2 + h3*h3; + term = -pterm * hsq; + expterm = 0.0; + if (term > -50.0 && hsq != 0.0) { + denom = volterm*hsq*bsmod1[i]*bsmod2[j]*bsmod3[k]; + expterm = exp(term) / denom; + struc2 = gridfft1[n]*gridfft2[n] + gridfft1[n+1]*gridfft2[n+1]; + eterm = 0.5 * felec * expterm * struc2; + vterm = (2.0/hsq) * (1.0-term) * eterm; + vxx += h1*h1*vterm - eterm; + vyy += h2*h2*vterm - eterm; + vzz += h3*h3*vterm - eterm; + vxy += h1*h2*vterm; + vyz += h2*h3*vterm; + vxz += h1*h3*vterm; + } + n += 2; + } + } + } + + // assign only the induced dipoles to the PME grid + // and perform the 3-D FFT forward transformation + // NOTE: why is there no inverse FFT in this section? + + if (poltyp == DIRECT || poltyp == TCG) { + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 10; j++) + cmp[i][j] = 0.0; + for (j = 1; j < 4; j++) + cmp[i][j] = uinp[i][j-1]; + } + + // convert Cartesian multipoles to fractional multipoles + + cmp_to_fmp(cmp,fmp); + + // gridpre = my portion of 3d grid in brick decomp w/ ghost values + // zeroed by zero() + + FFT_SCALAR ***gridpre = (FFT_SCALAR ***) p_kspace->zero(); + + // map atoms to grid + + grid_mpole(fmp,gridpre); + + // pre-convolution operations including forward FFT + // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector + + FFT_SCALAR *gridfft = p_kspace->pre_convolution(); + + // gridfft1 = copy of first FFT + + int nfft_owned = p_kspace->nfft_owned; + memcpy(gridfft1,gridfft,2*nfft_owned*sizeof(FFT_SCALAR)); + + // assign ??? to the PME grid + + for (i = 0; i < nlocal; i++) { + for (j = 1; j < 4; j++) + cmp[i][j] = uind[i][j-1]; + } + + // convert Cartesian multipoles to fractional multipoles + + cmp_to_fmp(cmp,fmp); + + // gridpre = my portion of 3d grid in brick decomp w/ ghost values + + gridpre = (FFT_SCALAR ***) p_kspace->zero(); + + // map atoms to grid + + grid_mpole(fmp,gridpre); + + // pre-convolution operations including forward FFT + // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector + + FFT_SCALAR *gridfft2 = p_kspace->pre_convolution(); + + // --------------------- + // convolution operation + // --------------------- + + m = n = 0; + for (k = nzlo; k <= nzhi; k++) { + for (j = nylo; j <= nyhi; j++) { + for (i = nxlo; i <= nxhi; i++) { + r1 = (i >= nhalf1) ? i-nfft1 : i; + r2 = (j >= nhalf2) ? j-nfft2 : j; + r3 = (k >= nhalf3) ? k-nfft3 : k; + h1 = recip[0][0]*r1 + recip[0][1]*r2 + recip[0][2]*r3; // matvec + h2 = recip[1][0]*r1 + recip[1][1]*r2 + recip[1][2]*r3; + h3 = recip[2][0]*r1 + recip[2][1]*r2 + recip[2][2]*r3; + hsq = h1*h1 + h2*h2 + h3*h3; + term = -pterm * hsq; + expterm = 0.0; + if (term > -50.0 && hsq != 0.0) { + denom = volterm*hsq*bsmod1[i]*bsmod2[j]*bsmod3[k]; + expterm = exp(term) / denom; + struc2 = gridfft1[n]*gridfft2[n] + gridfft1[n+1]*gridfft2[n+1]; + eterm = 0.5 * felec * expterm * struc2; + vterm = (2.0/hsq) * (1.0-term) * eterm; + vxx += h1*h1*vterm - eterm; + vyy += h2*h2*vterm - eterm; + vzz += h3*h3*vterm - eterm; + vxy += h1*h2*vterm; + vyz += h2*h3*vterm; + vxz += h1*h3*vterm; + } + n += 2; + } + } + } + } + + // increment the total internal virial tensor components + + if (vflag_global) { + virpolar[0] -= vxx; + virpolar[1] -= vyy; + virpolar[2] -= vzz; + virpolar[3] -= vxy; + virpolar[4] -= vxz; + virpolar[5] -= vyz; + } +} + +/* ---------------------------------------------------------------------- + compute atom forces from torques +------------------------------------------------------------------------- */ + +template +void PairAmoebaGPU::compute_force_from_torque(const numtyp* tq_ptr, + double** force_comp, + double* virial_comp) +{ + int i,ix,iy,iz; + double xix,yix,zix; + double xiy,yiy,ziy; + double xiz,yiz,ziz; + double vxx,vyy,vzz; + double vxy,vxz,vyz; + double fix[3],fiy[3],fiz[3],_tq[4]; + + double** x = atom->x; + int nlocal = atom->nlocal; + + for (i = 0; i < nlocal; i++) { + _tq[0] = tq_ptr[4*i]; + _tq[1] = tq_ptr[4*i+1]; + _tq[2] = tq_ptr[4*i+2]; + torque2force(i,_tq,fix,fiy,fiz,force_comp); + + iz = zaxis2local[i]; + ix = xaxis2local[i]; + iy = yaxis2local[i]; + + xiz = x[iz][0] - x[i][0]; + yiz = x[iz][1] - x[i][1]; + ziz = x[iz][2] - x[i][2]; + xix = x[ix][0] - x[i][0]; + yix = x[ix][1] - x[i][1]; + zix = x[ix][2] - x[i][2]; + xiy = x[iy][0] - x[i][0]; + yiy = x[iy][1] - x[i][1]; + ziy = x[iy][2] - x[i][2]; + + vxx = xix*fix[0] + xiy*fiy[0] + xiz*fiz[0]; + vyy = yix*fix[1] + yiy*fiy[1] + yiz*fiz[1]; + vzz = zix*fix[2] + ziy*fiy[2] + ziz*fiz[2]; + vxy = 0.5 * (yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] + + xix*fix[1] + xiy*fiy[1] + xiz*fiz[1]); + vxz = 0.5 * (zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] + + xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]); + vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] + + yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]); + + virial_comp[0] -= vxx; + virial_comp[1] -= vyy; + virial_comp[2] -= vzz; + virial_comp[3] -= vxy; + virial_comp[4] -= vxz; + virial_comp[5] -= vyz; + } +} + +/* ---------------------------------------------------------------------- */ + +double PairAmoebaGPU::memory_usage() +{ + double bytes = Pair::memory_usage(); + return bytes + amoeba_gpu_bytes(); +} diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h new file mode 100644 index 0000000000..be53f7ef50 --- /dev/null +++ b/src/GPU/pair_amoeba_gpu.h @@ -0,0 +1,72 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS Development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS +// clang-format off +PairStyle(amoeba/gpu,PairAmoebaGPU); +// clang-format on +#else + +#ifndef LMP_PAIR_AMOEBA_GPU_H +#define LMP_PAIR_AMOEBA_GPU_H + +#include "pair_amoeba.h" + +namespace LAMMPS_NS { + +class PairAmoebaGPU : public PairAmoeba { + public: + PairAmoebaGPU(LAMMPS *lmp); + ~PairAmoebaGPU() override; + void init_style() override; + double memory_usage() override; + + enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH }; + + void induce() override; + + void multipole_real() override; + void udirect2b(double **, double **) override; + void umutual1(double **, double **) override; + void fphi_uind(FFT_SCALAR ****, double **, double **, double **) override; + void umutual2b(double **, double **) override; + void ufield0c(double **, double **) override; + void polar_real() override; + void polar_kspace() override; + + private: + int gpu_mode; + double cpu_time; + void *tq_pinned; + void *fieldp_pinned; + bool acc_float; + + bool gpu_hal_ready; + bool gpu_repulsion_ready; + bool gpu_dispersion_real_ready; + bool gpu_multipole_real_ready; + bool gpu_udirect2b_ready; + bool gpu_umutual1_ready; + bool gpu_fphi_uind_ready; + bool gpu_umutual2b_ready; + bool gpu_polar_real_ready; + + void udirect2b_cpu(); + + template + void compute_force_from_torque(const numtyp*, double**, double*); +}; + +} // namespace LAMMPS_NS +#endif +#endif diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp new file mode 100644 index 0000000000..9d286d5db7 --- /dev/null +++ b/src/GPU/pair_hippo_gpu.cpp @@ -0,0 +1,1494 @@ +// clang-format off +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS Development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Trung Nguyen (Northwestern/UChicago) +------------------------------------------------------------------------- */ + +#include "pair_hippo_gpu.h" + +#include "amoeba_convolution_gpu.h" +#include "atom.h" +#include "comm.h" +#include "domain.h" +#include "error.h" +#include "fix_store_peratom.h" +#include "force.h" +#include "gpu_extra.h" +#include "info.h" +#include "math_const.h" +#include "memory.h" +#include "my_page.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "neighbor.h" +#include "suffix.h" +#include + +using namespace LAMMPS_NS; +using namespace MathConst; + +enum{INDUCE,RSD,SETUP_hippo,SETUP_HIPPO,KMPOLE,AMGROUP}; // forward comm +enum{FIELD,ZRSD,TORQUE,UFLD}; // reverse comm +enum{VDWL,REPULSE,QFER,DISP,MPOLE,POLAR,USOLV,DISP_LONG,MPOLE_LONG,POLAR_LONG}; +enum{MUTUAL,OPT,TCG,DIRECT}; +enum{GEAR,ASPC,LSQR}; +enum{BUILD,APPLY}; +enum{GORDON1,GORDON2}; + +// same as in pair_amoeba.cpp +enum{MPOLE_GRID,POLAR_GRID,POLAR_GRIDC,DISP_GRID,INDUCE_GRID,INDUCE_GRIDC}; + +#define DEBYE 4.80321 // conversion factor from q-Angs (real units) to Debye + +// External functions from cuda library for atom decomposition + +int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass, + const double *host_pdamp, const double *host_thole, + const double *host_dirdamp, const int* host_amtype2class, + const double *host_special_repel, const double *host_special_disp, + const double *host_special_mpole, + const double *host_special_polar_wscale, + const double *host_special_polar_piscale, + const double *host_special_polar_pscale, + const double *host_sizpr, const double *host_dmppr, const double *host_elepr, + const double *host_csix, const double *host_adisp, + const double *host_pcore, const double *host_palpha, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const int maxspecial15, + const double cell_size, int &gpu_mode, FILE *screen, + const double polar_dscale, const double polar_uscale); +void hippo_gpu_clear(); + +int** hippo_gpu_precompute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, double *host_pval, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, double *prd); + +void hippo_gpu_compute_repulsion(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *host_amtype, int *host_amgroup, double **host_rpole, + double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, int *nspecial15, tagint** special15, + const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, const double aewald, const double off2, + double *host_q, double *boxlo, double *prd, + double cut2, double c0, double c1, double c2, + double c3, double c4, double c5, void **tep_ptr); + +void hippo_gpu_compute_dispersion_real(int *host_amtype, int *host_amgroup, double **host_rpole, + const double aewald, const double off2); + +void hippo_gpu_compute_multipole_real(const int ago, const int inum, const int nall, + double **host_x, int *host_type, int *host_amtype, int *host_amgroup, + double **host_rpole, double *host_pval, double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, int* nspecial15, tagint** special15, + const bool eflag, const bool vflag, const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, const double cpu_time, + bool &success, const double aewald, const double felec, const double off2, + double *host_q, double *boxlo, double *prd, void **tq_ptr); + +void hippo_gpu_compute_udirect2b(int *host_amtype, int *host_amgroup, + double **host_rpole, double **host_uind, double **host_uinp, + double *host_pval, const double aewald, const double off2, void **fieldp_ptr); + +void hippo_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup, + double **host_rpole, double **host_uind, double **host_uinp, double *host_pval, + const double aewald, const double off2, void **fieldp_ptr); + +void hippo_gpu_update_fieldp(void **fieldp_ptr); + +void hippo_gpu_precompute_kspace(const int inum_full, const int bsorder, + double ***host_thetai1, double ***host_thetai2, + double ***host_thetai3, int** igrid, + const int nzlo_out, const int nzhi_out, + const int nylo_out, const int nyhi_out, + const int nxlo_out, const int nxhi_out); + +void hippo_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1, + void **host_fdip_phi2, void **host_fdip_sum_phi); + +void hippo_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, + double **host_rpole, double **host_uind, double **host_uinp, double *host_pval, + const bool eflag, const bool vflag, const bool eatom, const bool vatom, + const double aewald, const double felec, const double off2, + void **tq_ptr); + +double hippo_gpu_bytes(); + +/* ---------------------------------------------------------------------- */ + +PairHippoGPU::PairHippoGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE) +{ + amoeba = false; + mystyle = "hippo"; + + respa_enable = 0; + reinitflag = 0; + cpu_time = 0.0; + suffix_flag |= Suffix::GPU; + fieldp_pinned = nullptr; + tq_pinned = nullptr; + + gpu_hal_ready = false; // always false for HIPPO + gpu_repulsion_ready = true; + gpu_dispersion_real_ready = true; + gpu_multipole_real_ready = true; + gpu_udirect2b_ready = true; + gpu_umutual1_ready = true; + gpu_fphi_uind_ready = true; + gpu_umutual2b_ready = true; + gpu_polar_real_ready = true; // need to be true for copying data from device back to host + + GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); +} + +/* ---------------------------------------------------------------------- + free all arrays +------------------------------------------------------------------------- */ + +PairHippoGPU::~PairHippoGPU() +{ + hippo_gpu_clear(); +} + +/* ---------------------------------------------------------------------- + init specific to this pair style +------------------------------------------------------------------------- */ + +void PairHippoGPU::init_style() +{ + PairAmoeba::init_style(); + + // Repeat cutsq calculation because done after call to init_style + + double maxcut = -1.0; + double cut; + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = i; j <= atom->ntypes; j++) { + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + cut = init_one(i,j); + cut *= cut; + if (cut > maxcut) + maxcut = cut; + cutsq[i][j] = cutsq[j][i] = cut; + } else + cutsq[i][j] = cutsq[j][i] = 0.0; + } + } + + double cell_size = sqrt(maxcut) + neighbor->skin; + + int maxspecial=0; + int maxspecial15=0; + if (atom->molecular != Atom::ATOMIC) { + maxspecial=atom->maxspecial; + maxspecial15=atom->maxspecial15; + } + + int mnf = 5e-2 * neighbor->oneatom; + int success = hippo_gpu_init(atom->ntypes+1, max_amtype, max_amclass, + pdamp, thole, dirdamp, amtype2class, + special_repel, special_disp, special_mpole, + special_polar_wscale, special_polar_piscale, + special_polar_pscale, sizpr, dmppr, elepr, + csix, adisp, pcore, palpha, + atom->nlocal, atom->nlocal+atom->nghost, mnf, + maxspecial, maxspecial15, cell_size, gpu_mode, + screen, polar_dscale, polar_uscale); + GPU_EXTRA::check_flag(success,error,world); + + if (gpu_mode == GPU_FORCE) + error->all(FLERR,"Pair style hippo/gpu does not support neigh no for now"); + + acc_float = Info::has_accelerator_feature("GPU", "precision", "single"); + + // replace with the gpu counterpart + + if (gpu_umutual1_ready) { + if (use_ewald && ic_kspace) { + delete ic_kspace; + ic_kspace = + new AmoebaConvolutionGPU(lmp,this,nefft1,nefft2,nefft3,bsporder,INDUCE_GRIDC); + } + } +} + +/* ---------------------------------------------------------------------- + repulsion = Pauli repulsion interactions + adapted from Tinker erepel1b() routine +------------------------------------------------------------------------- */ + +void PairHippoGPU::repulsion() +{ + if (!gpu_repulsion_ready) { + PairAmoeba::repulsion(); + return; + } + + int eflag=1, vflag=1; + double **f = atom->f; + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + bool success = true; + int *ilist, *numneigh; + + double sublo[3],subhi[3]; + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi); + } + inum = atom->nlocal; + + hippo_gpu_precompute(neighbor->ago, inum, nall, atom->x, + atom->type, amtype, amgroup, rpole, + nullptr, nullptr, nullptr, + sublo, subhi, atom->tag, + atom->nspecial, atom->special, + atom->nspecial15, atom->special15, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, atom->q, domain->boxlo, domain->prd); + + // select the correct cutoff for the term + + choose(REPULSE); + + hippo_gpu_compute_repulsion(neighbor->ago, inum, nall, atom->x, + atom->type, amtype, amgroup, rpole, + sublo, subhi, atom->tag, + atom->nspecial, atom->special, + atom->nspecial15, atom->special15, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, aewald, off2, atom->q, + domain->boxlo, domain->prd, cut2, + c0, c1, c2, c3, c4, c5, &tq_pinned); + + if (!success) + error->one(FLERR,"Insufficient memory on accelerator"); + + // reference to the tep array from GPU lib + + if (acc_float) { + auto *tq_ptr = (float *)tq_pinned; + compute_force_from_torque(tq_ptr, f, virrepulse); // frepulse + } else { + auto *tq_ptr = (double *)tq_pinned; + compute_force_from_torque(tq_ptr, f, virrepulse); // frepulse + } +} + +/* ---------------------------------------------------------------------- + dispersion_real = real-space portion of Ewald dispersion + adapted from Tinker edreal1d() routine +------------------------------------------------------------------------- */ + +void PairHippoGPU::dispersion_real() +{ + if (!gpu_dispersion_real_ready) { + PairAmoeba::dispersion_real(); + return; + } + + double sublo[3],subhi[3]; + + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi); + } + + // select the correct cutoff for the term + + if (use_dewald) choose(DISP_LONG); + else choose(DISP); + + hippo_gpu_compute_dispersion_real(amtype, amgroup, rpole, aewald, off2); +} + +/* ---------------------------------------------------------------------- + multipole_real = real-space portion of mulipole interactions + adapted from Tinker emreal1d() routine +------------------------------------------------------------------------- */ + +void PairHippoGPU::multipole_real() +{ + if (!gpu_multipole_real_ready) { + PairAmoeba::multipole_real(); + return; + } + + int eflag=1, vflag=1; + double **f = atom->f; + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + bool success = true; + int *ilist, *numneigh; + + double sublo[3],subhi[3]; + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi); + } + inum = atom->nlocal; + + // select the correct cutoff for the term + + if (use_ewald) choose(MPOLE_LONG); + else choose(MPOLE); + + // set the energy unit conversion factor for multipolar real-space calculation + + double felec = electric / am_dielectric; + double *pval = atom->dvector[index_pval]; + + hippo_gpu_compute_multipole_real(neighbor->ago, inum, nall, atom->x, + atom->type, amtype, amgroup, rpole, pval, + sublo, subhi, atom->tag, + atom->nspecial, atom->special, + atom->nspecial15, atom->special15, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, aewald, felec, off2, atom->q, + domain->boxlo, domain->prd, &tq_pinned); + + if (!success) + error->one(FLERR,"Insufficient memory on accelerator"); + + // reference to the tep array from GPU lib + + if (acc_float) { + auto *tq_ptr = (float *)tq_pinned; + compute_force_from_torque(tq_ptr, f, virmpole); // fmpole + } else { + auto *tq_ptr = (double *)tq_pinned; + compute_force_from_torque(tq_ptr, f, virmpole); // fmpole + } +} + +/* ---------------------------------------------------------------------- + induce = induced dipole moments via pre-conditioned CG solver + adapted from Tinker induce0a() routine + NOTE: Almost the same in the CPU version, except that there is no need + to call reverse_comm() for crstyle = FIELD; +------------------------------------------------------------------------- */ + +void PairHippoGPU::induce() +{ + bool done; + int i,j,m,itype; + int iter,maxiter; + double polmin; + double eps,epsold; + double epsd,epsp; + double udsum,upsum; + double a,ap,b,bp; + double sum,sump,term; + double reduce[4],allreduce[4]; + + // set cutoffs, taper coeffs, and PME params + // create qfac here, free at end of polar() + + if (use_ewald) choose(POLAR_LONG); + else choose(POLAR); + + // owned atoms + + int nlocal = atom->nlocal; + + // zero out the induced dipoles at each site + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + uind[i][j] = 0.0; + uinp[i][j] = 0.0; + } + } + + // get the electrostatic field due to permanent multipoles + + dfield0c(field,fieldp); + + // need reverse_comm if dfield0c (i.e. udirect2b) is CPU-only + + if (!gpu_udirect2b_ready) { + crstyle = FIELD; + comm->reverse_comm(this); + } + + // set induced dipoles to polarizability times direct field + + for (i = 0; i < nlocal; i++) { + itype = amtype[i]; + for (j = 0; j < 3; j++) { + udir[i][j] = polarity[itype] * field[i][j]; + udirp[i][j] = polarity[itype] * fieldp[i][j]; + if (pcgguess) { + uind[i][j] = udir[i][j]; + uinp[i][j] = udirp[i][j]; + } + } + } + + // allocate memory and make early host-device transfers + // must be done before the first ufield0c + if (ic_kspace) + hippo_gpu_precompute_kspace(atom->nlocal, bsorder, thetai1, thetai2, + thetai3, igrid, + ic_kspace->nzlo_out, ic_kspace->nzhi_out, + ic_kspace->nylo_out, ic_kspace->nyhi_out, + ic_kspace->nxlo_out, ic_kspace->nxhi_out); + + // get induced dipoles via the OPT extrapolation method + // NOTE: any way to rewrite these loops to avoid allocating + // uopt,uoptp with a optorder+1 dimension, just optorder ?? + // since no need to store optorder+1 values after these loops + + if (poltyp == OPT) { + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + uopt[i][0][j] = udir[i][j]; + uoptp[i][0][j] = udirp[i][j]; + } + } + + for (m = 1; m <= optorder; m++) { + optlevel = m - 1; // used in umutual1() for fopt,foptp + + cfstyle = INDUCE; + comm->forward_comm(this); + + ufield0c(field,fieldp); + + if (!gpu_umutual2b_ready) { + crstyle = FIELD; + comm->reverse_comm(this); + } + + for (i = 0; i < nlocal; i++) { + itype = amtype[i]; + for (j = 0; j < 3; j++) { + uopt[i][m][j] = polarity[itype] * field[i][j]; + uoptp[i][m][j] = polarity[itype] * fieldp[i][j]; + uind[i][j] = uopt[i][m][j]; + uinp[i][j] = uoptp[i][m][j]; + } + } + } + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + uind[i][j] = 0.0; + uinp[i][j] = 0.0; + usum[i][j] = 0.0; + usump[i][j] = 0.0; + for (m = 0; m <= optorder; m++) { + usum[i][j] += uopt[i][m][j]; + usump[i][j] += uoptp[i][m][j]; + uind[i][j] += copt[m]*usum[i][j]; + uinp[i][j] += copt[m]*usump[i][j]; + } + } + } + } + + // set tolerances for computation of mutual induced dipoles + + if (poltyp == MUTUAL) { + done = false; + maxiter = 100; + iter = 0; + polmin = 0.00000001; + eps = 100.0; + + // estimate induced dipoles using a polynomial predictor + + if (use_pred && nualt == maxualt) { + ulspred(); + + double ***udalt = fixudalt->tstore; + double ***upalt = fixupalt->tstore; + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + udsum = 0.0; + upsum = 0.0; + for (m = 0; m < nualt; m++) { + udsum += bpred[m]*udalt[i][m][j]; + upsum += bpredp[m]*upalt[i][m][j]; + } + uind[i][j] = udsum; + uinp[i][j] = upsum; + } + } + } + + // estimate induced dipoles via inertial extended Lagrangian + // not supported for now + // requires uaux,upaux to persist with each atom + // also requires a velocity vector(s) to persist + // also requires updating uaux,upaux in the Verlet integration + + /* + if (use_ielscf) { + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + uind[i][j] = uaux[i][j]; + uinp[i][j] = upaux[i][j]; + } + } + } + */ + + // get the electrostatic field due to induced dipoles + + cfstyle = INDUCE; + comm->forward_comm(this); + + ufield0c(field,fieldp); + + if (!gpu_umutual2b_ready) { + crstyle = FIELD; + comm->reverse_comm(this); + } + + // set initial conjugate gradient residual and conjugate vector + + for (i = 0; i < nlocal; i++) { + itype = amtype[i]; + + poli[i] = MAX(polmin,polarity[itype]); + for (j = 0; j < 3; j++) { + if (pcgguess) { + rsd[i][j] = (udir[i][j]-uind[i][j])/poli[i] + field[i][j]; + rsdp[i][j] = (udirp[i][j]-uinp[i][j])/poli[i] + fieldp[i][j]; + } else { + rsd[i][j] = udir[i][j] / poli[i]; + rsdp[i][j] = udirp[i][j] / poli[i]; + } + zrsd[i][j] = rsd[i][j]; + zrsdp[i][j] = rsdp[i][j]; + } + } + + if (pcgprec) { + cfstyle = RSD; + comm->forward_comm(this); + uscale0b(BUILD,rsd,rsdp,zrsd,zrsdp); + uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp); + crstyle = ZRSD; + comm->reverse_comm(this); + } + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + conj[i][j] = zrsd[i][j]; + conjp[i][j] = zrsdp[i][j]; + } + } + + // conjugate gradient iteration of the mutual induced dipoles + + while (!done) { + iter++; + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + vec[i][j] = uind[i][j]; + vecp[i][j] = uinp[i][j]; + uind[i][j] = conj[i][j]; + uinp[i][j] = conjp[i][j]; + } + } + + cfstyle = INDUCE; + comm->forward_comm(this); + + ufield0c(field,fieldp); + + if (!gpu_umutual2b_ready) { + crstyle = FIELD; + comm->reverse_comm(this); + } + + //error->all(FLERR,"STOP"); + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + uind[i][j] = vec[i][j]; + uinp[i][j] = vecp[i][j]; + vec[i][j] = conj[i][j]/poli[i] - field[i][j]; + vecp[i][j] = conjp[i][j]/poli[i] - fieldp[i][j]; + } + } + + a = 0.0; + ap = 0.0; + sum = 0.0; + sump = 0.0; + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + a += conj[i][j]*vec[i][j]; + ap += conjp[i][j]*vecp[i][j]; + sum += rsd[i][j]*zrsd[i][j]; + sump += rsdp[i][j]*zrsdp[i][j]; + } + } + + reduce[0] = a; + reduce[1] = ap; + reduce[2] = sum; + reduce[3] = sump; + MPI_Allreduce(reduce,allreduce,4,MPI_DOUBLE,MPI_SUM,world); + a = allreduce[0]; + ap = allreduce[1]; + sum = allreduce[2]; + sump = allreduce[3]; + + if (a != 0.0) a = sum / a; + if (ap != 0.0) ap = sump / ap; + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + uind[i][j] = uind[i][j] + a*conj[i][j]; + uinp[i][j] = uinp[i][j] + ap*conjp[i][j]; + rsd[i][j] = rsd[i][j] - a*vec[i][j]; + rsdp[i][j] = rsdp[i][j] - ap*vecp[i][j]; + zrsd[i][j] = rsd[i][j]; + zrsdp[i][j] = rsdp[i][j]; + } + } + + if (pcgprec) { + cfstyle = RSD; + comm->forward_comm(this); + uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp); + crstyle = ZRSD; + comm->reverse_comm(this); + } + + b = 0.0; + bp = 0.0; + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + b += rsd[i][j]*zrsd[i][j]; + bp += rsdp[i][j]*zrsdp[i][j]; + } + } + + reduce[0] = b; + reduce[1] = bp; + MPI_Allreduce(reduce,allreduce,4,MPI_DOUBLE,MPI_SUM,world); + b = allreduce[0]; + bp = allreduce[1]; + + if (sum != 0.0) b /= sum; + if (sump != 0.0) bp /= sump; + + epsd = 0.0; + epsp = 0.0; + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + conj[i][j] = zrsd[i][j] + b*conj[i][j]; + conjp[i][j] = zrsdp[i][j] + bp*conjp[i][j]; + epsd += rsd[i][j]*rsd[i][j]; + epsp += rsdp[i][j]*rsdp[i][j]; + } + } + + reduce[0] = epsd; + reduce[1] = epsp; + MPI_Allreduce(reduce,allreduce,4,MPI_DOUBLE,MPI_SUM,world); + epsd = allreduce[0]; + epsp = allreduce[1]; + + // check the convergence of the mutual induced dipoles + + epsold = eps; + eps = MAX(epsd,epsp); + eps = DEBYE * sqrt(eps/atom->natoms); + + if (eps < poleps) done = true; + if (eps > epsold) done = true; + if (iter >= politer) done = true; + + // apply a "peek" iteration to the mutual induced dipoles + + if (done) { + for (i = 0; i < nlocal; i++) { + term = pcgpeek * poli[i]; + for (j = 0; j < 3; j++) { + uind[i][j] += term*rsd[i][j]; + uinp[i][j] += term*rsdp[i][j]; + } + } + } + + } + + // terminate the calculation if dipoles failed to converge + // NOTE: could make this an error + + if (iter >= maxiter || eps > epsold) + if (comm->me == 0) + error->warning(FLERR,"HIPPO induced dipoles did not converge"); + } + + // update the lists of previous induced dipole values + // shift previous m values up to m+1, add new values at m = 0 + // only when preconditioner is used + + if (use_pred) { + double ***udalt = fixudalt->tstore; + double ***upalt = fixupalt->tstore; + + nualt = MIN(nualt+1,maxualt); + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + for (m = nualt-1; m > 0; m--) { + udalt[i][m][j] = udalt[i][m-1][j]; + upalt[i][m][j] = upalt[i][m-1][j]; + } + udalt[i][0][j] = uind[i][j]; + upalt[i][0][j] = uinp[i][j]; + } + } + } +} + +/* ---------------------------------------------------------------------- + udirect2b = Ewald real direct field via list + udirect2b computes the real space contribution of the permanent + atomic multipole moments to the field via a neighbor list +------------------------------------------------------------------------- */ + +void PairHippoGPU::udirect2b(double **field, double **fieldp) +{ + if (!gpu_udirect2b_ready) { + PairAmoeba::udirect2b(field, fieldp); + return; + } + + int inum; + double sublo[3],subhi[3]; + + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi); + } + inum = atom->nlocal; + + // select the correct cutoff (off2) for the term + + if (use_ewald) choose(POLAR_LONG); + else choose(POLAR); + + double *pval = atom->dvector[index_pval]; + hippo_gpu_compute_udirect2b(amtype, amgroup, rpole, uind, uinp, pval, + aewald, off2, &fieldp_pinned); + + // rebuild dipole-dipole pair list and store pairwise dipole matrices + // done one atom at a time in real-space double loop over atoms & neighs + // NOTE: for the moment the tdipdip values are computed just in time in umutual2b() + // so no need to call ubdirect2b_cpu(). + // udirect2b_cpu(); + + // accumulate the field and fieldp values from the GPU lib + // field and fieldp may already have some nonzero values from kspace (udirect1) + + int nlocal = atom->nlocal; + if (acc_float) { + auto field_ptr = (float *)fieldp_pinned; + + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + field[i][0] += field_ptr[idx]; + field[i][1] += field_ptr[idx+1]; + field[i][2] += field_ptr[idx+2]; + } + + field_ptr += 4*inum; + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + fieldp[i][0] += field_ptr[idx]; + fieldp[i][1] += field_ptr[idx+1]; + fieldp[i][2] += field_ptr[idx+2]; + } + + } else { + + auto field_ptr = (double *)fieldp_pinned; + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + field[i][0] += field_ptr[idx]; + field[i][1] += field_ptr[idx+1]; + field[i][2] += field_ptr[idx+2]; + } + + field_ptr += 4*inum; + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + fieldp[i][0] += field_ptr[idx]; + fieldp[i][1] += field_ptr[idx+1]; + fieldp[i][2] += field_ptr[idx+2]; + } + } +} + +/* ---------------------------------------------------------------------- + udirect2b = Ewald real direct field via list + udirect2b computes the real space contribution of the permanent + atomic multipole moments to the field via a neighbor list +------------------------------------------------------------------------- */ + +void PairHippoGPU::udirect2b_cpu() +{ + int i,j,m,n,ii,jj,jextra,ndip,itype,jtype,igroup,jgroup; + double xr,yr,zr,r,r2; + double rr1,rr2,rr3,rr5; + double bfac,exp2a; + double ralpha,aefac; + double aesq2,aesq2n; + double pdi,pti; + double pgamma; + double damp,expdamp; + double scale3,scale5,scalek; + double bn[4],bcn[3]; + double factor_uscale; + + int inum,jnum; + int *ilist,*jlist,*numneigh,**firstneigh; + + double **x = atom->x; + + // neigh list + + inum = list->inum; + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + + // NOTE: doesn't this have a problem if aewald is tiny ?? + + aesq2 = 2.0 * aewald * aewald; + aesq2n = 0.0; + if (aewald > 0.0) aesq2n = 1.0 / (MY_PIS*aewald); + + // rebuild dipole-dipole pair list and store pairwise dipole matrices + // done one atom at a time in real-space double loop over atoms & neighs + + int *neighptr; + double *tdipdip; + + // compute the real space portion of the Ewald summation + + for (ii = 0; ii < inum; ii++) { + i = ilist[ii]; + itype = amtype[i]; + igroup = amgroup[i]; + jlist = firstneigh[i]; + jnum = numneigh[i]; + + n = ndip = 0; + neighptr = ipage_dipole->vget(); + tdipdip = dpage_dipdip->vget(); + + pdi = pdamp[itype]; + pti = thole[itype]; + + // evaluate all sites within the cutoff distance + + for (jj = 0; jj < jnum; jj++) { + jextra = jlist[jj]; + j = jextra & NEIGHMASK15; + + xr = x[j][0] - x[i][0]; + yr = x[j][1] - x[i][1]; + zr = x[j][2] - x[i][2]; + r2 = xr*xr + yr* yr + zr*zr; + if (r2 > off2) continue; + + jtype = amtype[j]; + jgroup = amgroup[j]; + + if (igroup == jgroup) factor_uscale = polar_uscale; + else factor_uscale = 1.0; + + r = sqrt(r2); + rr1 = 1.0 / r; + rr2 = rr1 * rr1; + rr3 = rr2 * rr1; + rr5 = 3.0 * rr2 * rr3; + + // calculate the real space Ewald error function terms + + ralpha = aewald * r; + bn[0] = erfc(ralpha) * rr1; + exp2a = exp(-ralpha*ralpha); + aefac = aesq2n; + for (m = 1; m <= 3; m++) { + bfac = m+m-1; + aefac = aesq2 * aefac; + bn[m] = (bfac*bn[m-1]+aefac*exp2a) * rr2; + } + + // find terms needed later to compute mutual polarization + + if (poltyp != DIRECT) { + scale3 = 1.0; + scale5 = 1.0; + damp = pdi * pdamp[jtype]; + if (damp != 0.0) { + pgamma = MIN(pti,thole[jtype]); + damp = pgamma * pow(r/damp,3.0); + if (damp < 50.0) { + expdamp = exp(-damp); + scale3 = 1.0 - expdamp; + scale5 = 1.0 - expdamp*(1.0+damp); + } + } + scalek = factor_uscale; + bcn[0] = bn[1] - (1.0-scalek*scale3)*rr3; + bcn[1] = bn[2] - (1.0-scalek*scale5)*rr5; + + neighptr[n++] = j; + tdipdip[ndip++] = -bcn[0] + bcn[1]*xr*xr; + tdipdip[ndip++] = bcn[1]*xr*yr; + tdipdip[ndip++] = bcn[1]*xr*zr; + tdipdip[ndip++] = -bcn[0] + bcn[1]*yr*yr; + tdipdip[ndip++] = bcn[1]*yr*zr; + tdipdip[ndip++] = -bcn[0] + bcn[1]*zr*zr; + } else { + + } + + } // jj + + firstneigh_dipole[i] = neighptr; + firstneigh_dipdip[i] = tdipdip; + numneigh_dipole[i] = n; + ipage_dipole->vgot(n); + dpage_dipdip->vgot(ndip); + } +} + +/* ---------------------------------------------------------------------- + ufield0c = mutual induction via Ewald sum + ufield0c computes the mutual electrostatic field due to + induced dipole moments via Ewald summation +------------------------------------------------------------------------- */ + +void PairHippoGPU::ufield0c(double **field, double **fieldp) +{ + double term; + + // zero field,fieldp for owned and ghost atoms + + int nlocal = atom->nlocal; + int nall = nlocal + atom->nghost; + + memset(&field[0][0], 0, 3*nall *sizeof(double)); + memset(&fieldp[0][0], 0, 3*nall *sizeof(double)); + + // get the real space portion of the mutual field first + + double time0, time1, time2; + MPI_Barrier(world); + time0 = platform::walltime(); + + if (polar_rspace_flag) umutual2b(field,fieldp); + time1 = platform::walltime(); + + // get the reciprocal space part of the mutual field + + if (polar_kspace_flag) umutual1(field,fieldp); + time2 = platform::walltime(); + + // add the self-energy portion of the mutual field + + term = (4.0/3.0) * aewald*aewald*aewald / MY_PIS; + for (int i = 0; i < nlocal; i++) { + field[i][0] += term*uind[i][0]; + field[i][1] += term*uind[i][1]; + field[i][2] += term*uind[i][2]; + } + + for (int i = 0; i < nlocal; i++) { + fieldp[i][0] += term*uinp[i][0]; + fieldp[i][1] += term*uinp[i][1]; + fieldp[i][2] += term*uinp[i][2]; + } + + // accumulate the field and fieldp values from the real-space portion from umutual2b() on the GPU + // field and fieldp may already have some nonzero values from kspace (umutual1 and self) + + hippo_gpu_update_fieldp(&fieldp_pinned); + int inum = atom->nlocal; + + if (acc_float) { + auto *field_ptr = (float *)fieldp_pinned; + + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + field[i][0] += field_ptr[idx]; + field[i][1] += field_ptr[idx+1]; + field[i][2] += field_ptr[idx+2]; + } + + field_ptr += 4*inum; + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + fieldp[i][0] += field_ptr[idx]; + fieldp[i][1] += field_ptr[idx+1]; + fieldp[i][2] += field_ptr[idx+2]; + } + + } else { + auto *field_ptr = (double *)fieldp_pinned; + + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + field[i][0] += field_ptr[idx]; + field[i][1] += field_ptr[idx+1]; + field[i][2] += field_ptr[idx+2]; + } + + field_ptr += 4*inum; + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + fieldp[i][0] += field_ptr[idx]; + fieldp[i][1] += field_ptr[idx+1]; + fieldp[i][2] += field_ptr[idx+2]; + } + } + + // accumulate timing information + + time_mutual_rspace += time1 - time0; + time_mutual_kspace += time2 - time1; +} + +/* ---------------------------------------------------------------------- + umutual1 = Ewald recip mutual induced field + umutual1 computes the reciprocal space contribution of the + induced atomic dipole moments to the field +------------------------------------------------------------------------- */ + +void PairHippoGPU::umutual1(double **field, double **fieldp) +{ + int m,n; + int nxlo,nxhi,nylo,nyhi,nzlo,nzhi; + double term; + double a[3][3]; // indices not flipped vs Fortran + + // return if the Ewald coefficient is zero + + if (aewald < 1.0e-6) return; + + // convert Cartesian dipoles to fractional coordinates + + for (int j = 0; j < 3; j++) { + a[0][j] = nfft1 * recip[0][j]; + a[1][j] = nfft2 * recip[1][j]; + a[2][j] = nfft3 * recip[2][j]; + } + + int nlocal = atom->nlocal; + + for (int i = 0; i < nlocal; i++) { + fuind[i][0] = a[0][0]*uind[i][0] + a[0][1]*uind[i][1] + a[0][2]*uind[i][2]; + fuind[i][1] = a[1][0]*uind[i][0] + a[1][1]*uind[i][1] + a[1][2]*uind[i][2]; + fuind[i][2] = a[2][0]*uind[i][0] + a[2][1]*uind[i][1] + a[2][2]*uind[i][2]; + } + + for (int i = 0; i < nlocal; i++) { + fuinp[i][0] = a[0][0]*uinp[i][0] + a[0][1]*uinp[i][1] + a[0][2]*uinp[i][2]; + fuinp[i][1] = a[1][0]*uinp[i][0] + a[1][1]*uinp[i][1] + a[1][2]*uinp[i][2]; + fuinp[i][2] = a[2][0]*uinp[i][0] + a[2][1]*uinp[i][1] + a[2][2]*uinp[i][2]; + } + + // gridpre = my portion of 4d grid in brick decomp w/ ghost values + + FFT_SCALAR ****gridpre = (FFT_SCALAR ****) ic_kspace->zero(); + + // map 2 values to grid + + double time0, time1; + MPI_Barrier(world); + time0 = platform::walltime(); + + grid_uind(fuind,fuinp,gridpre); + + time1 = platform::walltime(); + time_grid_uind += (time1 - time0); + + // pre-convolution operations including forward FFT + // gridfft = my portion of complex 3d grid in FFT decomposition + + FFT_SCALAR *gridfft = ic_kspace->pre_convolution(); + + // --------------------- + // convolution operation + // --------------------- + + nxlo = ic_kspace->nxlo_fft; + nxhi = ic_kspace->nxhi_fft; + nylo = ic_kspace->nylo_fft; + nyhi = ic_kspace->nyhi_fft; + nzlo = ic_kspace->nzlo_fft; + nzhi = ic_kspace->nzhi_fft; + + // use qfac values stored in udirect1() + + m = n = 0; + for (int k = nzlo; k <= nzhi; k++) { + for (int j = nylo; j <= nyhi; j++) { + for (int i = nxlo; i <= nxhi; i++) { + term = qfac[m++]; + gridfft[n] *= term; + gridfft[n+1] *= term; + n += 2; + } + } + } + + // post-convolution operations including backward FFT + // gridppost = my portion of 4d grid in brick decomp w/ ghost values + + FFT_SCALAR ****gridpost = (FFT_SCALAR ****) ic_kspace->post_convolution(); + + // get potential + + MPI_Barrier(world); + time0 = platform::walltime(); + + fphi_uind(gridpost,fdip_phi1,fdip_phi2,fdip_sum_phi); + + time1 = platform::walltime(); + time_fphi_uind += (time1 - time0); + + // store fractional reciprocal potentials for OPT method + + if (poltyp == OPT) { + for (int i = 0; i < nlocal; i++) { + for (int j = 0; j < 10; j++) { + fopt[i][optlevel][j] = fdip_phi1[i][j]; + foptp[i][optlevel][j] = fdip_phi2[i][j]; + } + } + } + + // convert the dipole fields from fractional to Cartesian + + for (int i = 0; i < 3; i++) { + a[0][i] = nfft1 * recip[0][i]; + a[1][i] = nfft2 * recip[1][i]; + a[2][i] = nfft3 * recip[2][i]; + } + + for (int i = 0; i < nlocal; i++) { + double dfx = a[0][0]*fdip_phi1[i][1] + + a[0][1]*fdip_phi1[i][2] + a[0][2]*fdip_phi1[i][3]; + double dfy = a[1][0]*fdip_phi1[i][1] + + a[1][1]*fdip_phi1[i][2] + a[1][2]*fdip_phi1[i][3]; + double dfz = a[2][0]*fdip_phi1[i][1] + + a[2][1]*fdip_phi1[i][2] + a[2][2]*fdip_phi1[i][3]; + field[i][0] -= dfx; + field[i][1] -= dfy; + field[i][2] -= dfz; + } + + for (int i = 0; i < nlocal; i++) { + double dfx = a[0][0]*fdip_phi2[i][1] + + a[0][1]*fdip_phi2[i][2] + a[0][2]*fdip_phi2[i][3]; + double dfy = a[1][0]*fdip_phi2[i][1] + + a[1][1]*fdip_phi2[i][2] + a[1][2]*fdip_phi2[i][3]; + double dfz = a[2][0]*fdip_phi2[i][1] + + a[2][1]*fdip_phi2[i][2] + a[2][2]*fdip_phi2[i][3]; + fieldp[i][0] -= dfx; + fieldp[i][1] -= dfy; + fieldp[i][2] -= dfz; + } +} + +/* ---------------------------------------------------------------------- + fphi_uind = induced potential from grid + fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid +------------------------------------------------------------------------- */ + +void PairHippoGPU::fphi_uind(FFT_SCALAR ****grid, double **fdip_phi1, + double **fdip_phi2, double **fdip_sum_phi) +{ + if (!gpu_fphi_uind_ready) { + PairAmoeba::fphi_uind(grid, fdip_phi1, fdip_phi2, fdip_sum_phi); + return; + } + + void* fdip_phi1_pinned = nullptr; + void* fdip_phi2_pinned = nullptr; + void* fdip_sum_phi_pinned = nullptr; + hippo_gpu_fphi_uind(grid, &fdip_phi1_pinned, &fdip_phi2_pinned, + &fdip_sum_phi_pinned); + + int nlocal = atom->nlocal; + if (acc_float) { + auto _fdip_phi1_ptr = (float *)fdip_phi1_pinned; + for (int i = 0; i < nlocal; i++) { + int n = i; + for (int m = 0; m < 10; m++) { + fdip_phi1[i][m] = _fdip_phi1_ptr[n]; + n += nlocal; + } + } + + auto _fdip_phi2_ptr = (float *)fdip_phi2_pinned; + for (int i = 0; i < nlocal; i++) { + int n = i; + for (int m = 0; m < 10; m++) { + fdip_phi2[i][m] = _fdip_phi2_ptr[n]; + n += nlocal; + } + } + + auto _fdip_sum_phi_ptr = (float *)fdip_sum_phi_pinned; + for (int i = 0; i < nlocal; i++) { + int n = i; + for (int m = 0; m < 20; m++) { + fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[n]; + n += nlocal; + } + } + + } else { + + auto _fdip_phi1_ptr = (double *)fdip_phi1_pinned; + for (int i = 0; i < nlocal; i++) { + int n = i; + for (int m = 0; m < 10; m++) { + fdip_phi1[i][m] = _fdip_phi1_ptr[n]; + n += nlocal; + } + } + + auto _fdip_phi2_ptr = (double *)fdip_phi2_pinned; + for (int i = 0; i < nlocal; i++) { + int n = i; + for (int m = 0; m < 10; m++) { + fdip_phi2[i][m] = _fdip_phi2_ptr[n]; + n += nlocal; + } + } + + auto _fdip_sum_phi_ptr = (double *)fdip_sum_phi_pinned; + for (int i = 0; i < nlocal; i++) { + int n = i; + for (int m = 0; m < 20; m++) { + fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[n]; + n += nlocal; + } + } + } +} + +/* ---------------------------------------------------------------------- + umutual2b = Ewald real mutual field via list + umutual2b computes the real space contribution of the induced + atomic dipole moments to the field via a neighbor list +------------------------------------------------------------------------- */ + +void PairHippoGPU::umutual2b(double **field, double **fieldp) +{ + if (!gpu_umutual2b_ready) { + PairAmoeba::umutual2b(field, fieldp); + return; + } + + double sublo[3],subhi[3]; + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi); + } + + // select the correct cutoff (off2) for the term + + if (use_ewald) choose(POLAR_LONG); + else choose(POLAR); + + double *pval = atom->dvector[index_pval]; + hippo_gpu_compute_umutual2b(amtype, amgroup, rpole, uind, uinp, pval, + aewald, off2, &fieldp_pinned); +} + +/* ---------------------------------------------------------------------- + polar_real = real-space portion of induced dipole polarization + adapted from Tinker epreal1d() routine +------------------------------------------------------------------------- */ + +void PairHippoGPU::polar_real() +{ + if (!gpu_polar_real_ready) { + PairAmoeba::polar_real(); + return; + } + + int eflag=1, vflag=1; + double **f = atom->f; + double sublo[3],subhi[3]; + + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi); + } + + // select the correct cutoff and aewald for the term + + if (use_ewald) choose(POLAR_LONG); + else choose(POLAR); + + // set the energy unit conversion factor for polar real-space calculation + + double felec = 0.5 * electric / am_dielectric; + double *pval = atom->dvector[index_pval]; + + hippo_gpu_compute_polar_real(amtype, amgroup, rpole, uind, uinp, pval, + eflag, vflag, eflag_atom, vflag_atom, + aewald, felec, off2, &tq_pinned); + + // reference to the tep array from GPU lib + + if (acc_float) { + auto *tep_ptr = (float *)tq_pinned; + compute_force_from_torque(tep_ptr, f, virpolar); // fpolar + } else { + auto *tep_ptr = (double *)tq_pinned; + compute_force_from_torque(tep_ptr, f, virpolar); // fpolar + } +} + +/* ---------------------------------------------------------------------- + compute atom forces from torques used by various terms +------------------------------------------------------------------------- */ + +template +void PairHippoGPU::compute_force_from_torque(const numtyp* tq_ptr, + double** force_comp, + double* virial_comp) +{ + int i,ix,iy,iz; + double xix,yix,zix; + double xiy,yiy,ziy; + double xiz,yiz,ziz; + double vxx,vyy,vzz; + double vxy,vxz,vyz; + double fix[3],fiy[3],fiz[3],_tq[4]; + + double** x = atom->x; + int nlocal = atom->nlocal; + + for (i = 0; i < nlocal; i++) { + _tq[0] = tq_ptr[4*i]; + _tq[1] = tq_ptr[4*i+1]; + _tq[2] = tq_ptr[4*i+2]; + torque2force(i,_tq,fix,fiy,fiz,force_comp); + + iz = zaxis2local[i]; + ix = xaxis2local[i]; + iy = yaxis2local[i]; + + xiz = x[iz][0] - x[i][0]; + yiz = x[iz][1] - x[i][1]; + ziz = x[iz][2] - x[i][2]; + xix = x[ix][0] - x[i][0]; + yix = x[ix][1] - x[i][1]; + zix = x[ix][2] - x[i][2]; + xiy = x[iy][0] - x[i][0]; + yiy = x[iy][1] - x[i][1]; + ziy = x[iy][2] - x[i][2]; + + vxx = xix*fix[0] + xiy*fiy[0] + xiz*fiz[0]; + vyy = yix*fix[1] + yiy*fiy[1] + yiz*fiz[1]; + vzz = zix*fix[2] + ziy*fiy[2] + ziz*fiz[2]; + vxy = 0.5 * (yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] + + xix*fix[1] + xiy*fiy[1] + xiz*fiz[1]); + vxz = 0.5 * (zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] + + xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]); + vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] + + yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]); + + virial_comp[0] -= vxx; + virial_comp[1] -= vyy; + virial_comp[2] -= vzz; + virial_comp[3] -= vxy; + virial_comp[4] -= vxz; + virial_comp[5] -= vyz; + } +} + +/* ---------------------------------------------------------------------- */ + +double PairHippoGPU::memory_usage() +{ + double bytes = Pair::memory_usage(); + return bytes + hippo_gpu_bytes(); +} diff --git a/src/GPU/pair_hippo_gpu.h b/src/GPU/pair_hippo_gpu.h new file mode 100644 index 0000000000..d160446d77 --- /dev/null +++ b/src/GPU/pair_hippo_gpu.h @@ -0,0 +1,73 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS Development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS +// clang-format off +PairStyle(hippo/gpu,PairHippoGPU); +// clang-format on +#else + +#ifndef LMP_PAIR_HIPPO_GPU_H +#define LMP_PAIR_HIPPO_GPU_H + +#include "pair_amoeba.h" + +namespace LAMMPS_NS { + +class PairHippoGPU : public PairAmoeba { + public: + PairHippoGPU(LAMMPS *lmp); + ~PairHippoGPU() override; + void init_style() override; + double memory_usage() override; + + enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH }; + + void induce() override; + + void repulsion() override; + void dispersion_real() override; + void multipole_real() override; + void udirect2b(double **, double **) override; + void umutual1(double **, double **) override; + void fphi_uind(FFT_SCALAR ****, double **, double **, double **) override; + void umutual2b(double **, double **) override; + void ufield0c(double **, double **) override; + void polar_real() override; + + private: + int gpu_mode; + double cpu_time; + void *tq_pinned; + void *fieldp_pinned; + bool acc_float; + + bool gpu_hal_ready; + bool gpu_repulsion_ready; + bool gpu_dispersion_real_ready; + bool gpu_multipole_real_ready; + bool gpu_udirect2b_ready; + bool gpu_umutual1_ready; + bool gpu_fphi_uind_ready; + bool gpu_umutual2b_ready; + bool gpu_polar_real_ready; + + void udirect2b_cpu(); + + template + void compute_force_from_torque(const numtyp*, double**, double*); +}; + +} // namespace LAMMPS_NS +#endif +#endif diff --git a/src/KOKKOS/Install.sh b/src/KOKKOS/Install.sh index e7472f4e88..f493b5438a 100755 --- a/src/KOKKOS/Install.sh +++ b/src/KOKKOS/Install.sh @@ -204,6 +204,8 @@ action mliap_model_linear_kokkos.h mliap_model_linear.h action mliap_model_python_kokkos.cpp mliap_model_linear.cpp action mliap_model_python_kokkos.h mliap_model_linear.h action mliap_model_kokkos.h mliap_model.h +action mliap_unified_kokkos.cpp mliap_unified.cpp +action mliap_unified_kokkos.h mliap_unified.h action mliap_so3_kokkos.cpp mliap_so3.cpp action mliap_so3_kokkos.h mliap_so3.h action modify_kokkos.cpp @@ -314,6 +316,8 @@ action pair_lj_spica_kokkos.cpp pair_lj_spica.cpp action pair_lj_spica_kokkos.h pair_lj_spica.h action pair_meam_kokkos.cpp pair_meam.cpp action pair_meam_kokkos.h pair_meam.h +action pair_meam_ms_kokkos.cpp pair_meam_ms.cpp +action pair_meam_ms_kokkos.h pair_meam_ms.h action pair_mliap_kokkos.cpp pair_mliap.cpp action pair_mliap_kokkos.h pair_mliap.h action pair_morse_kokkos.cpp @@ -365,6 +369,7 @@ action verlet_kokkos.h # Install cython pyx file only if non-KOKKOS version is present action mliap_model_python_couple_kokkos.pyx mliap_model_python_couple.pyx +action mliap_unified_couple_kokkos.pyx mliap_unified_couple.pyx # edit 2 Makefile.package files to include/exclude package info @@ -423,15 +428,19 @@ fi if (test $1 = 1) then if (type cythonize > /dev/null 2>&1 && test -e ../python_impl.cpp) then cythonize -3 ../mliap_model_python_couple_kokkos.pyx + cythonize -3 ../mliap_unified_couple_kokkos.pyx fi elif (test $1 = 0) then rm -f ../mliap_model_python_couple_kokkos.cpp ../mliap_model_python_couple_kokkos.h + rm -f ../mliap_unified_couple_kokkos.cpp ../mliap_unified_couple_kokkos.h elif (test $1 = 2) then if (type cythonize > /dev/null 2>&1 && test -e ../python_impl.cpp) then cythonize -3 ../mliap_model_python_couple_kokkos.pyx + cythonize -3 ../mliap_unified_couple_kokkos.pyx else rm -f ../mliap_model_python_couple_kokkos.cpp ../mliap_model_python_couple_kokkos.h + rm -f ../mliap_unified_couple_kokkos.cpp ../mliap_unified_couple_kokkos.h fi fi diff --git a/src/KOKKOS/fix_nvt_kokkos.cpp b/src/KOKKOS/fix_nvt_kokkos.cpp index d98ba5c163..16328c5e3a 100644 --- a/src/KOKKOS/fix_nvt_kokkos.cpp +++ b/src/KOKKOS/fix_nvt_kokkos.cpp @@ -39,7 +39,7 @@ FixNVTKokkos::FixNVTKokkos(LAMMPS *lmp, int narg, char **arg) : // id = fix-ID + temp this->id_temp = utils::strdup(std::string(this->id)+"_temp"); - this->modify->add_compute(fmt::format("{} all temp/kk",this->id_temp)); + this->modify->add_compute(fmt::format("{} {} temp/kk",this->id_temp,this->group->names[this->igroup])); this->tcomputeflag = 1; } diff --git a/src/KOKKOS/fix_nvt_sllod_kokkos.cpp b/src/KOKKOS/fix_nvt_sllod_kokkos.cpp index 12b1e8f322..69ffdcd684 100644 --- a/src/KOKKOS/fix_nvt_sllod_kokkos.cpp +++ b/src/KOKKOS/fix_nvt_sllod_kokkos.cpp @@ -67,7 +67,7 @@ FixNVTSllodKokkos::FixNVTSllodKokkos(LAMMPS *lmp, int narg, char **a } this->id_temp = utils::strdup(std::string(this->id)+"_temp"); - this->modify->add_compute(fmt::format("{} all temp/deform/kk",this->id_temp)); + this->modify->add_compute(fmt::format("{} {} temp/deform/kk",this->id_temp,this->group->names[this->igroup])); this->tcomputeflag = 1; this->nondeformbias = 0; } diff --git a/src/KOKKOS/fix_setforce_kokkos.cpp b/src/KOKKOS/fix_setforce_kokkos.cpp index 4b1c31bec0..9f193bc6e4 100644 --- a/src/KOKKOS/fix_setforce_kokkos.cpp +++ b/src/KOKKOS/fix_setforce_kokkos.cpp @@ -77,9 +77,8 @@ void FixSetForceKokkos::init() template void FixSetForceKokkos::post_force(int /*vflag*/) { - atomKK->sync(execution_space, X_MASK | F_MASK | MASK_MASK); + atomKK->sync(execution_space, F_MASK | MASK_MASK); - x = atomKK->k_x.view(); f = atomKK->k_f.view(); mask = atomKK->k_mask.view(); @@ -88,6 +87,8 @@ void FixSetForceKokkos::post_force(int /*vflag*/) // update region if necessary if (region) { + if (!utils::strmatch(region->style, "^block")) + error->all(FLERR,"Cannot (yet) use {}-style region with fix setforce/kk",region->style); region->prematch(); DAT::tdual_int_1d k_match = DAT::tdual_int_1d("setforce:k_match",nlocal); KokkosBase* regionKKBase = dynamic_cast(region); diff --git a/src/KOKKOS/meam_dens_final_kokkos.h b/src/KOKKOS/meam_dens_final_kokkos.h index bcc7b558dc..5e7ffdec20 100644 --- a/src/KOKKOS/meam_dens_final_kokkos.h +++ b/src/KOKKOS/meam_dens_final_kokkos.h @@ -61,34 +61,61 @@ void MEAMKokkos::operator()(TagMEAMDensFinal, const int &i, EV_FLOAT if (elti >= 0) { scaleii = d_scale(type[i],type[i]); d_rho1[i] = 0.0; - d_rho2[i] = -1.0 / 3.0 * d_arho2b[i] * d_arho2b[i]; + if (msmeamflag) { + d_rho2[i] = -1.0 / 3.0 * (d_arho2b[i] * d_arho2b[i] + - d_arho2mb[i] * d_arho2mb[i]); + } else{ + d_rho2[i] = -1.0 / 3.0 * d_arho2b[i] * d_arho2b[i]; + } d_rho3[i] = 0.0; for (int m = 0; m < 3; m++) { - d_rho1[i] += d_arho1(i,m) * d_arho1(i,m); - d_rho3[i] -= 3.0 / 5.0 * d_arho3b(i,m) * d_arho3b(i,m); - } - for (int m = 0; m < 6; m++) - d_rho2[i] += v2D[m] * d_arho2(i,m) * d_arho2(i,m); - for (int m = 0; m < 10; m++) - d_rho3[i] += v3D[m] * d_arho3(i,m) * d_arho3(i,m); - - if (d_rho0[i] > 0.0) { - if (ialloy == 1) { - d_t_ave(i,0) = fdiv_zero_kk(d_t_ave(i,0), d_tsq_ave(i,0)); - d_t_ave(i,1) = fdiv_zero_kk(d_t_ave(i,1), d_tsq_ave(i,1)); - d_t_ave(i,2) = fdiv_zero_kk(d_t_ave(i,2), d_tsq_ave(i,2)); - } else if (ialloy == 2) { - d_t_ave(i,0) = t1_meam[elti]; - d_t_ave(i,1) = t2_meam[elti]; - d_t_ave(i,2) = t3_meam[elti]; - } else { - d_t_ave(i,0) /= d_rho0[i]; - d_t_ave(i,1) /= d_rho0[i]; - d_t_ave(i,2) /= d_rho0[i]; + if (msmeamflag) { + d_rho1[i] = d_rho1[i] + d_arho1(i, m) * d_arho1(i, m) + - d_arho1m(i, m) * d_arho1m(i, m); + d_rho3[i] = d_rho3[i] - 3.0 / 5.0 * (d_arho3b(i, m) * d_arho3b(i, m) + - d_arho3mb(i, m) * d_arho3mb(i, m)); + } else{ + d_rho1[i] += d_arho1(i,m) * d_arho1(i,m); + d_rho3[i] -= 3.0 / 5.0 * d_arho3b(i,m) * d_arho3b(i,m); } } + for (int m = 0; m < 6; m++){ + if (msmeamflag) { + d_rho2[i] = d_rho2[i] + v2D[m] * (d_arho2(i, m) * d_arho2(i, m) + - d_arho2m(i, m) * d_arho2m(i, m)); + } else{ + d_rho2[i] += v2D[m] * d_arho2(i,m) * d_arho2(i,m); + } + } + for (int m = 0; m < 10; m++) + if (msmeamflag) { + d_rho3[i] = d_rho3[i] + v3D[m] * (d_arho3(i, m) * d_arho3(i, m) + - d_arho3m(i, m) * d_arho3m(i, m)); + } else{ + d_rho3[i] += v3D[m] * d_arho3(i,m) * d_arho3(i,m); + } - d_gamma[i] = d_t_ave(i,0) * d_rho1[i] + d_t_ave(i,1) * d_rho2[i] + d_t_ave(i,2) * d_rho3[i]; + if (msmeamflag) { + // with msmeam all t weights are already accounted for in rho + d_gamma[i] = d_rho1[i] + d_rho2[i] + d_rho3[i]; + } else{ + if (d_rho0[i] > 0.0) { + if (ialloy == 1) { + d_t_ave(i,0) = fdiv_zero_kk(d_t_ave(i,0), d_tsq_ave(i,0)); + d_t_ave(i,1) = fdiv_zero_kk(d_t_ave(i,1), d_tsq_ave(i,1)); + d_t_ave(i,2) = fdiv_zero_kk(d_t_ave(i,2), d_tsq_ave(i,2)); + } else if (ialloy == 2) { + d_t_ave(i,0) = t1_meam[elti]; + d_t_ave(i,1) = t2_meam[elti]; + d_t_ave(i,2) = t3_meam[elti]; + } else { + d_t_ave(i,0) /= d_rho0[i]; + d_t_ave(i,1) /= d_rho0[i]; + d_t_ave(i,2) /= d_rho0[i]; + } + } + d_gamma[i] = d_t_ave(i,0) * d_rho1[i] + d_t_ave(i,1) * d_rho2[i] + d_t_ave(i,2) * d_rho3[i]; + } if (d_rho0[i] > 0.0) d_gamma[i] /= (d_rho0[i] * d_rho0[i]); diff --git a/src/KOKKOS/meam_dens_init_kokkos.h b/src/KOKKOS/meam_dens_init_kokkos.h index 31ac046dcf..60bb6553d8 100644 --- a/src/KOKKOS/meam_dens_init_kokkos.h +++ b/src/KOKKOS/meam_dens_init_kokkos.h @@ -43,11 +43,23 @@ void MEAMKokkos::operator()(TagMEAMZero, const int &i) const { d_rho0[i] = 0.0; d_arho2b[i] = 0.0; d_arho1(i,0) = d_arho1(i,1) = d_arho1(i,2) = 0.0; - for (int j = 0; j < 6; j++) + if (msmeamflag) { + d_arho2mb[i] = 0.0; + d_arho1m(i,0) = d_arho1m(i,1) = d_arho1m(i,2) = 0.0; + } + for (int j = 0; j < 6; j++) { d_arho2(i,j) = 0.0; - for (int j = 0; j < 10; j++) + if (msmeamflag) + d_arho2m(i,j) = 0.0; + } + for (int j = 0; j < 10; j++) { d_arho3(i,j) = 0.0; + if (msmeamflag) + d_arho3m(i,j) = 0.0; + } d_arho3b(i,0) = d_arho3b(i,1) = d_arho3b(i,2) = 0.0; + if (msmeamflag) + d_arho3mb(i,0) = d_arho3mb(i,1) = d_arho3mb(i,2) = 0.0; d_t_ave(i,0) = d_t_ave(i,1) = d_t_ave(i,2) = 0.0; d_tsq_ave(i,0) = d_tsq_ave(i,1) = d_tsq_ave(i,2) = 0.0; } @@ -80,13 +92,20 @@ MEAMKokkos::meam_dens_setup(int atom_nmax, int nall, int n_neigh) memoryKK->destroy_kokkos(k_arho3b,arho3b); memoryKK->destroy_kokkos(k_t_ave,t_ave); memoryKK->destroy_kokkos(k_tsq_ave,tsq_ave); + // msmeam + memoryKK->destroy_kokkos(k_arho2mb, arho2mb); + memoryKK->destroy_kokkos(k_arho1m, arho1m); + memoryKK->destroy_kokkos(k_arho2m, arho2m); + memoryKK->destroy_kokkos(k_arho3m, arho3m); + memoryKK->destroy_kokkos(k_arho3mb, arho3mb); nmax = atom_nmax; -// memory->create(rho, nmax, "pair:rho"); + + //memory->create(rho, nmax, "pair:rho"); k_rho = DAT::tdual_ffloat_1d("pair:rho",nmax); d_rho = k_rho.template view(); h_rho = k_rho.h_view; - // memory->create(rho0, nmax, "pair:rho0"); + //memory->create(rho0, nmax, "pair:rho0"); k_rho0 = DAT::tdual_ffloat_1d("pair:rho0",nmax); d_rho0 = k_rho0.template view(); h_rho0 = k_rho0.h_view; @@ -150,6 +169,28 @@ MEAMKokkos::meam_dens_setup(int atom_nmax, int nall, int n_neigh) k_tsq_ave = DAT::tdual_ffloat_2d("pair:tsq_ave",nmax, 3); d_tsq_ave = k_tsq_ave.template view(); h_tsq_ave = k_tsq_ave.h_view; + + // msmeam + //memory->create(arho2mb, nmax, "pair:arho2mb"); + k_arho2mb = DAT::tdual_ffloat_1d("pair:arho2mb",nmax); + d_arho2mb = k_arho2mb.template view(); + h_arho2mb = k_arho2mb.h_view; + //memory->create(arho1m, nmax, 3, "pair:arho1m"); + k_arho1m = DAT::tdual_ffloat_2d("pair:arho1m", nmax, 3); + d_arho1m = k_arho1m.template view(); + h_arho1m = k_arho1m.h_view; + //memory->create(arho2m, nmax, 6, "pair:arho2m"); + k_arho2m = DAT::tdual_ffloat_2d("pair:arho2m", nmax, 6); + d_arho2m = k_arho2m.template view(); + h_arho2m = k_arho2m.h_view; + //memory->create(arho3m, nmax, 10, "pair:arho3m"); + k_arho3m = DAT::tdual_ffloat_2d("pair:arho3m", nmax, 10); + d_arho3m = k_arho3m.template view(); + h_arho3m = k_arho3m.h_view; + //memory->create(arho3mb, nmax, 3, "pair:arho3mb"); + k_arho3mb = DAT::tdual_ffloat_2d("pair:arho3mb", nmax, 3); + d_arho3mb = k_arho3mb.template view(); + h_arho3mb = k_arho3mb.h_view; } if (n_neigh > maxneigh) { @@ -206,6 +247,12 @@ MEAMKokkos::meam_dens_init(int inum_half, int ntype, typename AT::t_ dup_arho3b = Kokkos::Experimental::create_scatter_view(d_arho3b); dup_t_ave = Kokkos::Experimental::create_scatter_view(d_t_ave); dup_tsq_ave = Kokkos::Experimental::create_scatter_view(d_tsq_ave); + // msmeam + dup_arho2mb = Kokkos::Experimental::create_scatter_view(d_arho2mb); + dup_arho1m = Kokkos::Experimental::create_scatter_view(d_arho1m); + dup_arho2m = Kokkos::Experimental::create_scatter_view(d_arho2m); + dup_arho3m = Kokkos::Experimental::create_scatter_view(d_arho3m); + dup_arho3mb = Kokkos::Experimental::create_scatter_view(d_arho3mb); } else { ndup_rho0 = Kokkos::Experimental::create_scatter_view(d_rho0); ndup_arho2b = Kokkos::Experimental::create_scatter_view(d_arho2b); @@ -215,6 +262,12 @@ MEAMKokkos::meam_dens_init(int inum_half, int ntype, typename AT::t_ ndup_arho3b = Kokkos::Experimental::create_scatter_view(d_arho3b); ndup_t_ave = Kokkos::Experimental::create_scatter_view(d_t_ave); ndup_tsq_ave = Kokkos::Experimental::create_scatter_view(d_tsq_ave); + // msmeam + ndup_arho2mb = Kokkos::Experimental::create_scatter_view(d_arho2mb); + ndup_arho1m = Kokkos::Experimental::create_scatter_view(d_arho1m); + ndup_arho2m = Kokkos::Experimental::create_scatter_view(d_arho2m); + ndup_arho3m = Kokkos::Experimental::create_scatter_view(d_arho3m); + ndup_arho3mb = Kokkos::Experimental::create_scatter_view(d_arho3mb); } copymode = 1; @@ -233,6 +286,12 @@ MEAMKokkos::meam_dens_init(int inum_half, int ntype, typename AT::t_ Kokkos::Experimental::contribute(d_arho3b, dup_arho3b); Kokkos::Experimental::contribute(d_t_ave, dup_t_ave); Kokkos::Experimental::contribute(d_tsq_ave, dup_tsq_ave); + // msmeam + Kokkos::Experimental::contribute(d_arho2mb, dup_arho2mb); + Kokkos::Experimental::contribute(d_arho1m, dup_arho1m); + Kokkos::Experimental::contribute(d_arho2m, dup_arho2m); + Kokkos::Experimental::contribute(d_arho3m, dup_arho3m); + Kokkos::Experimental::contribute(d_arho3mb, dup_arho3mb); // free duplicated memory dup_rho0 = decltype(dup_rho0)(); @@ -243,6 +302,12 @@ MEAMKokkos::meam_dens_init(int inum_half, int ntype, typename AT::t_ dup_arho3b = decltype(dup_arho3b)(); dup_t_ave = decltype(dup_t_ave)(); dup_tsq_ave = decltype(dup_tsq_ave)(); + // msmeam + dup_arho2mb = decltype(dup_arho2mb)(); + dup_arho1m = decltype(dup_arho1m)(); + dup_arho2m = decltype(dup_arho2m)(); + dup_arho3m = decltype(dup_arho3m)(); + dup_arho3mb = decltype(dup_arho3mb)(); } } @@ -417,7 +482,6 @@ MEAMKokkos::calc_rho1(int i, int /*ntype*/, typename AT::t_int_1d ty int offset) const { // The rho0, etc. arrays are duplicated for OpenMP, atomic for CUDA, and neither for Serial - auto v_rho0 = ScatterViewHelper,decltype(dup_rho0),decltype(ndup_rho0)>::get(dup_rho0,ndup_rho0); auto a_rho0 = v_rho0.template access>(); auto v_arho2b = ScatterViewHelper,decltype(dup_arho2b),decltype(ndup_arho2b)>::get(dup_arho2b,ndup_arho2b); @@ -434,6 +498,17 @@ MEAMKokkos::calc_rho1(int i, int /*ntype*/, typename AT::t_int_1d ty auto a_t_ave = v_t_ave.template access>(); auto v_tsq_ave = ScatterViewHelper,decltype(dup_tsq_ave),decltype(ndup_tsq_ave)>::get(dup_tsq_ave,ndup_tsq_ave); auto a_tsq_ave = v_tsq_ave.template access>(); + // msmeam + auto v_arho2mb = ScatterViewHelper,decltype(dup_arho2mb),decltype(ndup_arho2mb)>::get(dup_arho2mb,ndup_arho2mb); + auto a_arho2mb = v_arho2mb.template access>(); + auto v_arho1m = ScatterViewHelper,decltype(dup_arho1m),decltype(ndup_arho1m)>::get(dup_arho1m,ndup_arho1m); + auto a_arho1m = v_arho1m.template access>(); + auto v_arho2m = ScatterViewHelper,decltype(dup_arho2m),decltype(ndup_arho2m)>::get(dup_arho2m,ndup_arho2m); + auto a_arho2m = v_arho2m.template access>(); + auto v_arho3m = ScatterViewHelper,decltype(dup_arho3m),decltype(ndup_arho3m)>::get(dup_arho3m,ndup_arho3m); + auto a_arho3m = v_arho3m.template access>(); + auto v_arho3mb = ScatterViewHelper,decltype(dup_arho3mb),decltype(ndup_arho3mb)>::get(dup_arho3mb,ndup_arho3mb); + auto a_arho3mb = v_arho3mb.template access>(); const int elti = d_map[type[i]]; const double xtmp = x(i,0); @@ -463,6 +538,16 @@ MEAMKokkos::calc_rho1(int i, int /*ntype*/, typename AT::t_int_1d ty double rhoa1i = ro0i * MathSpecialKokkos::fm_exp(-beta1_meam[elti] * ai) * sij; double rhoa2i = ro0i * MathSpecialKokkos::fm_exp(-beta2_meam[elti] * ai) * sij; double rhoa3i = ro0i * MathSpecialKokkos::fm_exp(-beta3_meam[elti] * ai) * sij; + // msmeam + double rhoa1mj, rhoa2mj, rhoa3mj, rhoa1mi, rhoa2mi, rhoa3mi; + if (msmeamflag) { + rhoa1mj = ro0j * t1m_meam[eltj] * MathSpecialKokkos::fm_exp(-beta1m_meam[eltj] * aj) * sij; + rhoa2mj = ro0j * t2m_meam[eltj] * MathSpecialKokkos::fm_exp(-beta2m_meam[eltj] * aj) * sij; + rhoa3mj = ro0j * t3m_meam[eltj] * MathSpecialKokkos::fm_exp(-beta3m_meam[eltj] * aj) * sij; + rhoa1mi = ro0i * t1m_meam[elti] * MathSpecialKokkos::fm_exp(-beta1m_meam[elti] * ai) * sij; + rhoa2mi = ro0i * t2m_meam[elti] * MathSpecialKokkos::fm_exp(-beta2m_meam[elti] * ai) * sij; + rhoa3mi = ro0i * t3m_meam[elti] * MathSpecialKokkos::fm_exp(-beta3m_meam[elti] * ai) * sij; + } if (ialloy == 1) { rhoa1j *= t1_meam[eltj]; rhoa2j *= t2_meam[eltj]; @@ -499,20 +584,45 @@ MEAMKokkos::calc_rho1(int i, int /*ntype*/, typename AT::t_int_1d ty const double A1i = rhoa1i / rij; const double A2i = rhoa2i / rij2; const double A3i = rhoa3i / (rij2 * rij); + double A1mj, A2mj, A3mj, A1mi, A2mi, A3mi; + if (msmeamflag) { + a_arho2mb[i] += rhoa2mj; + a_arho2mb[j] += rhoa2mi; + A1mj = rhoa1mj / rij; + A2mj = rhoa2mj / rij2; + A3mj = rhoa3mj / (rij2 * rij); + A1mi = rhoa1mi / rij; + A2mi = rhoa2mi / rij2; + A3mi = rhoa3mi / (rij2 * rij); + } int nv2 = 0; int nv3 = 0; for (int m = 0; m < 3; m++) { - a_arho1(i,m) += A1j * delij[m]; + a_arho1(i,m) += A1j * delij[m]; a_arho1(j,m) += -A1i * delij[m]; - a_arho3b(i,m) += rhoa3j * delij[m] / rij; + a_arho3b(i,m) += rhoa3j * delij[m] / rij; a_arho3b(j,m) += -rhoa3i * delij[m] / rij; + if (msmeamflag) { + a_arho1m(i,m) += A1mj * delij[m]; + a_arho1m(j,m) += -A1mi * delij[m]; + a_arho3mb(i,m) += rhoa3mj * delij[m] / rij; + a_arho3mb(j,m) += -rhoa3mi * delij[m] / rij; + } for (int n = m; n < 3; n++) { a_arho2(i,nv2) += A2j * delij[m] * delij[n]; a_arho2(j,nv2) += A2i * delij[m] * delij[n]; + if (msmeamflag) { + a_arho2m(i,nv2) += A2mj * delij[m] * delij[n]; + a_arho2m(j,nv2) += A2mi * delij[m] * delij[n]; + } nv2++; for (int p = n; p < 3; p++) { - a_arho3(i,nv3) += A3j * delij[m] * delij[n] * delij[p]; + a_arho3(i,nv3) += A3j * delij[m] * delij[n] * delij[p]; a_arho3(j,nv3) += -A3i * delij[m] * delij[n] * delij[p]; + if (msmeamflag) { + a_arho3m(i,nv3) += A3mj * delij[m] * delij[n] * delij[p]; + a_arho3m(j,nv3) += -A3mi * delij[m] * delij[n] * delij[p]; + } nv3++; } } diff --git a/src/KOKKOS/meam_force_kokkos.h b/src/KOKKOS/meam_force_kokkos.h index e7e6c64231..5c4244e99b 100644 --- a/src/KOKKOS/meam_force_kokkos.h +++ b/src/KOKKOS/meam_force_kokkos.h @@ -119,6 +119,17 @@ KOKKOS_INLINE_FUNCTION void MEAMKokkos::operator()(TagMEAMForce::operator()(TagMEAMForce::operator()(TagMEAMForce::operator()(TagMEAMForce::operator()(TagMEAMForce::operator()(TagMEAMForce::operator()(TagMEAMForcev2D[nv2]; + drho3mdrm1[m] = drho3mdrm1[m] + d_arho3m(i, vind3D[m][n][p]) * arg; + drho3mdrm2[m] = drho3mdrm2[m] + d_arho3m(j, vind3D[m][n][p]) * arg; + nv2 = nv2 + 1; + } + } + drho3mdrm1[m] = (a3 * drho3mdrm1[m] - a3a * d_arho3mb(i, m)) * rhoa3mj; + drho3mdrm2[m] = (-a3 * drho3mdrm2[m] + a3a * d_arho3mb(j, m)) * rhoa3mi; + } + } else { + for (m = 0; m < 3; m++) { + drho1mdrm1[m] = 0.0; + drho1mdrm2[m] = 0.0; + drho2mdrm1[m] = 0.0; + drho2mdrm2[m] = 0.0; + drho3mdrm1[m] = 0.0; + drho3mdrm2[m] = 0.0; + } + } + // Compute derivatives of weighting functions t wrt rij - t1i = d_t_ave(i, 0); - t2i = d_t_ave(i, 1); - t3i = d_t_ave(i, 2); - t1j = d_t_ave(j, 0); - t2j = d_t_ave(j, 1); - t3j = d_t_ave(j, 2); - - if (ialloy == 1) { - - a1i = fdiv_zero_kk(drhoa0j * sij, d_tsq_ave(i, 0)); - a1j = fdiv_zero_kk(drhoa0i * sij, d_tsq_ave(j, 0)); - a2i = fdiv_zero_kk(drhoa0j * sij, d_tsq_ave(i, 1)); - a2j = fdiv_zero_kk(drhoa0i * sij, d_tsq_ave(j, 1)); - a3i = fdiv_zero_kk(drhoa0j * sij, d_tsq_ave(i, 2)); - a3j = fdiv_zero_kk(drhoa0i * sij, d_tsq_ave(j, 2)); - - dt1dr1 = a1i * (t1mj - t1i * MathSpecialKokkos::square(t1mj)); - dt1dr2 = a1j * (t1mi - t1j * MathSpecialKokkos::square(t1mi)); - dt2dr1 = a2i * (t2mj - t2i * MathSpecialKokkos::square(t2mj)); - dt2dr2 = a2j * (t2mi - t2j * MathSpecialKokkos::square(t2mi)); - dt3dr1 = a3i * (t3mj - t3i * MathSpecialKokkos::square(t3mj)); - dt3dr2 = a3j * (t3mi - t3j * MathSpecialKokkos::square(t3mi)); - - } else if (ialloy == 2) { + // Weighting functions t set to unity for msmeam + if (msmeamflag) { + t1i = 1.0; + t2i = 1.0; + t3i = 1.0; + t1j = 1.0; + t2j = 1.0; + t3j = 1.0; dt1dr1 = 0.0; dt1dr2 = 0.0; dt2dr1 = 0.0; dt2dr2 = 0.0; dt3dr1 = 0.0; dt3dr2 = 0.0; - } else { - ai = 0.0; - if (!iszero_kk(d_rho0[i])) ai = drhoa0j * sij / d_rho0[i]; - aj = 0.0; - if (!iszero_kk(d_rho0[j])) aj = drhoa0i * sij / d_rho0[j]; + t1i = d_t_ave(i, 0); + t2i = d_t_ave(i, 1); + t3i = d_t_ave(i, 2); + t1j = d_t_ave(j, 0); + t2j = d_t_ave(j, 1); + t3j = d_t_ave(j, 2); - dt1dr1 = ai * (t1mj - t1i); - dt1dr2 = aj * (t1mi - t1j); - dt2dr1 = ai * (t2mj - t2i); - dt2dr2 = aj * (t2mi - t2j); - dt3dr1 = ai * (t3mj - t3i); - dt3dr2 = aj * (t3mi - t3j); + if (ialloy == 1) { + + a1i = fdiv_zero_kk(drhoa0j * sij, d_tsq_ave(i, 0)); + a1j = fdiv_zero_kk(drhoa0i * sij, d_tsq_ave(j, 0)); + a2i = fdiv_zero_kk(drhoa0j * sij, d_tsq_ave(i, 1)); + a2j = fdiv_zero_kk(drhoa0i * sij, d_tsq_ave(j, 1)); + a3i = fdiv_zero_kk(drhoa0j * sij, d_tsq_ave(i, 2)); + a3j = fdiv_zero_kk(drhoa0i * sij, d_tsq_ave(j, 2)); + + dt1dr1 = a1i * (t1mj - t1i * MathSpecialKokkos::square(t1mj)); + dt1dr2 = a1j * (t1mi - t1j * MathSpecialKokkos::square(t1mi)); + dt2dr1 = a2i * (t2mj - t2i * MathSpecialKokkos::square(t2mj)); + dt2dr2 = a2j * (t2mi - t2j * MathSpecialKokkos::square(t2mi)); + dt3dr1 = a3i * (t3mj - t3i * MathSpecialKokkos::square(t3mj)); + dt3dr2 = a3j * (t3mi - t3j * MathSpecialKokkos::square(t3mi)); + + } else if (ialloy == 2) { + + dt1dr1 = 0.0; + dt1dr2 = 0.0; + dt2dr1 = 0.0; + dt2dr2 = 0.0; + dt3dr1 = 0.0; + dt3dr2 = 0.0; + + } else { + + ai = 0.0; + if (!iszero_kk(d_rho0[i])) ai = drhoa0j * sij / d_rho0[i]; + aj = 0.0; + if (!iszero_kk(d_rho0[j])) aj = drhoa0i * sij / d_rho0[j]; + + dt1dr1 = ai * (t1mj - t1i); + dt1dr2 = aj * (t1mi - t1j); + dt2dr1 = ai * (t2mj - t2i); + dt2dr2 = aj * (t2mi - t2j); + dt3dr1 = ai * (t3mj - t3i); + dt3dr2 = aj * (t3mi - t3j); + } } // Compute derivatives of total density wrt rij, sij and rij(3) get_shpfcn(lattce_meam[elti][elti], stheta_meam[elti][elti], ctheta_meam[elti][elti], shpi); get_shpfcn(lattce_meam[eltj][eltj], stheta_meam[elti][elti], ctheta_meam[elti][elti], shpj); - drhodr1 = d_dgamma1[i] * drho0dr1 + - d_dgamma2[i] * - (dt1dr1 * d_rho1[i] + t1i * drho1dr1 + dt2dr1 * d_rho2[i] + t2i * drho2dr1 + - dt3dr1 * d_rho3[i] + t3i * drho3dr1) - - d_dgamma3[i] * (shpi[0] * dt1dr1 + shpi[1] * dt2dr1 + shpi[2] * dt3dr1); - drhodr2 = d_dgamma1[j] * drho0dr2 + - d_dgamma2[j] * - (dt1dr2 * d_rho1[j] + t1j * drho1dr2 + dt2dr2 * d_rho2[j] + t2j * drho2dr2 + - dt3dr2 * d_rho3[j] + t3j * drho3dr2) - - d_dgamma3[j] * (shpj[0] * dt1dr2 + shpj[1] * dt2dr2 + shpj[2] * dt3dr2); - for (m = 0; m < 3; m++) { - drhodrm1[m] = 0.0; - drhodrm2[m] = 0.0; - drhodrm1[m] = - d_dgamma2[i] * (t1i * drho1drm1[m] + t2i * drho2drm1[m] + t3i * drho3drm1[m]); - drhodrm2[m] = - d_dgamma2[j] * (t1j * drho1drm2[m] + t2j * drho2drm2[m] + t3j * drho3drm2[m]); + + if (msmeamflag) { + drhodr1 = d_dgamma1[i] * drho0dr1 + + d_dgamma2[i] * (dt1dr1 * d_rho1[i] + t1i * (drho1dr1 - drho1mdr1) + + dt2dr1 * d_rho2[i] + t2i * (drho2dr1 - drho2mdr1) + + dt3dr1 * d_rho3[i] + t3i * (drho3dr1 - drho3mdr1)) - + d_dgamma3[i] * (shpi[0] * dt1dr1 + shpi[1] * dt2dr1 + shpi[2] * dt3dr1); + drhodr2 = d_dgamma1[j] * drho0dr2 + + d_dgamma2[j] * (dt1dr2 * d_rho1[j] + t1j * (drho1dr2 - drho1mdr2) + + dt2dr2 * d_rho2[j] + t2j * (drho2dr2 - drho2mdr2) + + dt3dr2 * d_rho3[j] + t3j * (drho3dr2 - drho3mdr2)) - + d_dgamma3[j] * (shpj[0] * dt1dr2 + shpj[1] * dt2dr2 + shpj[2] * dt3dr2); + for (m = 0; m < 3; m++) { + drhodrm1[m] = 0.0; + drhodrm2[m] = 0.0; + drhodrm1[m] = d_dgamma2[i] * (t1i * (drho1drm1[m] - drho1mdrm1[m]) + + t2i * (drho2drm1[m] - drho2mdrm1[m]) + + t3i * (drho3drm1[m] - drho3mdrm1[m]) ); + drhodrm2[m] = d_dgamma2[j] * (t1j * (drho1drm2[m] - drho1mdrm2[m]) + + t2j * (drho2drm2[m] - drho2mdrm2[m]) + + t3j * (drho3drm2[m] - drho3mdrm2[m]) ); + } + } else { + drhodr1 = d_dgamma1[i] * drho0dr1 + + d_dgamma2[i] * + (dt1dr1 * d_rho1[i] + t1i * drho1dr1 + dt2dr1 * d_rho2[i] + t2i * drho2dr1 + + dt3dr1 * d_rho3[i] + t3i * drho3dr1) - + d_dgamma3[i] * (shpi[0] * dt1dr1 + shpi[1] * dt2dr1 + shpi[2] * dt3dr1); + drhodr2 = d_dgamma1[j] * drho0dr2 + + d_dgamma2[j] * + (dt1dr2 * d_rho1[j] + t1j * drho1dr2 + dt2dr2 * d_rho2[j] + t2j * drho2dr2 + + dt3dr2 * d_rho3[j] + t3j * drho3dr2) - + d_dgamma3[j] * (shpj[0] * dt1dr2 + shpj[1] * dt2dr2 + shpj[2] * dt3dr2); + for (m = 0; m < 3; m++) { + drhodrm1[m] = 0.0; + drhodrm2[m] = 0.0; + drhodrm1[m] = + d_dgamma2[i] * (t1i * drho1drm1[m] + t2i * drho2drm1[m] + t3i * drho3drm1[m]); + drhodrm2[m] = + d_dgamma2[j] * (t1j * drho1drm2[m] + t2j * drho2drm2[m] + t3j * drho3drm2[m]); + } } // Compute derivatives wrt sij, but only if necessary @@ -416,6 +594,24 @@ KOKKOS_INLINE_FUNCTION void MEAMKokkos::operator()(TagMEAMForce::operator()(TagMEAMForce::~MEAMKokkos() memoryKK->destroy_kokkos(k_scrfcn,scrfcn); memoryKK->destroy_kokkos(k_dscrfcn,dscrfcn); memoryKK->destroy_kokkos(k_fcpair,fcpair); + + // msmeam + + memoryKK->destroy_kokkos(k_arho2mb, arho2mb); + memoryKK->destroy_kokkos(k_arho1m, arho1m); + memoryKK->destroy_kokkos(k_arho2m, arho2m); + memoryKK->destroy_kokkos(k_arho3m, arho3m); + memoryKK->destroy_kokkos(k_arho3mb, arho3mb); } #include "meam_setup_done_kokkos.h" diff --git a/src/KOKKOS/meam_kokkos.h b/src/KOKKOS/meam_kokkos.h index cc75023810..2203355641 100644 --- a/src/KOKKOS/meam_kokkos.h +++ b/src/KOKKOS/meam_kokkos.h @@ -136,6 +136,13 @@ template class MEAMKokkos : public MEAM { DAT::tdual_ffloat_1d k_scrfcn, k_dscrfcn, k_fcpair; typename ArrayTypes::t_ffloat_1d d_scrfcn, d_dscrfcn, d_fcpair; HAT::t_ffloat_1d h_scrfcn, h_dscrfcn, h_fcpair; + // msmeam + DAT::tdual_ffloat_2d k_arho1m, k_arho2m, k_arho3m, k_arho3mb; + typename ArrayTypes::t_ffloat_2d d_arho1m, d_arho2m, d_arho3m, d_arho3mb; + HAT::t_ffloat_2d h_arho1m, h_arho2m, h_arho3m, h_arho3mb; + DAT::tdual_ffloat_1d k_arho2mb; + typename ArrayTypes::t_ffloat_1d d_arho2mb; + HAT::t_ffloat_1d h_arho2mb; protected: int need_dup; @@ -195,6 +202,31 @@ template class MEAMKokkos : public MEAM { dup_vatom; NonDupScatterView ndup_vatom; + + // msmeam + + DupScatterView + dup_arho1m; + NonDupScatterView + ndup_arho1m; + DupScatterView + dup_arho2m; + NonDupScatterView + ndup_arho2m; + DupScatterView + dup_arho3m; + NonDupScatterView + ndup_arho3m; + DupScatterView + dup_arho2mb; + NonDupScatterView + ndup_arho2mb; + DupScatterView + dup_arho3mb; + NonDupScatterView + ndup_arho3mb; }; KOKKOS_INLINE_FUNCTION diff --git a/src/KOKKOS/min_kokkos.cpp b/src/KOKKOS/min_kokkos.cpp index 4e9c9db4e2..4e1c3967ff 100644 --- a/src/KOKKOS/min_kokkos.cpp +++ b/src/KOKKOS/min_kokkos.cpp @@ -513,6 +513,7 @@ double MinKokkos::energy_force(int resetflag) if (modify->n_min_post_force) { timer->stamp(); modify->min_post_force(vflag); + atomKK->sync(Device,F_MASK); timer->stamp(Timer::MODIFY); } diff --git a/src/KOKKOS/mliap_data_kokkos.cpp b/src/KOKKOS/mliap_data_kokkos.cpp index f9453301c7..993272771d 100644 --- a/src/KOKKOS/mliap_data_kokkos.cpp +++ b/src/KOKKOS/mliap_data_kokkos.cpp @@ -56,7 +56,9 @@ MLIAPDataKokkos::~MLIAPDataKokkos() { memoryKK->destroy_kokkos(k_ielems,ielems); memoryKK->destroy_kokkos(k_numneighs,numneighs); memoryKK->destroy_kokkos(k_jatoms,jatoms); + memoryKK->destroy_kokkos(k_pair_i,pair_i); memoryKK->destroy_kokkos(k_jelems,jelems); + memoryKK->destroy_kokkos(k_elems,elems); memoryKK->destroy_kokkos(k_ij); memoryKK->destroy_kokkos(k_rij,rij); memoryKK->destroy_kokkos(k_graddesc,graddesc); @@ -75,13 +77,17 @@ void MLIAPDataKokkos::generate_neighdata(class NeighList *list_in, i nmax = atom->nmax; memoryKK->destroy_kokkos(k_gradforce,gradforce); memoryKK->create_kokkos(k_gradforce, gradforce, nmax, size_gradforce, "mliap_data:gradforce"); - } + memoryKK->destroy_kokkos(k_elems,elems); + memoryKK->create_kokkos(k_elems, elems, nmax, "mliap_data:elems"); } // clear gradforce array + int nall = atom->nlocal + atom->nghost; + ntotal = nall; auto d_gradforce = k_gradforce.template view(); Kokkos::deep_copy(d_gradforce, 0.); - + auto d_elems = k_elems.template view(); + Kokkos::deep_copy(d_elems, 0.); // grow arrays if necessary nlistatoms = list->inum; @@ -122,6 +128,7 @@ void MLIAPDataKokkos::generate_neighdata(class NeighList *list_in, i auto d_ij = k_ij.template view(); auto d_numneighs = k_numneighs.template view(); auto d_jatoms = k_jatoms.template view(); + auto d_pair_i= k_pair_i.template view(); auto d_jelems= k_jelems.template view(); auto d_rij= k_rij.template view(); @@ -162,6 +169,7 @@ void MLIAPDataKokkos::generate_neighdata(class NeighList *list_in, i const int jelem = map(jtype); if (rsq < d_cutsq(itype,jtype)) { d_jatoms(ij) = j; + d_pair_i(ij) = i; d_jelems(ij) = jelem; d_rij(ij, 0) = delx; d_rij(ij, 1) = dely; @@ -172,8 +180,11 @@ void MLIAPDataKokkos::generate_neighdata(class NeighList *list_in, i d_iatoms[ii] = i; d_ielems[ii] = ielem; }); - - modified(execution_space, NUMNEIGHS_MASK | IATOMS_MASK | IELEMS_MASK | JATOMS_MASK | JELEMS_MASK | RIJ_MASK | IJ_MASK ); + Kokkos::parallel_for(nmax, KOKKOS_LAMBDA (int i) { + const int itype = type(i); + d_elems(i) = map(itype); + }); + modified(execution_space, NUMNEIGHS_MASK | IATOMS_MASK | IELEMS_MASK | ELEMS_MASK | JATOMS_MASK | PAIR_I_MASK | JELEMS_MASK | RIJ_MASK | IJ_MASK ); eflag = eflag_in; vflag = vflag_in; } @@ -183,7 +194,8 @@ void MLIAPDataKokkos::generate_neighdata(class NeighList *list_in, i template void MLIAPDataKokkos::grow_neigharrays() { AtomKokkos *atomKK = (AtomKokkos *) atom; - + f = atom->f; + f_device = atomKK->k_f.view().data(); // grow neighbor arrays if necessary if (natomneigh_max < nlistatoms) { @@ -207,6 +219,7 @@ void MLIAPDataKokkos::grow_neigharrays() { auto x = atomKK->k_x.view(); auto type = atomKK->k_type.view(); auto d_cutsq=k_pairmliap->k_cutsq.template view(); + auto h_cutsq=k_pairmliap->k_cutsq.template view(); auto d_numneighs = k_numneighs.template view(); Kokkos::parallel_reduce(nlistatoms, KOKKOS_LAMBDA (int ii, int &contrib) { const int i = d_ilist[ii]; @@ -229,22 +242,24 @@ void MLIAPDataKokkos::grow_neigharrays() { } d_numneighs(ii) = count; contrib += count; - }, nij_total); + }, npairs); modified(execution_space, NUMNEIGHS_MASK); - if (nneigh_max < nij_total) { + if (nneigh_max < npairs) { memoryKK->destroy_kokkos(k_jatoms,jatoms); - memoryKK->create_kokkos(k_jatoms, jatoms, nij_total, "mliap_data:jatoms"); + memoryKK->create_kokkos(k_jatoms, jatoms, npairs, "mliap_data:jatoms"); + memoryKK->destroy_kokkos(k_pair_i,pair_i); + memoryKK->create_kokkos(k_pair_i, pair_i, npairs, "mliap_data:pair_i"); memoryKK->destroy_kokkos(k_jelems,jelems); - memoryKK->create_kokkos(k_jelems, jelems, nij_total, "mliap_data:jelems"); + memoryKK->create_kokkos(k_jelems, jelems, npairs, "mliap_data:jelems"); memoryKK->destroy_kokkos(k_rij,rij); - memoryKK->create_kokkos(k_rij, rij, nij_total, 3, "mliap_data:rij"); + memoryKK->create_kokkos(k_rij, rij, npairs, 3, "mliap_data:rij"); if (gradgradflag == 0){ memoryKK->destroy_kokkos(k_graddesc,graddesc); - memoryKK->create_kokkos(k_graddesc, graddesc, nij_total, ndescriptors,3, "mliap_data:graddesc"); + memoryKK->create_kokkos(k_graddesc, graddesc, npairs, ndescriptors,3, "mliap_data:graddesc"); } - nneigh_max = nij_total; + nneigh_max = npairs; } } @@ -256,7 +271,9 @@ void MLIAPDataKokkos::modified(ExecutionSpace space, unsigned int ma if (mask & IATOMS_MASK ) k_iatoms .modify(); if (mask & IELEMS_MASK ) k_ielems .modify(); if (mask & JATOMS_MASK ) k_jatoms .modify(); + if (mask & PAIR_I_MASK ) k_pair_i .modify(); if (mask & JELEMS_MASK ) k_jelems .modify(); + if (mask & ELEMS_MASK ) k_elems .modify(); if (mask & IJ_MASK ) k_ij .modify(); if (mask & BETAS_MASK ) k_betas .modify(); if (mask & DESCRIPTORS_MASK ) k_descriptors .modify(); @@ -274,7 +291,9 @@ void MLIAPDataKokkos::modified(ExecutionSpace space, unsigned int ma if (mask & IATOMS_MASK ) k_iatoms .modify(); if (mask & IELEMS_MASK ) k_ielems .modify(); if (mask & JATOMS_MASK ) k_jatoms .modify(); + if (mask & PAIR_I_MASK ) k_pair_i .modify(); if (mask & JELEMS_MASK ) k_jelems .modify(); + if (mask & ELEMS_MASK ) k_elems .modify(); if (mask & IJ_MASK ) k_ij .modify(); if (mask & BETAS_MASK ) k_betas .modify(); if (mask & DESCRIPTORS_MASK ) k_descriptors .modify(); @@ -300,7 +319,9 @@ void MLIAPDataKokkos::sync(ExecutionSpace space, unsigned int mask, if (mask & IATOMS_MASK ) k_iatoms .sync(); if (mask & IELEMS_MASK ) k_ielems .sync(); if (mask & JATOMS_MASK ) k_jatoms .sync(); + if (mask & PAIR_I_MASK ) k_pair_i .sync(); if (mask & JELEMS_MASK ) k_jelems .sync(); + if (mask & ELEMS_MASK ) k_elems .sync(); if (mask & IJ_MASK ) k_ij .sync(); if (mask & BETAS_MASK ) k_betas .sync(); if (mask & DESCRIPTORS_MASK ) k_descriptors .sync(); @@ -317,7 +338,9 @@ void MLIAPDataKokkos::sync(ExecutionSpace space, unsigned int mask, if (mask & IATOMS_MASK ) k_iatoms .sync(); if (mask & IELEMS_MASK ) k_ielems .sync(); if (mask & JATOMS_MASK ) k_jatoms .sync(); + if (mask & PAIR_I_MASK ) k_pair_i .sync(); if (mask & JELEMS_MASK ) k_jelems .sync(); + if (mask & ELEMS_MASK ) k_elems .sync(); if (mask & IJ_MASK ) k_ij .sync(); if (mask & BETAS_MASK ) k_betas .sync(); if (mask & DESCRIPTORS_MASK ) k_descriptors .sync(); diff --git a/src/KOKKOS/mliap_data_kokkos.h b/src/KOKKOS/mliap_data_kokkos.h index ba81e2a226..f641085c6a 100644 --- a/src/KOKKOS/mliap_data_kokkos.h +++ b/src/KOKKOS/mliap_data_kokkos.h @@ -43,6 +43,8 @@ enum { GAMMA_MASK_MASK = 0x00001000, GAMMA_ROW_MASK = 0x00002000, GAMMA_COL_MASK = 0x00004000, + PAIR_I_MASK = 0x00008000, + ELEMS_MASK = 0x00010000, }; // clang-format on @@ -65,6 +67,8 @@ template class MLIAPDataKokkos : public MLIAPData { DAT::tdual_int_1d k_iatoms; // index of each atom DAT::tdual_int_1d k_ielems; // element of each atom DAT::tdual_int_1d k_jatoms; // index of each neighbor + DAT::tdual_int_1d k_elems; // element of each atom in or not in the neighborlist + DAT::tdual_int_1d k_pair_i; // index of each i atom for each ij pair DAT::tdual_int_1d k_jelems; // element of each neighbor DAT::tdual_int_1d k_ij; // Start location for each particle DAT::tdual_float_2d k_betas; // betas for all atoms in list @@ -78,10 +82,123 @@ template class MLIAPDataKokkos : public MLIAPData { DAT::tdual_int_2d k_gamma_row_index; // row (parameter) index DAT::tdual_int_2d k_gamma_col_index; // column (descriptor) index - int nij_total; + // Just cached for python interface + double *f_device; protected: class LAMMPS *lmp; }; + +// Now we need a specific device version for communication with python +class MLIAPDataKokkosDevice { +public: + + MLIAPDataKokkosDevice(MLIAPDataKokkos &base) : + size_array_rows(base.size_array_rows), + size_array_cols(base.size_array_cols), + natoms(base.natoms), + yoffset(base.yoffset), + zoffset(base.zoffset), + ndims_force(base.ndims_force), + ndims_virial(base.ndims_virial), + size_gradforce(base.size_gradforce), + f(base.f_device), + gradforce(base.k_gradforce.d_view.data()), + betas(base.k_betas.d_view.data()), + descriptors(base.k_descriptors.d_view.data()), + eatoms(base.k_eatoms.d_view.data()), + energy(&base.energy), + ndescriptors(base.ndescriptors), + nparams(base.nparams), + nelements(base.nelements), + gamma_nnz(base.gamma_nnz), + gamma(base.k_gamma.d_view.data()), + gamma_row_index(base.k_gamma_row_index.d_view.data()), + gamma_col_index(base.k_gamma_col_index.d_view.data()), + egradient(nullptr), + ntotal(base.ntotal), + nlistatoms(base.nlistatoms), + natomneigh(base.natomneigh), + numneighs(base.numneighs), + iatoms(base.k_iatoms.d_view.data()), + pair_i(base.k_pair_i.d_view.data()), + ielems(base.k_ielems.d_view.data()), + nneigh_max(base.nneigh_max), + npairs(base.npairs), + jatoms(base.k_jatoms.d_view.data()), + jelems(base.k_jelems.d_view.data()), + elems(base.k_elems.d_view.data()), + rij(base.k_rij.d_view.data()), + graddesc(base.k_graddesc.d_view.data()), + eflag(base.eflag), + vflag(base.vflag), + pairmliap(dynamic_cast *>(base.pairmliap)), +#if defined(KOKKOS_ENABLE_CUDA) + dev(1) +#else + dev(0) +#endif + { } + int size_array_rows; + int size_array_cols; + int natoms; + int yoffset; + int zoffset; + int ndims_force; + int ndims_virial; + int size_gradforce; + + //Write only + double *f; + double *gradforce; + double *betas; + double *descriptors; + double *eatoms; + double *energy; + + // sizing + const int ndescriptors; + const int nparams; + const int nelements; + + //Ignored for now + int gamma_nnz; + double *gamma; + int *gamma_row_index; + int *gamma_col_index; + double *egradient; + + // Neighborlist stuff + const int ntotal; + const int nlistatoms; + const int natomneigh; + int *numneighs; + int *iatoms; + int *pair_i; + int *ielems; + const int nneigh_max; + const int npairs; + int *jatoms; + int *jelems; + int *elems; + double *rij; + double *graddesc; + int eflag; + int vflag; + + class PairMLIAPKokkos *pairmliap; // access to pair tally functions + + int dev; + +#ifdef LMP_KOKKOS_GPU + MLIAPDataKokkosDevice(MLIAPDataKokkos &base) : ndescriptors(-1),nparams(-1),nelements(-1),ntotal(-1),nlistatoms(-1),natomneigh(-1), + nneigh_max(-1),npairs(-1) + { + // It cannot get here, but needed for compilation + } +#endif +}; + + } // namespace LAMMPS_NS #endif diff --git a/src/KOKKOS/mliap_descriptor_so3_kokkos.cpp b/src/KOKKOS/mliap_descriptor_so3_kokkos.cpp index f0122bca11..6518cccaa8 100644 --- a/src/KOKKOS/mliap_descriptor_so3_kokkos.cpp +++ b/src/KOKKOS/mliap_descriptor_so3_kokkos.cpp @@ -58,7 +58,7 @@ void MLIAPDescriptorSO3Kokkos::compute_descriptors(class MLIAPData * { auto data = static_cast*>(data_); so3ptr_kokkos->spectrum(data->nlistatoms, data->k_numneighs, data->k_jelems, this->k_wjelem, data->k_rij, data->k_ij, - nmax, lmax, rcutfac, alpha, data->nij_total, data->ndescriptors); + nmax, lmax, rcutfac, alpha, data->npairs, data->ndescriptors); Kokkos::deep_copy(data->k_descriptors.template view(), so3ptr_kokkos->m_plist_r); Kokkos::deep_copy(data->k_descriptors.h_view, so3ptr_kokkos->m_plist_r); @@ -70,7 +70,7 @@ template void MLIAPDescriptorSO3Kokkos::compute_forces(class MLIAPData *data_) { auto data = static_cast*>(data_); - int npairs = data->nij_total; + int npairs = data->npairs; auto d_numneighs = data->k_numneighs.template view(); so3ptr_kokkos->spectrum_dxdr(data->nlistatoms, data->k_numneighs, data->k_jelems, this->k_wjelem, data->k_rij, data->k_ij, nmax, lmax, rcutfac, alpha, npairs, data->ndescriptors); @@ -186,7 +186,7 @@ void MLIAPDescriptorSO3Kokkos::compute_force_gradients(class MLIAPDa error->all(FLERR,"This has not been tested in cuda/kokkos"); auto data = static_cast*>(data_); - int npairs = data->nij_total; + int npairs = data->npairs; so3ptr_kokkos->spectrum_dxdr(data->nlistatoms, data->k_numneighs, data->k_jelems, this->k_wjelem, data->k_rij, data->k_ij, nmax, lmax, rcutfac, alpha, npairs, data->ndescriptors); auto d_dplist_r = so3ptr_kokkos->k_dplist_r; @@ -239,7 +239,7 @@ template void MLIAPDescriptorSO3Kokkos::compute_descriptor_gradients(class MLIAPData *data_) { auto data = static_cast*>(data_); - bigint npairs = data->nij_total; + bigint npairs = data->npairs; so3ptr_kokkos->spectrum_dxdr(data->nlistatoms, data->k_numneighs, data->k_jelems, this->k_wjelem, data->k_rij, data->k_ij, nmax, lmax, rcutfac, alpha, npairs, data->ndescriptors); auto graddesc = data->k_graddesc.template view(); diff --git a/src/KOKKOS/mliap_model_python_kokkos.h b/src/KOKKOS/mliap_model_python_kokkos.h index e8c9909b88..a223cafd9d 100644 --- a/src/KOKKOS/mliap_model_python_kokkos.h +++ b/src/KOKKOS/mliap_model_python_kokkos.h @@ -36,51 +36,11 @@ class MLIAPModelPythonKokkos : public MLIAPModelPython, public MLIAPModelKokkos< void compute_force_gradients(class MLIAPData *) override; void connect_param_counts(); }; -} // namespace LAMMPS_NS - - - -#include "mliap_data_kokkos.h" - -namespace LAMMPS_NS { class MLIAPModelPythonKokkosDevice: public MLIAPModelPythonKokkos { }; -class MLIAPDataKokkosDevice { -public: +} // namespace LAMMPS_NS - MLIAPDataKokkosDevice(MLIAPDataKokkos &base) : - ndescriptors(base.ndescriptors), - nlistatoms(base.nlistatoms), - ielems(base.k_ielems.d_view.data()), - descriptors(base.k_descriptors.d_view.data()), - betas(base.k_betas.d_view.data()), - eatoms(base.k_eatoms.d_view.data()), - energy(&base.energy), -#if defined(KOKKOS_ENABLE_CUDA) - dev(1) -#else - dev(0) -#endif - { } - - const int ndescriptors; - const int nlistatoms; - int *ielems; - double *descriptors; - double *betas; - double *eatoms; - double *energy; - int dev; - -#ifdef LMP_KOKKOS_GPU - MLIAPDataKokkosDevice(MLIAPDataKokkos &base) : ndescriptors(-1),nlistatoms(-1) - { - // It cannot get here, but needed for compilation - } -#endif -}; -} #endif diff --git a/src/KOKKOS/mliap_unified_couple_kokkos.pyx b/src/KOKKOS/mliap_unified_couple_kokkos.pyx new file mode 100644 index 0000000000..37326263d3 --- /dev/null +++ b/src/KOKKOS/mliap_unified_couple_kokkos.pyx @@ -0,0 +1,445 @@ +# cython: language_level=3 +# distutils: language = c++ + +import pickle +import numpy as np +import lammps.mliap +try: + import cupy +except ImportError: + pass +from libc.stdint cimport uintptr_t + +cimport cython +from cpython.ref cimport PyObject +from libc.stdlib cimport malloc, free + + +cdef extern from "lammps.h" namespace "LAMMPS_NS": + cdef cppclass LAMMPS: + pass + + +cdef extern from "mliap_data_kokkos.h" namespace "LAMMPS_NS": + cdef cppclass MLIAPDataKokkosDevice: + # ----- may not need ----- + int size_array_rows + int size_array_cols + int natoms + int yoffset + int zoffset + int ndims_force + int ndims_virial + # -END- may not need -END- + int size_gradforce + # ----- write only ----- + double * f + double * gradforce + double * betas # betas for all atoms in list + double * descriptors # descriptors for all atoms in list + double * eatoms # energies for all atoms in list + double * energy + # -END- write only -END- + int ndescriptors # number of descriptors + int nparams # number of model parameters per element + int nelements # number of elements + + # data structures for grad-grad list (gamma) + + # ----- ignore for now ----- + int gamma_nnz # number of non-zero entries in gamma + double * gamma # gamma element + int * gamma_row_index # row (parameter) index + int * gamma_col_index # column (descriptor) index + double * egradient # energy gradient w.r.t. parameters + # -END- ignore for now -END- + + # data structures for mliap neighbor list + # only neighbors strictly inside descriptor cutoff + + int ntotal # total number of owned and ghost atoms on this proc + int nlistatoms # current number of atoms in local atom lists + int natomneigh # current number of atoms and ghosts in atom neighbor arrays + int * numneighs # neighbors count for each atom + int * iatoms # index of each atom + int * pair_i # index of each i atom for each ij pair + int * ielems # element of each atom + int nneigh_max # number of ij neighbors allocated + int npairs # number of ij neighbor pairs + int * jatoms # index of each neighbor + int * jelems # element of each neighbor + int * elems # element of each atom in or not in the neighborlist + double * rij # distance vector of each neighbor + # ----- write only ----- + double * graddesc # descriptor gradient w.r.t. each neighbor + # -END- write only -END- + int eflag # indicates if energy is needed + int vflag # indicates if virial is needed + void * pairmliap # pointer to base class + int dev + +cdef extern from "mliap_unified_kokkos.h" namespace "LAMMPS_NS": + cdef cppclass MLIAPDummyDescriptor: + MLIAPDummyDescriptor(PyObject *, LAMMPS *) except + + int ndescriptors # number of descriptors + int nelements # # of unique elements + char *elements # names of unique elements + double cutmax # maximum cutoff needed + double rcutfac + double *radelem # element radii + + void compute_descriptors(MLIAPDataKokkosDevice *) + void compute_forces(MLIAPDataKokkosDevice *) + void set_elements(char **, int) + + cdef cppclass MLIAPDummyModel: + MLIAPDummyModel(PyObject *, LAMMPS *, char * = NULL) except + + int ndescriptors # number of descriptors + int nparams # number of parameters per element + int nelements; # # of unique elements + + void compute_gradients(MLIAPDataKokkosDevice *) + + cdef void update_pair_energy(MLIAPDataKokkosDevice *, double *) except + + cdef void update_pair_forces(MLIAPDataKokkosDevice *, double *) except + + + +LOADED_MODEL = None + + +# @property sans getter +def write_only_property(fset): + return property(fget=None, fset=fset) + +cdef create_array(device, void *pointer, shape,is_int): + size=1 + for i in shape: + size = size*i + + if ( device == 1): + mem = cupy.cuda.UnownedMemory(ptr=int( pointer), owner=None, size=size) + memptr = cupy.cuda.MemoryPointer(mem, 0) + type=cupy.double + if (is_int): + type=cupy.int32 + return cupy.ndarray(shape, type, memptr=memptr) + else: + if (len(shape) == 1 ): + if (is_int): + return np.asarray(pointer) + else: + return np.asarray(pointer) + else: + if (is_int): + return np.asarray(pointer) + else: + return np.asarray(pointer) + + + +# Cython implementation of MLIAPData +# Automatically converts between C arrays and numpy when needed +cdef class MLIAPDataPy: + cdef MLIAPDataKokkosDevice * data + + def __cinit__(self): + self.data = NULL + + def update_pair_energy_cpu(self, eij): + cdef double[:] eij_arr = eij + update_pair_energy(self.data, &eij_arr[0]) + def update_pair_energy_gpu(self, eij): + cdef uintptr_t ptr = eij.data.ptr + update_pair_energy(self.data, ptr) + def update_pair_energy(self, eij): + if self.data.dev==0: + self.update_pair_energy_cpu(eij) + else: + self.update_pair_energy_gpu(eij) + + def update_pair_forces_cpu(self, fij): + cdef double[:, ::1] fij_arr = fij + update_pair_forces(self.data, &fij_arr[0][0]) + def update_pair_forces_gpu(self, fij): + cdef uintptr_t ptr = fij.data.ptr + update_pair_forces(self.data, ptr) + def update_pair_forces(self, fij): + if self.data.dev==0: + self.update_pair_forces_cpu(fij) + else: + self.update_pair_forces_gpu(fij) + @property + def f(self): + if self.data.f is NULL: + return None + return cupy.asarray( self.data.f) + + @property + def size_gradforce(self): + return self.data.size_gradforce + + @write_only_property + def gradforce(self, value): + if self.data.gradforce is NULL: + raise ValueError("attempt to set NULL gradforce") + cdef double[:, :] gradforce_view = &self.data.gradforce[0] + cdef double[:, :] value_view = value + gradforce_view[:] = value_view + print("This code has not been tested or optimized for the GPU, if you are getting this warning optimize gradforce") + + @write_only_property + def betas(self, value): + if self.data.betas is NULL: + raise ValueError("attempt to set NULL betas") + cdef double[:, :] betas_view = &self.data.betas[0] + cdef double[:, :] value_view = value + betas_view[:] = value_view + print("This code has not been tested or optimized for the GPU, if you are getting this warning optimize ") + + @write_only_property + def descriptors(self, value): + if self.data.descriptors is NULL: + raise ValueError("attempt to set NULL descriptors") + cdef double[:, :] descriptors_view = &self.data.descriptors[0] + cdef double[:, :] value_view = value + descriptors_view[:] = value_view + print("This code has not been tested or optimized for the GPU, if you are getting this warning optimize descriptors") + + @write_only_property + def eatoms(self, value): + if self.data.eatoms is NULL: + raise ValueError("attempt to set NULL eatoms") + cdef double[:] eatoms_view = &self.data.eatoms[0] + cdef double[:] value_view = value + eatoms_view[:] = value_view + print("This code has not been tested or optimized for the GPU, if you are getting this warning optimize eatoms") + + + @write_only_property + def energy(self, value): + self.data.energy[0] = value + + @property + def ndescriptors(self): + return self.data.ndescriptors + + @property + def nparams(self): + return self.data.nparams + + @property + def nelements(self): + return self.data.nelements + + # data structures for grad-grad list (gamma) + + @property + def gamma_nnz(self): + return self.data.gamma_nnz + + @property + def gamma(self): + if self.data.gamma is NULL: + return None + return create_array(self.data.dev, self.data.gamma, [self.nlistatoms, self.gama_nnz],False) + + @property + def gamma_row_index(self): + if self.data.gamma_row_index is NULL: + return None + return create_array(self.data.dev, self.data.gamma_row_index, [self.nlistatoms, self.gamma_nnz],True) + + @property + def gamma_col_index(self): + if self.data.gamma_col_index is NULL: + return None + return create_array(self.data.dev, self.data.gamma_col_index, [self.nlistatoms, self.gamma_nnz],True) + + @property + def egradient(self): + if self.data.egradient is NULL: + return None + return create_array(self.data.dev, self.data.egradient, [self.nelements*self.nparams],False) + + # data structures for mliap neighbor list + # only neighbors strictly inside descriptor cutoff + + @property + def ntotal(self): + return self.data.ntotal + + @property + def elems(self): + if self.data.elems is NULL: + return None + return create_array(self.data.dev, self.data.elems, [self.ntotal],True) + + @property + def nlistatoms(self): + return self.data.nlistatoms + + @property + def natomneigh(self): + return self.data.natomneigh + + @property + def numneighs(self): + if self.data.numneighs is NULL: + return None + return create_array(self.data.dev, self.data.numneighs, [self.natomneigh],False) + + @property + def iatoms(self): + if self.data.iatoms is NULL: + return None + return create_array(self.data.dev, self.data.iatoms, [self.natomneigh],True) + + @property + def ielems(self): + if self.data.ielems is NULL: + return None + return create_array(self.data.dev, self.data.ielems, [self.natomneigh],True) + + @property + def npairs(self): + return self.data.npairs + + @property + def pair_i(self): + if self.data.pair_i is NULL: + return None + return create_array(self.data.dev, self.data.pair_i, [self.npairs],True) + + @property + def pair_j(self): + return self.jatoms + + @property + def jatoms(self): + if self.data.jatoms is NULL: + return None + return create_array(self.data.dev, self.data.jatoms, [self.npairs],True) + + @property + def jelems(self): + if self.data.jelems is NULL: + return None + return create_array(self.data.dev, self.data.jelems, [self.npairs],True) + + + @property + def rij(self): + if self.data.rij is NULL: + return None + return create_array(self.data.dev, self.data.rij, [self.npairs,3],False) + + @write_only_property + def graddesc(self, value): + if self.data.graddesc is NULL: + raise ValueError("attempt to set NULL graddesc") + cdef double[:, :, :] graddesc_view = &self.data.graddesc[0] + cdef double[:, :, :] value_view = value + graddesc_view[:] = value_view + + @property + def eflag(self): + return self.data.eflag + + @property + def vflag(self): + return self.data.vflag + + +# Interface between C and Python compute functions +cdef class MLIAPUnifiedInterface: + cdef MLIAPDummyModel * model + cdef MLIAPDummyDescriptor * descriptor + cdef unified_impl + + def __init__(self, unified_impl): + self.model = NULL + self.descriptor = NULL + self.unified_impl = unified_impl + + def compute_gradients(self, data): + self.unified_impl.compute_gradients(data) + + def compute_descriptors(self, data): + self.unified_impl.compute_descriptors(data) + + def compute_forces(self, data): + self.unified_impl.compute_forces(data) + + +cdef public void compute_gradients_python_kokkos(unified_int, MLIAPDataKokkosDevice *data) except * with gil: + pydata = MLIAPDataPy() + pydata.data = data + unified_int.compute_gradients(pydata) + + +cdef public void compute_descriptors_python_kokkos(unified_int, MLIAPDataKokkosDevice *data) except * with gil: + pydata = MLIAPDataPy() + pydata.data = data + unified_int.compute_descriptors(pydata) + + +cdef public void compute_forces_python_kokkos(unified_int, MLIAPDataKokkosDevice *data) except * with gil: + pydata = MLIAPDataPy() + pydata.data = data + unified_int.compute_forces(pydata) + + +# Create a MLIAPUnifiedInterface and connect it to the dummy model, descriptor +cdef public object mliap_unified_connect_kokkos(char *fname, MLIAPDummyModel * model, + MLIAPDummyDescriptor * descriptor) with gil: + str_fname = fname.decode('utf-8') + if str_fname == 'EXISTS': + if LOADED_MODEL is None: + raise ValueError("No unified model loaded") + unified = LOADED_MODEL + elif str_fname.endswith(".pt") or str_fname.endswith('.pth'): + import torch + unified = torch.load(str_fname) + else: + with open(str_fname, 'rb') as pfile: + unified = pickle.load(pfile) + + unified_int = MLIAPUnifiedInterface(unified) + unified_int.model = model + unified_int.descriptor = descriptor + + unified.interface = unified_int + + if unified.ndescriptors is None: + raise ValueError("no descriptors set") + + unified_int.descriptor.ndescriptors = unified.ndescriptors + unified_int.descriptor.rcutfac = unified.rcutfac + unified_int.model.ndescriptors = unified.ndescriptors + unified_int.model.nparams = unified.nparams + + if unified.element_types is None: + raise ValueError("no element type set") + + cdef int nelements = len(unified.element_types) + cdef char **elements = malloc(nelements * sizeof(char*)) + + if not elements: + raise MemoryError("failed to allocate memory for element names") + + cdef char *elem_name + for i, elem in enumerate(unified.element_types): + elem_name_bytes = elem.encode('UTF-8') + elem_name = elem_name_bytes + elements[i] = &elem_name[0] + unified_int.descriptor.set_elements(elements, nelements) + unified_int.model.nelements = nelements + + free(elements) + return unified_int + + +# For pre-loading a Python model +def load_from_python(unified): + global LOADED_MODEL + LOADED_MODEL = unified diff --git a/src/KOKKOS/mliap_unified_kokkos.cpp b/src/KOKKOS/mliap_unified_kokkos.cpp new file mode 100644 index 0000000000..bfb9193df6 --- /dev/null +++ b/src/KOKKOS/mliap_unified_kokkos.cpp @@ -0,0 +1,388 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Matt Bettencourt (NVIDIA) +------------------------------------------------------------------------- */ + +#ifdef MLIAP_PYTHON + +#include "mliap_unified_kokkos.h" +#include + +#include "error.h" +#include "lmppython.h" +#include "memory.h" +#include "mliap_data.h" +#include "mliap_unified_couple_kokkos.h" +#include "pair_mliap.h" +#include "python_compat.h" +#include "utils.h" + +using namespace LAMMPS_NS; + +/* ---------------------------------------------------------------------- */ + +template +MLIAPDummyDescriptorKokkos::MLIAPDummyDescriptorKokkos(LAMMPS *_lmp) : + Pointers(_lmp), MLIAPDummyDescriptor(_lmp), MLIAPDescriptorKokkos(lmp, this) {} + +template +MLIAPDummyDescriptorKokkos::~MLIAPDummyDescriptorKokkos() +{ + // done in base class + // Py_DECREF(unified_interface); +} + +/* ---------------------------------------------------------------------- + invoke compute_descriptors from Cython interface + ---------------------------------------------------------------------- */ + +template +void MLIAPDummyDescriptorKokkos::compute_descriptors(class MLIAPData *data) +{ + PyGILState_STATE gstate = PyGILState_Ensure(); + auto *kokkos_data = dynamic_cast*>(data); + MLIAPDataKokkosDevice raw_data(*kokkos_data); + compute_descriptors_python_kokkos(unified_interface, &raw_data); + if (PyErr_Occurred()) { + PyErr_Print(); + PyErr_Clear(); + PyGILState_Release(gstate); + lmp->error->all(FLERR, "Running mliappy unified compute_descriptors failure."); + } + PyGILState_Release(gstate); +} + +/* ---------------------------------------------------------------------- + invoke compute_forces from Cython interface + ---------------------------------------------------------------------- */ + +template +void MLIAPDummyDescriptorKokkos::compute_forces(class MLIAPData *data) +{ + PyGILState_STATE gstate = PyGILState_Ensure(); + auto *kokkos_data = dynamic_cast*>(data); + MLIAPDataKokkosDevice raw_data(*kokkos_data); + compute_forces_python_kokkos(unified_interface, &raw_data); + if (PyErr_Occurred()) { + PyErr_Print(); + PyErr_Clear(); + PyGILState_Release(gstate); + lmp->error->all(FLERR, "Running mliappy unified compute_forces failure."); + } + PyGILState_Release(gstate); +} + +// not implemented +template +void MLIAPDummyDescriptorKokkos::compute_force_gradients(class MLIAPData *) +{ + error->all(FLERR, "compute_force_gradients not implemented"); +} + +// not implemented +template +void MLIAPDummyDescriptorKokkos::compute_descriptor_gradients(class MLIAPData *) +{ + error->all(FLERR, "compute_descriptor_gradients not implemented"); +} + +template +void MLIAPDummyDescriptorKokkos::init() +{ + memory->create(radelem, nelements, "mliap_dummy_descriptor:radelem"); + for (int ielem = 0; ielem < nelements; ielem++) { radelem[ielem] = 1; } + + double cut; + cutmax = 0.0; + memory->create(cutsq, nelements, nelements, "mliap/descriptor/dummy:cutsq"); + memory->create(cutghost, nelements, nelements, "mliap/descriptor/dummy:cutghost"); + for (int ielem = 0; ielem < nelements; ielem++) { + // rcutfac set from python, is global cutoff for all elements + cut = 2.0 * radelem[ielem] * rcutfac; + if (cut > cutmax) cutmax = cut; + cutsq[ielem][ielem] = cut * cut; + cutghost[ielem][ielem] = cut * cut; + for (int jelem = ielem + 1; jelem < nelements; jelem++) { + cut = (radelem[ielem] + radelem[jelem]) * rcutfac; + cutsq[ielem][jelem] = cutsq[jelem][ielem] = cut * cut; + cutghost[ielem][jelem] = cutghost[jelem][ielem] = cut * cut; + } + } +} + +template +void MLIAPDummyDescriptorKokkos::set_elements(char **elems, int nelems) +{ + nelements = nelems; + elements = new char *[nelems]; + for (int i = 0; i < nelems; i++) { elements[i] = utils::strdup(elems[i]); } +} + +/* ---------------------------------------------------------------------- */ + +template +MLIAPDummyModelKokkos::MLIAPDummyModelKokkos(LAMMPS *lmp, char *coefffilename) : +MLIAPDummyModel(lmp,coefffilename), +MLIAPModelKokkos(lmp, this) +{ + nonlinearflag = 1; +} + +template +MLIAPDummyModelKokkos::~MLIAPDummyModelKokkos() +{ + // manually decrement borrowed reference from Python + Py_DECREF(unified_interface); +} + +template +int MLIAPDummyModelKokkos::get_nparams() +{ + return nparams; +} + +template +int MLIAPDummyModelKokkos::get_gamma_nnz(class MLIAPData *) +{ + // TODO: get_gamma_nnz + return 0; +} + +/* ---------------------------------------------------------------------- + invoke compute_gradients from Cython interface + ---------------------------------------------------------------------- */ + +template +void MLIAPDummyModelKokkos::compute_gradients(class MLIAPData *data) +{ + PyGILState_STATE gstate = PyGILState_Ensure(); + auto *kokkos_data = dynamic_cast*>(data); + MLIAPDataKokkosDevice raw_data(*kokkos_data); + compute_gradients_python_kokkos(unified_interface, &raw_data); + if (PyErr_Occurred()) { + PyErr_Print(); + PyErr_Clear(); + PyGILState_Release(gstate); + MLIAPModelKokkos::error->all(FLERR, "Running mliappy unified compute_gradients failure."); + } + PyGILState_Release(gstate); +} + +// not implemented +template +void MLIAPDummyModelKokkos::compute_gradgrads(class MLIAPData *) +{ + MLIAPModelKokkos::error->all(FLERR, "compute_gradgrads not implemented"); +} + +// not implemented +template +void MLIAPDummyModelKokkos::compute_force_gradients(class MLIAPData *) +{ + MLIAPModelKokkos::error->all(FLERR, "compute_force_gradients not implemented"); +} + +/* ---------------------------------------------------------------------- + memory usage unclear due to Cython/Python implementation + ---------------------------------------------------------------------- */ + +template +double MLIAPDummyModelKokkos::memory_usage() +{ + // TODO: implement memory usage in Cython(?) + return 0; +} + +// not implemented +template +void MLIAPDummyModelKokkos::read_coeffs(char *) +{ + MLIAPModelKokkos::error->all(FLERR, "read_coeffs not implemented"); +} + +/* ---------------------------------------------------------------------- + build the unified interface object, connect to dummy model and descriptor + ---------------------------------------------------------------------- */ + +template +MLIAPBuildUnifiedKokkos_t LAMMPS_NS::build_unified(char *unified_fname, MLIAPDataKokkos *data, LAMMPS *lmp, + char *coefffilename) +{ + lmp->python->init(); + PyGILState_STATE gstate = PyGILState_Ensure(); + + PyObject *pyMain = PyImport_AddModule("__main__"); + + if (!pyMain) { + PyGILState_Release(gstate); + lmp->error->all(FLERR, "Could not initialize embedded Python"); + } + + PyImport_ImportModule("mliap_unified_couple_kokkos"); + + if (PyErr_Occurred()) { + PyErr_Print(); + PyErr_Clear(); + PyGILState_Release(gstate); + lmp->error->all(FLERR, "Loading mliappy unified module failure."); + } + + // Connect dummy model, dummy descriptor, data to Python unified + MLIAPDummyModelKokkos *model = new MLIAPDummyModelKokkos(lmp, coefffilename); + MLIAPDummyDescriptorKokkos *descriptor = new MLIAPDummyDescriptorKokkos(lmp); + + PyObject *unified_interface = mliap_unified_connect_kokkos(unified_fname, model, descriptor); + if (PyErr_Occurred()) { + PyErr_Print(); + PyErr_Clear(); + PyGILState_Release(gstate); + lmp->error->all(FLERR, "Running mliappy unified module failure."); + } + + // Borrowed references must be manually incremented + model->unified_interface = unified_interface; + Py_INCREF(unified_interface); + descriptor->unified_interface = unified_interface; + Py_INCREF(unified_interface); + + PyGILState_Release(gstate); + + MLIAPBuildUnifiedKokkos_t build = {data, descriptor, model}; + return build; +} + +/* ---------------------------------------------------------------------- + set energy for ij atom pairs + ---------------------------------------------------------------------- */ + +void LAMMPS_NS::update_pair_energy(MLIAPDataKokkosDevice *data, double *eij) +{ + double e_total = 0.0; + auto d_eatoms = data->eatoms; + auto d_pair_i= data->pair_i; + const auto nlistatoms = data->nlistatoms; + Kokkos::parallel_for(nlistatoms, KOKKOS_LAMBDA(int ii){ + d_eatoms[ii] = 0; + }); + + Kokkos::parallel_reduce(data->npairs, KOKKOS_LAMBDA(int ii, double &local_sum){ + int i = d_pair_i[ii]; + double e = 0.5 * eij[ii]; + + // must not count any contribution where i is not a local atom + if (i < nlistatoms) { + Kokkos::atomic_add(&d_eatoms[i], e); + local_sum += e; + } + },*data->energy); +} + +/* ---------------------------------------------------------------------- + set forces for ij atom pairs + ---------------------------------------------------------------------- */ + +void LAMMPS_NS::update_pair_forces(MLIAPDataKokkosDevice *data, double *fij) +{ + const auto nlistatoms = data->nlistatoms; + auto *f = data->f; + auto pair_i = data->pair_i; + auto j_atoms = data->jatoms; + auto vflag = data->vflag; + auto rij = data->rij; + int vflag_either=data->pairmliap->vflag_either, vflag_global=data->pairmliap->vflag_global, vflag_atom=data->pairmliap->vflag_atom; + auto d_vatom = data->pairmliap->k_vatom.template view(); + Kokkos::View virial("virial"); + + Kokkos::parallel_for(data->npairs,KOKKOS_LAMBDA (int ii) { + + int ii3 = ii * 3; + int i = pair_i[ii]; + int j = j_atoms[ii]; + + // must not count any contribution where i is not a local atom + if (i < nlistatoms) { + Kokkos::atomic_add(&f[i*3+0], fij[ii3+0]); + Kokkos::atomic_add(&f[i*3+1], fij[ii3+1]); + Kokkos::atomic_add(&f[i*3+2], fij[ii3+2]); + Kokkos::atomic_add(&f[j*3+0],-fij[ii3+0]); + Kokkos::atomic_add(&f[j*3+1],-fij[ii3+1]); + Kokkos::atomic_add(&f[j*3+2],-fij[ii3+2]); + if (vflag) { + double v[6]; + v[0] = -rij[ii3+0]*fij[ii3+0]; + v[1] = -rij[ii3+1]*fij[ii3+1]; + v[2] = -rij[ii3+2]*fij[ii3+2]; + v[3] = -rij[ii3+0]*fij[ii3+1]; + v[4] = -rij[ii3+0]*fij[ii3+2]; + v[5] = -rij[ii3+1]*fij[ii3+2]; + if (vflag_global) { + Kokkos::atomic_add(&virial[0], v[0]); + Kokkos::atomic_add(&virial[1], v[1]); + Kokkos::atomic_add(&virial[2], v[2]); + Kokkos::atomic_add(&virial[3], v[3]); + Kokkos::atomic_add(&virial[4], v[4]); + Kokkos::atomic_add(&virial[5], v[5]); + } + if (vflag_atom) { + Kokkos::atomic_add(&d_vatom(i,0), 0.5*v[0]); + Kokkos::atomic_add(&d_vatom(i,1), 0.5*v[1]); + Kokkos::atomic_add(&d_vatom(i,2), 0.5*v[2]); + Kokkos::atomic_add(&d_vatom(i,3), 0.5*v[3]); + Kokkos::atomic_add(&d_vatom(i,4), 0.5*v[4]); + Kokkos::atomic_add(&d_vatom(i,5), 0.5*v[5]); + + Kokkos::atomic_add(&d_vatom(j,0), 0.5*v[0]); + Kokkos::atomic_add(&d_vatom(j,1), 0.5*v[1]); + Kokkos::atomic_add(&d_vatom(j,2), 0.5*v[2]); + Kokkos::atomic_add(&d_vatom(j,3), 0.5*v[3]); + Kokkos::atomic_add(&d_vatom(j,4), 0.5*v[4]); + Kokkos::atomic_add(&d_vatom(j,5), 0.5*v[5]); + } + } + } + }); + + if (vflag) { + if (vflag_global) { + Kokkos::View h_virial("h_virial"); + Kokkos::deep_copy(h_virial,virial); + for (int i=0;i<6;++i) + data->pairmliap->virial[i]+=h_virial[i]; + } + if (vflag_atom) { + data->pairmliap->k_vatom.template modify(); + data->pairmliap->k_vatom.template sync(); + } + } +} + +namespace LAMMPS_NS { +template class MLIAPDummyModelKokkos; +template class MLIAPDummyDescriptorKokkos; +template MLIAPBuildUnifiedKokkos_t LAMMPS_NS::build_unified(char *unified_fname, MLIAPDataKokkos *data, LAMMPS *lmp, + char *coefffilename); +//template void LAMMPS_NS::update_pair_energy(MLIAPDataKokkos *data, double *eij); +//template void LAMMPS_NS::update_pair_forces(MLIAPDataKokkos *data, double *fij); +#ifdef LMP_KOKKOS_GPU +template class MLIAPDummyModelKokkos; +template class MLIAPDummyDescriptorKokkos; +template MLIAPBuildUnifiedKokkos_t LAMMPS_NS::build_unified(char *unified_fname, MLIAPDataKokkos *data, LAMMPS *lmp, + char *coefffilename); +//template void LAMMPS_NS::update_pair_energy(MLIAPDataKokkos *data, double *eij); +//template void LAMMPS_NS::update_pair_forces(MLIAPDataKokkos *data, double *fij); +#endif +} +#endif + diff --git a/src/KOKKOS/mliap_unified_kokkos.h b/src/KOKKOS/mliap_unified_kokkos.h new file mode 100644 index 0000000000..aad25891b0 --- /dev/null +++ b/src/KOKKOS/mliap_unified_kokkos.h @@ -0,0 +1,66 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifndef LMP_MLIAP_UNIFIED_KOKKOS_H +#define LMP_MLIAP_UNIFIED_KOKKOS_H + +#include "mliap_unified.h" +#include "mliap_descriptor_kokkos.h" +#include "mliap_model_kokkos.h" +#include "mliap_data_kokkos.h" + +#include + +namespace LAMMPS_NS { +template +class MLIAPDummyDescriptorKokkos : public MLIAPDummyDescriptor, public MLIAPDescriptorKokkos{ + public: + MLIAPDummyDescriptorKokkos(LAMMPS *); + ~MLIAPDummyDescriptorKokkos() override; + void compute_descriptors(class MLIAPData *) override; + void compute_forces(class MLIAPData *) override; + void compute_force_gradients(class MLIAPData *) override; + void compute_descriptor_gradients(class MLIAPData *) override; + void init() override; + void set_elements(char **, int); +}; +template +class MLIAPDummyModelKokkos : public MLIAPDummyModel, public MLIAPModelKokkos { + public: + MLIAPDummyModelKokkos(LAMMPS *, char * = nullptr); + ~MLIAPDummyModelKokkos() override; + int get_nparams() override; + int get_gamma_nnz(class MLIAPData *) override; + void compute_gradients(class MLIAPData *) override; + void compute_gradgrads(class MLIAPData *) override; + void compute_force_gradients(class MLIAPData *) override; + double memory_usage() override; + + protected: + void read_coeffs(char *) override; +}; + +template +struct MLIAPBuildUnifiedKokkos_t { + MLIAPDataKokkos *data; + MLIAPDummyDescriptorKokkos *descriptor; + MLIAPDummyModelKokkos *model; +}; +template +MLIAPBuildUnifiedKokkos_t build_unified(char *, MLIAPDataKokkos *, LAMMPS *, char * = NULL); +void update_pair_energy(MLIAPDataKokkosDevice *, double *); +void update_pair_forces(MLIAPDataKokkosDevice *, double *); + +} // namespace LAMMPS_NS + +#endif diff --git a/src/KOKKOS/pair_meam_kokkos.cpp b/src/KOKKOS/pair_meam_kokkos.cpp index 90e714cefe..c2b03c2054 100644 --- a/src/KOKKOS/pair_meam_kokkos.cpp +++ b/src/KOKKOS/pair_meam_kokkos.cpp @@ -51,6 +51,7 @@ PairMEAMKokkos::PairMEAMKokkos(LAMMPS *lmp) : PairMEAM(lmp) delete meam_inst; meam_inst_kk = new MEAMKokkos(memory); meam_inst = meam_inst_kk; + myname = "meam/kk"; } /* ---------------------------------------------------------------------- */ @@ -156,7 +157,8 @@ void PairMEAMKokkos::compute(int eflag_in, int vflag_in) int need_dup = lmp->kokkos->need_dup(); - meam_inst_kk->meam_dens_init(inum_half,ntype,type,d_map,x,d_numneigh_half,d_numneigh_full,d_ilist_half,d_neighbors_half, d_neighbors_full, d_offset, neighflag, need_dup); + meam_inst_kk->meam_dens_init(inum_half,ntype,type,d_map,x,d_numneigh_half,d_numneigh_full, + d_ilist_half,d_neighbors_half, d_neighbors_full, d_offset, neighflag, need_dup); meam_inst_kk->k_rho0.template modify(); meam_inst_kk->k_arho2b.template modify(); @@ -166,6 +168,13 @@ void PairMEAMKokkos::compute(int eflag_in, int vflag_in) meam_inst_kk->k_arho3b.template modify(); meam_inst_kk->k_t_ave.template modify(); meam_inst_kk->k_tsq_ave.template modify(); + if (msmeamflag) { + meam_inst_kk->k_arho2mb.template modify(); + meam_inst_kk->k_arho1m.template modify(); + meam_inst_kk->k_arho2m.template modify(); + meam_inst_kk->k_arho3m.template modify(); + meam_inst_kk->k_arho3mb.template modify(); + } comm->reverse_comm(this); @@ -177,6 +186,13 @@ void PairMEAMKokkos::compute(int eflag_in, int vflag_in) meam_inst_kk->k_arho3b.template sync(); meam_inst_kk->k_t_ave.template sync(); meam_inst_kk->k_tsq_ave.template sync(); + if (msmeamflag) { + meam_inst_kk->k_arho2mb.template sync(); + meam_inst_kk->k_arho1m.template sync(); + meam_inst_kk->k_arho2m.template sync(); + meam_inst_kk->k_arho3m.template sync(); + meam_inst_kk->k_arho3mb.template sync(); + } meam_inst_kk->meam_dens_final(nlocal,eflag_either,eflag_global,eflag_atom, d_eatom,ntype,type,d_map,d_scale,errorflag,ev); @@ -200,6 +216,13 @@ void PairMEAMKokkos::compute(int eflag_in, int vflag_in) meam_inst_kk->k_arho3b.template modify(); meam_inst_kk->k_t_ave.template modify(); meam_inst_kk->k_tsq_ave.template modify(); + if (msmeamflag) { + meam_inst_kk->k_arho2mb.template modify(); + meam_inst_kk->k_arho1m.template modify(); + meam_inst_kk->k_arho2m.template modify(); + meam_inst_kk->k_arho3m.template modify(); + meam_inst_kk->k_arho3mb.template modify(); + } comm->forward_comm(this); @@ -219,6 +242,13 @@ void PairMEAMKokkos::compute(int eflag_in, int vflag_in) meam_inst_kk->k_arho3b.template sync(); meam_inst_kk->k_t_ave.template sync(); meam_inst_kk->k_tsq_ave.template sync(); + if (msmeamflag) { + meam_inst_kk->k_arho2mb.template sync(); + meam_inst_kk->k_arho1m.template sync(); + meam_inst_kk->k_arho2m.template sync(); + meam_inst_kk->k_arho3m.template sync(); + meam_inst_kk->k_arho3mb.template sync(); + } meam_inst_kk->meam_force(inum_half,eflag_global,eflag_atom,vflag_global, vflag_atom,d_eatom,ntype,type,d_map,x, @@ -315,7 +345,7 @@ int PairMEAMKokkos::pack_forward_comm_kokkos(int n, DAT::tdual_int_2 iswap = iswap_in; v_buf = buf.view(); Kokkos::parallel_for(Kokkos::RangePolicy(0,n),*this); - return n*38; + return n*comm_forward; } /* ---------------------------------------------------------------------- */ @@ -324,7 +354,7 @@ template KOKKOS_INLINE_FUNCTION void PairMEAMKokkos::operator()(TagPairMEAMPackForwardComm, const int &i) const { int j = d_sendlist(iswap, i); - int m = i*38; + int m = i*comm_forward; v_buf[m++] = d_rho0[j]; v_buf[m++] = d_rho1[j]; v_buf[m++] = d_rho2[j]; @@ -354,6 +384,22 @@ void PairMEAMKokkos::operator()(TagPairMEAMPackForwardComm, const in v_buf[m++] = d_tsq_ave(j,0); v_buf[m++] = d_tsq_ave(j,1); v_buf[m++] = d_tsq_ave(j,2); + if (msmeamflag) { + v_buf[m++] = d_arho2mb[j]; + v_buf[m++] = d_arho1m(j,0); + v_buf[m++] = d_arho1m(j,1); + v_buf[m++] = d_arho1m(j,2); + v_buf[m++] = d_arho2m(j,0); + v_buf[m++] = d_arho2m(j,1); + v_buf[m++] = d_arho2m(j,2); + v_buf[m++] = d_arho2m(j,3); + v_buf[m++] = d_arho2m(j,4); + v_buf[m++] = d_arho2m(j,5); + for (int k = 0; k < 10; k++) v_buf[m++] = d_arho3m(j,k); + v_buf[m++] = d_arho3mb(j,0); + v_buf[m++] = d_arho3mb(j,1); + v_buf[m++] = d_arho3mb(j,2); + } } /* ---------------------------------------------------------------------- */ @@ -371,7 +417,8 @@ void PairMEAMKokkos::unpack_forward_comm_kokkos(int n, int first_in, template KOKKOS_INLINE_FUNCTION void PairMEAMKokkos::operator()(TagPairMEAMUnpackForwardComm, const int &i) const{ - int m = i*38; + //int m = i*38; + int m = i*comm_forward; d_rho0[i+first] = v_buf[m++]; d_rho1[i+first] = v_buf[m++]; @@ -402,6 +449,22 @@ void PairMEAMKokkos::operator()(TagPairMEAMUnpackForwardComm, const d_tsq_ave(i+first,0) = v_buf[m++]; d_tsq_ave(i+first,1) = v_buf[m++]; d_tsq_ave(i+first,2) = v_buf[m++]; + if (msmeamflag) { + d_arho2mb[i+first] = v_buf[m++]; + d_arho1m(i+first,0) = v_buf[m++]; + d_arho1m(i+first,1) = v_buf[m++]; + d_arho1m(i+first,2) = v_buf[m++]; + d_arho2m(i+first,0) = v_buf[m++]; + d_arho2m(i+first,1) = v_buf[m++]; + d_arho2m(i+first,2) = v_buf[m++]; + d_arho2m(i+first,3) = v_buf[m++]; + d_arho2m(i+first,4) = v_buf[m++]; + d_arho2m(i+first,5) = v_buf[m++]; + for (int k = 0; k < 10; k++) d_arho3m(i+first,k) = v_buf[m++]; + d_arho3mb(i+first,0) = v_buf[m++]; + d_arho3mb(i+first,1) = v_buf[m++]; + d_arho3mb(i+first,2) = v_buf[m++]; + } } /* ---------------------------------------------------------------------- */ @@ -426,6 +489,13 @@ int PairMEAMKokkos::pack_forward_comm(int n, int *list, double *buf, meam_inst_kk->k_arho3b.sync_host(); meam_inst_kk->k_t_ave.sync_host(); meam_inst_kk->k_tsq_ave.sync_host(); + if (msmeamflag) { + meam_inst_kk->k_arho2mb.sync_host(); + meam_inst_kk->k_arho1m.sync_host(); + meam_inst_kk->k_arho2m.sync_host(); + meam_inst_kk->k_arho3m.sync_host(); + meam_inst_kk->k_arho3mb.sync_host(); + } int m = 0; for (int i = 0; i < n; i++) { @@ -459,6 +529,22 @@ int PairMEAMKokkos::pack_forward_comm(int n, int *list, double *buf, buf[m++] = meam_inst_kk->h_tsq_ave(j,0); buf[m++] = meam_inst_kk->h_tsq_ave(j,1); buf[m++] = meam_inst_kk->h_tsq_ave(j,2); + if (msmeamflag) { + buf[m++] = meam_inst_kk->h_arho2mb[j]; + buf[m++] = meam_inst_kk->h_arho1m(j,0); + buf[m++] = meam_inst_kk->h_arho1m(j,1); + buf[m++] = meam_inst_kk->h_arho1m(j,2); + buf[m++] = meam_inst_kk->h_arho2m(j,0); + buf[m++] = meam_inst_kk->h_arho2m(j,1); + buf[m++] = meam_inst_kk->h_arho2m(j,2); + buf[m++] = meam_inst_kk->h_arho2m(j,3); + buf[m++] = meam_inst_kk->h_arho2m(j,4); + buf[m++] = meam_inst_kk->h_arho2m(j,5); + for (int k = 0; k < 10; k++) buf[m++] = meam_inst_kk->h_arho3m(j,k); + buf[m++] = meam_inst_kk->h_arho3mb(j,0); + buf[m++] = meam_inst_kk->h_arho3mb(j,1); + buf[m++] = meam_inst_kk->h_arho3mb(j,2); + } } return m; @@ -485,6 +571,13 @@ void PairMEAMKokkos::unpack_forward_comm(int n, int first, double *b meam_inst_kk->k_arho3b.sync_host(); meam_inst_kk->k_t_ave.sync_host(); meam_inst_kk->k_tsq_ave.sync_host(); + if (msmeamflag) { + meam_inst_kk->k_arho2mb.sync_host(); + meam_inst_kk->k_arho1m.sync_host(); + meam_inst_kk->k_arho2m.sync_host(); + meam_inst_kk->k_arho3m.sync_host(); + meam_inst_kk->k_arho3mb.sync_host(); + } int m = 0; const int last = first + n; @@ -518,6 +611,22 @@ void PairMEAMKokkos::unpack_forward_comm(int n, int first, double *b meam_inst_kk->h_tsq_ave(i,0) = buf[m++]; meam_inst_kk->h_tsq_ave(i,1) = buf[m++]; meam_inst_kk->h_tsq_ave(i,2) = buf[m++]; + if (msmeamflag) { + meam_inst_kk->h_arho2mb[i] = buf[m++]; + meam_inst_kk->h_arho1m(i,0) = buf[m++]; + meam_inst_kk->h_arho1m(i,1) = buf[m++]; + meam_inst_kk->h_arho1m(i,2) = buf[m++]; + meam_inst_kk->h_arho2m(i,0) = buf[m++]; + meam_inst_kk->h_arho2m(i,1) = buf[m++]; + meam_inst_kk->h_arho2m(i,2) = buf[m++]; + meam_inst_kk->h_arho2m(i,3) = buf[m++]; + meam_inst_kk->h_arho2m(i,4) = buf[m++]; + meam_inst_kk->h_arho2m(i,5) = buf[m++]; + for (int k = 0; k < 10; k++) meam_inst_kk->h_arho3m(i,k) = buf[m++]; + meam_inst_kk->h_arho3mb(i,0) = buf[m++]; + meam_inst_kk->h_arho3mb(i,1) = buf[m++]; + meam_inst_kk->h_arho3mb(i,2) = buf[m++]; + } } meam_inst_kk->k_rho0.modify_host(); @@ -536,6 +645,13 @@ void PairMEAMKokkos::unpack_forward_comm(int n, int first, double *b meam_inst_kk->k_arho3b.modify_host(); meam_inst_kk->k_t_ave.modify_host(); meam_inst_kk->k_tsq_ave.modify_host(); + if (msmeamflag) { + meam_inst_kk->k_arho2mb.modify_host(); + meam_inst_kk->k_arho1m.modify_host(); + meam_inst_kk->k_arho2m.modify_host(); + meam_inst_kk->k_arho3m.modify_host(); + meam_inst_kk->k_arho3mb.modify_host(); + } } /* ---------------------------------------------------------------------- */ @@ -546,7 +662,8 @@ int PairMEAMKokkos::pack_reverse_comm_kokkos(int n, int first_in, DA first = first_in; v_buf = buf.view(); Kokkos::parallel_for(Kokkos::RangePolicy(0,n),*this); - return n*30; + //return n*30; + return n*comm_reverse; } /* ---------------------------------------------------------------------- */ @@ -554,7 +671,8 @@ int PairMEAMKokkos::pack_reverse_comm_kokkos(int n, int first_in, DA template KOKKOS_INLINE_FUNCTION void PairMEAMKokkos::operator()(TagPairMEAMPackReverseComm, const int &i) const { - int m = i*30; + //int m = i*30; + int m = i*comm_reverse; v_buf[m++] = d_rho0[i+first]; v_buf[m++] = d_arho2b[i+first]; @@ -577,6 +695,22 @@ void PairMEAMKokkos::operator()(TagPairMEAMPackReverseComm, const in v_buf[m++] = d_tsq_ave(i+first,0); v_buf[m++] = d_tsq_ave(i+first,1); v_buf[m++] = d_tsq_ave(i+first,2); + if (msmeamflag) { + v_buf[m++] = d_arho2mb[i+first]; + v_buf[m++] = d_arho1m(i+first,0); + v_buf[m++] = d_arho1m(i+first,1); + v_buf[m++] = d_arho1m(i+first,2); + v_buf[m++] = d_arho2m(i+first,0); + v_buf[m++] = d_arho2m(i+first,1); + v_buf[m++] = d_arho2m(i+first,2); + v_buf[m++] = d_arho2m(i+first,3); + v_buf[m++] = d_arho2m(i+first,4); + v_buf[m++] = d_arho2m(i+first,5); + for (int k = 0; k < 10; k++) v_buf[m++] = d_arho3m(i+first,k); + v_buf[m++] = d_arho3mb(i+first,0); + v_buf[m++] = d_arho3mb(i+first,1); + v_buf[m++] = d_arho3mb(i+first,2); + } } /* ---------------------------------------------------------------------- */ @@ -592,6 +726,13 @@ int PairMEAMKokkos::pack_reverse_comm(int n, int first, double *buf) meam_inst_kk->k_arho3b.sync_host(); meam_inst_kk->k_t_ave.sync_host(); meam_inst_kk->k_tsq_ave.sync_host(); + if (msmeamflag) { + meam_inst_kk->k_arho2mb.sync_host(); + meam_inst_kk->k_arho1m.sync_host(); + meam_inst_kk->k_arho2m.sync_host(); + meam_inst_kk->k_arho3m.sync_host(); + meam_inst_kk->k_arho3mb.sync_host(); + } int m = 0; const int last = first + n; @@ -617,6 +758,22 @@ int PairMEAMKokkos::pack_reverse_comm(int n, int first, double *buf) buf[m++] = meam_inst_kk->h_tsq_ave(i,0); buf[m++] = meam_inst_kk->h_tsq_ave(i,1); buf[m++] = meam_inst_kk->h_tsq_ave(i,2); + if (msmeamflag) { + buf[m++] = meam_inst_kk->h_arho2mb[i]; + buf[m++] = meam_inst_kk->h_arho1m(i,0); + buf[m++] = meam_inst_kk->h_arho1m(i,1); + buf[m++] = meam_inst_kk->h_arho1m(i,2); + buf[m++] = meam_inst_kk->h_arho2m(i,0); + buf[m++] = meam_inst_kk->h_arho2m(i,1); + buf[m++] = meam_inst_kk->h_arho2m(i,2); + buf[m++] = meam_inst_kk->h_arho2m(i,3); + buf[m++] = meam_inst_kk->h_arho2m(i,4); + buf[m++] = meam_inst_kk->h_arho2m(i,5); + for (int k = 0; k < 10; k++) buf[m++] = meam_inst_kk->h_arho3m(i,k); + buf[m++] = meam_inst_kk->h_arho3mb(i,0); + buf[m++] = meam_inst_kk->h_arho3mb(i,1); + buf[m++] = meam_inst_kk->h_arho3mb(i,2); + } } return m; @@ -639,7 +796,8 @@ template KOKKOS_INLINE_FUNCTION void PairMEAMKokkos::operator()(TagPairMEAMUnpackReverseComm, const int &i) const { int j = d_sendlist(iswap, i); - int m = i*30; + //int m = i*30; + int m = i*comm_reverse; d_rho0[j] += v_buf[m++]; d_arho2b[j] += v_buf[m++]; @@ -662,6 +820,22 @@ void PairMEAMKokkos::operator()(TagPairMEAMUnpackReverseComm, const d_tsq_ave(j,0) += v_buf[m++]; d_tsq_ave(j,1) += v_buf[m++]; d_tsq_ave(j,2) += v_buf[m++]; + if (msmeamflag) { + d_arho2mb[j] += v_buf[m++]; + d_arho1m(j,0) += v_buf[m++]; + d_arho1m(j,1) += v_buf[m++]; + d_arho1m(j,2) += v_buf[m++]; + d_arho2m(j,0) += v_buf[m++]; + d_arho2m(j,1) += v_buf[m++]; + d_arho2m(j,2) += v_buf[m++]; + d_arho2m(j,3) += v_buf[m++]; + d_arho2m(j,4) += v_buf[m++]; + d_arho2m(j,5) += v_buf[m++]; + for (int k = 0; k < 10; k++) d_arho3m(j,k) += v_buf[m++]; + d_arho3mb(j,0) += v_buf[m++]; + d_arho3mb(j,1) += v_buf[m++]; + d_arho3mb(j,2) += v_buf[m++]; + } } /* ---------------------------------------------------------------------- */ @@ -677,6 +851,13 @@ void PairMEAMKokkos::unpack_reverse_comm(int n, int *list, double *b meam_inst_kk->k_arho3b.sync_host(); meam_inst_kk->k_t_ave.sync_host(); meam_inst_kk->k_tsq_ave.sync_host(); + if (msmeamflag) { + meam_inst_kk->k_arho2mb.sync_host(); + meam_inst_kk->k_arho1m.sync_host(); + meam_inst_kk->k_arho2m.sync_host(); + meam_inst_kk->k_arho3m.sync_host(); + meam_inst_kk->k_arho3mb.sync_host(); + } int m = 0; for (int i = 0; i < n; i++) { @@ -702,6 +883,22 @@ void PairMEAMKokkos::unpack_reverse_comm(int n, int *list, double *b meam_inst_kk->h_tsq_ave(j,0) += buf[m++]; meam_inst_kk->h_tsq_ave(j,1) += buf[m++]; meam_inst_kk->h_tsq_ave(j,2) += buf[m++]; + if (msmeamflag) { + meam_inst_kk->h_arho2mb[j] += buf[m++]; + meam_inst_kk->h_arho1m(j,0) += buf[m++]; + meam_inst_kk->h_arho1m(j,1) += buf[m++]; + meam_inst_kk->h_arho1m(j,2) += buf[m++]; + meam_inst_kk->h_arho2m(j,0) += buf[m++]; + meam_inst_kk->h_arho2m(j,1) += buf[m++]; + meam_inst_kk->h_arho2m(j,2) += buf[m++]; + meam_inst_kk->h_arho2m(j,3) += buf[m++]; + meam_inst_kk->h_arho2m(j,4) += buf[m++]; + meam_inst_kk->h_arho2m(j,5) += buf[m++]; + for (int k = 0; k < 10; k++) meam_inst_kk->h_arho3m(j,k) += buf[m++]; + meam_inst_kk->h_arho3mb(j,0) += buf[m++]; + meam_inst_kk->h_arho3mb(j,1) += buf[m++]; + meam_inst_kk->h_arho3mb(j,2) += buf[m++]; + } } meam_inst_kk->k_rho0.modify_host(); @@ -712,6 +909,13 @@ void PairMEAMKokkos::unpack_reverse_comm(int n, int *list, double *b meam_inst_kk->k_arho3b.modify_host(); meam_inst_kk->k_t_ave.modify_host(); meam_inst_kk->k_tsq_ave.modify_host(); + if (msmeamflag) { + meam_inst_kk->k_arho2mb.modify_host(); + meam_inst_kk->k_arho1m.modify_host(); + meam_inst_kk->k_arho2m.modify_host(); + meam_inst_kk->k_arho3m.modify_host(); + meam_inst_kk->k_arho3mb.modify_host(); + } } /* ---------------------------------------------------------------------- @@ -764,6 +968,12 @@ void PairMEAMKokkos::update_meam_views() d_arho3b = meam_inst_kk->d_arho3b; d_t_ave = meam_inst_kk->d_t_ave; d_tsq_ave = meam_inst_kk->d_tsq_ave; + // msmeam + d_arho1m = meam_inst_kk->d_arho1m; + d_arho2m = meam_inst_kk->d_arho2m; + d_arho3m = meam_inst_kk->d_arho3m; + d_arho2mb = meam_inst_kk->d_arho2mb; + d_arho3mb = meam_inst_kk->d_arho3mb; } /* ---------------------------------------------------------------------- */ diff --git a/src/KOKKOS/pair_meam_kokkos.h b/src/KOKKOS/pair_meam_kokkos.h index c5fe82fa79..0d0d7667f3 100644 --- a/src/KOKKOS/pair_meam_kokkos.h +++ b/src/KOKKOS/pair_meam_kokkos.h @@ -13,12 +13,12 @@ #ifdef PAIR_CLASS // clang-format off -PairStyle(meam/c/kk,PairMEAMKokkos) -PairStyle(meam/c/kk/device,PairMEAMKokkos) -PairStyle(meam/c/kk/host,PairMEAMKokkos) -PairStyle(meam/kk,PairMEAMKokkos) -PairStyle(meam/kk/device,PairMEAMKokkos) -PairStyle(meam/kk/host,PairMEAMKokkos) +PairStyle(meam/c/kk,PairMEAMKokkos); +PairStyle(meam/c/kk/device,PairMEAMKokkos); +PairStyle(meam/c/kk/host,PairMEAMKokkos); +PairStyle(meam/kk,PairMEAMKokkos); +PairStyle(meam/kk/device,PairMEAMKokkos); +PairStyle(meam/kk/host,PairMEAMKokkos); // clang-format on #else @@ -117,6 +117,9 @@ class PairMEAMKokkos : public PairMEAM, public KokkosBase { typename ArrayTypes::t_ffloat_1d d_rho, d_rho0, d_rho1, d_rho2, d_rho3, d_frhop; typename ArrayTypes::t_ffloat_1d d_gamma, d_dgamma1, d_dgamma2, d_dgamma3, d_arho2b; typename ArrayTypes::t_ffloat_2d d_arho1, d_arho2, d_arho3, d_arho3b, d_t_ave, d_tsq_ave; + // msmeam params + typename ArrayTypes::t_ffloat_1d d_arho2mb; + typename ArrayTypes::t_ffloat_2d d_arho1m, d_arho2m, d_arho3m, d_arho3mb; void update_meam_views(); diff --git a/src/KOKKOS/pair_meam_ms_kokkos.cpp b/src/KOKKOS/pair_meam_ms_kokkos.cpp new file mode 100644 index 0000000000..491fc0273c --- /dev/null +++ b/src/KOKKOS/pair_meam_ms_kokkos.cpp @@ -0,0 +1,32 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/ Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include "pair_meam_ms_kokkos.h" +#include "meam.h" + +using namespace LAMMPS_NS; + +/* ---------------------------------------------------------------------- */ +template +PairMEAMMSKokkos::PairMEAMMSKokkos(LAMMPS *lmp) : PairMEAMKokkos(lmp) +{ + this->meam_inst->msmeamflag = this->msmeamflag = 1; + this->myname = "meam/ms/kk"; +} + +namespace LAMMPS_NS { +template class PairMEAMMSKokkos; +#ifdef KOKKOS_ENABLE_CUDA +template class PairMEAMMSKokkos; +#endif +} diff --git a/src/KOKKOS/pair_meam_ms_kokkos.h b/src/KOKKOS/pair_meam_ms_kokkos.h new file mode 100644 index 0000000000..a2cefc2c16 --- /dev/null +++ b/src/KOKKOS/pair_meam_ms_kokkos.h @@ -0,0 +1,36 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/ Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS +// clang-format off +PairStyle(meam/ms/kk,PairMEAMMSKokkos); +PairStyle(meam/ms/kk/device,PairMEAMMSKokkos); +PairStyle(meam/ms/kk/host,PairMEAMMSKokkos); +// clang-format on +#else + +#ifndef LMP_PAIR_MEAM_MS_KOKKOS_H +#define LMP_PAIR_MEAM_MS_KOKKOS_H + +#include "pair_meam_kokkos.h" + +namespace LAMMPS_NS { + +template +class PairMEAMMSKokkos : public PairMEAMKokkos { + public: + PairMEAMMSKokkos(class LAMMPS *); +}; +} // namespace LAMMPS_NS +#endif +#endif diff --git a/src/KOKKOS/pair_mliap_kokkos.cpp b/src/KOKKOS/pair_mliap_kokkos.cpp index d26b6367f8..71e45085ea 100644 --- a/src/KOKKOS/pair_mliap_kokkos.cpp +++ b/src/KOKKOS/pair_mliap_kokkos.cpp @@ -24,6 +24,7 @@ #include "mliap_model_linear_kokkos.h" #ifdef MLIAP_PYTHON #include "mliap_model_python_kokkos.h" +#include "mliap_unified_kokkos.h" #endif #include "error.h" #include "neigh_request.h" @@ -66,7 +67,6 @@ PairMLIAPKokkos::~PairMLIAPKokkos() template void PairMLIAPKokkos::compute(int eflag, int vflag) { - atomKK->sync(Host,F_MASK | ENERGY_MASK | VIRIAL_MASK); atomKK->sync(execution_space,X_MASK | TYPE_MASK ); MLIAPDataKokkos *k_data = (MLIAPDataKokkos*)(data); @@ -97,7 +97,7 @@ void PairMLIAPKokkos::compute(int eflag, int vflag) // compute descriptors, if needed if (model->nonlinearflag || eflag) { - k_data->sync(descriptor_space, NUMNEIGHS_MASK | IATOMS_MASK | IELEMS_MASK | JATOMS_MASK | JELEMS_MASK | RIJ_MASK ); + k_data->sync(descriptor_space, NUMNEIGHS_MASK | IATOMS_MASK | IELEMS_MASK | ELEMS_MASK | JATOMS_MASK | PAIR_I_MASK | JELEMS_MASK | RIJ_MASK ); descriptor->compute_descriptors(data); if (!is_kokkos_descriptor) k_data->modified(descriptor_space, DESCRIPTORS_MASK); @@ -109,12 +109,13 @@ void PairMLIAPKokkos::compute(int eflag, int vflag) k_data->modified(model_space, BETAS_MASK); if (eflag_atom) k_data->modified(model_space, EATOMS_MASK); - e_tally(data); // calculate force contributions beta_i*dB_i/dR_j - k_data->sync(descriptor_space, NUMNEIGHS_MASK | IATOMS_MASK | IELEMS_MASK | BETAS_MASK | JATOMS_MASK | JELEMS_MASK | RIJ_MASK ); + k_data->sync(descriptor_space, NUMNEIGHS_MASK | IATOMS_MASK | IELEMS_MASK | ELEMS_MASK | BETAS_MASK | JATOMS_MASK | PAIR_I_MASK | JELEMS_MASK | RIJ_MASK ); descriptor->compute_forces(data); + e_tally(data); + if (evflag) { atomKK->modified(descriptor_space,F_MASK | ENERGY_MASK | VIRIAL_MASK); atomKK->sync(execution_space,F_MASK | ENERGY_MASK | VIRIAL_MASK); @@ -181,6 +182,25 @@ void PairMLIAPKokkos::settings(int narg, char ** arg) iarg += 3; } else new_args.push_back(arg[iarg++]); + } else if (strcmp(arg[iarg], "unified") == 0) { +#ifdef MLIAP_PYTHON + printf("IN SETUP UNIFIED\n"); + if (model != nullptr) error->all(FLERR,"Illegal multiple pair_style mliap model definitions"); + if (descriptor != nullptr) error->all(FLERR,"Illegal multiple pair_style mliap descriptor definitions"); + if (iarg+2 > narg) utils::missing_cmd_args(FLERR, "pair_style mliap unified", error); + MLIAPBuildUnifiedKokkos_t build = build_unified(arg[iarg+1], dynamic_cast*>(data), lmp); + if (iarg+3 > narg) { + ghostneigh = 0; + } else { + ghostneigh = utils::logical(FLERR, arg[iarg+2], false, lmp); + } + + iarg += 3; + model = build.model; + descriptor = build.descriptor; +#else + error->all(FLERR,"Using pair_style mliap unified requires ML-IAP with python support"); +#endif } else new_args.push_back(arg[iarg++]); } @@ -226,13 +246,6 @@ void PairMLIAPKokkos::coeff(int narg, char **arg) { k_map.modify(); k_map.sync(); - auto h_cutsq=k_cutsq.template view(); - for (int itype=1; itype <= atom->ntypes; ++itype) - for (int jtype=1; jtype <= atom->ntypes; ++jtype) - h_cutsq(itype,jtype) = descriptor->cutsq[map[itype]][map[jtype]]; - k_cutsq.modify(); - k_cutsq.sync(); - // clear setflag since coeff() called once with I,J = * * int n = atom->ntypes; @@ -257,6 +270,13 @@ void PairMLIAPKokkos::coeff(int narg, char **arg) { // set up model, descriptor, and mliap data structures model->init(); descriptor->init(); + + auto h_cutsq=k_cutsq.template view(); + for (int itype=1; itype <= atom->ntypes; ++itype) + for (int jtype=1; jtype <= atom->ntypes; ++jtype) + h_cutsq(itype,jtype) = descriptor->cutsq[map[itype]][map[jtype]]; + k_cutsq.modify(); + k_cutsq.sync(); int gradgradflag = -1; delete data; data = new MLIAPDataKokkos(lmp, gradgradflag, map, model, descriptor, this); diff --git a/src/KOKKOS/region_block_kokkos.cpp b/src/KOKKOS/region_block_kokkos.cpp index cfee46e916..c53fae7b03 100644 --- a/src/KOKKOS/region_block_kokkos.cpp +++ b/src/KOKKOS/region_block_kokkos.cpp @@ -48,7 +48,8 @@ void RegBlockKokkos::match_all_kokkos(int groupbit_in, DAT::tdual_in groupbit = groupbit_in; d_match = k_match_in.template view(); - atomKK->sync(Device, X_MASK | MASK_MASK); + auto execution_space = ExecutionSpaceFromDevice::space; + atomKK->sync(execution_space, X_MASK | MASK_MASK); x = atomKK->k_x.view(); mask = atomKK->k_mask.view(); diff --git a/src/MEAM/meam.h b/src/MEAM/meam.h index 9ec7de3426..5a131bdc34 100644 --- a/src/MEAM/meam.h +++ b/src/MEAM/meam.h @@ -17,7 +17,7 @@ #include #include -#define maxelt 5 +constexpr int maxelt = 5; namespace LAMMPS_NS { class Memory; @@ -30,6 +30,7 @@ class MEAM { virtual ~MEAM(); int copymode; + int msmeamflag; protected: Memory *memory; @@ -74,6 +75,12 @@ class MEAM { // vind[23]D = Voight notation index maps for 2 and 3D // v2D,v3D = array of factors to apply for Voight notation + // MS-MEAM parameters + + // msmeamflag = flag to activate MS-MEAM + // betam[1-3]_meam = MS-MEAM electron density constants + // tm[1-3]_meam = MS-MEAM coefficients on densities in Gamma computation + // nr,dr = pair function discretization parameters // nrar,rdrar = spline coeff array parameters @@ -115,12 +122,22 @@ class MEAM { int nr, nrar; double dr, rdrar; + // MS-MEAM parameters + + double t1m_meam[maxelt], t2m_meam[maxelt], t3m_meam[maxelt]; + double beta1m_meam[maxelt], beta2m_meam[maxelt], beta3m_meam[maxelt]; + //int msmeamflag; // made public for pair style settings + public: int nmax; double *rho, *rho0, *rho1, *rho2, *rho3, *frhop; double *gamma, *dgamma1, *dgamma2, *dgamma3, *arho2b; double **arho1, **arho2, **arho3, **arho3b, **t_ave, **tsq_ave; + // MS-MEAM arrays + + double **arho1m, **arho2m, *arho2mb, **arho3m, **arho3mb; + int maxneigh; double *scrfcn, *dscrfcn, *fcpair; @@ -242,7 +259,7 @@ class MEAM { double, double, double, double, double, int, int, lattice_t); void get_sijk(double, int, int, int, double *); void get_densref(double, int, int, double *, double *, double *, double *, double *, double *, - double *, double *); + double *, double *, double *, double *, double *, double *, double *, double *); // last 6 args for msmeam void interpolate_meam(int); public: @@ -282,10 +299,12 @@ class MEAM { } // clang-format on static int get_Zij(const lattice_t latt); + // last 6 args are optional msmeam parameters void meam_setup_global(int nelt, lattice_t *lat, int *ielement, double *atwt, double *alpha, double *b0, double *b1, double *b2, double *b3, double *alat, double *esub, double *asub, double *t0, double *t1, double *t2, double *t3, - double *rozero, int *ibar); + double *rozero, int *ibar, double *b1m, double *b2m, double *b3m, + double *t1m, double *t2m, double *t3m); void meam_setup_param(int which, double value, int nindex, int *index /*index(3)*/, int *errorflag); virtual void meam_setup_done(double *cutmax); diff --git a/src/MEAM/meam_dens_final.cpp b/src/MEAM/meam_dens_final.cpp index cf964a4724..ab0ac8c53f 100644 --- a/src/MEAM/meam_dens_final.cpp +++ b/src/MEAM/meam_dens_final.cpp @@ -27,115 +27,222 @@ MEAM::meam_dens_final(int nlocal, int eflag_either, int eflag_global, int eflag_ // Complete the calculation of density - for (i = 0; i < nlocal; i++) { - elti = fmap[type[i]]; - if (elti >= 0) { - scaleii = scale[type[i]][type[i]]; - rho1[i] = 0.0; - rho2[i] = -1.0 / 3.0 * arho2b[i] * arho2b[i]; - rho3[i] = 0.0; - for (m = 0; m < 3; m++) { - rho1[i] = rho1[i] + arho1[i][m] * arho1[i][m]; - rho3[i] = rho3[i] - 3.0 / 5.0 * arho3b[i][m] * arho3b[i][m]; - } - for (m = 0; m < 6; m++) { - rho2[i] = rho2[i] + this->v2D[m] * arho2[i][m] * arho2[i][m]; - } - for (m = 0; m < 10; m++) { - rho3[i] = rho3[i] + this->v3D[m] * arho3[i][m] * arho3[i][m]; - } - - if (rho0[i] > 0.0) { - if (this->ialloy == 1) { - t_ave[i][0] = fdiv_zero(t_ave[i][0], tsq_ave[i][0]); - t_ave[i][1] = fdiv_zero(t_ave[i][1], tsq_ave[i][1]); - t_ave[i][2] = fdiv_zero(t_ave[i][2], tsq_ave[i][2]); - } else if (this->ialloy == 2) { - t_ave[i][0] = this->t1_meam[elti]; - t_ave[i][1] = this->t2_meam[elti]; - t_ave[i][2] = this->t3_meam[elti]; - } else { - t_ave[i][0] = t_ave[i][0] / rho0[i]; - t_ave[i][1] = t_ave[i][1] / rho0[i]; - t_ave[i][2] = t_ave[i][2] / rho0[i]; + if (this->msmeamflag) { + for (i = 0; i < nlocal; i++) { + elti = fmap[type[i]]; + if (elti >= 0) { + scaleii = scale[type[i]][type[i]]; + rho1[i] = 0.0; + rho2[i] = -1.0 / 3.0 * (arho2b[i] * arho2b[i] + - arho2mb[i] * arho2mb[i]); + rho3[i] = 0.0; + for (m = 0; m < 3; m++) { + rho1[i] = rho1[i] + arho1[i][m] * arho1[i][m] + - arho1m[i][m] * arho1m[i][m]; + rho3[i] = rho3[i] - 3.0 / 5.0 * (arho3b[i][m] * arho3b[i][m] + - arho3mb[i][m] * arho3mb[i][m]); } - } - - gamma[i] = t_ave[i][0] * rho1[i] + t_ave[i][1] * rho2[i] + t_ave[i][2] * rho3[i]; - - if (rho0[i] > 0.0) { - gamma[i] = gamma[i] / (rho0[i] * rho0[i]); - } - - Z = get_Zij(this->lattce_meam[elti][elti]); - - G = G_gam(gamma[i], this->ibar_meam[elti], errorflag); - if (errorflag != 0) - return; - - get_shpfcn(this->lattce_meam[elti][elti], this->stheta_meam[elti][elti], this->ctheta_meam[elti][elti], shp); - - if (this->ibar_meam[elti] <= 0) { - Gbar = 1.0; - dGbar = 0.0; - } else { - if (this->mix_ref_t == 1) { - gam = (t_ave[i][0] * shp[0] + t_ave[i][1] * shp[1] + t_ave[i][2] * shp[2]) / (Z * Z); - } else { - gam = (this->t1_meam[elti] * shp[0] + this->t2_meam[elti] * shp[1] + this->t3_meam[elti] * shp[2]) / - (Z * Z); + for (m = 0; m < 6; m++) { + rho2[i] = rho2[i] + this->v2D[m] * (arho2[i][m] * arho2[i][m] + - arho2m[i][m] * arho2m[i][m]); } - Gbar = G_gam(gam, this->ibar_meam[elti], errorflag); - } - rho[i] = rho0[i] * G; - if (this->mix_ref_t == 1) { + for (m = 0; m < 10; m++) { + rho3[i] = rho3[i] + this->v3D[m] * (arho3[i][m] * arho3[i][m] + - arho3m[i][m] * arho3m[i][m]); + } + + // all the t weights are already accounted for with msmeam + gamma[i] = rho1[i] + rho2[i] + rho3[i]; + + if (rho0[i] > 0.0) { + gamma[i] = gamma[i] / (rho0[i] * rho0[i]); + } + + Z = get_Zij(this->lattce_meam[elti][elti]); + + G = G_gam(gamma[i], this->ibar_meam[elti], errorflag); + if (errorflag != 0) + return; + + get_shpfcn(this->lattce_meam[elti][elti], this->stheta_meam[elti][elti], this->ctheta_meam[elti][elti], shp); + if (this->ibar_meam[elti] <= 0) { Gbar = 1.0; dGbar = 0.0; } else { - gam = (t_ave[i][0] * shp[0] + t_ave[i][1] * shp[1] + t_ave[i][2] * shp[2]) / (Z * Z); - Gbar = dG_gam(gam, this->ibar_meam[elti], dGbar); + if (this->mix_ref_t == 1) { + gam = (t_ave[i][0] * shp[0] + t_ave[i][1] * shp[1] + t_ave[i][2] * shp[2]) / (Z * Z); + } else { + gam = (this->t1_meam[elti] * shp[0] + this->t2_meam[elti] * shp[1] + this->t3_meam[elti] * shp[2]) / + (Z * Z); + } + Gbar = G_gam(gam, this->ibar_meam[elti], errorflag); } - rho_bkgd = this->rho0_meam[elti] * Z * Gbar; - } else { - if (this->bkgd_dyn == 1) { - rho_bkgd = this->rho0_meam[elti] * Z; + rho[i] = rho0[i] * G; + + if (this->mix_ref_t == 1) { + if (this->ibar_meam[elti] <= 0) { + Gbar = 1.0; + dGbar = 0.0; + } else { + gam = (t_ave[i][0] * shp[0] + t_ave[i][1] * shp[1] + t_ave[i][2] * shp[2]) / (Z * Z); + Gbar = dG_gam(gam, this->ibar_meam[elti], dGbar); + } + rho_bkgd = this->rho0_meam[elti] * Z * Gbar; } else { - rho_bkgd = this->rho_ref_meam[elti]; + if (this->bkgd_dyn == 1) { + rho_bkgd = this->rho0_meam[elti] * Z; + } else { + rho_bkgd = this->rho_ref_meam[elti]; + } + } + rhob = rho[i] / rho_bkgd; + denom = 1.0 / rho_bkgd; + + G = dG_gam(gamma[i], this->ibar_meam[elti], dG); + + dgamma1[i] = (G - 2 * dG * gamma[i]) * denom; + + if (!iszero(rho0[i])) { + dgamma2[i] = (dG / rho0[i]) * denom; + } else { + dgamma2[i] = 0.0; + } + + // dgamma3 is nonzero only if we are using the "mixed" rule for + // computing t in the reference system (which is not correct, but + // included for backward compatibility + if (this->mix_ref_t == 1) { + dgamma3[i] = rho0[i] * G * dGbar / (Gbar * Z * Z) * denom; + } else { + dgamma3[i] = 0.0; + } + + Fl = embedding(this->A_meam[elti], this->Ec_meam[elti][elti], rhob, frhop[i]); + if (eflag_either != 0) { + Fl *= scaleii; + if (eflag_global != 0) { + *eng_vdwl = *eng_vdwl + Fl; + } + if (eflag_atom != 0) { + eatom[i] = eatom[i] + Fl; + } } } - rhob = rho[i] / rho_bkgd; - denom = 1.0 / rho_bkgd; - - G = dG_gam(gamma[i], this->ibar_meam[elti], dG); - - dgamma1[i] = (G - 2 * dG * gamma[i]) * denom; - - if (!iszero(rho0[i])) { - dgamma2[i] = (dG / rho0[i]) * denom; - } else { - dgamma2[i] = 0.0; - } - - // dgamma3 is nonzero only if we are using the "mixed" rule for - // computing t in the reference system (which is not correct, but - // included for backward compatibility - if (this->mix_ref_t == 1) { - dgamma3[i] = rho0[i] * G * dGbar / (Gbar * Z * Z) * denom; - } else { - dgamma3[i] = 0.0; - } - - Fl = embedding(this->A_meam[elti], this->Ec_meam[elti][elti], rhob, frhop[i]); - - if (eflag_either != 0) { - Fl *= scaleii; - if (eflag_global != 0) { - *eng_vdwl = *eng_vdwl + Fl; + } + } else { + for (i = 0; i < nlocal; i++) { + elti = fmap[type[i]]; + if (elti >= 0) { + scaleii = scale[type[i]][type[i]]; + rho1[i] = 0.0; + rho2[i] = -1.0 / 3.0 * arho2b[i] * arho2b[i]; + rho3[i] = 0.0; + for (m = 0; m < 3; m++) { + rho1[i] = rho1[i] + arho1[i][m] * arho1[i][m]; + rho3[i] = rho3[i] - 3.0 / 5.0 * arho3b[i][m] * arho3b[i][m]; } - if (eflag_atom != 0) { - eatom[i] = eatom[i] + Fl; + for (m = 0; m < 6; m++) { + rho2[i] = rho2[i] + this->v2D[m] * arho2[i][m] * arho2[i][m]; + } + for (m = 0; m < 10; m++) { + rho3[i] = rho3[i] + this->v3D[m] * arho3[i][m] * arho3[i][m]; + } + + if (rho0[i] > 0.0) { + if (this->ialloy == 1) { + t_ave[i][0] = fdiv_zero(t_ave[i][0], tsq_ave[i][0]); + t_ave[i][1] = fdiv_zero(t_ave[i][1], tsq_ave[i][1]); + t_ave[i][2] = fdiv_zero(t_ave[i][2], tsq_ave[i][2]); + } else if (this->ialloy == 2) { + t_ave[i][0] = this->t1_meam[elti]; + t_ave[i][1] = this->t2_meam[elti]; + t_ave[i][2] = this->t3_meam[elti]; + } else { + t_ave[i][0] = t_ave[i][0] / rho0[i]; + t_ave[i][1] = t_ave[i][1] / rho0[i]; + t_ave[i][2] = t_ave[i][2] / rho0[i]; + } + } + + gamma[i] = t_ave[i][0] * rho1[i] + t_ave[i][1] * rho2[i] + t_ave[i][2] * rho3[i]; + + if (rho0[i] > 0.0) { + gamma[i] = gamma[i] / (rho0[i] * rho0[i]); + } + + Z = get_Zij(this->lattce_meam[elti][elti]); + + G = G_gam(gamma[i], this->ibar_meam[elti], errorflag); + if (errorflag != 0) + return; + + get_shpfcn(this->lattce_meam[elti][elti], this->stheta_meam[elti][elti], this->ctheta_meam[elti][elti], shp); + + if (this->ibar_meam[elti] <= 0) { + Gbar = 1.0; + dGbar = 0.0; + } else { + if (this->mix_ref_t == 1) { + gam = (t_ave[i][0] * shp[0] + t_ave[i][1] * shp[1] + t_ave[i][2] * shp[2]) / (Z * Z); + } else { + gam = (this->t1_meam[elti] * shp[0] + this->t2_meam[elti] * shp[1] + this->t3_meam[elti] * shp[2]) / + (Z * Z); + } + Gbar = G_gam(gam, this->ibar_meam[elti], errorflag); + } + rho[i] = rho0[i] * G; + + if (this->mix_ref_t == 1) { + if (this->ibar_meam[elti] <= 0) { + Gbar = 1.0; + dGbar = 0.0; + } else { + gam = (t_ave[i][0] * shp[0] + t_ave[i][1] * shp[1] + t_ave[i][2] * shp[2]) / (Z * Z); + Gbar = dG_gam(gam, this->ibar_meam[elti], dGbar); + } + rho_bkgd = this->rho0_meam[elti] * Z * Gbar; + } else { + if (this->bkgd_dyn == 1) { + rho_bkgd = this->rho0_meam[elti] * Z; + } else { + rho_bkgd = this->rho_ref_meam[elti]; + } + } + rhob = rho[i] / rho_bkgd; + denom = 1.0 / rho_bkgd; + + G = dG_gam(gamma[i], this->ibar_meam[elti], dG); + + dgamma1[i] = (G - 2 * dG * gamma[i]) * denom; + + if (!iszero(rho0[i])) { + dgamma2[i] = (dG / rho0[i]) * denom; + } else { + dgamma2[i] = 0.0; + } + + // dgamma3 is nonzero only if we are using the "mixed" rule for + // computing t in the reference system (which is not correct, but + // included for backward compatibility + if (this->mix_ref_t == 1) { + dgamma3[i] = rho0[i] * G * dGbar / (Gbar * Z * Z) * denom; + } else { + dgamma3[i] = 0.0; + } + + Fl = embedding(this->A_meam[elti], this->Ec_meam[elti][elti], rhob, frhop[i]); + + if (eflag_either != 0) { + Fl *= scaleii; + if (eflag_global != 0) { + *eng_vdwl = *eng_vdwl + Fl; + } + if (eflag_atom != 0) { + eatom[i] = eatom[i] + Fl; + + } } } } diff --git a/src/MEAM/meam_dens_init.cpp b/src/MEAM/meam_dens_init.cpp index b60e1a7a17..00ad276ad7 100644 --- a/src/MEAM/meam_dens_init.cpp +++ b/src/MEAM/meam_dens_init.cpp @@ -45,6 +45,14 @@ MEAM::meam_dens_setup(int atom_nmax, int nall, int n_neigh) memory->destroy(arho3b); memory->destroy(t_ave); memory->destroy(tsq_ave); + // msmeam params + if (this->msmeamflag) { + memory->destroy(arho1m); + memory->destroy(arho2m); + memory->destroy(arho3m); + memory->destroy(arho2mb); + memory->destroy(arho3mb); + } nmax = atom_nmax; @@ -65,6 +73,14 @@ MEAM::meam_dens_setup(int atom_nmax, int nall, int n_neigh) memory->create(arho3b, nmax, 3, "pair:arho3b"); memory->create(t_ave, nmax, 3, "pair:t_ave"); memory->create(tsq_ave, nmax, 3, "pair:tsq_ave"); + // msmeam params + if (this->msmeamflag) { + memory->create(arho1m, nmax, 3, "pair:arho1m"); + memory->create(arho2m, nmax, 6, "pair:arho2m"); + memory->create(arho3m, nmax, 10, "pair:arho3m"); + memory->create(arho2mb, nmax, "pair:arho2mb"); + memory->create(arho3mb, nmax, 3, "pair:arho3mb"); + } } if (n_neigh > maxneigh) { @@ -83,14 +99,30 @@ MEAM::meam_dens_setup(int atom_nmax, int nall, int n_neigh) rho0[i] = 0.0; arho2b[i] = 0.0; arho1[i][0] = arho1[i][1] = arho1[i][2] = 0.0; - for (j = 0; j < 6; j++) + if (this->msmeamflag) { + arho2mb[i] = 0.0; + arho1m[i][0] = arho1m[i][1] = arho1m[i][2] = 0.0; + } + for (j = 0; j < 6; j++) { arho2[i][j] = 0.0; - for (j = 0; j < 10; j++) + if (this->msmeamflag) { + arho2m[i][j] = 0.0; + } + } + for (j = 0; j < 10; j++) { arho3[i][j] = 0.0; + if (this->msmeamflag) { + arho3m[i][j] = 0.0; + } + } arho3b[i][0] = arho3b[i][1] = arho3b[i][2] = 0.0; + if (this->msmeamflag) { + arho3mb[i][0] = arho3mb[i][1] = arho3mb[i][2] = 0.0; + } t_ave[i][0] = t_ave[i][1] = t_ave[i][2] = 0.0; tsq_ave[i][0] = tsq_ave[i][1] = tsq_ave[i][2] = 0.0; } + } void @@ -282,6 +314,9 @@ MEAM::calc_rho1(int i, int /*ntype*/, int* type, int* fmap, double** x, int numn // double G,Gbar,gam,shp[3+1]; double ro0i, ro0j; double rhoa0i, rhoa1i, rhoa2i, rhoa3i, A1i, A2i, A3i; + // msmeam params + double rhoa1mj, rhoa2mj, rhoa3mj, A1mj, A2mj, A3mj; + double rhoa1mi, rhoa2mi, rhoa3mi, A1mi, A2mi, A3mi; elti = fmap[type[i]]; xtmp = x[i][0]; @@ -306,10 +341,20 @@ MEAM::calc_rho1(int i, int /*ntype*/, int* type, int* fmap, double** x, int numn rhoa1j = ro0j * MathSpecial::fm_exp(-this->beta1_meam[eltj] * aj) * sij; rhoa2j = ro0j * MathSpecial::fm_exp(-this->beta2_meam[eltj] * aj) * sij; rhoa3j = ro0j * MathSpecial::fm_exp(-this->beta3_meam[eltj] * aj) * sij; + if (this->msmeamflag){ + rhoa1mj = ro0j * this->t1m_meam[eltj] * MathSpecial::fm_exp(-this->beta1m_meam[eltj] * aj) * sij; + rhoa2mj = ro0j * this->t2m_meam[eltj] * MathSpecial::fm_exp(-this->beta2m_meam[eltj] * aj) * sij; + rhoa3mj = ro0j * this->t3m_meam[eltj] * MathSpecial::fm_exp(-this->beta3m_meam[eltj] * aj) * sij; + } rhoa0i = ro0i * MathSpecial::fm_exp(-this->beta0_meam[elti] * ai) * sij; rhoa1i = ro0i * MathSpecial::fm_exp(-this->beta1_meam[elti] * ai) * sij; rhoa2i = ro0i * MathSpecial::fm_exp(-this->beta2_meam[elti] * ai) * sij; rhoa3i = ro0i * MathSpecial::fm_exp(-this->beta3_meam[elti] * ai) * sij; + if (this->msmeamflag){ + rhoa1mi = ro0i * this->t1m_meam[elti] * MathSpecial::fm_exp(-this->beta1m_meam[elti] * ai) * sij; + rhoa2mi = ro0i * this->t2m_meam[elti] * MathSpecial::fm_exp(-this->beta2m_meam[elti] * ai) * sij; + rhoa3mi = ro0i * this->t3m_meam[elti] * MathSpecial::fm_exp(-this->beta3m_meam[elti] * ai) * sij; + } if (this->ialloy == 1) { rhoa1j = rhoa1j * this->t1_meam[eltj]; rhoa2j = rhoa2j * this->t2_meam[eltj]; @@ -321,6 +366,7 @@ MEAM::calc_rho1(int i, int /*ntype*/, int* type, int* fmap, double** x, int numn rho0[i] = rho0[i] + rhoa0j; rho0[j] = rho0[j] + rhoa0i; // For ialloy = 2, use single-element value (not average) + // For ialloy = 2, use single-element value (not average) if (this->ialloy != 2) { t_ave[i][0] = t_ave[i][0] + this->t1_meam[eltj] * rhoa0j; t_ave[i][1] = t_ave[i][1] + this->t2_meam[eltj] * rhoa0j; @@ -348,18 +394,42 @@ MEAM::calc_rho1(int i, int /*ntype*/, int* type, int* fmap, double** x, int numn A3i = rhoa3i / (rij2 * rij); nv2 = 0; nv3 = 0; + if (this->msmeamflag) { + arho2mb[i] = arho2mb[i] + rhoa2mj; + arho2mb[j] = arho2mb[j] + rhoa2mi; + A1mj = rhoa1mj/rij; + A2mj = rhoa2mj/rij2; + A3mj = rhoa3mj/(rij2*rij); + A1mi = rhoa1mi/rij; + A2mi = rhoa2mi/rij2; + A3mi = rhoa3mi/(rij2*rij); + } for (m = 0; m < 3; m++) { arho1[i][m] = arho1[i][m] + A1j * delij[m]; arho1[j][m] = arho1[j][m] - A1i * delij[m]; arho3b[i][m] = arho3b[i][m] + rhoa3j * delij[m] / rij; arho3b[j][m] = arho3b[j][m] - rhoa3i * delij[m] / rij; + if (this->msmeamflag) { + arho1m[i][m] = arho1m[i][m] + A1mj*delij[m]; + arho1m[j][m] = arho1m[j][m] - A1mi*delij[m]; + arho3mb[i][m] = arho3mb[i][m] + rhoa3mj*delij[m] / rij; + arho3mb[j][m] = arho3mb[j][m] - rhoa3mi*delij[m] / rij; + } for (n = m; n < 3; n++) { arho2[i][nv2] = arho2[i][nv2] + A2j * delij[m] * delij[n]; arho2[j][nv2] = arho2[j][nv2] + A2i * delij[m] * delij[n]; + if (this->msmeamflag) { + arho2m[i][nv2] = arho2m[i][nv2] + A2mj*delij[m] * delij[n]; + arho2m[j][nv2] = arho2m[j][nv2] + A2mi*delij[m] * delij[n]; + } nv2 = nv2 + 1; for (p = n; p < 3; p++) { arho3[i][nv3] = arho3[i][nv3] + A3j * delij[m] * delij[n] * delij[p]; arho3[j][nv3] = arho3[j][nv3] - A3i * delij[m] * delij[n] * delij[p]; + if (this->msmeamflag) { + arho3m[i][nv3] = arho3m[i][nv3] + A3mj*delij[m]*delij[n]*delij[p]; + arho3m[j][nv3] = arho3m[j][nv3] - A3mi*delij[m]*delij[n]*delij[p]; + } nv3 = nv3 + 1; } } diff --git a/src/MEAM/meam_force.cpp b/src/MEAM/meam_force.cpp index acc3d5672a..4bc7380898 100644 --- a/src/MEAM/meam_force.cpp +++ b/src/MEAM/meam_force.cpp @@ -61,6 +61,17 @@ MEAM::meam_force(int i, int eflag_global, int eflag_atom, int vflag_global, int double t1i, t2i, t3i, t1j, t2j, t3j; double scaleij; + double rhoa1mj,drhoa1mj,rhoa1mi,drhoa1mi; + double rhoa2mj,drhoa2mj,rhoa2mi,drhoa2mi; + double rhoa3mj, drhoa3mj, rhoa3mi, drhoa3mi; + double arg1i1m, arg1j1m, arg1i2m, arg1j2m, arg1i3m, arg1j3m, arg3i3m, arg3j3m; + double drho1mdr1, drho1mdr2, drho1mds1, drho1mds2; + double drho1mdrm1[3], drho1mdrm2[3]; + double drho2mdr1, drho2mdr2, drho2mds1, drho2mds2; + double drho2mdrm1[3], drho2mdrm2[3]; + double drho3mdr1, drho3mdr2, drho3mds1, drho3mds2; + double drho3mdrm1[3], drho3mdrm2[3]; + third = 1.0 / 3.0; sixth = 1.0 / 6.0; @@ -74,6 +85,7 @@ MEAM::meam_force(int i, int eflag_global, int eflag_atom, int vflag_global, int zitmp = x[i][2]; // Treat each pair + for (jn = 0; jn < numneigh; jn++) { j = firstneigh[jn]; eltj = fmap[type[j]]; @@ -89,7 +101,6 @@ MEAM::meam_force(int i, int eflag_global, int eflag_atom, int vflag_global, int if (rij2 < this->cutforcesq) { rij = sqrt(rij2); recip = 1.0 / rij; - // Compute phi and phip ind = this->eltind[elti][eltj]; pp = rij * this->rdrar; @@ -114,6 +125,7 @@ MEAM::meam_force(int i, int eflag_global, int eflag_atom, int vflag_global, int // write(1,*) "force_meamf: phip: ",phip // Compute pair densities and derivatives + invrei = 1.0 / this->re_meam[elti][elti]; ai = rij * invrei - 1.0; ro0i = this->rho0_meam[elti]; @@ -126,6 +138,15 @@ MEAM::meam_force(int i, int eflag_global, int eflag_atom, int vflag_global, int rhoa3i = ro0i * MathSpecial::fm_exp(-this->beta3_meam[elti] * ai); drhoa3i = -this->beta3_meam[elti] * invrei * rhoa3i; + if (this->msmeamflag) { + rhoa1mi = ro0i * MathSpecial::fm_exp(-this->beta1m_meam[elti] * ai) * t1m_meam[elti]; + drhoa1mi = -this->beta1m_meam[elti] * invrei * rhoa1mi; + rhoa2mi = ro0i * MathSpecial::fm_exp(-this->beta2m_meam[elti] * ai) * t2m_meam[elti]; + drhoa2mi = -this->beta2m_meam[elti] * invrei * rhoa2mi; + rhoa3mi = ro0i * MathSpecial::fm_exp(-this->beta3m_meam[elti] * ai) * t3m_meam[elti]; + drhoa3mi = -this->beta3m_meam[elti] * invrei * rhoa3mi; + } + if (elti != eltj) { invrej = 1.0 / this->re_meam[eltj][eltj]; aj = rij * invrej - 1.0; @@ -138,6 +159,16 @@ MEAM::meam_force(int i, int eflag_global, int eflag_atom, int vflag_global, int drhoa2j = -this->beta2_meam[eltj] * invrej * rhoa2j; rhoa3j = ro0j * MathSpecial::fm_exp(-this->beta3_meam[eltj] * aj); drhoa3j = -this->beta3_meam[eltj] * invrej * rhoa3j; + + if (this->msmeamflag) { + rhoa1mj = ro0j * t1m_meam[eltj] * MathSpecial::fm_exp(-this->beta1m_meam[eltj] * aj); + drhoa1mj = -this->beta1m_meam[eltj] * invrej * rhoa1mj; + rhoa2mj = ro0j * t2m_meam[eltj] * MathSpecial::fm_exp(-this->beta2m_meam[eltj] * aj); + drhoa2mj = -this->beta2m_meam[eltj] * invrej * rhoa2mj; + rhoa3mj = ro0j * t3m_meam[eltj] * MathSpecial::fm_exp(-this->beta3m_meam[eltj] * aj); + drhoa3mj = -this->beta3m_meam[eltj] * invrej * rhoa3mj; + } + } else { rhoa0j = rhoa0i; drhoa0j = drhoa0i; @@ -147,6 +178,15 @@ MEAM::meam_force(int i, int eflag_global, int eflag_atom, int vflag_global, int drhoa2j = drhoa2i; rhoa3j = rhoa3i; drhoa3j = drhoa3i; + + if (this->msmeamflag) { + rhoa1mj = rhoa1mi; + drhoa1mj = drhoa1mi; + rhoa2mj = rhoa2mi; + drhoa2mj = drhoa2mi; + rhoa3mj = rhoa3mi; + drhoa3mj = drhoa3mi; + } } const double t1mi = this->t1_meam[elti]; @@ -156,7 +196,10 @@ MEAM::meam_force(int i, int eflag_global, int eflag_atom, int vflag_global, int const double t2mj = this->t2_meam[eltj]; const double t3mj = this->t3_meam[eltj]; - if (this->ialloy == 1) { + // ialloy mod not needed in MS-MEAM, but similarity here is that we multply rhos by t. + // We did this above with rhoa1mj, rhoa2mj, etc. + + if (this->ialloy == 1 || this->msmeamflag) { rhoa1j *= t1mj; rhoa2j *= t2mj; rhoa3j *= t3mj; @@ -200,6 +243,39 @@ MEAM::meam_force(int i, int eflag_global, int eflag_atom, int vflag_global, int arg3j3 = arg3j3 - arho3b[j][n] * delij[n]; } + // msmeam arhom args + + nv2 = 0; + nv3 = 0; + arg1i1m = 0.0; + arg1j1m = 0.0; + arg1i2m = 0.0; + arg1j2m = 0.0; + arg1i3m = 0.0; + arg1j3m = 0.0; + arg3i3m = 0.0; + arg3j3m = 0.0; + if (this->msmeamflag) { + for (n = 0; n < 3; n++) { + for (p = n; p < 3; p++) { + for (q = p; q < 3; q++) { + arg = delij[n] * delij[p] * delij[q] * this->v3D[nv3]; + arg1i3m = arg1i3m - arho3m[i][nv3] * arg; + arg1j3m = arg1j3m + arho3m[j][nv3] * arg; + nv3 = nv3 + 1; + } + arg = delij[n] * delij[p] * this->v2D[nv2]; + arg1i2m = arg1i2m + arho2m[i][nv2] * arg; + arg1j2m = arg1j2m + arho2m[j][nv2] * arg; + nv2 = nv2 + 1; + } + arg1i1m = arg1i1m - arho1m[i][n] * delij[n]; + arg1j1m = arg1j1m + arho1m[j][n] * delij[n]; + arg3i3m = arg3i3m - arho3mb[i][n] * delij[n]; + arg3j3m = arg3j3m + arho3mb[j][n] * delij[n]; + } + } + // rho0 terms drho0dr1 = drhoa0j * sij; drho0dr2 = drhoa0i * sij; @@ -254,32 +330,83 @@ MEAM::meam_force(int i, int eflag_global, int eflag_atom, int vflag_global, int drho3drm2[m] = (-a3 * drho3drm2[m] + a3a * arho3b[j][m]) * rhoa3i; } - // Compute derivatives of weighting functions t wrt rij - t1i = t_ave[i][0]; - t2i = t_ave[i][1]; - t3i = t_ave[i][2]; - t1j = t_ave[j][0]; - t2j = t_ave[j][1]; - t3j = t_ave[j][2]; + if (this->msmeamflag) { + // rho1m terms + a1 = 2 * sij / rij; + drho1mdr1 = a1 * (drhoa1mj - rhoa1mj / rij) * arg1i1m; + drho1mdr2 = a1 * (drhoa1mi - rhoa1mi / rij) * arg1j1m; + drho1mdr1 *= -1.0; + drho1mdr2 *= -1.0; + a1 = 2.0 * sij / rij; + for (m = 0; m < 3; m++) { + drho1mdrm1[m] = a1 * rhoa1mj * arho1m[i][m]; + drho1mdrm2[m] = -a1 * rhoa1mi * arho1m[j][m]; + } - if (this->ialloy == 1) { + // rho2m terms + a2 = 2 * sij / rij2; + drho2mdr1 = a2 * (drhoa2mj - 2 * rhoa2mj / rij) * arg1i2m - 2.0 / 3.0 * arho2mb[i] * drhoa2mj * sij; + drho2mdr2 = a2 * (drhoa2mi - 2 * rhoa2mi / rij) * arg1j2m - 2.0 / 3.0 * arho2mb[j] * drhoa2mi * sij; + a2 = 4 * sij / rij2; + for (m = 0; m < 3; m++) { + drho2mdrm1[m] = 0.0; + drho2mdrm2[m] = 0.0; + for (n = 0; n < 3; n++) { + drho2mdrm1[m] += arho2m[i][this->vind2D[m][n]] * delij[n]; + drho2mdrm2[m] -= arho2m[j][this->vind2D[m][n]] * delij[n]; + } + drho2mdrm1[m] = a2 * rhoa2mj * drho2mdrm1[m]; + drho2mdrm2[m] = -a2 * rhoa2mi * drho2mdrm2[m]; + } - a1i = fdiv_zero(drhoa0j * sij, tsq_ave[i][0]); - a1j = fdiv_zero(drhoa0i * sij, tsq_ave[j][0]); - a2i = fdiv_zero(drhoa0j * sij, tsq_ave[i][1]); - a2j = fdiv_zero(drhoa0i * sij, tsq_ave[j][1]); - a3i = fdiv_zero(drhoa0j * sij, tsq_ave[i][2]); - a3j = fdiv_zero(drhoa0i * sij, tsq_ave[j][2]); + // rho3m terms + rij3 = rij * rij2; + a3 = 2 * sij / rij3; + a3a = 6.0 / 5.0 * sij / rij; + drho3mdr1 = a3 * (drhoa3mj - 3 * rhoa3mj / rij) * arg1i3m - a3a * (drhoa3mj - rhoa3mj / rij) * arg3i3m; + drho3mdr2 = a3 * (drhoa3mi - 3 * rhoa3mi / rij) * arg1j3m - a3a * (drhoa3mi - rhoa3mi / rij) * arg3j3m; + drho3mdr1 *= -1.0; + drho3mdr2 *= -1.0; - dt1dr1 = a1i * (t1mj - t1i * MathSpecial::square(t1mj)); - dt1dr2 = a1j * (t1mi - t1j * MathSpecial::square(t1mi)); - dt2dr1 = a2i * (t2mj - t2i * MathSpecial::square(t2mj)); - dt2dr2 = a2j * (t2mi - t2j * MathSpecial::square(t2mi)); - dt3dr1 = a3i * (t3mj - t3i * MathSpecial::square(t3mj)); - dt3dr2 = a3j * (t3mi - t3j * MathSpecial::square(t3mi)); + a3 = 6 * sij / rij3; + a3a = 6 * sij / (5 * rij); + for (m = 0; m < 3; m++) { + drho3mdrm1[m] = 0.0; + drho3mdrm2[m] = 0.0; + nv2 = 0; + for (n = 0; n < 3; n++) { + for (p = n; p < 3; p++) { + arg = delij[n] * delij[p] * this->v2D[nv2]; + drho3mdrm1[m] += arho3m[i][this->vind3D[m][n][p]] * arg; + drho3mdrm2[m] += arho3m[j][this->vind3D[m][n][p]] * arg; + nv2 = nv2 + 1; + } + } + drho3mdrm1[m] = (a3 * drho3mdrm1[m] - a3a * arho3mb[i][m]) * rhoa3mj; + drho3mdrm2[m] = (-a3 * drho3mdrm2[m] + a3a * arho3mb[j][m]) * rhoa3mi; + } + } else { + for (m = 0; m < 3; m++) { + drho1mdrm1[m] = 0.0; + drho1mdrm2[m] = 0.0; + drho2mdrm1[m] = 0.0; + drho2mdrm2[m] = 0.0; + drho3mdrm1[m] = 0.0; + drho3mdrm2[m] = 0.0; + } + } - } else if (this->ialloy == 2) { + // compute derivatives of weighting functions t wrt rij + // weighting functions t set to unity for MS-MEAM + if (this->msmeamflag) { + + t1i = 1.0; + t2i = 1.0; + t3i = 1.0; + t1j = 1.0; + t2j = 1.0; + t3j = 1.0; dt1dr1 = 0.0; dt1dr2 = 0.0; dt2dr1 = 0.0; @@ -289,38 +416,98 @@ MEAM::meam_force(int i, int eflag_global, int eflag_atom, int vflag_global, int } else { - ai = 0.0; - if (!iszero(rho0[i])) - ai = drhoa0j * sij / rho0[i]; - aj = 0.0; - if (!iszero(rho0[j])) - aj = drhoa0i * sij / rho0[j]; + t1i = t_ave[i][0]; + t2i = t_ave[i][1]; + t3i = t_ave[i][2]; + t1j = t_ave[j][0]; + t2j = t_ave[j][1]; + t3j = t_ave[j][2]; + + if (this->ialloy == 1) { + + a1i = fdiv_zero(drhoa0j * sij, tsq_ave[i][0]); + a1j = fdiv_zero(drhoa0i * sij, tsq_ave[j][0]); + a2i = fdiv_zero(drhoa0j * sij, tsq_ave[i][1]); + a2j = fdiv_zero(drhoa0i * sij, tsq_ave[j][1]); + a3i = fdiv_zero(drhoa0j * sij, tsq_ave[i][2]); + a3j = fdiv_zero(drhoa0i * sij, tsq_ave[j][2]); + + dt1dr1 = a1i * (t1mj - t1i * MathSpecial::square(t1mj)); + dt1dr2 = a1j * (t1mi - t1j * MathSpecial::square(t1mi)); + dt2dr1 = a2i * (t2mj - t2i * MathSpecial::square(t2mj)); + dt2dr2 = a2j * (t2mi - t2j * MathSpecial::square(t2mi)); + dt3dr1 = a3i * (t3mj - t3i * MathSpecial::square(t3mj)); + dt3dr2 = a3j * (t3mi - t3j * MathSpecial::square(t3mi)); + + } else if (this->ialloy == 2) { + + dt1dr1 = 0.0; + dt1dr2 = 0.0; + dt2dr1 = 0.0; + dt2dr2 = 0.0; + dt3dr1 = 0.0; + dt3dr2 = 0.0; + + } else { + + ai = 0.0; + if (!iszero(rho0[i])) + ai = drhoa0j * sij / rho0[i]; + aj = 0.0; + if (!iszero(rho0[j])) + aj = drhoa0i * sij / rho0[j]; + + dt1dr1 = ai * (t1mj - t1i); + dt1dr2 = aj * (t1mi - t1j); + dt2dr1 = ai * (t2mj - t2i); + dt2dr2 = aj * (t2mi - t2j); + dt3dr1 = ai * (t3mj - t3i); + dt3dr2 = aj * (t3mi - t3j); + } - dt1dr1 = ai * (t1mj - t1i); - dt1dr2 = aj * (t1mi - t1j); - dt2dr1 = ai * (t2mj - t2i); - dt2dr2 = aj * (t2mi - t2j); - dt3dr1 = ai * (t3mj - t3i); - dt3dr2 = aj * (t3mi - t3j); } // Compute derivatives of total density wrt rij, sij and rij(3) get_shpfcn(this->lattce_meam[elti][elti], this->stheta_meam[elti][elti], this->ctheta_meam[elti][elti], shpi); get_shpfcn(this->lattce_meam[eltj][eltj], this->stheta_meam[elti][elti], this->ctheta_meam[elti][elti], shpj); - drhodr1 = dgamma1[i] * drho0dr1 + - dgamma2[i] * (dt1dr1 * rho1[i] + t1i * drho1dr1 + dt2dr1 * rho2[i] + t2i * drho2dr1 + - dt3dr1 * rho3[i] + t3i * drho3dr1) - - dgamma3[i] * (shpi[0] * dt1dr1 + shpi[1] * dt2dr1 + shpi[2] * dt3dr1); - drhodr2 = dgamma1[j] * drho0dr2 + - dgamma2[j] * (dt1dr2 * rho1[j] + t1j * drho1dr2 + dt2dr2 * rho2[j] + t2j * drho2dr2 + - dt3dr2 * rho3[j] + t3j * drho3dr2) - - dgamma3[j] * (shpj[0] * dt1dr2 + shpj[1] * dt2dr2 + shpj[2] * dt3dr2); - for (m = 0; m < 3; m++) { - drhodrm1[m] = 0.0; - drhodrm2[m] = 0.0; - drhodrm1[m] = dgamma2[i] * (t1i * drho1drm1[m] + t2i * drho2drm1[m] + t3i * drho3drm1[m]); - drhodrm2[m] = dgamma2[j] * (t1j * drho1drm2[m] + t2j * drho2drm2[m] + t3j * drho3drm2[m]); + if (this->msmeamflag) { + drhodr1 = dgamma1[i] * drho0dr1 + + dgamma2[i] * (dt1dr1 * rho1[i] + t1i * (drho1dr1 - drho1mdr1) + + dt2dr1 * rho2[i] + t2i * (drho2dr1 - drho2mdr1) + + dt3dr1 * rho3[i] + t3i * (drho3dr1 - drho3mdr1)) - + dgamma3[i] * (shpi[0] * dt1dr1 + shpi[1] * dt2dr1 + shpi[2] * dt3dr1); + drhodr2 = dgamma1[j] * drho0dr2 + + dgamma2[j] * (dt1dr2 * rho1[j] + t1j * (drho1dr2 - drho1mdr2) + + dt2dr2 * rho2[j] + t2j * (drho2dr2 - drho2mdr2) + + dt3dr2 * rho3[j] + t3j * (drho3dr2 - drho3mdr2)) - + dgamma3[j] * (shpj[0] * dt1dr2 + shpj[1] * dt2dr2 + shpj[2] * dt3dr2); + for (m = 0; m < 3; m++) { + drhodrm1[m] = 0.0; + drhodrm2[m] = 0.0; + drhodrm1[m] = dgamma2[i] * (t1i * (drho1drm1[m] - drho1mdrm1[m]) + + t2i * (drho2drm1[m] - drho2mdrm1[m]) + + t3i * (drho3drm1[m] - drho3mdrm1[m]) ); + drhodrm2[m] = dgamma2[j] * (t1j * (drho1drm2[m] - drho1mdrm2[m]) + + t2j * (drho2drm2[m] - drho2mdrm2[m]) + + t3j * (drho3drm2[m] - drho3mdrm2[m]) ); + } + } else { + + drhodr1 = dgamma1[i] * drho0dr1 + + dgamma2[i] * (dt1dr1 * rho1[i] + t1i * drho1dr1 + dt2dr1 * rho2[i] + t2i * drho2dr1 + + dt3dr1 * rho3[i] + t3i * drho3dr1) - + dgamma3[i] * (shpi[0] * dt1dr1 + shpi[1] * dt2dr1 + shpi[2] * dt3dr1); + drhodr2 = dgamma1[j] * drho0dr2 + + dgamma2[j] * (dt1dr2 * rho1[j] + t1j * drho1dr2 + dt2dr2 * rho2[j] + t2j * drho2dr2 + + dt3dr2 * rho3[j] + t3j * drho3dr2) - + dgamma3[j] * (shpj[0] * dt1dr2 + shpj[1] * dt2dr2 + shpj[2] * dt3dr2); + for (m = 0; m < 3; m++) { + drhodrm1[m] = 0.0; + drhodrm2[m] = 0.0; + drhodrm1[m] = dgamma2[i] * (t1i * drho1drm1[m] + t2i * drho2drm1[m] + t3i * drho3drm1[m]); + drhodrm2[m] = dgamma2[j] * (t1j * drho1drm2[m] + t2j * drho2drm2[m] + t3j * drho3drm2[m]); + } } // Compute derivatives wrt sij, but only if necessary @@ -328,17 +515,37 @@ MEAM::meam_force(int i, int eflag_global, int eflag_atom, int vflag_global, int drho0ds1 = rhoa0j; drho0ds2 = rhoa0i; a1 = 2.0 / rij; - drho1ds1 = a1 * rhoa1j * arg1i1; - drho1ds2 = a1 * rhoa1i * arg1j1; a2 = 2.0 / rij2; - drho2ds1 = a2 * rhoa2j * arg1i2 - 2.0 / 3.0 * arho2b[i] * rhoa2j; - drho2ds2 = a2 * rhoa2i * arg1j2 - 2.0 / 3.0 * arho2b[j] * rhoa2i; a3 = 2.0 / rij3; a3a = 6.0 / (5.0 * rij); + + drho1ds1 = a1 * rhoa1j * arg1i1; + drho1ds2 = a1 * rhoa1i * arg1j1; + drho2ds1 = a2 * rhoa2j * arg1i2 - 2.0 / 3.0 * arho2b[i] * rhoa2j; + drho2ds2 = a2 * rhoa2i * arg1j2 - 2.0 / 3.0 * arho2b[j] * rhoa2i; drho3ds1 = a3 * rhoa3j * arg1i3 - a3a * rhoa3j * arg3i3; drho3ds2 = a3 * rhoa3i * arg1j3 - a3a * rhoa3i * arg3j3; + if (this->msmeamflag) { + drho1mds1 = a1 * rhoa1mj * arg1i1m; + drho1mds2 = a1 * rhoa1mi * arg1j1m; + drho2mds1 = a2 * rhoa2mj * arg1i2m - 2.0 / 3.0 * arho2mb[i] * rhoa2mj; + drho2mds2 = a2 * rhoa2mi * arg1j2m - 2.0 / 3.0 * arho2mb[j] * rhoa2mi; + drho3mds1 = a3 * rhoa3mj * arg1i3m - a3a * rhoa3mj * arg3i3m; + drho3mds2 = a3 * rhoa3mi * arg1j3m - a3a * rhoa3mi * arg3j3m; + drho3mds1 *= -1; + drho3mds2 *= -1; + } else { + drho1mds1 = 0.0; + drho1mds2 = 0.0; + drho2mds1 = 0.0; + drho2mds2 = 0.0; + drho3mds1 = 0.0; + drho3mds2 = 0.0; + } + if (this->ialloy == 1) { + a1i = fdiv_zero(rhoa0j, tsq_ave[i][0]); a1j = fdiv_zero(rhoa0i, tsq_ave[j][0]); a2i = fdiv_zero(rhoa0j, tsq_ave[i][1]); @@ -379,19 +586,36 @@ MEAM::meam_force(int i, int eflag_global, int eflag_atom, int vflag_global, int dt3ds2 = aj * (t3mi - t3j); } - drhods1 = dgamma1[i] * drho0ds1 + - dgamma2[i] * (dt1ds1 * rho1[i] + t1i * drho1ds1 + dt2ds1 * rho2[i] + t2i * drho2ds1 + - dt3ds1 * rho3[i] + t3i * drho3ds1) - - dgamma3[i] * (shpi[0] * dt1ds1 + shpi[1] * dt2ds1 + shpi[2] * dt3ds1); - drhods2 = dgamma1[j] * drho0ds2 + - dgamma2[j] * (dt1ds2 * rho1[j] + t1j * drho1ds2 + dt2ds2 * rho2[j] + t2j * drho2ds2 + - dt3ds2 * rho3[j] + t3j * drho3ds2) - - dgamma3[j] * (shpj[0] * dt1ds2 + shpj[1] * dt2ds2 + shpj[2] * dt3ds2); + if (this->msmeamflag) { + drhods1 = dgamma1[i] * drho0ds1 + + dgamma2[i] * (dt1ds1 * rho1[i] + t1i * (drho1ds1 - drho1mds1) + + dt2ds1 * rho2[i] + t2i * (drho2ds1 - drho2mds1) + + dt3ds1 * rho3[i] + t3i * (drho3ds1 - drho3mds1)) - + dgamma3[i] * (shpi[0] * dt1ds1 + shpi[1] * dt2ds1 + shpi[2] * dt3ds1); + drhods2 = dgamma1[j] * drho0ds2 + + dgamma2[j] * (dt1ds2 * rho1[j] + t1j * (drho1ds2 - drho1mds2) + + dt2ds2 * rho2[j] + t2j * (drho2ds2 - drho2mds2) + + dt3ds2 * rho3[j] + t3j * (drho3ds2 - drho3mds2)) - + dgamma3[j] * (shpj[0] * dt1ds2 + shpj[1] * dt2ds2 + shpj[2] * dt3ds2); + } + else { + drhods1 = dgamma1[i] * drho0ds1 + + dgamma2[i] * (dt1ds1 * rho1[i] + t1i * drho1ds1 + dt2ds1 * rho2[i] + t2i * drho2ds1 + + dt3ds1 * rho3[i] + t3i * drho3ds1) - + dgamma3[i] * (shpi[0] * dt1ds1 + shpi[1] * dt2ds1 + shpi[2] * dt3ds1); + drhods2 = dgamma1[j] * drho0ds2 + + dgamma2[j] * (dt1ds2 * rho1[j] + t1j * drho1ds2 + dt2ds2 * rho2[j] + t2j * drho2ds2 + + dt3ds2 * rho3[j] + t3j * drho3ds2) - + dgamma3[j] * (shpj[0] * dt1ds2 + shpj[1] * dt2ds2 + shpj[2] * dt3ds2); + } } - // Compute derivatives of energy wrt rij, sij and rij[3] + // Compute derivatives of energy wrt rij, sij and rij[3] + // MS-MEAM affects phip + dUdrij = phip * sij + frhop[i] * drhodr1 + frhop[j] * drhodr2; dUdsij = 0.0; + if (!iszero(dscrfcn[fnoffset + jn])) { dUdsij = phi + frhop[i] * drhods1 + frhop[j] * drhods2; } diff --git a/src/MEAM/meam_impl.cpp b/src/MEAM/meam_impl.cpp index bbfb83e94a..5290647b18 100644 --- a/src/MEAM/meam_impl.cpp +++ b/src/MEAM/meam_impl.cpp @@ -34,6 +34,11 @@ MEAM::MEAM(Memory* mem) gamma = dgamma1 = dgamma2 = dgamma3 = arho2b = nullptr; arho1 = arho2 = arho3 = arho3b = t_ave = tsq_ave = nullptr; + // msmeam arrays + msmeamflag = 0; + arho2mb = nullptr; + arho1m = arho2m = arho3m = arho3mb = nullptr; + maxneigh = 0; scrfcn = dscrfcn = fcpair = nullptr; copymode = 0; @@ -43,7 +48,9 @@ MEAM::MEAM(Memory* mem) A_meam[i] = rho0_meam[i] = beta0_meam[i] = beta1_meam[i]= beta2_meam[i] = beta3_meam[i] = t0_meam[i] = t1_meam[i] = t2_meam[i] = t3_meam[i] = - rho_ref_meam[i] = ibar_meam[i] = ielt_meam[i] = 0.0; + rho_ref_meam[i] = ibar_meam[i] = ielt_meam[i] = + t1m_meam[i] = t2m_meam[i] = t3m_meam[i] = + beta1m_meam[i] = beta2m_meam[i] = beta3m_meam[i] = 0.0; for (int j = 0; j < maxelt; j++) { lattce_meam[i][j] = FCC; Ec_meam[i][j] = re_meam[i][j] = alpha_meam[i][j] = delta_meam[i][j] = ebound_meam[i][j] = attrac_meam[i][j] = repuls_meam[i][j] = 0.0; @@ -87,4 +94,13 @@ MEAM::~MEAM() memory->destroy(this->scrfcn); memory->destroy(this->dscrfcn); memory->destroy(this->fcpair); + + // msmeam + if (this->msmeamflag){ + memory->destroy(this->arho1m); + memory->destroy(this->arho2m); + memory->destroy(this->arho3m); + memory->destroy(this->arho2mb); + memory->destroy(this->arho3mb); + } } diff --git a/src/MEAM/meam_setup_done.cpp b/src/MEAM/meam_setup_done.cpp index 93f2552465..de1188349c 100644 --- a/src/MEAM/meam_setup_done.cpp +++ b/src/MEAM/meam_setup_done.cpp @@ -220,7 +220,6 @@ void MEAM::compute_pair_meam() // loop over r values and compute for (j = 0; j < this->nr; j++) { r = j * this->dr; - this->phir[nv2][j] = phi_meam(r, a, b); // if using second-nearest neighbor, solve recursive problem @@ -333,9 +332,12 @@ double MEAM::phi_meam(double r, int a, int b) lattice_t latta /*unused:,lattb*/; double rho_bkgd1, rho_bkgd2; double b11s, b22s; + // msmeam + double t1m1av, t2m1av, t3m1av, t1m2av, t2m2av, t3m2av; + double rho1m1, rho2m1, rho3m1; + double rho1m2, rho2m2, rho3m2; double phi_m = 0.0; - // Equation numbers below refer to: // I. Huang et.al., Modelling simul. Mater. Sci. Eng. 3:615 @@ -345,8 +347,16 @@ double MEAM::phi_meam(double r, int a, int b) Z2 = get_Zij(this->lattce_meam[b][b]); Z12 = get_Zij(this->lattce_meam[a][b]); - get_densref(r, a, b, &rho01, &rho11, &rho21, &rho31, &rho02, &rho12, &rho22, &rho32); - + // this function has extra args for msmeam + if (this->msmeamflag) { + get_densref(r, a, b, &rho01, &rho11, &rho21, &rho31, &rho02, &rho12, &rho22, &rho32, + &rho1m1, &rho2m1, &rho3m1, + &rho1m2, &rho2m2, &rho3m2); + } else { + get_densref(r, a, b, &rho01, &rho11, &rho21, &rho31, &rho02, &rho12, &rho22, &rho32, + nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr); + } // if densities are too small, numerical problems may result; just return zero if (rho01 <= 1e-14 && rho02 <= 1e-14) return 0.0; @@ -374,6 +384,12 @@ double MEAM::phi_meam(double r, int a, int b) get_tavref(&t11av, &t21av, &t31av, &t12av, &t22av, &t32av, this->t1_meam[a], this->t2_meam[a], this->t3_meam[a], this->t1_meam[b], this->t2_meam[b], this->t3_meam[b], r, a, b, this->lattce_meam[a][b]); + // with msmeam call twice with different sets of variables + if (this->msmeamflag) { + get_tavref(&t1m1av, &t2m1av, &t3m1av, &t1m2av, &t2m2av, &t3m2av, this->t1m_meam[a], this->t2m_meam[a], + this->t3m_meam[a], this->t1m_meam[b], this->t2m_meam[b], this->t3m_meam[b], r, a, b, + this->lattce_meam[a][b]); + } } // for c11b structure, calculate background electron densities @@ -420,17 +436,33 @@ double MEAM::phi_meam(double r, int a, int b) rho0_1 = this->rho0_meam[a] * Z1 * G1; rho0_2 = this->rho0_meam[b] * Z2 * G2; } - Gam1 = (t11av * rho11 + t21av * rho21 + t31av * rho31); - if (rho01 < 1.0e-14) - Gam1 = 0.0; - else - Gam1 = Gam1 / (rho01 * rho01); - Gam2 = (t12av * rho12 + t22av * rho22 + t32av * rho32); - if (rho02 < 1.0e-14) - Gam2 = 0.0; - else - Gam2 = Gam2 / (rho02 * rho02); + if (this->msmeamflag) { + // no additional use of t's here; all included in definitions of rho's for msmeam + Gam1 = rho11 + rho21 + rho31 - (rho1m1 + rho2m1 + rho3m1); + if (rho01 < 1.0e-14) + Gam1 = 0.0; + else + Gam1 = Gam1 / (rho01 * rho01); + Gam2 = rho12 + rho22 + rho32 - (rho1m2 + rho2m2 + rho3m2); + if (rho02 < 1.0e-14) + Gam2 = 0.0; + else + Gam2 = Gam2 / (rho02 * rho02); + + } else { + Gam1 = (t11av * rho11 + t21av * rho21 + t31av * rho31); + if (rho01 < 1.0e-14) + Gam1 = 0.0; + else + Gam1 = Gam1 / (rho01 * rho01); + + Gam2 = (t12av * rho12 + t22av * rho22 + t32av * rho32); + if (rho02 < 1.0e-14) + Gam2 = 0.0; + else + Gam2 = Gam2 / (rho02 * rho02); + } G1 = G_gam(Gam1, this->ibar_meam[a], errorflag); G2 = G_gam(Gam2, this->ibar_meam[b], errorflag); @@ -655,7 +687,9 @@ void MEAM::get_sijk(double C, int i, int j, int k, double* sijk) //------------------------------------------------------------------------------c // Calculate density functions, assuming reference configuration void MEAM::get_densref(double r, int a, int b, double* rho01, double* rho11, double* rho21, double* rho31, - double* rho02, double* rho12, double* rho22, double* rho32) + double* rho02, double* rho12, double* rho22, double* rho32, + double* rho1m1, double* rho2m1, double* rho3m1, + double* rho1m2, double* rho2m2, double* rho3m2) { double a1, a2; double s[3]; @@ -666,18 +700,39 @@ void MEAM::get_densref(double r, int a, int b, double* rho01, double* rho11, dou double rhoa02, rhoa12, rhoa22, rhoa32; double arat, scrn, denom; double C, s111, s112, s221, S11, S22; + // msmeam + double rhoa1m1, rhoa2m1, rhoa3m1, rhoa1m2, rhoa2m2, rhoa3m2; a1 = r / this->re_meam[a][a] - 1.0; a2 = r / this->re_meam[b][b] - 1.0; rhoa01 = this->rho0_meam[a] * MathSpecial::fm_exp(-this->beta0_meam[a] * a1); - rhoa11 = this->rho0_meam[a] * MathSpecial::fm_exp(-this->beta1_meam[a] * a1); - rhoa21 = this->rho0_meam[a] * MathSpecial::fm_exp(-this->beta2_meam[a] * a1); - rhoa31 = this->rho0_meam[a] * MathSpecial::fm_exp(-this->beta3_meam[a] * a1); - rhoa02 = this->rho0_meam[b] * MathSpecial::fm_exp(-this->beta0_meam[b] * a2); - rhoa12 = this->rho0_meam[b] * MathSpecial::fm_exp(-this->beta1_meam[b] * a2); - rhoa22 = this->rho0_meam[b] * MathSpecial::fm_exp(-this->beta2_meam[b] * a2); - rhoa32 = this->rho0_meam[b] * MathSpecial::fm_exp(-this->beta3_meam[b] * a2); + + if (this->msmeamflag) { + // the rho variables are multiplied by t here since ialloy not needed in msmeam + rhoa11 = this->rho0_meam[a] * this->t1_meam[a] * MathSpecial::fm_exp(-this->beta1_meam[a] * a1); + rhoa21 = this->rho0_meam[a] * this->t2_meam[a] * MathSpecial::fm_exp(-this->beta2_meam[a] * a1); + rhoa31 = this->rho0_meam[a] * this->t3_meam[a] * MathSpecial::fm_exp(-this->beta3_meam[a] * a1); + rhoa02 = this->rho0_meam[b] * MathSpecial::fm_exp(-this->beta0_meam[b] * a2); + rhoa12 = this->rho0_meam[b] * this->t1_meam[b] * MathSpecial::fm_exp(-this->beta1_meam[b] * a2); + rhoa22 = this->rho0_meam[b] * this->t2_meam[b] * MathSpecial::fm_exp(-this->beta2_meam[b] * a2); + rhoa32 = this->rho0_meam[b] * this->t3_meam[b] * MathSpecial::fm_exp(-this->beta3_meam[b] * a2); + // msmeam specific rho vars + rhoa1m1 = this->rho0_meam[a] * this->t1m_meam[a] * MathSpecial::fm_exp(-this->beta1m_meam[a] * a1); + rhoa2m1 = this->rho0_meam[a] * this->t2m_meam[a] * MathSpecial::fm_exp(-this->beta2m_meam[a] * a1); + rhoa3m1 = this->rho0_meam[a] * this->t3m_meam[a] * MathSpecial::fm_exp(-this->beta3m_meam[a] * a1); + rhoa1m2 = this->rho0_meam[b] * this->t1m_meam[b] * MathSpecial::fm_exp(-this->beta1m_meam[b] * a2); + rhoa2m2 = this->rho0_meam[b] * this->t2m_meam[b] * MathSpecial::fm_exp(-this->beta2m_meam[b] * a2); + rhoa3m2 = this->rho0_meam[b] * this->t3m_meam[b] * MathSpecial::fm_exp(-this->beta3m_meam[b] * a2); + } else { + rhoa11 = this->rho0_meam[a] * MathSpecial::fm_exp(-this->beta1_meam[a] * a1); + rhoa21 = this->rho0_meam[a] * MathSpecial::fm_exp(-this->beta2_meam[a] * a1); + rhoa31 = this->rho0_meam[a] * MathSpecial::fm_exp(-this->beta3_meam[a] * a1); + rhoa02 = this->rho0_meam[b] * MathSpecial::fm_exp(-this->beta0_meam[b] * a2); + rhoa12 = this->rho0_meam[b] * MathSpecial::fm_exp(-this->beta1_meam[b] * a2); + rhoa22 = this->rho0_meam[b] * MathSpecial::fm_exp(-this->beta2_meam[b] * a2); + rhoa32 = this->rho0_meam[b] * MathSpecial::fm_exp(-this->beta3_meam[b] * a2); + } lat = this->lattce_meam[a][b]; @@ -689,7 +744,16 @@ void MEAM::get_densref(double r, int a, int b, double* rho01, double* rho11, dou *rho12 = 0.0; *rho22 = 0.0; *rho32 = 0.0; + if (this->msmeamflag) { + *rho1m1 = 0.0; + *rho2m1 = 0.0; + *rho3m1 = 0.0; + *rho1m2 = 0.0; + *rho2m2 = 0.0; + *rho3m2 = 0.0; + } + // keep track of density components separately; combine in the calling subroutine switch (lat) { case FCC: *rho01 = 12.0 * rhoa02; @@ -710,12 +774,20 @@ void MEAM::get_densref(double r, int a, int b, double* rho01, double* rho11, dou *rho02 = 4.0 * rhoa01; *rho31 = 32.0 / 9.0 * rhoa32 * rhoa32; *rho32 = 32.0 / 9.0 * rhoa31 * rhoa31; + if (this->msmeamflag) { + *rho3m1 = 32.0 / 9.0 * rhoa3m2 * rhoa3m2; + *rho3m2 = 32.0 / 9.0 * rhoa3m1 * rhoa3m1; + } break; case HCP: *rho01 = 12 * rhoa02; *rho02 = 12 * rhoa01; *rho31 = 1.0 / 3.0 * rhoa32 * rhoa32; *rho32 = 1.0 / 3.0 * rhoa31 * rhoa31; + if (this->msmeamflag) { + *rho3m1 = 1.0 / 3.0 * rhoa3m2 * rhoa3m2; + *rho3m2 = 1.0 / 3.0 * rhoa3m1 * rhoa3m1; + } break; case DIM: get_shpfcn(DIM, 0, 0, s); @@ -727,6 +799,14 @@ void MEAM::get_densref(double r, int a, int b, double* rho01, double* rho11, dou *rho22 = s[1] * rhoa21 * rhoa21; *rho31 = s[2] * rhoa32 * rhoa32; *rho32 = s[2] * rhoa31 * rhoa31; + if (this->msmeamflag) { + *rho1m1 = s[0] * rhoa1m2 * rhoa1m2; + *rho1m2 = s[0] * rhoa1m1 * rhoa1m1; + *rho2m1 = s[1] * rhoa2m2 * rhoa2m2; + *rho2m2 = s[1] * rhoa2m1 * rhoa2m1; + *rho3m1 = s[2] * rhoa3m2 * rhoa3m2; + *rho3m2 = s[2] * rhoa3m1 * rhoa3m1; + } break; case C11: *rho01 = rhoa01; @@ -737,17 +817,28 @@ void MEAM::get_densref(double r, int a, int b, double* rho01, double* rho11, dou *rho22 = rhoa22; *rho31 = rhoa31; *rho32 = rhoa32; + if (this->msmeamflag) { + *rho1m1 = rhoa1m1; + *rho1m2 = rhoa1m2; + *rho2m1 = rhoa2m1; + *rho2m2 = rhoa2m2; + *rho3m1 = rhoa3m1; + *rho3m2 = rhoa3m2; + } break; case L12: *rho01 = 8 * rhoa01 + 4 * rhoa02; *rho02 = 12 * rhoa01; - if (this->ialloy == 1) { + if (this->ialloy ==1){ *rho21 = 8. / 3. * MathSpecial::square(rhoa21 * this->t2_meam[a] - rhoa22 * this->t2_meam[b]); denom = 8 * rhoa01 * MathSpecial::square(this->t2_meam[a]) + 4 * rhoa02 * MathSpecial::square(this->t2_meam[b]); if (denom > 0.) *rho21 = *rho21 / denom * *rho01; } else *rho21 = 8. / 3. * (rhoa21 - rhoa22) * (rhoa21 - rhoa22); + if (this->msmeamflag) { + *rho2m1 = 8. / 3. * (rhoa2m1 - rhoa2m2) * (rhoa2m1 - rhoa2m2); + } break; case B2: *rho01 = 8.0 * rhoa02; @@ -864,6 +955,7 @@ void MEAM::interpolate_meam(int ind) this->rdrar = 1.0 / drar; // phir interp + for (j = 0; j < this->nrar; j++) { this->phirar[ind][j] = this->phir[ind][j]; } diff --git a/src/MEAM/meam_setup_global.cpp b/src/MEAM/meam_setup_global.cpp index 545a2ad3f4..5d35242e7c 100644 --- a/src/MEAM/meam_setup_global.cpp +++ b/src/MEAM/meam_setup_global.cpp @@ -36,7 +36,8 @@ void MEAM::meam_setup_global(int nelt, lattice_t* lat, int* ielement, double* /*atwt*/, double* alpha, double* b0, double* b1, double* b2, double* b3, double* alat, double* esub, double* asub, double* t0, double* t1, double* t2, double* t3, double* rozero, - int* ibar) + int* ibar, double* b1m, double *b2m, double *b3m, double *t1m, double *t2m, + double *t3m) { int i; @@ -53,6 +54,11 @@ MEAM::meam_setup_global(int nelt, lattice_t* lat, int* ielement, double* /*atwt* this->beta1_meam[i] = b1[i]; this->beta2_meam[i] = b2[i]; this->beta3_meam[i] = b3[i]; + if (this->msmeamflag){ + this->beta1m_meam[i] = b1m[i]; + this->beta2m_meam[i] = b2m[i]; + this->beta3m_meam[i] = b3m[i]; + } tmplat[i] = alat[i]; this->Ec_meam[i][i] = esub[i]; this->A_meam[i] = asub[i]; @@ -60,6 +66,11 @@ MEAM::meam_setup_global(int nelt, lattice_t* lat, int* ielement, double* /*atwt* this->t1_meam[i] = t1[i]; this->t2_meam[i] = t2[i]; this->t3_meam[i] = t3[i]; + if (this->msmeamflag){ + this->t1m_meam[i] = t1m[i]; + this->t2m_meam[i] = t2m[i]; + this->t3m_meam[i] = t3m[i]; + } this->rho0_meam[i] = rozero[i]; this->ibar_meam[i] = ibar[i]; diff --git a/src/MEAM/pair_meam.cpp b/src/MEAM/pair_meam.cpp index bcfffbe52b..c4a4cfa1d7 100644 --- a/src/MEAM/pair_meam.cpp +++ b/src/MEAM/pair_meam.cpp @@ -58,13 +58,12 @@ PairMEAM::PairMEAM(LAMMPS *lmp) : Pair(lmp) allocated = 0; nlibelements = 0; + meam_inst = new MEAM(memory); + meam_inst->msmeamflag = msmeamflag = 0; + myname = "meam"; + scale = nullptr; - - // set comm size needed by this Pair - - comm_forward = 38; - comm_reverse = 30; } /* ---------------------------------------------------------------------- @@ -93,7 +92,6 @@ void PairMEAM::compute(int eflag, int vflag) int i,ii,n,inum_half,errorflag; int *ilist_half,*numneigh_half,**firstneigh_half; int *numneigh_full,**firstneigh_full; - ev_init(eflag,vflag); // neighbor list info @@ -133,7 +131,6 @@ void PairMEAM::compute(int eflag, int vflag) int offset = 0; errorflag = 0; - for (ii = 0; ii < inum_half; ii++) { i = ilist_half[ii]; meam_inst->meam_dens_init(i,ntype,type,map,x, @@ -142,9 +139,7 @@ void PairMEAM::compute(int eflag, int vflag) offset); offset += numneigh_half[i]; } - comm->reverse_comm(this); - meam_inst->meam_dens_final(nlocal,eflag_either,eflag_global,eflag_atom, &eng_vdwl,eatom,ntype,type,map,scale,errorflag); if (errorflag) @@ -159,7 +154,6 @@ void PairMEAM::compute(int eflag, int vflag) double **vptr = nullptr; if (vflag_atom) vptr = vatom; - for (ii = 0; ii < inum_half; ii++) { i = ilist_half[ii]; meam_inst->meam_force(i,eflag_global,eflag_atom,vflag_global, @@ -169,7 +163,6 @@ void PairMEAM::compute(int eflag, int vflag) offset,f,vptr,virial); offset += numneigh_half[i]; } - if (vflag_fdotr) virial_fdotr_compute(); } @@ -193,7 +186,17 @@ void PairMEAM::allocate() void PairMEAM::settings(int narg, char ** /*arg*/) { - if (narg != 0) error->all(FLERR,"Illegal pair_style command"); + if (narg != 0) error->all(FLERR,"Illegal pair_style {} command", myname); + + // set comm size needed by this Pair + + if (msmeamflag) { + comm_forward = 38+23; // plus 23 for msmeam + comm_reverse = 30+23; // plus 23 for msmeam + } else { + comm_forward = 38; + comm_reverse = 30; + } } /* ---------------------------------------------------------------------- @@ -206,12 +209,7 @@ void PairMEAM::coeff(int narg, char **arg) if (!allocated) allocate(); - if (narg < 6) error->all(FLERR,"Incorrect args for pair coefficients"); - - // ensure I,J args are * * - - if (strcmp(arg[0],"*") != 0 || strcmp(arg[1],"*") != 0) - error->all(FLERR,"Incorrect args for pair coefficients"); + if (narg < 6) error->all(FLERR,"Incorrect args for pair style {} coefficients", myname); // check for presence of first meam file @@ -239,7 +237,7 @@ void PairMEAM::coeff(int narg, char **arg) } if (paridx < 0) error->all(FLERR,"No MEAM parameter file in pair coefficients"); if ((narg - paridx - 1) != atom->ntypes) - error->all(FLERR,"Incorrect args for pair coefficients"); + error->all(FLERR,"Incorrect args for pair style {} coefficients", myname); // MEAM element names between 2 filenames // nlibelements = # of MEAM elements @@ -282,7 +280,7 @@ void PairMEAM::coeff(int narg, char **arg) if (libelements[j] == arg[i]) break; if (j < nlibelements) map[m] = j; else if (strcmp(arg[i],"NULL") == 0) map[m] = -1; - else error->all(FLERR,"Incorrect args for pair coefficients"); + else error->all(FLERR,"Incorrect args for pair style {} coefficients", myname); } // clear setflag since coeff() called once with I,J = * * @@ -307,7 +305,7 @@ void PairMEAM::coeff(int narg, char **arg) } } - if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients"); + if (count == 0) error->all(FLERR,"Incorrect args for pair style {} coefficients", myname); } /* ---------------------------------------------------------------------- @@ -317,7 +315,7 @@ void PairMEAM::coeff(int narg, char **arg) void PairMEAM::init_style() { if (force->newton_pair == 0) - error->all(FLERR,"Pair style MEAM requires newton pair on"); + error->all(FLERR,"Pair style {} requires newton pair on", myname); // need a full and a half neighbor list @@ -360,7 +358,9 @@ void PairMEAM::read_files(const std::string &globalfile, void PairMEAM::read_global_meam_file(const std::string &globalfile) { + // allocate parameter arrays + std::vector lat(nlibelements); std::vector ielement(nlibelements); std::vector ibar(nlibelements); @@ -381,6 +381,15 @@ void PairMEAM::read_global_meam_file(const std::string &globalfile) std::vector rozero(nlibelements); std::vector found(nlibelements, false); + // allocate 6 extra arrays for msmeam + + std::vector b1m(nlibelements); + std::vector b2m(nlibelements); + std::vector b3m(nlibelements); + std::vector t1m(nlibelements); + std::vector t2m(nlibelements); + std::vector t3m(nlibelements); + // open global meamf file on proc 0 if (comm->me == 0) { @@ -416,8 +425,7 @@ void PairMEAM::read_global_meam_file(const std::string &globalfile) std::string lattice_type = values.next_string(); if (!MEAM::str_to_lat(lattice_type, true, lat[index])) - error->one(FLERR,"Unrecognized lattice type in MEAM " - "library file: {}", lattice_type); + error->one(FLERR,"Unrecognized lattice type in MEAM library file: {}", lattice_type); // store parameters @@ -429,6 +437,11 @@ void PairMEAM::read_global_meam_file(const std::string &globalfile) b1[index] = values.next_double(); b2[index] = values.next_double(); b3[index] = values.next_double(); + if (msmeamflag) { + b1m[index] = values.next_double(); + b2m[index] = values.next_double(); + b3m[index] = values.next_double(); + } alat[index] = values.next_double(); esub[index] = values.next_double(); asub[index] = values.next_double(); @@ -436,15 +449,20 @@ void PairMEAM::read_global_meam_file(const std::string &globalfile) t1[index] = values.next_double(); t2[index] = values.next_double(); t3[index] = values.next_double(); + if (msmeamflag) { + t1m[index] = values.next_double(); + t2m[index] = values.next_double(); + t3m[index] = values.next_double(); + } rozero[index] = values.next_double(); ibar[index] = values.next_int(); if (!isone(t0[index])) - error->one(FLERR,"Unsupported parameter in MEAM library file: t0!=1"); + error->one(FLERR,"Unsupported parameter in MEAM library file: t0 != 1"); // z given is ignored: if this is mismatched, we definitely won't do what the user said -> fatal error if (z[index] != MEAM::get_Zij(lat[index])) - error->one(FLERR,"Mismatched parameter in MEAM library file: z!=lat"); + error->one(FLERR,"Mismatched parameter in MEAM library file: z != lat"); nset++; } catch (TokenizerException &e) { @@ -484,13 +502,29 @@ void PairMEAM::read_global_meam_file(const std::string &globalfile) MPI_Bcast(t2.data(), nlibelements, MPI_DOUBLE, 0, world); MPI_Bcast(t3.data(), nlibelements, MPI_DOUBLE, 0, world); MPI_Bcast(rozero.data(), nlibelements, MPI_DOUBLE, 0, world); + // distribute msmeam parameter sets + MPI_Bcast(b1m.data(), nlibelements, MPI_DOUBLE, 0, world); + MPI_Bcast(b2m.data(), nlibelements, MPI_DOUBLE, 0, world); + MPI_Bcast(b3m.data(), nlibelements, MPI_DOUBLE, 0, world); + MPI_Bcast(t1m.data(), nlibelements, MPI_DOUBLE, 0, world); + MPI_Bcast(t2m.data(), nlibelements, MPI_DOUBLE, 0, world); + MPI_Bcast(t3m.data(), nlibelements, MPI_DOUBLE, 0, world); // pass element parameters to MEAM package - meam_inst->meam_setup_global(nlibelements, lat.data(), ielement.data(), atwt.data(), - alpha.data(), b0.data(), b1.data(), b2.data(), b3.data(), - alat.data(), esub.data(), asub.data(), t0.data(), t1.data(), - t2.data(), t3.data(), rozero.data(), ibar.data()); + if (msmeamflag) { + meam_inst->meam_setup_global(nlibelements, lat.data(), ielement.data(), atwt.data(), + alpha.data(), b0.data(), b1.data(), b2.data(), b3.data(), + alat.data(), esub.data(), asub.data(), t0.data(), t1.data(), + t2.data(), t3.data(), rozero.data(), ibar.data(), b1m.data(), + b2m.data(), b3m.data(), t1m.data(), t2m.data(), t3m.data()); + } else { + meam_inst->meam_setup_global(nlibelements, lat.data(), ielement.data(), atwt.data(), + alpha.data(), b0.data(), b1.data(), b2.data(), b3.data(), + alat.data(), esub.data(), asub.data(), t0.data(), t1.data(), + t2.data(), t3.data(), rozero.data(), ibar.data(), nullptr, + nullptr, nullptr, nullptr, nullptr, nullptr); + } // set element masses @@ -613,6 +647,23 @@ int PairMEAM::pack_forward_comm(int n, int *list, double *buf, buf[m++] = meam_inst->tsq_ave[j][0]; buf[m++] = meam_inst->tsq_ave[j][1]; buf[m++] = meam_inst->tsq_ave[j][2]; + if (msmeamflag) { + buf[m++] = meam_inst->arho2mb[j]; + buf[m++] = meam_inst->arho1m[j][0]; + buf[m++] = meam_inst->arho1m[j][1]; + buf[m++] = meam_inst->arho1m[j][2]; + buf[m++] = meam_inst->arho2m[j][0]; + buf[m++] = meam_inst->arho2m[j][1]; + buf[m++] = meam_inst->arho2m[j][2]; + buf[m++] = meam_inst->arho2m[j][3]; + buf[m++] = meam_inst->arho2m[j][4]; + buf[m++] = meam_inst->arho2m[j][5]; + for (k = 0; k < 10; k++) buf[m++] = meam_inst->arho3m[j][k]; + buf[m++] = meam_inst->arho3mb[j][0]; + buf[m++] = meam_inst->arho3mb[j][1]; + buf[m++] = meam_inst->arho3mb[j][2]; + } + } return m; @@ -656,6 +707,22 @@ void PairMEAM::unpack_forward_comm(int n, int first, double *buf) meam_inst->tsq_ave[i][0] = buf[m++]; meam_inst->tsq_ave[i][1] = buf[m++]; meam_inst->tsq_ave[i][2] = buf[m++]; + if (msmeamflag) { + meam_inst->arho2mb[i] = buf[m++]; + meam_inst->arho1m[i][0] = buf[m++]; + meam_inst->arho1m[i][1] = buf[m++]; + meam_inst->arho1m[i][2] = buf[m++]; + meam_inst->arho2m[i][0] = buf[m++]; + meam_inst->arho2m[i][1] = buf[m++]; + meam_inst->arho2m[i][2] = buf[m++]; + meam_inst->arho2m[i][3] = buf[m++]; + meam_inst->arho2m[i][4] = buf[m++]; + meam_inst->arho2m[i][5] = buf[m++]; + for (k = 0; k < 10; k++) meam_inst->arho3m[i][k] = buf[m++]; + meam_inst->arho3mb[i][0] = buf[m++]; + meam_inst->arho3mb[i][1] = buf[m++]; + meam_inst->arho3mb[i][2] = buf[m++]; + } } } @@ -689,6 +756,22 @@ int PairMEAM::pack_reverse_comm(int n, int first, double *buf) buf[m++] = meam_inst->tsq_ave[i][0]; buf[m++] = meam_inst->tsq_ave[i][1]; buf[m++] = meam_inst->tsq_ave[i][2]; + if (msmeamflag) { + buf[m++] = meam_inst->arho2mb[i]; + buf[m++] = meam_inst->arho1m[i][0]; + buf[m++] = meam_inst->arho1m[i][1]; + buf[m++] = meam_inst->arho1m[i][2]; + buf[m++] = meam_inst->arho2m[i][0]; + buf[m++] = meam_inst->arho2m[i][1]; + buf[m++] = meam_inst->arho2m[i][2]; + buf[m++] = meam_inst->arho2m[i][3]; + buf[m++] = meam_inst->arho2m[i][4]; + buf[m++] = meam_inst->arho2m[i][5]; + for (k = 0; k < 10; k++) buf[m++] = meam_inst->arho3m[i][k]; + buf[m++] = meam_inst->arho3mb[i][0]; + buf[m++] = meam_inst->arho3mb[i][1]; + buf[m++] = meam_inst->arho3mb[i][2]; + } } return m; @@ -724,7 +807,25 @@ void PairMEAM::unpack_reverse_comm(int n, int *list, double *buf) meam_inst->tsq_ave[j][0] += buf[m++]; meam_inst->tsq_ave[j][1] += buf[m++]; meam_inst->tsq_ave[j][2] += buf[m++]; + if (msmeamflag) { + meam_inst->arho2mb[j] += buf[m++]; + meam_inst->arho1m[j][0] += buf[m++]; + meam_inst->arho1m[j][1] += buf[m++]; + meam_inst->arho1m[j][2] += buf[m++]; + meam_inst->arho2m[j][0] += buf[m++]; + meam_inst->arho2m[j][1] += buf[m++]; + meam_inst->arho2m[j][2] += buf[m++]; + meam_inst->arho2m[j][3] += buf[m++]; + meam_inst->arho2m[j][4] += buf[m++]; + meam_inst->arho2m[j][5] += buf[m++]; + for (k = 0; k < 10; k++) meam_inst->arho3m[j][k] += buf[m++]; + meam_inst->arho3mb[j][0] += buf[m++]; + meam_inst->arho3mb[j][1] += buf[m++]; + meam_inst->arho3mb[j][2] += buf[m++]; + } } + + } /* ---------------------------------------------------------------------- diff --git a/src/MEAM/pair_meam.h b/src/MEAM/pair_meam.h index 16ba38fcb2..a89714bfa9 100644 --- a/src/MEAM/pair_meam.h +++ b/src/MEAM/pair_meam.h @@ -47,6 +47,8 @@ class PairMEAM : public Pair { class MEAM *meam_inst; double cutmax; // max cutoff for all elements int nlibelements; // # of library elements + int msmeamflag; // 0 (default) for normal MEAM, 1 for MS-MEAM + std::string myname; // name of the pair style std::vector libelements; // names of library elements std::vector mass; // mass of library element diff --git a/src/MEAM/pair_meam_ms.cpp b/src/MEAM/pair_meam_ms.cpp new file mode 100644 index 0000000000..982a54f546 --- /dev/null +++ b/src/MEAM/pair_meam_ms.cpp @@ -0,0 +1,25 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/ Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include "pair_meam_ms.h" +#include "meam.h" + +using namespace LAMMPS_NS; + +/* ---------------------------------------------------------------------- */ + +PairMEAMMS::PairMEAMMS(LAMMPS *lmp) : PairMEAM(lmp) +{ + meam_inst->msmeamflag = msmeamflag = 1; + myname = "meam/ms"; +} diff --git a/src/MEAM/pair_meam_ms.h b/src/MEAM/pair_meam_ms.h new file mode 100644 index 0000000000..25878203ed --- /dev/null +++ b/src/MEAM/pair_meam_ms.h @@ -0,0 +1,33 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/ Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS +// clang-format off +PairStyle(meam/ms,PairMEAMMS); +// clang-format on +#else + +#ifndef LMP_PAIR_MEAM_MS_H +#define LMP_PAIR_MEAM_MS_H + +#include "pair_meam.h" + +namespace LAMMPS_NS { + +class PairMEAMMS : public PairMEAM { + public: + PairMEAMMS(class LAMMPS *); +}; +} // namespace LAMMPS_NS +#endif +#endif diff --git a/src/ML-IAP/pair_mliap.cpp b/src/ML-IAP/pair_mliap.cpp index 6b55fb3373..929a32020b 100644 --- a/src/ML-IAP/pair_mliap.cpp +++ b/src/ML-IAP/pair_mliap.cpp @@ -83,7 +83,6 @@ void PairMLIAP::compute(int eflag, int vflag) { // consistency checks - if (data->ndescriptors != model->ndescriptors) error->all(FLERR, "Inconsistent model and descriptor descriptor count: {} vs {}", model->ndescriptors, data->ndescriptors); @@ -134,10 +133,10 @@ void PairMLIAP::allocate() void PairMLIAP::settings(int narg, char ** arg) { - if (narg < 2) utils::missing_cmd_args(FLERR, "pair_style mliap", error); // This is needed because the unit test calls settings twice if (!is_child) { + if (narg < 2) utils::missing_cmd_args(FLERR, "pair_style mliap", error); delete model; model = nullptr; delete descriptor; diff --git a/src/REAXFF/fix_reaxff_species.cpp b/src/REAXFF/fix_reaxff_species.cpp index 65a2e6d8ce..ce04be2cc8 100644 --- a/src/REAXFF/fix_reaxff_species.cpp +++ b/src/REAXFF/fix_reaxff_species.cpp @@ -21,6 +21,7 @@ #include "atom.h" #include "atom_vec.h" +#include "citeme.h" #include "comm.h" #include "domain.h" #include "error.h" @@ -36,12 +37,25 @@ #include "pair_reaxff.h" #include "reaxff_defs.h" +#include #include #include +#include using namespace LAMMPS_NS; using namespace FixConst; +static const char cite_reaxff_species_delete[] = + "fix reaxff/species, 'delete' keyword: https://doi.org/10.1016/j.carbon.2022.11.002\n\n" + "@Article{Gissinger23,\n" + " author = {J. R. Gissinger, S. R. Zavada, J. G. Smith, J. Kemppainen, I. Gallegos, G. M. Odegard, E. J. Siochi, K. E. Wise},\n" + " title = {Predicting char yield of high-temperature resins},\n" + " journal = {Carbon},\n" + " year = 2023,\n" + " volume = 202,\n" + " pages = {336-347}\n" + "}\n\n"; + /* ---------------------------------------------------------------------- */ FixReaxFFSpecies::FixReaxFFSpecies(LAMMPS *lmp, int narg, char **arg) : @@ -145,6 +159,7 @@ FixReaxFFSpecies::FixReaxFFSpecies(LAMMPS *lmp, int narg, char **arg) : ele = filepos = filedel = nullptr; eleflag = posflag = padflag = 0; delflag = specieslistflag = masslimitflag = 0; + delete_Nlimit = delete_Nsteps = 0; singlepos_opened = multipos_opened = del_opened = 0; multipos = 0; @@ -221,7 +236,12 @@ FixReaxFFSpecies::FixReaxFFSpecies(LAMMPS *lmp, int narg, char **arg) : } else error->all(FLERR, "Unknown fix reaxff/species delete option: {}", arg[iarg]); - + // rate limit when deleting molecules + } else if (strcmp(arg[iarg], "delete_rate_limit") == 0) { + if (iarg + 3 > narg) utils::missing_cmd_args(FLERR, "fix reaxff/species delete_rate_limit", error); + delete_Nlimit = utils::numeric(FLERR, arg[iarg+1], false, lmp); + delete_Nsteps = utils::numeric(FLERR, arg[iarg+2], false, lmp); + iarg += 3; // position of molecules } else if (strcmp(arg[iarg], "position") == 0) { if (iarg + 3 > narg) utils::missing_cmd_args(FLERR, "fix reaxff/species position", error); @@ -260,6 +280,15 @@ FixReaxFFSpecies::FixReaxFFSpecies(LAMMPS *lmp, int narg, char **arg) : if (delflag && specieslistflag && masslimitflag) error->all(FLERR, "Incompatible combination fix reaxff/species command options"); + if (delete_Nlimit > 0) { + if (lmp->citeme) lmp->citeme->add(cite_reaxff_species_delete); + memory->create(delete_Tcount,delete_Nsteps,"reaxff/species:delete_Tcount"); + + for (int i = 0; i < delete_Nsteps; i++) + delete_Tcount[i] = -1; + delete_Tcount[0] = 0; + } + vector_nmole = 0; vector_nspec = 0; } @@ -279,6 +308,7 @@ FixReaxFFSpecies::~FixReaxFFSpecies() memory->destroy(Mol2Spec); memory->destroy(MolType); memory->destroy(MolName); + memory->destroy(delete_Tcount); delete[] filepos; delete[] filedel; @@ -375,7 +405,13 @@ void FixReaxFFSpecies::Output_ReaxFF_Bonds(bigint ntimestep, FILE * /*fp*/) // point to fix_ave_atom f_SPECBOND->end_of_step(); - if (ntimestep != nvalid) return; + if (ntimestep != nvalid) { + // push back delete_Tcount on every step + if (delete_Nlimit > 0) + for (int i = delete_Nsteps-1; i > 0; i--) + delete_Tcount[i] = delete_Tcount[i-1]; + return; + } nlocal = atom->nlocal; @@ -826,6 +862,15 @@ void FixReaxFFSpecies::WritePos(int Nmole, int Nspec) void FixReaxFFSpecies::DeleteSpecies(int Nmole, int Nspec) { + int ndeletions; + int headroom = -1; + if (delete_Nlimit > 0) { + if (delete_Tcount[delete_Nsteps-1] == -1) return; + ndeletions = delete_Tcount[0] - delete_Tcount[delete_Nsteps-1]; + headroom = MAX(0, delete_Nlimit - ndeletions); + if (headroom == 0) return; + } + int i, j, m, n, itype, cid; int ndel, ndelone, count, count_tmp; int *Nameall; @@ -856,7 +901,23 @@ void FixReaxFFSpecies::DeleteSpecies(int Nmole, int Nspec) int *marklist; memory->create(marklist, nlocal, "reaxff/species:marklist"); - for (m = 1; m <= Nmole; m++) { + std::random_device rnd; + std::minstd_rand park_rng(rnd()); + int *molrange; + memory->create(molrange,Nmole,"reaxff/species:molrange"); + for (m = 0; m < Nmole; m++) + molrange[m] = m + 1; + if (delete_Nlimit > 0) { + // shuffle index when using rate_limit, in case order is biased + if (comm->me == 0) + std::shuffle(&molrange[0],&molrange[Nmole], park_rng); + MPI_Bcast(&molrange[0], Nmole, MPI_INT, 0, world); + } + + int this_delete_Tcount = 0; + for (int mm = 0; mm < Nmole; mm++) { + if (this_delete_Tcount == headroom) break; + m = molrange[mm]; localmass = totalmass = count = nmarklist = 0; for (n = 0; n < ntypes; n++) Name[n] = 0; @@ -896,6 +957,7 @@ void FixReaxFFSpecies::DeleteSpecies(int Nmole, int Nspec) // find corresponding moltype if (totalmass > massmin && totalmass < massmax) { + this_delete_Tcount++; for (j = 0; j < nmarklist; j++) { mark[marklist[j]] = 1; deletecount[Mol2Spec[m - 1]] += 1.0 / (double) count; @@ -905,6 +967,7 @@ void FixReaxFFSpecies::DeleteSpecies(int Nmole, int Nspec) if (count > 0) { for (i = 0; i < ndelspec; i++) { if (del_species[i] == species_str) { + this_delete_Tcount++; for (j = 0; j < nmarklist; j++) { mark[marklist[j]] = 1; deletecount[i] += 1.0 / (double) count; @@ -976,6 +1039,14 @@ void FixReaxFFSpecies::DeleteSpecies(int Nmole, int Nspec) } } + + // push back delete_Tcount on every step + if (delete_Nlimit > 0) { + for (i = delete_Nsteps-1; i > 0; i--) + delete_Tcount[i] = delete_Tcount[i-1]; + delete_Tcount[0] += this_delete_Tcount; + } + if (ndel && (atom->map_style != Atom::MAP_NONE)) { atom->nghost = 0; atom->map_init(); @@ -988,6 +1059,7 @@ void FixReaxFFSpecies::DeleteSpecies(int Nmole, int Nspec) memory->destroy(marklist); memory->destroy(mark); memory->destroy(deletecount); + memory->destroy(molrange); } /* ---------------------------------------------------------------------- */ diff --git a/src/REAXFF/fix_reaxff_species.h b/src/REAXFF/fix_reaxff_species.h index 65eeae4c60..329e17145b 100644 --- a/src/REAXFF/fix_reaxff_species.h +++ b/src/REAXFF/fix_reaxff_species.h @@ -60,6 +60,7 @@ class FixReaxFFSpecies : public Fix { FILE *fp, *pos, *fdel; int eleflag, posflag, multipos, padflag, setupflag; int delflag, specieslistflag, masslimitflag; + int delete_Nlimit, delete_Nsteps, *delete_Tcount; double massmin, massmax; int singlepos_opened, multipos_opened, del_opened; char *ele, **eletype, *filepos, *filedel; diff --git a/src/REPLICA/fix_pimd.cpp b/src/REPLICA/fix_pimd.cpp index 5daff2d643..154f3deecd 100644 --- a/src/REPLICA/fix_pimd.cpp +++ b/src/REPLICA/fix_pimd.cpp @@ -84,14 +84,15 @@ FixPIMD::FixPIMD(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg) else if (strcmp(arg[i + 1], "cmd") == 0) method = CMD; else - error->universe_all(FLERR, "Unknown method parameter for fix pimd"); + error->universe_all(FLERR, fmt::format("Unknown method parameter {} for fix pimd", + arg[i + 1])); } else if (strcmp(arg[i], "fmass") == 0) { fmass = utils::numeric(FLERR, arg[i + 1], false, lmp); - if (fmass < 0.0 || fmass > 1.0) - error->universe_all(FLERR, "Invalid fmass value for fix pimd"); + if ((fmass < 0.0) || (fmass > np)) + error->universe_all(FLERR, fmt::format("Invalid fmass value {} for fix pimd", fmass)); } else if (strcmp(arg[i], "sp") == 0) { sp = utils::numeric(FLERR, arg[i + 1], false, lmp); - if (fmass < 0.0) error->universe_all(FLERR, "Invalid sp value for fix pimd"); + if (sp < 0.0) error->universe_all(FLERR, "Invalid sp value for fix pimd"); } else if (strcmp(arg[i], "temp") == 0) { nhc_temp = utils::numeric(FLERR, arg[i + 1], false, lmp); if (nhc_temp < 0.0) error->universe_all(FLERR, "Invalid temp value for fix pimd"); @@ -120,7 +121,7 @@ FixPIMD::FixPIMD(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg) global_freq = 1; vector_flag = 1; - size_vector = 2; + size_vector = 3; extvector = 1; comm_forward = 3; @@ -135,6 +136,7 @@ FixPIMD::FixPIMD(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg) } /* ---------------------------------------------------------------------- */ + FixPIMD::~FixPIMD() { delete[] mass; @@ -166,6 +168,7 @@ FixPIMD::~FixPIMD() } /* ---------------------------------------------------------------------- */ + int FixPIMD::setmask() { int mask = 0; @@ -215,7 +218,7 @@ void FixPIMD::init() double beta = 1.0 / (Boltzmann * nhc_temp); double _fbond = 1.0 * np / (beta * beta * hbar * hbar); - omega_np = sqrt((double)np) / (hbar * beta) * sqrt(force->mvv2e); + omega_np = sqrt((double) np) / (hbar * beta) * sqrt(force->mvv2e); fbond = -_fbond * force->mvv2e; if (universe->me == 0) @@ -306,7 +309,7 @@ void FixPIMD::nhc_init() nhc_eta_dotdot[i][ichain] = 0.0; nhc_eta_mass[i][ichain] = mass0; if ((method == CMD || method == NMPIMD) && universe->iworld == 0) - ; + ; // do nothing else nhc_eta_mass[i][ichain] *= fmass; } @@ -538,6 +541,8 @@ void FixPIMD::spring_force() double *xlast = buf_beads[x_last]; double *xnext = buf_beads[x_next]; + virial = 0.0; + for (int i = 0; i < nlocal; i++) { double delx1 = xlast[0] - x[i][0]; double dely1 = xlast[1] - x[i][1]; @@ -557,11 +562,13 @@ void FixPIMD::spring_force() double dy = dely1 + dely2; double dz = delz1 + delz2; + virial += -0.5 * (x[i][0] * f[i][0] + x[i][1] * f[i][1] + x[i][2] * f[i][2]); + f[i][0] -= (dx) *ff; f[i][1] -= (dy) *ff; f[i][2] -= (dz) *ff; - spring_energy += (dx * dx + dy * dy + dz * dz); + spring_energy += -0.5 * ff * (delx2 * delx2 + dely2 * dely2 + delz2 * delz2); } } @@ -875,5 +882,6 @@ double FixPIMD::compute_vector(int n) { if (n == 0) { return spring_energy; } if (n == 1) { return t_sys; } + if (n == 2) { return virial; } return 0.0; } diff --git a/src/REPLICA/fix_pimd.h b/src/REPLICA/fix_pimd.h index 384bc2ce25..b96c088efe 100644 --- a/src/REPLICA/fix_pimd.h +++ b/src/REPLICA/fix_pimd.h @@ -57,7 +57,7 @@ class FixPIMD : public Fix { /* ring-polymer model */ - double omega_np, fbond, spring_energy, sp; + double omega_np, fbond, spring_energy, sp, virial; int x_last, x_next; void spring_force(); diff --git a/src/atom.cpp b/src/atom.cpp index 480a779e68..32285758c0 100644 --- a/src/atom.cpp +++ b/src/atom.cpp @@ -2345,6 +2345,18 @@ void Atom::setup_sort_bins() return; } +#ifdef LMP_GPU + if (userbinsize == 0.0) { + auto ifix = dynamic_cast(modify->get_fix_by_id("package_gpu")); + if (ifix) { + const double subx = domain->subhi[0] - domain->sublo[0]; + const double suby = domain->subhi[1] - domain->sublo[1]; + const double subz = domain->subhi[2] - domain->sublo[2]; + binsize = ifix->binsize(subx, suby, subz, atom->nlocal, 0.5 * neighbor->cutneighmax); + } + } +#endif + double bininv = 1.0/binsize; // nbin xyz = local bins diff --git a/src/neighbor.cpp b/src/neighbor.cpp index f2b094ec37..05371c8259 100644 --- a/src/neighbor.cpp +++ b/src/neighbor.cpp @@ -535,6 +535,7 @@ void Neighbor::init() int flag=0; for (int isub=0; isub < ph->nstyles; ++isub) { if (force->pair_match("amoeba",0,isub) + || force->pair_match("hippo",0,isub) || force->pair_match("coul/wolf",0,isub) || force->pair_match("coul/dsf",0,isub) || force->pair_match("coul/exclude",0) @@ -545,6 +546,7 @@ void Neighbor::init() special_flag[1] = special_flag[2] = special_flag[3] = 2; } else { if (force->pair_match("amoeba",0) + || force->pair_match("hippo",0) || force->pair_match("coul/wolf",0) || force->pair_match("coul/dsf",0) || force->pair_match("coul/exclude",0) diff --git a/unittest/force-styles/tests/atomic-pair-meam_ms.yaml b/unittest/force-styles/tests/atomic-pair-meam_ms.yaml new file mode 100644 index 0000000000..e479514017 --- /dev/null +++ b/unittest/force-styles/tests/atomic-pair-meam_ms.yaml @@ -0,0 +1,94 @@ +--- +lammps_version: 22 Dec 2022 +tags: slow +date_generated: Thu Jan 26 15:27:03 2023 +epsilon: 2.5e-12 +skip_tests: +prerequisites: ! | + pair meam/ms +pre_commands: ! | + variable newton_pair delete + if "$(is_active(package,gpu)) > 0.0" then "variable newton_pair index off" else "variable newton_pair index on" +post_commands: ! "" +input_file: in.metal +pair_style: meam/ms +pair_coeff: ! | + * * library.msmeam H Ga4 HGa.msmeam H Ga4 +extract: ! | + scale 2 +natoms: 32 +init_vdwl: 785.6030480758675 +init_coul: 0 +init_stress: ! |2- + 3.3502530994900699e+03 3.6405858278699407e+03 3.6349804214165547e+03 -3.1609283411508039e+02 -7.9448207656135153e+01 -1.9854140603340727e+02 +init_forces: ! |2 + 1 1.2872255079741514e+01 -7.5031848810810864e-01 4.5969595156096510e+01 + 2 -3.9028679722038632e+01 -1.5647800180326567e+02 -1.6643992152928173e+00 + 3 -6.1521549955194672e+01 2.6970968316419874e+02 -9.6866430262650326e+01 + 4 3.1462579880342336e+01 4.0240291291218455e+01 1.1654869213327775e+01 + 5 1.4859248182951113e+01 -3.4132880749392825e+01 6.7430378007130244e+01 + 6 6.4609571260694096e+00 -3.8973222482916441e+01 -2.8510000379627442e+01 + 7 7.8114612113500250e+00 -1.0421431668544374e+01 -4.2887607385766536e+01 + 8 -4.8934215863351795e+01 -6.3567347969802590e-01 1.1845972792272754e+02 + 9 9.4089549606898402e+01 -7.4342942103394511e+00 2.5331198575951383e+01 + 10 1.5130369934140692e+01 -5.9245630928969938e+01 -6.7469126603400198e+01 + 11 -2.5176547213746847e+01 1.1577205529172168e+02 -2.2897457133540517e+01 + 12 6.2237686199502349e+01 2.0501996047945163e+01 -2.8805091517252826e+01 + 13 -5.9438589221526925e+01 3.0453092653824072e+01 -1.9919245831196157e+01 + 14 6.9128305482543766e+01 -7.7400771634148342e+01 3.3376079908119145e+01 + 15 -4.9671207786831857e+01 -4.9520814527298228e+01 8.4325181097614305e+01 + 16 -1.1782591146017666e+01 -3.2478963020209051e+01 1.5503663677714293e+01 + 17 9.0881787245915220e+00 6.2377477671714963e+01 -4.0411006180232363e+01 + 18 -4.2285082775720454e+01 2.4883979527636967e+01 -4.4858149086530510e+00 + 19 -8.0259798420493979e+01 9.6356660229207137e+01 6.0543230952477984e+01 + 20 8.0924547938759346e+01 7.1034504027236025e+01 -7.1958482512489610e+01 + 21 1.0833434220705425e+02 -1.5973910256481020e+02 -2.5432700070393153e+01 + 22 -2.3754601906353900e+00 5.2216955012971823e+01 4.7112051341131576e+00 + 23 -2.7227169255996543e+01 8.1968603165764222e+01 4.6535834898716878e+01 + 24 -2.9230758067555616e+01 6.5909555829367733e+01 -2.8250697734131258e+01 + 25 -5.1310041582953993e+01 -3.0895272949222822e+01 -5.4271286813003794e+00 + 26 3.9605941911194620e+01 -5.5919050176828883e+01 -1.0209061328106253e+01 + 27 8.2934427989660890e+01 6.1956200199325636e+01 5.0072108788590960e+01 + 28 -7.8572755094413296e+01 -3.9613391730681300e+01 -2.6183413623428891e+00 + 29 6.9475725072041925e+01 -6.0535433603583563e+01 -1.4566536349135829e+01 + 30 -2.4347184151182930e+01 -1.9359391333689970e+02 -2.6718379302915952e+01 + 31 7.7351971629808688e+01 -7.0102650745312999e+01 -5.4615048867524763e+01 + 32 -1.5060591772899014e+02 8.4489763988097266e+01 2.9799482293372058e+01 +run_vdwl: 682.3107192428497 +run_coul: 0 +run_stress: ! |2- + 3.2247564044913129e+03 3.3749506031067485e+03 3.3223794967215117e+03 -2.8460979167554797e+02 -7.2614457076660575e+00 -3.1510685747732862e+02 +run_forces: ! |2 + 1 -1.2037185973996296e+01 -2.5090364403764944e+01 1.4014184973113366e+01 + 2 -3.7365848425239264e+01 -1.5871199357658887e+02 3.7846333470446991e+00 + 3 -3.2057228694304293e+01 2.5316344962361612e+02 -6.0679585186816752e+01 + 4 2.9086197614116237e+01 4.8267528016068823e+01 4.3387429619749920e+00 + 5 -1.1672554618399744e+01 -2.6840760926124332e+01 4.9694308545223279e+01 + 6 1.1892092913978592e+01 -4.9360840569608243e+01 -2.3083171938147949e+01 + 7 2.1084251901459215e+01 -4.8251731643401072e+00 -3.8474871193885967e+01 + 8 -5.7775944085787714e+01 1.3522956442661442e+01 1.1661345819661486e+02 + 9 7.2926105059437930e+01 4.8686056096860133e+00 2.3817134806042311e+01 + 10 1.7307367990304396e+01 -3.0865570121704572e+01 -1.2314307646704794e+01 + 11 -1.1341297645054201e+01 9.1441145595173211e+01 -2.1806407500802493e+01 + 12 4.0645024127126625e+01 1.2207243511090397e+01 -2.6757649464936929e+01 + 13 -5.2283270287937697e+01 3.4023912643812679e+01 -1.9030352703627774e+01 + 14 8.4403128243303399e+01 -9.3773678297574406e+01 1.6481720093363641e+01 + 15 -4.2790833192154764e+01 -4.3242943642279130e+01 7.1075696811865868e+01 + 16 -1.5041912007490836e+01 -3.3544044565611586e+01 2.4823109532967212e+01 + 17 -9.6413207346836316e-01 4.5826021602656141e+01 -3.9155163702194102e+01 + 18 -2.0337015515785971e+01 7.2815285567550134e+00 -8.2049879725129813e+00 + 19 -6.4105384732081120e+01 1.1564665740933788e+02 2.4163791756721466e+01 + 20 8.5723654185276146e+01 8.3354105531647818e+01 -6.6380939444134356e+01 + 21 7.2614253221132458e+01 -1.0858997173537107e+02 -9.7505297776024449e+00 + 22 -7.0420361713052930e+00 5.3431098224890221e+01 3.3089063930822551e+00 + 23 -2.6591358240682062e+01 5.7408565880721866e+01 2.7437106471305679e+01 + 24 -4.1792038450554799e+01 5.1730557789864775e+01 -4.0814677464080816e+01 + 25 -4.1432062506590214e+01 -2.5839213423062226e+01 4.2240164846210408e+00 + 26 4.7210066329871566e+01 -5.2462761136081880e+01 -7.3222050314410501e+00 + 27 7.1880187551772764e+01 6.4264938765955392e+01 4.3600944370341068e+01 + 28 -8.4540787660053340e+01 -3.5402262816619938e+01 -1.8100280797937039e+01 + 29 6.9538301274653790e+01 -6.3441028093040622e+01 -1.4636386232064458e+01 + 30 -1.0347208112535196e+01 -1.7647584813608077e+02 7.2581082578181517e+00 + 31 5.5139777976761025e+01 -4.2081916983382541e+01 -4.6602437208067727e+01 + 32 -1.0993230999577290e+02 3.4110056387297462e+01 1.8478090262857769e+01 +...