Merge branch 'develop'
This commit is contained in:
@ -99,8 +99,15 @@ function(check_for_autogen_files source_dir)
|
||||
endfunction()
|
||||
|
||||
macro(pkg_depends PKG1 PKG2)
|
||||
if(PKG_${PKG1} AND NOT (PKG_${PKG2} OR BUILD_${PKG2}))
|
||||
message(FATAL_ERROR "The ${PKG1} package needs LAMMPS to be built with the ${PKG2} package")
|
||||
if(DEFINED BUILD_${PKG2})
|
||||
if(PKG_${PKG1} AND NOT BUILD_${PKG2})
|
||||
message(FATAL_ERROR "The ${PKG1} package needs LAMMPS to be built with -D BUILD_${PKG2}=ON")
|
||||
endif()
|
||||
elseif(DEFINED PKG_${PKG2})
|
||||
if(PKG_${PKG1} AND NOT PKG_${PKG2})
|
||||
message(WARNING "The ${PKG1} package depends on the ${PKG2} package. Enabling it.")
|
||||
set(PKG_${PKG2} ON CACHE BOOL "" FORCE)
|
||||
endif()
|
||||
endif()
|
||||
endmacro()
|
||||
|
||||
|
||||
@ -1,4 +1,9 @@
|
||||
find_package(ZLIB REQUIRED)
|
||||
find_package(ZLIB)
|
||||
if(NOT ZLIB_FOUND)
|
||||
message(WARNING "No Zlib development support found. Disabling COMPRESS package...")
|
||||
set(PKG_COMPRESS OFF CACHE BOOL "" FORCE)
|
||||
return()
|
||||
endif()
|
||||
target_link_libraries(lammps PRIVATE ZLIB::ZLIB)
|
||||
|
||||
find_package(PkgConfig QUIET)
|
||||
|
||||
@ -26,6 +26,19 @@ elseif(GPU_PREC STREQUAL "SINGLE")
|
||||
set(GPU_PREC_SETTING "SINGLE_SINGLE")
|
||||
endif()
|
||||
|
||||
option(GPU_DEBUG "Enable debugging code of the GPU package" OFF)
|
||||
mark_as_advanced(GPU_DEBUG)
|
||||
|
||||
if(PKG_AMOEBA AND FFT_SINGLE)
|
||||
message(FATAL_ERROR "GPU acceleration of AMOEBA is not (yet) compatible with single precision FFT")
|
||||
endif()
|
||||
|
||||
if (PKG_AMOEBA)
|
||||
list(APPEND GPU_SOURCES
|
||||
${GPU_SOURCES_DIR}/amoeba_convolution_gpu.h
|
||||
${GPU_SOURCES_DIR}/amoeba_convolution_gpu.cpp)
|
||||
endif()
|
||||
|
||||
file(GLOB GPU_LIB_SOURCES ${CONFIGURE_DEPENDS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/[^.]*.cpp)
|
||||
file(MAKE_DIRECTORY ${LAMMPS_LIB_BINARY_DIR}/gpu)
|
||||
|
||||
@ -151,7 +164,12 @@ if(GPU_API STREQUAL "CUDA")
|
||||
add_library(gpu STATIC ${GPU_LIB_SOURCES} ${GPU_LIB_CUDPP_SOURCES} ${GPU_OBJS})
|
||||
target_link_libraries(gpu PRIVATE ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY})
|
||||
target_include_directories(gpu PRIVATE ${LAMMPS_LIB_BINARY_DIR}/gpu ${CUDA_INCLUDE_DIRS})
|
||||
target_compile_definitions(gpu PRIVATE -DUSE_CUDA -D_${GPU_PREC_SETTING} -DMPI_GERYON -DUCL_NO_EXIT ${GPU_CUDA_MPS_FLAGS})
|
||||
target_compile_definitions(gpu PRIVATE -DUSE_CUDA -D_${GPU_PREC_SETTING} ${GPU_CUDA_MPS_FLAGS})
|
||||
if(GPU_DEBUG)
|
||||
target_compile_definitions(gpu PRIVATE -DUCL_DEBUG -DGERYON_KERNEL_DUMP)
|
||||
else()
|
||||
target_compile_definitions(gpu PRIVATE -DMPI_GERYON -DUCL_NO_EXIT)
|
||||
endif()
|
||||
if(CUDPP_OPT)
|
||||
target_include_directories(gpu PRIVATE ${LAMMPS_LIB_SOURCE_DIR}/gpu/cudpp_mini)
|
||||
target_compile_definitions(gpu PRIVATE -DUSE_CUDPP)
|
||||
@ -192,6 +210,7 @@ elseif(GPU_API STREQUAL "OPENCL")
|
||||
${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff.cu
|
||||
${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_zbl.cu
|
||||
${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_mod.cu
|
||||
${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_hippo.cu
|
||||
)
|
||||
|
||||
foreach(GPU_KERNEL ${GPU_LIB_CU})
|
||||
@ -208,6 +227,7 @@ elseif(GPU_API STREQUAL "OPENCL")
|
||||
GenerateOpenCLHeader(tersoff ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff.cu)
|
||||
GenerateOpenCLHeader(tersoff_zbl ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_zbl_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_zbl_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_zbl.cu)
|
||||
GenerateOpenCLHeader(tersoff_mod ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_mod_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_mod_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_mod.cu)
|
||||
GenerateOpenCLHeader(hippo ${CMAKE_CURRENT_BINARY_DIR}/gpu/hippo_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_hippo_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_hippo.cu)
|
||||
|
||||
list(APPEND GPU_LIB_SOURCES
|
||||
${CMAKE_CURRENT_BINARY_DIR}/gpu/gayberne_cl.h
|
||||
@ -217,14 +237,18 @@ elseif(GPU_API STREQUAL "OPENCL")
|
||||
${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_cl.h
|
||||
${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_zbl_cl.h
|
||||
${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_mod_cl.h
|
||||
${CMAKE_CURRENT_BINARY_DIR}/gpu/hippo_cl.h
|
||||
)
|
||||
|
||||
add_library(gpu STATIC ${GPU_LIB_SOURCES})
|
||||
target_link_libraries(gpu PRIVATE OpenCL::OpenCL)
|
||||
target_include_directories(gpu PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/gpu)
|
||||
target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT)
|
||||
target_compile_definitions(gpu PRIVATE -DUSE_OPENCL)
|
||||
|
||||
target_compile_definitions(gpu PRIVATE -DUSE_OPENCL -D_${GPU_PREC_SETTING})
|
||||
if(GPU_DEBUG)
|
||||
target_compile_definitions(gpu PRIVATE -DUCL_DEBUG -DGERYON_KERNEL_DUMP)
|
||||
else()
|
||||
target_compile_definitions(gpu PRIVATE -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT)
|
||||
endif()
|
||||
target_link_libraries(lammps PRIVATE gpu)
|
||||
|
||||
add_executable(ocl_get_devices ${LAMMPS_LIB_SOURCE_DIR}/gpu/geryon/ucl_get_devices.cpp)
|
||||
@ -374,8 +398,12 @@ elseif(GPU_API STREQUAL "HIP")
|
||||
|
||||
add_library(gpu STATIC ${GPU_LIB_SOURCES})
|
||||
target_include_directories(gpu PRIVATE ${LAMMPS_LIB_BINARY_DIR}/gpu)
|
||||
target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -DMPI_GERYON -DUCL_NO_EXIT)
|
||||
target_compile_definitions(gpu PRIVATE -DUSE_HIP)
|
||||
target_compile_definitions(gpu PRIVATE -DUSE_HIP -D_${GPU_PREC_SETTING})
|
||||
if(GPU_DEBUG)
|
||||
target_compile_definitions(gpu PRIVATE -DUCL_DEBUG -DGERYON_KERNEL_DUMP)
|
||||
else()
|
||||
target_compile_definitions(gpu PRIVATE -DMPI_GERYON -DUCL_NO_EXIT)
|
||||
endif()
|
||||
target_link_libraries(gpu PRIVATE hip::host)
|
||||
|
||||
if(HIP_USE_DEVICE_SORT)
|
||||
|
||||
@ -144,6 +144,7 @@ if(PKG_ML-IAP)
|
||||
${KOKKOS_PKG_SOURCES_DIR}/mliap_descriptor_so3_kokkos.cpp
|
||||
${KOKKOS_PKG_SOURCES_DIR}/mliap_model_linear_kokkos.cpp
|
||||
${KOKKOS_PKG_SOURCES_DIR}/mliap_model_python_kokkos.cpp
|
||||
${KOKKOS_PKG_SOURCES_DIR}/mliap_unified_kokkos.cpp
|
||||
${KOKKOS_PKG_SOURCES_DIR}/mliap_so3_kokkos.cpp)
|
||||
|
||||
# Add KOKKOS version of ML-IAP Python coupling if non-KOKKOS version is included
|
||||
|
||||
@ -126,10 +126,11 @@ CMake build
|
||||
-D GPU_API=value # value = opencl (default) or cuda or hip
|
||||
-D GPU_PREC=value # precision setting
|
||||
# value = double or mixed (default) or single
|
||||
-D HIP_PATH # path to HIP installation. Must be set if GPU_API=HIP
|
||||
-D GPU_ARCH=value # primary GPU hardware choice for GPU_API=cuda
|
||||
# value = sm_XX, see below
|
||||
# default is sm_50
|
||||
# value = sm_XX (see below, default is sm_50)
|
||||
-D GPU_DEBUG=value # enable debug code in the GPU package library, mostly useful for developers
|
||||
# value = yes or no (default)
|
||||
-D HIP_PATH=value # value = path to HIP installation. Must be set if GPU_API=HIP
|
||||
-D HIP_ARCH=value # primary GPU hardware choice for GPU_API=hip
|
||||
# value depends on selected HIP_PLATFORM
|
||||
# default is 'gfx906' for HIP_PLATFORM=amd and 'sm_50' for HIP_PLATFORM=nvcc
|
||||
|
||||
@ -39,7 +39,7 @@ OPT.
|
||||
* :doc:`agni (o) <pair_agni>`
|
||||
* :doc:`airebo (io) <pair_airebo>`
|
||||
* :doc:`airebo/morse (io) <pair_airebo>`
|
||||
* :doc:`amoeba <pair_amoeba>`
|
||||
* :doc:`amoeba (g) <pair_amoeba>`
|
||||
* :doc:`atm <pair_atm>`
|
||||
* :doc:`awpmd/cut <pair_awpmd>`
|
||||
* :doc:`beck (go) <pair_beck>`
|
||||
@ -126,7 +126,7 @@ OPT.
|
||||
* :doc:`hbond/dreiding/lj (o) <pair_hbond_dreiding>`
|
||||
* :doc:`hbond/dreiding/morse (o) <pair_hbond_dreiding>`
|
||||
* :doc:`hdnnp <pair_hdnnp>`
|
||||
* :doc:`hippo <pair_amoeba>`
|
||||
* :doc:`hippo (g) <pair_amoeba>`
|
||||
* :doc:`ilp/graphene/hbn (t) <pair_ilp_graphene_hbn>`
|
||||
* :doc:`ilp/tmd (t) <pair_ilp_tmd>`
|
||||
* :doc:`kolmogorov/crespi/full <pair_kolmogorov_crespi_full>`
|
||||
@ -200,6 +200,7 @@ OPT.
|
||||
* :doc:`mdpd <pair_mesodpd>`
|
||||
* :doc:`mdpd/rhosum <pair_mesodpd>`
|
||||
* :doc:`meam (k) <pair_meam>`
|
||||
* :doc:`meam/ms (k) <pair_meam>`
|
||||
* :doc:`meam/spline (o) <pair_meam_spline>`
|
||||
* :doc:`meam/sw/spline <pair_meam_sw_spline>`
|
||||
* :doc:`mesocnt <pair_mesocnt>`
|
||||
|
||||
@ -149,6 +149,34 @@ related tasks for each of the partitions, e.g.
|
||||
restart 1000 system_${ibead}.restart1 system_${ibead}.restart2
|
||||
read_restart system_${ibead}.restart2
|
||||
|
||||
Restart, fix_modify, output, run start/stop, minimize info
|
||||
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
||||
|
||||
This fix writes the state of the Nose/Hoover thermostat over all
|
||||
quasi-beads to :doc:`binary restart files <restart>`. See the
|
||||
:doc:`read_restart <read_restart>` command for info on how to re-specify
|
||||
a fix in an input script that reads a restart file, so that the
|
||||
operation of the fix continues in an uninterrupted fashion.
|
||||
|
||||
None of the :doc:`fix_modify <fix_modify>` options
|
||||
are relevant to this fix.
|
||||
|
||||
This fix computes a global 3-vector, which can be accessed by various
|
||||
:doc:`output commands <Howto_output>`. The three quantities in the
|
||||
global vector are
|
||||
|
||||
#. the total spring energy of the quasi-beads,
|
||||
#. the current temperature of the classical system of ring polymers,
|
||||
#. the current value of the scalar virial estimator for the kinetic
|
||||
energy of the quantum system :ref:`(Herman) <Herman>`.
|
||||
|
||||
The vector values calculated by this fix are "extensive", except for the
|
||||
temperature, which is "intensive".
|
||||
|
||||
No parameter of this fix can be used with the *start/stop* keywords of
|
||||
the :doc:`run <run>` command. This fix is not invoked during
|
||||
:doc:`energy minimization <minimize>`.
|
||||
|
||||
Restrictions
|
||||
""""""""""""
|
||||
|
||||
@ -204,3 +232,8 @@ Path Integrals, McGraw-Hill, New York (1965).
|
||||
|
||||
**(Calhoun)** A. Calhoun, M. Pavese, G. Voth, Chem Phys Letters, 262,
|
||||
415 (1996).
|
||||
|
||||
.. _Herman:
|
||||
|
||||
**(Herman)** M. F. Herman, E. J. Bruskin, B. J. Berne, J Chem Phys, 76, 5150 (1982).
|
||||
|
||||
|
||||
@ -39,6 +39,9 @@ Syntax
|
||||
*masslimit* value = massmin massmax
|
||||
massmin = minimum molecular weight of species to delete
|
||||
massmax = maximum molecular weight of species to delete
|
||||
*delete_rate_limit* value = Nlimit Nsteps
|
||||
Nlimit = maximum number of deletions allowed to occur within interval
|
||||
Nsteps = the interval (number of timesteps) over which to count deletions
|
||||
|
||||
Examples
|
||||
""""""""
|
||||
@ -142,7 +145,13 @@ When using the *masslimit* keyword, each line of the *filedel* file
|
||||
contains the timestep on which deletions occurs, followed by how many
|
||||
of each species are deleted (with quantities preceding chemical
|
||||
formulae). The *specieslist* and *masslimit* keywords cannot both be
|
||||
used in the same *reaxff/species* fix.
|
||||
used in the same *reaxff/species* fix. The *delete_rate_limit*
|
||||
keyword can enforce an upper limit on the overall rate of molecule
|
||||
deletion. The number of deletion occurrences is limited to Nlimit
|
||||
within an interval of Nsteps timesteps. When using the
|
||||
*delete_rate_limit* keyword, no deletions are permitted to occur
|
||||
within the first Nsteps timesteps of the first run (after reading a
|
||||
either a data or restart file).
|
||||
|
||||
----------
|
||||
|
||||
|
||||
@ -732,8 +732,8 @@ choices:
|
||||
|
||||
* Use one of the 4 NPT or NPH styles for the rigid bodies. Use the
|
||||
*dilate* all option so that it will dilate the positions of the
|
||||
*non-rigid particles as well. Use :doc:`fix nvt <fix_nh>` (or any
|
||||
*other thermostat) for the non-rigid particles.
|
||||
non-rigid particles as well. Use :doc:`fix nvt <fix_nh>` (or any
|
||||
other thermostat) for the non-rigid particles.
|
||||
* Use :doc:`fix npt <fix_nh>` for the group of non-rigid particles. Use
|
||||
the *dilate* all option so that it will dilate the center-of-mass
|
||||
positions of the rigid bodies as well. Use one of the 4 NVE or 2 NVT
|
||||
|
||||
@ -1,11 +1,18 @@
|
||||
.. index:: pair_style amoeba
|
||||
.. index:: pair_style amoeba/gpu
|
||||
.. index:: pair_style hippo
|
||||
.. index:: pair_style hippo/gpu
|
||||
|
||||
pair_style amoeba command
|
||||
=========================
|
||||
|
||||
Accelerator Variants: *amoeba/gpu*
|
||||
|
||||
pair_style hippo command
|
||||
========================
|
||||
|
||||
Accelerator Variants: *hippo/gpu*
|
||||
|
||||
Syntax
|
||||
""""""
|
||||
|
||||
@ -127,6 +134,10 @@ version discussed in :ref:`(Ponder) <amoeba-Ponder>`, :ref:`(Ren)
|
||||
implementation of HIPPO in LAMMPS matches the version discussed in
|
||||
:ref:`(Rackers) <amoeba-Rackers>`.
|
||||
|
||||
.. versionadded:: TBD
|
||||
|
||||
Accelerator support via the GPU package is available.
|
||||
|
||||
----------
|
||||
|
||||
Only a single pair_coeff command is used with either the *amoeba* and
|
||||
@ -187,6 +198,19 @@ These pair styles can only be used via the *pair* keyword of the
|
||||
|
||||
----------
|
||||
|
||||
.. include:: accel_styles.rst
|
||||
|
||||
.. note::
|
||||
|
||||
Using the GPU accelerated pair styles 'amoeba/gpu' or 'hippo/gpu'
|
||||
when compiling the GPU package for OpenCL has a few known issues
|
||||
when running on integrated GPUs and the calculation may crash.
|
||||
|
||||
The GPU accelerated pair styles are also not (yet) compatible
|
||||
with single precision FFTs.
|
||||
|
||||
----------
|
||||
|
||||
Restrictions
|
||||
""""""""""""
|
||||
|
||||
|
||||
@ -1,17 +1,26 @@
|
||||
.. index:: pair_style meam
|
||||
.. index:: pair_style meam/kk
|
||||
.. index:: pair_style meam/ms
|
||||
.. index:: pair_style meam/ms/kk
|
||||
|
||||
pair_style meam command
|
||||
=========================
|
||||
|
||||
Accelerator Variants: *meam/kk*
|
||||
|
||||
pair_style meam/ms command
|
||||
==========================
|
||||
|
||||
Accelerator Variants: *meam/ms/kk*
|
||||
|
||||
Syntax
|
||||
""""""
|
||||
|
||||
.. code-block:: LAMMPS
|
||||
|
||||
pair_style meam
|
||||
pair_style style
|
||||
|
||||
* style = *meam* or *meam/ms*
|
||||
|
||||
Examples
|
||||
""""""""
|
||||
@ -22,6 +31,9 @@ Examples
|
||||
pair_coeff * * ../potentials/library.meam Si ../potentials/si.meam Si
|
||||
pair_coeff * * ../potentials/library.meam Ni Al NULL Ni Al Ni Ni
|
||||
|
||||
pair_style meam/ms
|
||||
pair_coeff * * ../potentials/library.msmeam H Ga ../potentials/HGa.meam H Ga
|
||||
|
||||
Description
|
||||
"""""""""""
|
||||
|
||||
@ -31,16 +43,23 @@ Description
|
||||
as of November 2010; see description below of the mixture_ref_t
|
||||
parameter
|
||||
|
||||
Pair style *meam* computes non-bonded interactions for a variety of materials
|
||||
using the modified embedded-atom method (MEAM)
|
||||
:ref:`(Baskes) <Baskes>`. Conceptually, it is an extension to the original
|
||||
:doc:`EAM method <pair_eam>` which adds angular forces. It is
|
||||
thus suitable for modeling metals and alloys with fcc, bcc, hcp and
|
||||
diamond cubic structures, as well as materials with covalent interactions
|
||||
like silicon and carbon. This *meam* pair style is a translation of the
|
||||
original Fortran version to C++. It is functionally equivalent but more
|
||||
efficient and has additional features. The Fortran version of the *meam*
|
||||
pair style has been removed from LAMMPS after the 12 December 2018 release.
|
||||
Pair style *meam* computes non-bonded interactions for a variety of
|
||||
materials using the modified embedded-atom method (MEAM) :ref:`(Baskes)
|
||||
<Baskes>`. Conceptually, it is an extension to the original :doc:`EAM
|
||||
method <pair_eam>` which adds angular forces. It is thus suitable for
|
||||
modeling metals and alloys with fcc, bcc, hcp and diamond cubic
|
||||
structures, as well as materials with covalent interactions like silicon
|
||||
and carbon.
|
||||
|
||||
The *meam* pair style is a translation of the original Fortran version
|
||||
to C++. It is functionally equivalent but more efficient and has
|
||||
additional features. The Fortran version of the *meam* pair style has
|
||||
been removed from LAMMPS after the 12 December 2018 release.
|
||||
|
||||
Pair style *meam/ms* uses the multi-state MEAM (MS-MEAM) method
|
||||
according to :ref:`(Baskes2) <Baskes2>`, which is an extension to MEAM.
|
||||
This pair style is mostly equivalent to *meam* and differs only
|
||||
where noted in the documentation below.
|
||||
|
||||
In the MEAM formulation, the total energy E of a system of atoms is
|
||||
given by:
|
||||
@ -351,6 +370,16 @@ Most published MEAM parameter sets use the default values *attrac* = *repulse* =
|
||||
Setting *repuls* = *attrac* = *delta* corresponds to the form used in several
|
||||
recent published MEAM parameter sets, such as :ref:`(Valone) <Valone>`
|
||||
|
||||
Then using *meam/ms* pair style the multi-state MEAM (MS-MEAM) method is
|
||||
activated. This requires 6 extra parameters in the MEAM library file,
|
||||
resulting in 25 parameters ordered that are ordered like this:
|
||||
|
||||
elt, lat, z, ielement, atwt, alpha, b0, b1, b2, b3, b1m, b2m, b3m, alat, esub, asub,
|
||||
t0, t1, t2, t3, t1m, t2m, t3m, rozero, ibar
|
||||
|
||||
The 6 extra MS-MEAM parameters are *b1m, b2m, b3m, t1m, t2m, t3m*.
|
||||
In the LAMMPS ``potentials`` folder, compatible files have an ".msmeam" extension.
|
||||
|
||||
----------
|
||||
|
||||
.. include:: accel_styles.rst
|
||||
@ -393,16 +422,15 @@ This pair style can only be used via the *pair* keyword of the
|
||||
Restrictions
|
||||
""""""""""""
|
||||
|
||||
The *meam* style is provided in the MEAM package. It is
|
||||
only enabled if LAMMPS was built with that package.
|
||||
The *meam* and *meam/ms* pair styles are provided in the MEAM
|
||||
package. They are only enabled if LAMMPS was built with that package.
|
||||
See the :doc:`Build package <Build_package>` page for more info.
|
||||
|
||||
The maximum number of elements, that can be read from the MEAM
|
||||
library file, is determined at compile time. The default is 5.
|
||||
If you need support for more elements, you have to change the
|
||||
define for the constant 'maxelt' at the beginning of the file
|
||||
src/MEAM/meam.h and update/recompile LAMMPS. There is no
|
||||
limit on the number of atoms types.
|
||||
The maximum number of elements, that can be read from the MEAM library
|
||||
file, is determined at compile time. The default is 5. If you need
|
||||
support for more elements, you have to change the the constant 'maxelt'
|
||||
at the beginning of the file ``src/MEAM/meam.h`` and update/recompile
|
||||
LAMMPS. There is no limit on the number of atoms types.
|
||||
|
||||
Related commands
|
||||
""""""""""""""""
|
||||
@ -421,6 +449,10 @@ none
|
||||
|
||||
**(Baskes)** Baskes, Phys Rev B, 46, 2727-2742 (1992).
|
||||
|
||||
.. _Baskes2:
|
||||
|
||||
**(Baskes2)** Baskes, Phys Rev B, 75, 094113 (2007).
|
||||
|
||||
.. _Gullet:
|
||||
|
||||
**(Gullet)** Gullet, Wagner, Slepoy, SANDIA Report 2003-8782 (2003). DOI:10.2172/918395
|
||||
|
||||
@ -277,7 +277,8 @@ accelerated styles exist.
|
||||
* :doc:`lubricateU/poly <pair_lubricateU>` - hydrodynamic lubrication forces for Fast Lubrication with polydispersity
|
||||
* :doc:`mdpd <pair_mesodpd>` - mDPD particle interactions
|
||||
* :doc:`mdpd/rhosum <pair_mesodpd>` - mDPD particle interactions for mass density
|
||||
* :doc:`meam <pair_meam>` - modified embedded atom method (MEAM) in C
|
||||
* :doc:`meam <pair_meam>` - modified embedded atom method (MEAM)
|
||||
* :doc:`meam/ms <pair_meam>` - multi-state modified embedded atom method (MS-MEAM)
|
||||
* :doc:`meam/spline <pair_meam_spline>` - splined version of MEAM
|
||||
* :doc:`meam/sw/spline <pair_meam_sw_spline>` - splined version of MEAM with a Stillinger-Weber term
|
||||
* :doc:`mesocnt <pair_mesocnt>` - mesoscopic vdW potential for (carbon) nanotubes
|
||||
|
||||
30
examples/meam/msmeam/HGa.meam
Normal file
30
examples/meam/msmeam/HGa.meam
Normal file
@ -0,0 +1,30 @@
|
||||
bkgd_dyn = 1
|
||||
emb_lin_neg = 1
|
||||
augt1=0
|
||||
ialloy=1
|
||||
rc = 5.9
|
||||
#H
|
||||
attrac(1,1)=0.460
|
||||
repuls(1,1)=0.460
|
||||
Cmin(1,1,1)=1.3 # PuMS
|
||||
Cmax(1,1,1)= 2.80
|
||||
nn2(1,1)=1
|
||||
#Ga
|
||||
rho0(2) = 0.6
|
||||
attrac(2,2)=0.097
|
||||
repuls(2,2)=0.097
|
||||
nn2(2,2)=1
|
||||
#HGa
|
||||
attrac(1,2)=0.300
|
||||
repuls(1,2)=0.300
|
||||
lattce(1,2)=l12
|
||||
re(1,2)=3.19
|
||||
delta(1,2)=-0.48
|
||||
alpha(1,2)=6.6
|
||||
Cmin(1,1,2)=2.0
|
||||
Cmin(2,1,2)= 2.0
|
||||
Cmin(1,2,1)=2.0
|
||||
Cmin(2,2,1) = 1.4
|
||||
Cmin(1,2,2) = 1.4
|
||||
Cmin(1,1,2) = 1.4
|
||||
nn2(1,2)=1
|
||||
9
examples/meam/msmeam/README.md
Normal file
9
examples/meam/msmeam/README.md
Normal file
@ -0,0 +1,9 @@
|
||||
To run Baske's test, do
|
||||
|
||||
lmp -in in.msmeam
|
||||
|
||||
Then
|
||||
|
||||
diff dump.msmeam dump.msmeam.bu
|
||||
|
||||
|
||||
25
examples/meam/msmeam/data.msmeam.bu
Normal file
25
examples/meam/msmeam/data.msmeam.bu
Normal file
@ -0,0 +1,25 @@
|
||||
LAMMPS data file via write_data, version 16 Feb 2016, timestep = 1
|
||||
|
||||
3 atoms
|
||||
2 atom types
|
||||
|
||||
-4.0000000000000000e+00 4.0000000000000000e+00 xlo xhi
|
||||
-4.0000000000000000e+00 4.0000000000000000e+00 ylo yhi
|
||||
-4.0000000000000000e+00 4.0000000000000000e+00 zlo zhi
|
||||
|
||||
Masses
|
||||
|
||||
1 1.0079
|
||||
2 69.723
|
||||
|
||||
Atoms # atomic
|
||||
|
||||
1 1 0.0000000000000000e+00 0.0000000000000000e+00 0.0000000000000000e+00 0 0 0
|
||||
2 2 2.2000000000000002e+00 0.0000000000000000e+00 0.0000000000000000e+00 0 0 0
|
||||
3 2 2.9999999999999999e-01 2.2999999999999998e+00 0.0000000000000000e+00 0 0 0
|
||||
|
||||
Velocities
|
||||
|
||||
1 0.0000000000000000e+00 0.0000000000000000e+00 0.0000000000000000e+00
|
||||
2 0.0000000000000000e+00 0.0000000000000000e+00 0.0000000000000000e+00
|
||||
3 0.0000000000000000e+00 0.0000000000000000e+00 0.0000000000000000e+00
|
||||
24
examples/meam/msmeam/dump.msmeam.bu
Normal file
24
examples/meam/msmeam/dump.msmeam.bu
Normal file
@ -0,0 +1,24 @@
|
||||
ITEM: TIMESTEP
|
||||
0
|
||||
ITEM: NUMBER OF ATOMS
|
||||
3
|
||||
ITEM: BOX BOUNDS pp pp pp
|
||||
-4 4
|
||||
-4 4
|
||||
-4 4
|
||||
ITEM: ATOMS id x y z fx fy fz c_pot_energy c_stress[1] c_stress[2] c_stress[3] c_stress[4] c_stress[5] c_stress[6]
|
||||
1 0 0 0 -131.925 -88.3005 0 22.9153 -2.147e+08 -1.62661e+08 -0 -2.05301e+07 -0 -0
|
||||
2 2.2 0 0 120.809 -0.482171 0 14.7692 -2.12028e+08 -0 -0 403352 -0 -0
|
||||
3 0.3 2.3 0 11.1159 88.7827 0 8.61478 -2.67145e+06 -1.62661e+08 -0 -2.09335e+07 -0 -0
|
||||
ITEM: TIMESTEP
|
||||
1
|
||||
ITEM: NUMBER OF ATOMS
|
||||
3
|
||||
ITEM: BOX BOUNDS pp pp pp
|
||||
-4 4
|
||||
-4 4
|
||||
-4 4
|
||||
ITEM: ATOMS id x y z fx fy fz c_pot_energy c_stress[1] c_stress[2] c_stress[3] c_stress[4] c_stress[5] c_stress[6]
|
||||
1 0 0 0 -131.925 -88.3005 0 22.9153 -2.147e+08 -1.62661e+08 -0 -2.05301e+07 -0 -0
|
||||
2 2.2 0 0 120.809 -0.482171 0 14.7692 -2.12028e+08 -0 -0 403352 -0 -0
|
||||
3 0.3 2.3 0 11.1159 88.7827 0 8.61478 -2.67145e+06 -1.62661e+08 -0 -2.09335e+07 -0 -0
|
||||
31
examples/meam/msmeam/in.msmeam
Normal file
31
examples/meam/msmeam/in.msmeam
Normal file
@ -0,0 +1,31 @@
|
||||
echo both
|
||||
log log.msmeam
|
||||
# Test of MEAM potential for HGa
|
||||
|
||||
# ------------------------ INITIALIZATION ----------------------------
|
||||
units metal
|
||||
dimension 3
|
||||
boundary p p p
|
||||
atom_style atomic
|
||||
variable latparam equal 4.646
|
||||
variable ncell equal 3
|
||||
|
||||
# ----------------------- ATOM DEFINITION ----------------------------
|
||||
region box block -4 4 -4 4 -4 4
|
||||
create_box 2 box
|
||||
|
||||
#
|
||||
|
||||
include potential.mod
|
||||
create_atoms 1 single 0 0 0 units box
|
||||
create_atoms 2 single 2.2 0 0 units box
|
||||
create_atoms 2 single 0.3 2.3 0 units box
|
||||
# ---------- Define Settings ---------------------
|
||||
variable teng equal "c_eatoms"
|
||||
compute pot_energy all pe/atom
|
||||
compute stress all stress/atom NULL
|
||||
dump 1 all custom 1 dump.msmeam id x y z fx fy fz c_pot_energy c_stress[1] c_stress[2] c_stress[3] c_stress[4] c_stress[5] c_stress[6]
|
||||
run 1
|
||||
write_data data.msmeam
|
||||
|
||||
print "All done!"
|
||||
14
examples/meam/msmeam/library.msmeam
Normal file
14
examples/meam/msmeam/library.msmeam
Normal file
@ -0,0 +1,14 @@
|
||||
# DATE: 2018-09-22 UNITS: metal CONTRIBUTOR: Steve Valone, smv@lanl.gov CITATION: Baskes, PRB 1992; smv, sr, mib, JNM 2010
|
||||
# ms-meam data format May 2010
|
||||
# elt lat z ielement atwt
|
||||
# alpha b0 b1 b2 b3 b1m b2m b3m alat esub asub
|
||||
# - t0 t1 t2 t3 t1m t2m t3m rozero ibar
|
||||
# NOTE: leading character cannot be a space
|
||||
|
||||
'H' 'dim' 1.0 1 1.0079
|
||||
2.960 2.960 3.0 1.0 1.0 1.0 3.0 1.0 0.741 2.235 2.50
|
||||
1.0 0.44721 0.0 0.00 0.0 0.31623 0 6.70 0
|
||||
|
||||
'Ga4' 'fcc' 12.0 31 69.723
|
||||
4.42 4.80 3.10 6.00 0.00 0.0 0.0 0.5 4.247 2.897 0.97
|
||||
1.0 1.649 1.435 0.00 0.0 0.0 2.0 0.70 0
|
||||
107
examples/meam/msmeam/log.msmeam.bu
Normal file
107
examples/meam/msmeam/log.msmeam.bu
Normal file
@ -0,0 +1,107 @@
|
||||
# Test of MEAM potential for HGa
|
||||
|
||||
# ------------------------ INITIALIZATION ----------------------------
|
||||
units metal
|
||||
dimension 3
|
||||
boundary p p p
|
||||
atom_style atomic
|
||||
variable latparam equal 4.646
|
||||
variable ncell equal 3
|
||||
|
||||
# ----------------------- ATOM DEFINITION ----------------------------
|
||||
region box block -4 4 -4 4 -4 4
|
||||
create_box 2 box
|
||||
Created orthogonal box = (-4 -4 -4) to (4 4 4)
|
||||
1 by 1 by 1 MPI processor grid
|
||||
|
||||
#
|
||||
|
||||
include potential.mod
|
||||
# NOTE: This script can be modified for different pair styles
|
||||
# See in.elastic for more info.
|
||||
|
||||
variable Pu string H
|
||||
print "potential chosen ${Pu}"
|
||||
potential chosen H
|
||||
# Choose potential
|
||||
pair_style MSmeam
|
||||
print "we just executed"
|
||||
we just executed
|
||||
|
||||
pair_coeff * * library.MSmeam ${Pu} Ga4 HGaMS.meam ${Pu} Ga4
|
||||
pair_coeff * * library.MSmeam H Ga4 HGaMS.meam ${Pu} Ga4
|
||||
pair_coeff * * library.MSmeam H Ga4 HGaMS.meam H Ga4
|
||||
Reading potential file library.MSmeam with DATE: 2018-09-22
|
||||
# Setup neighbor style
|
||||
neighbor 1.0 nsq
|
||||
neigh_modify once no every 1 delay 0 check yes
|
||||
|
||||
# Setup minimization style
|
||||
variable dmax equal 1.0e-2
|
||||
min_style cg
|
||||
min_modify dmax ${dmax} line quadratic
|
||||
min_modify dmax 0.01 line quadratic
|
||||
compute eng all pe/atom
|
||||
compute eatoms all reduce sum c_eng
|
||||
|
||||
# Setup output
|
||||
thermo 100
|
||||
thermo_style custom step temp etotal press pxx pyy pzz pxy pxz pyz lx ly lz vol c_eatoms
|
||||
thermo_modify norm yes
|
||||
create_atoms 1 single 0 0 0 units box
|
||||
Created 1 atoms
|
||||
create_atoms 2 single 2.2 0 0 units box
|
||||
Created 1 atoms
|
||||
create_atoms 2 single 0.3 2.3 0 units box
|
||||
Created 1 atoms
|
||||
# ---------- Define Settings ---------------------
|
||||
variable teng equal "c_eatoms"
|
||||
compute pot_energy all pe/atom
|
||||
compute stress all stress/atom NULL
|
||||
dump 1 all custom 1 dump.msmeam id x y z fx fy fz c_pot_energy c_stress[1] c_stress[2] c_stress[3] c_stress[4] c_stress[5] c_stress[6]
|
||||
run 1
|
||||
WARNING: No fixes defined, atoms won't move (../verlet.cpp:55)
|
||||
Neighbor list info ...
|
||||
2 neighbor list requests
|
||||
update every 1 steps, delay 0 steps, check yes
|
||||
max neighbors/atom: 2000, page size: 100000
|
||||
master list distance cutoff = 6.9
|
||||
ghost atom cutoff = 6.9
|
||||
Memory usage per processor = 12.9295 Mbytes
|
||||
Step Temp TotEng Press Pxx Pyy Pzz Pxy Pxz Pyz Lx Ly Lz Volume eatoms
|
||||
0 0 15.433079 491354.68 838670.91 635393.13 0 80195.793 0 0 8 8 8 512 15.433079
|
||||
1 0 15.433079 491354.68 838670.91 635393.13 0 80195.793 0 0 8 8 8 512 15.433079
|
||||
Loop time of 0.000172138 on 1 procs for 1 steps with 3 atoms
|
||||
|
||||
Performance: 501.922 ns/day, 0.048 hours/ns, 5809.285 timesteps/s
|
||||
81.3% CPU use with 1 MPI tasks x no OpenMP threads
|
||||
|
||||
MPI task timing breakdown:
|
||||
Section | min time | avg time | max time |%varavg| %total
|
||||
---------------------------------------------------------------
|
||||
Pair | 6.6996e-05 | 6.6996e-05 | 6.6996e-05 | 0.0 | 38.92
|
||||
Neigh | 0 | 0 | 0 | 0.0 | 0.00
|
||||
Comm | 1.9073e-06 | 1.9073e-06 | 1.9073e-06 | 0.0 | 1.11
|
||||
Output | 9.7036e-05 | 9.7036e-05 | 9.7036e-05 | 0.0 | 56.37
|
||||
Modify | 0 | 0 | 0 | 0.0 | 0.00
|
||||
Other | | 6.199e-06 | | | 3.60
|
||||
|
||||
Nlocal: 3 ave 3 max 3 min
|
||||
Histogram: 1 0 0 0 0 0 0 0 0 0
|
||||
Nghost: 78 ave 78 max 78 min
|
||||
Histogram: 1 0 0 0 0 0 0 0 0 0
|
||||
Neighs: 7 ave 7 max 7 min
|
||||
Histogram: 1 0 0 0 0 0 0 0 0 0
|
||||
FullNghs: 14 ave 14 max 14 min
|
||||
Histogram: 1 0 0 0 0 0 0 0 0 0
|
||||
|
||||
Total # of neighbors = 14
|
||||
Ave neighs/atom = 4.66667
|
||||
Neighbor list builds = 0
|
||||
Dangerous builds = 0
|
||||
write_data data.msmeam
|
||||
|
||||
print "All done!"
|
||||
All done!
|
||||
Total wall time: 0:00:00
|
||||
|
||||
24
examples/meam/msmeam/msmeam.dump.bu
Normal file
24
examples/meam/msmeam/msmeam.dump.bu
Normal file
@ -0,0 +1,24 @@
|
||||
ITEM: TIMESTEP
|
||||
0
|
||||
ITEM: NUMBER OF ATOMS
|
||||
3
|
||||
ITEM: BOX BOUNDS pp pp pp
|
||||
-4 4
|
||||
-4 4
|
||||
-4 4
|
||||
ITEM: ATOMS id x y z fx fy fz c_pot_energy c_stress[1] c_stress[2] c_stress[3] c_stress[4] c_stress[5] c_stress[6]
|
||||
1 0 0 0 -131.925 -88.3005 0 22.9153 -2.147e+08 -1.62661e+08 -0 -2.05301e+07 -0 -0
|
||||
2 2.2 0 0 120.809 -0.482171 0 14.7692 -2.12028e+08 -0 -0 403352 -0 -0
|
||||
3 0.3 2.3 0 11.1159 88.7827 0 8.61478 -2.67145e+06 -1.62661e+08 -0 -2.09335e+07 -0 -0
|
||||
ITEM: TIMESTEP
|
||||
1
|
||||
ITEM: NUMBER OF ATOMS
|
||||
3
|
||||
ITEM: BOX BOUNDS pp pp pp
|
||||
-4 4
|
||||
-4 4
|
||||
-4 4
|
||||
ITEM: ATOMS id x y z fx fy fz c_pot_energy c_stress[1] c_stress[2] c_stress[3] c_stress[4] c_stress[5] c_stress[6]
|
||||
1 0 0 0 -131.925 -88.3005 0 22.9153 -2.147e+08 -1.62661e+08 -0 -2.05301e+07 -0 -0
|
||||
2 2.2 0 0 120.809 -0.482171 0 14.7692 -2.12028e+08 -0 -0 403352 -0 -0
|
||||
3 0.3 2.3 0 11.1159 88.7827 0 8.61478 -2.67145e+06 -1.62661e+08 -0 -2.09335e+07 -0 -0
|
||||
25
examples/meam/msmeam/potential.mod
Normal file
25
examples/meam/msmeam/potential.mod
Normal file
@ -0,0 +1,25 @@
|
||||
# NOTE: This script can be modified for different pair styles
|
||||
# See in.elastic for more info.
|
||||
|
||||
variable Pu string H
|
||||
print "potential chosen ${Pu}"
|
||||
# Choose potential
|
||||
pair_style meam/ms
|
||||
print "we just executed"
|
||||
|
||||
pair_coeff * * library.msmeam ${Pu} Ga4 HGa.meam ${Pu} Ga4
|
||||
# Setup neighbor style
|
||||
neighbor 1.0 bin
|
||||
neigh_modify once no every 1 delay 0 check yes
|
||||
|
||||
# Setup minimization style
|
||||
variable dmax equal 1.0e-2
|
||||
min_style cg
|
||||
min_modify dmax ${dmax} line quadratic
|
||||
compute eng all pe/atom
|
||||
compute eatoms all reduce sum c_eng
|
||||
|
||||
# Setup output
|
||||
thermo 100
|
||||
thermo_style custom step temp etotal press pxx pyy pzz pxy pxz pyz lx ly lz vol c_eatoms
|
||||
thermo_modify norm yes
|
||||
@ -6,6 +6,6 @@ CUDA_HOME=/usr/local/cuda
|
||||
endif
|
||||
|
||||
gpu_SYSINC =
|
||||
gpu_SYSLIB = -lcudart -lcuda
|
||||
gpu_SYSLIB = -lcudart -lcuda -lcufft
|
||||
gpu_SYSPATH = -L$(CUDA_HOME)/lib64 -L$(CUDA_HOME)/lib64/stubs
|
||||
|
||||
|
||||
@ -1,8 +1,16 @@
|
||||
# Common headers for kernels
|
||||
PRE1_H = lal_preprocessor.h lal_aux_fun1.h
|
||||
|
||||
# Headers for Geryon
|
||||
UCL_H = $(wildcard ./geryon/ucl*.h)
|
||||
NVD_H = $(wildcard ./geryon/nvd*.h) $(UCL_H) lal_preprocessor.h \
|
||||
lal_pre_cuda_hip.h
|
||||
ALL_H = $(NVD_H) $(wildcard ./lal_*.h)
|
||||
|
||||
# Headers for Host files
|
||||
HOST_H = lal_answer.h lal_atom.h lal_balance.h lal_base_atomic.h lal_base_amoeba.h \
|
||||
lal_base_charge.h lal_base_dipole.h lal_base_dpd.h \
|
||||
lal_base_ellipsoid.h lal_base_three.h lal_device.h lal_neighbor.h \
|
||||
lal_neighbor_shared.h lal_pre_ocl_config.h $(NVD_H)
|
||||
|
||||
# Source files
|
||||
SRCS := $(wildcard ./lal_*.cpp)
|
||||
@ -54,13 +62,40 @@ $(OBJ_DIR)/pppm_d.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h \
|
||||
$(OBJ_DIR)/pppm_d_cubin.h: $(OBJ_DIR)/pppm_d.cubin
|
||||
$(BIN2C) -c -n pppm_d $(OBJ_DIR)/pppm_d.cubin > $(OBJ_DIR)/pppm_d_cubin.h
|
||||
|
||||
$(OBJ_DIR)/%_cubin.h: lal_%.cu $(ALL_H)
|
||||
$(OBJ_DIR)/%_cubin.h: lal_%.cu $(PRE1_H)
|
||||
$(CUDA) --fatbin -DNV_KERNEL -o $(OBJ_DIR)/$*.cubin $(OBJ_DIR)/lal_$*.cu
|
||||
$(BIN2C) -c -n $* $(OBJ_DIR)/$*.cubin > $@
|
||||
|
||||
# host code compilation
|
||||
|
||||
$(OBJ_DIR)/lal_%.o: lal_%.cpp $(CUHS) $(ALL_H)
|
||||
$(OBJ_DIR)/lal_answer.o: lal_answer.cpp $(HOST_H)
|
||||
$(CUDR) -o $@ -c lal_answer.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_dpd_tstat_ext.o: lal_dpd_tstat_ext.cpp lal_dpd.h $(HOST_H)
|
||||
$(CUDR) -o $@ -c lal_dpd_tstat_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_eam_alloy_ext.o: lal_eam_alloy_ext.cpp lal_eam.h $(HOST_H)
|
||||
$(CUDR) -o $@ -c lal_eam_alloy_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_eam_fs_ext.o: lal_eam_fs_ext.cpp lal_eam.h $(HOST_H)
|
||||
$(CUDR) -o $@ -c lal_eam_fs_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_neighbor.o: lal_neighbor.cpp $(HOST_H)
|
||||
$(CUDR) -o $@ -c lal_neighbor.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_neighbor_shared.o: lal_neighbor_shared.cpp $(HOST_H)
|
||||
$(CUDR) -o $@ -c lal_neighbor_shared.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_pppm.o: lal_pppm.cpp pppm_f_cubin.h pppm_d_cubin.h $(HOST_H)
|
||||
$(CUDR) -o $@ -c lal_pppm.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_%_ext.o: lal_%_ext.cpp lal_%.h $(HOST_H)
|
||||
$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_base_%.o: lal_base_%.cpp $(HOST_H)
|
||||
$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_%.o: lal_%.cpp %_cubin.h $(HOST_H)
|
||||
$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
|
||||
|
||||
#ifdef CUDPP_OPT
|
||||
@ -77,10 +112,10 @@ $(OBJ_DIR)/cudpp_plan_manager.o: cudpp_mini/cudpp_plan_manager.cpp
|
||||
$(CUDR) -o $@ -c cudpp_mini/cudpp_plan_manager.cpp -Icudpp_mini
|
||||
|
||||
$(OBJ_DIR)/radixsort_app.cu_o: cudpp_mini/radixsort_app.cu
|
||||
$(CUDA) -o $@ -c cudpp_mini/radixsort_app.cu
|
||||
$(CUDA) -o $@ -c cudpp_mini/radixsort_app.cu -Icudpp_mini
|
||||
|
||||
$(OBJ_DIR)/scan_app.cu_o: cudpp_mini/scan_app.cu
|
||||
$(CUDA) -o $@ -c cudpp_mini/scan_app.cu
|
||||
$(CUDA) -o $@ -c cudpp_mini/scan_app.cu -Icudpp_mini
|
||||
#endif
|
||||
|
||||
# build libgpu.a
|
||||
|
||||
@ -6,7 +6,7 @@ UCL_H = $(wildcard ./geryon/ucl*.h)
|
||||
OCL_H = $(wildcard ./geryon/ocl*.h) $(UCL_H) lal_precision.h
|
||||
|
||||
# Headers for Host files
|
||||
HOST_H = lal_answer.h lal_atom.h lal_balance.h lal_base_atomic.h \
|
||||
HOST_H = lal_answer.h lal_atom.h lal_balance.h lal_base_atomic.h lal_base_amoeba.h \
|
||||
lal_base_charge.h lal_base_dipole.h lal_base_dpd.h \
|
||||
lal_base_ellipsoid.h lal_base_three.h lal_device.h lal_neighbor.h \
|
||||
lal_neighbor_shared.h lal_pre_ocl_config.h $(OCL_H)
|
||||
@ -74,6 +74,9 @@ $(OBJ_DIR)/tersoff_mod_cl.h: lal_tersoff_mod.cu $(PRE1_H) lal_tersoff_mod_extra.
|
||||
$(OBJ_DIR)/tersoff_zbl_cl.h: lal_tersoff_zbl.cu $(PRE1_H) lal_tersoff_zbl_extra.h
|
||||
$(BSH) ./geryon/file_to_cstr.sh tersoff_zbl $(PRE1_H) lal_tersoff_zbl_extra.h lal_tersoff_zbl.cu $(OBJ_DIR)/tersoff_zbl_cl.h;
|
||||
|
||||
$(OBJ_DIR)/hippo_cl.h: lal_hippo.cu $(PRE1_H) lal_hippo_extra.h
|
||||
$(BSH) ./geryon/file_to_cstr.sh hippo $(PRE1_H) lal_hippo_extra.h lal_hippo.cu $(OBJ_DIR)/hippo_cl.h;
|
||||
|
||||
$(OBJ_DIR)/%_cl.h: lal_%.cu $(PRE1_H)
|
||||
$(BSH) ./geryon/file_to_cstr.sh $* $(PRE1_H) $< $@;
|
||||
|
||||
|
||||
@ -26,6 +26,9 @@
|
||||
#ifdef UCL_DEBUG
|
||||
#define UCL_SYNC_DEBUG
|
||||
#define UCL_DESTRUCT_CHECK
|
||||
#define UCL_DEBUG_ARG(arg) arg
|
||||
#else
|
||||
#define UCL_DEBUG_ARG(arg)
|
||||
#endif
|
||||
|
||||
#ifndef UCL_NO_API_CHECK
|
||||
|
||||
@ -33,6 +33,9 @@
|
||||
#ifdef UCL_DEBUG
|
||||
#define UCL_SYNC_DEBUG
|
||||
#define UCL_DESTRUCT_CHECK
|
||||
#define UCL_DEBUG_ARG(arg) arg
|
||||
#else
|
||||
#define UCL_DEBUG_ARG(arg)
|
||||
#endif
|
||||
|
||||
#ifndef UCL_NO_API_CHECK
|
||||
|
||||
@ -309,15 +309,14 @@ class UCL_Device {
|
||||
/// Return the maximum memory pitch in bytes for current device
|
||||
inline size_t max_pitch() { return max_pitch(_device); }
|
||||
/// Return the maximum memory pitch in bytes
|
||||
inline size_t max_pitch(const int i) { return 0; }
|
||||
inline size_t max_pitch(const int) { return 0; }
|
||||
|
||||
/// Returns false if accelerator cannot be shared by multiple processes
|
||||
/** If it cannot be determined, true is returned **/
|
||||
inline bool sharing_supported() { return sharing_supported(_device); }
|
||||
/// Returns false if accelerator cannot be shared by multiple processes
|
||||
/** If it cannot be determined, true is returned **/
|
||||
inline bool sharing_supported(const int i)
|
||||
{ return true; }
|
||||
inline bool sharing_supported(const int) { return true; }
|
||||
|
||||
/// True if the device is a sub-device
|
||||
inline bool is_subdevice()
|
||||
|
||||
@ -33,6 +33,9 @@
|
||||
#ifdef UCL_DEBUG
|
||||
#define UCL_SYNC_DEBUG
|
||||
#define UCL_DESTRUCT_CHECK
|
||||
#define UCL_DEBUG_ARG(arg) arg
|
||||
#else
|
||||
#define UCL_DEBUG_ARG(arg)
|
||||
#endif
|
||||
|
||||
#ifndef UCL_NO_API_CHECK
|
||||
|
||||
@ -137,7 +137,7 @@ inline int _host_view(mat_type &mat, copy_type &cm, const size_t o,
|
||||
|
||||
template <class mat_type>
|
||||
inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
|
||||
const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
|
||||
const enum UCL_MEMOPT kind, const enum UCL_MEMOPT /*kind2*/){
|
||||
cl_mem_flags buffer_perm;
|
||||
cl_map_flags map_perm;
|
||||
if (kind==UCL_READ_ONLY) {
|
||||
@ -583,7 +583,7 @@ template <> struct _ucl_memcpy<1,0> {
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const p2 &src, const size_t n,
|
||||
cl_command_queue &cq, const cl_bool block,
|
||||
const size_t dst_offset, const size_t src_offset) {
|
||||
const size_t /*dst_offset*/, const size_t src_offset) {
|
||||
if (src.cbegin()==dst.cbegin()) {
|
||||
#ifdef UCL_DBG_MEM_TRACE
|
||||
std::cerr << "UCL_COPY 1S\n";
|
||||
@ -641,7 +641,7 @@ template <> struct _ucl_memcpy<0,1> {
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const p2 &src, const size_t n,
|
||||
cl_command_queue &cq, const cl_bool block,
|
||||
const size_t dst_offset, const size_t src_offset) {
|
||||
const size_t dst_offset, const size_t /*src_offset*/) {
|
||||
if (src.cbegin()==dst.cbegin()) {
|
||||
if (block) ucl_sync(cq);
|
||||
#ifdef UCL_DBG_MEM_TRACE
|
||||
|
||||
@ -35,19 +35,19 @@ class UCL_Texture {
|
||||
UCL_Texture() {}
|
||||
~UCL_Texture() {}
|
||||
/// Construct with a specified texture reference
|
||||
inline UCL_Texture(UCL_Program &prog, const char *texture_name) { }
|
||||
inline UCL_Texture(UCL_Program & /*prog*/, const char * /*texture_name*/) { }
|
||||
/// Set the texture reference for this object
|
||||
inline void get_texture(UCL_Program &prog, const char *texture_name) { }
|
||||
inline void get_texture(UCL_Program & /*prog*/, const char * /*texture_name*/) { }
|
||||
|
||||
/// Bind a float array where each fetch grabs a vector of length numel
|
||||
template<class mat_typ>
|
||||
inline void bind_float(mat_typ &vec, const unsigned numel) { }
|
||||
inline void bind_float(mat_typ & /*vec*/, const unsigned /*numel*/) { }
|
||||
|
||||
/// Unbind the texture reference from the memory allocation
|
||||
inline void unbind() { }
|
||||
|
||||
/// Make a texture reference available to kernel
|
||||
inline void allow(UCL_Kernel &kernel) { }
|
||||
inline void allow(UCL_Kernel & /*kernel*/) { }
|
||||
|
||||
private:
|
||||
friend class UCL_Kernel;
|
||||
@ -62,7 +62,7 @@ class UCL_Const {
|
||||
inline UCL_Const(UCL_Program &prog, const char *global_name)
|
||||
{ get_global(prog,global_name); }
|
||||
/// Set the global reference for this object
|
||||
inline void get_global(UCL_Program &prog, const char *global_name) {
|
||||
inline void get_global(UCL_Program &prog, const char * /*global_name*/) {
|
||||
if (_active) {
|
||||
CL_DESTRUCT_CALL(clReleaseContext(_context));
|
||||
CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq));
|
||||
|
||||
@ -71,7 +71,7 @@ class UCL_Timer {
|
||||
inline void init(UCL_Device &dev) { init(dev,dev.cq()); }
|
||||
|
||||
/// Initialize command queue for timing
|
||||
inline void init(UCL_Device &dev, command_queue &cq) {
|
||||
inline void init(UCL_Device & /*dev*/, command_queue &cq) {
|
||||
clear();
|
||||
_cq=cq;
|
||||
clRetainCommandQueue(_cq);
|
||||
|
||||
@ -205,12 +205,11 @@ template <> struct _host_host_copy<1,1> {
|
||||
// Should never be here
|
||||
template <int host_t1, int host_t2> struct _host_host_copy {
|
||||
template <class mat1, class mat2>
|
||||
static inline void hhc(mat1 &dst, const mat2 &src, const size_t numel) {
|
||||
static inline void hhc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*numel*/) {
|
||||
assert(0==1);
|
||||
}
|
||||
template <class mat1, class mat2>
|
||||
static inline void hhc(mat1 &dst, const mat2 &src, const size_t rows,
|
||||
const size_t cols) {
|
||||
static inline void hhc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*rows*/, const size_t /*cols*/) {
|
||||
assert(0==1);
|
||||
}
|
||||
};
|
||||
@ -470,24 +469,22 @@ template <int host_type1> struct _ucl_cast_copy<host_type1,1> {
|
||||
// Neither on host or both on host
|
||||
template <> struct _ucl_cast_copy<1,1> {
|
||||
template <class mat1, class mat2, class mat3>
|
||||
static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
|
||||
mat3 &cast_buffer, command_queue &cq) {
|
||||
static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*numel*/,
|
||||
mat3 & /*cast_buffer*/, command_queue & /*cq*/) {
|
||||
assert(0==1);
|
||||
}
|
||||
template <class mat1, class mat2, class mat3>
|
||||
static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
|
||||
mat3 &cast_buffer) {
|
||||
static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*numel*/, mat3 & /*cast_buffer*/) {
|
||||
assert(0==1);
|
||||
}
|
||||
template <class mat1, class mat2, class mat3>
|
||||
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
|
||||
const size_t cols, mat3 &cast_buffer) {
|
||||
static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*rows*/,
|
||||
const size_t /*cols*/, mat3 & /*cast_buffer*/) {
|
||||
assert(0==1);
|
||||
}
|
||||
template <class mat1, class mat2, class mat3>
|
||||
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
|
||||
const size_t cols, mat3 &cast_buffer,
|
||||
command_queue &cq) {
|
||||
static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*rows*/,
|
||||
const size_t /*cols*/, mat3 & /*cast_buffer*/, command_queue & /*cq*/) {
|
||||
assert(0==1);
|
||||
}
|
||||
};
|
||||
@ -495,24 +492,22 @@ template <> struct _ucl_cast_copy<1,1> {
|
||||
// Neither on host or both on host
|
||||
template <> struct _ucl_cast_copy<0,0> {
|
||||
template <class mat1, class mat2, class mat3>
|
||||
static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
|
||||
mat3 &cast_buffer, command_queue &cq) {
|
||||
static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*numel*/,
|
||||
mat3 & /*cast_buffer*/, command_queue & /*cq*/) {
|
||||
assert(0==1);
|
||||
}
|
||||
template <class mat1, class mat2, class mat3>
|
||||
static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
|
||||
mat3 &cast_buffer) {
|
||||
static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*numel*/, mat3 & /*cast_buffer*/) {
|
||||
assert(0==1);
|
||||
}
|
||||
template <class mat1, class mat2, class mat3>
|
||||
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
|
||||
const size_t cols, mat3 &cast_buffer) {
|
||||
static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*rows*/,
|
||||
const size_t /*cols*/, mat3 & /*cast_buffer*/) {
|
||||
assert(0==1);
|
||||
}
|
||||
template <class mat1, class mat2, class mat3>
|
||||
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
|
||||
const size_t cols, mat3 &cast_buffer,
|
||||
command_queue &cq) {
|
||||
static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*rows*/,
|
||||
const size_t cols, mat3 & /*cast_buffer*/, command_queue & /*cq*/) {
|
||||
assert(0==1);
|
||||
}
|
||||
};
|
||||
|
||||
@ -125,7 +125,7 @@ class UCL_D_Vec : public UCL_BaseMat {
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container when using CUDA APIs **/
|
||||
template <class ucl_type>
|
||||
inline void view(ucl_type &input, const size_t rows, const size_t cols) {
|
||||
inline void view(ucl_type &input, const size_t UCL_DEBUG_ARG(rows), const size_t cols) {
|
||||
#ifdef UCL_DEBUG
|
||||
assert(rows==1);
|
||||
#endif
|
||||
@ -230,8 +230,8 @@ class UCL_D_Vec : public UCL_BaseMat {
|
||||
* - The view does not prevent the memory from being freed by the
|
||||
* allocating container when using CUDA APIs **/
|
||||
template <class ucl_type>
|
||||
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
|
||||
const size_t cols) {
|
||||
inline void view_offset(const size_t offset,ucl_type &input,
|
||||
const size_t UCL_DEBUG_ARG(rows), const size_t cols) {
|
||||
#ifdef UCL_DEBUG
|
||||
assert(rows==1);
|
||||
#endif
|
||||
|
||||
@ -126,7 +126,7 @@ class UCL_H_Vec : public UCL_BaseMat {
|
||||
* allocating container when using CUDA APIs
|
||||
* - Viewing a device container on the host is not supported **/
|
||||
template <class ucl_type>
|
||||
inline void view(ucl_type &input, const size_t rows, const size_t cols) {
|
||||
inline void view(ucl_type &input, const size_t UCL_DEBUG_ARG(rows), const size_t cols) {
|
||||
#ifdef UCL_DEBUG
|
||||
assert(rows==1);
|
||||
#endif
|
||||
@ -188,7 +188,7 @@ class UCL_H_Vec : public UCL_BaseMat {
|
||||
* allocating container when using CUDA APIs
|
||||
* - Viewing a device pointer on the host is not supported **/
|
||||
template <class ptr_type>
|
||||
inline void view(ptr_type *input, const size_t rows, const size_t cols,
|
||||
inline void view(ptr_type *input, const size_t UCL_DEBUG_ARG(rows), const size_t cols,
|
||||
UCL_Device &dev) {
|
||||
#ifdef UCL_DEBUG
|
||||
assert(rows==1);
|
||||
@ -233,7 +233,7 @@ class UCL_H_Vec : public UCL_BaseMat {
|
||||
* allocating container when using CUDA APIs
|
||||
* - Viewing a device container on the host is not supported **/
|
||||
template <class ucl_type>
|
||||
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
|
||||
inline void view_offset(const size_t offset,ucl_type &input,const size_t UCL_DEBUG_ARG(rows),
|
||||
const size_t cols) {
|
||||
#ifdef UCL_DEBUG
|
||||
assert(rows==1);
|
||||
|
||||
@ -27,7 +27,7 @@ template <int st> struct _ucl_s_obj_help;
|
||||
// -- Can potentially use same memory if shared by accelerator
|
||||
template <> struct _ucl_s_obj_help<1> {
|
||||
template <class t1, class t2, class t3>
|
||||
static inline int alloc(t1 &host, t2 &device, t3 &_buffer,
|
||||
static inline int alloc(t1 &host, t2 &device, t3 & /*_buffer*/,
|
||||
const int cols, UCL_Device &acc,
|
||||
const enum UCL_MEMOPT kind1,
|
||||
const enum UCL_MEMOPT kind2) {
|
||||
@ -131,41 +131,37 @@ template <> struct _ucl_s_obj_help<1> {
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3>
|
||||
static inline void copy(t1 &dst, t2 &src, t3 &buffer, const bool async) {
|
||||
static inline void copy(t1 &dst, t2 &src, t3 & /*buffer*/, const bool async) {
|
||||
ucl_copy(dst,src,async);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3>
|
||||
static inline void copy(t1 &dst, t2 &src, t3 &buffer, command_queue &cq) {
|
||||
static inline void copy(t1 &dst, t2 &src, t3 & /*buffer*/, command_queue &cq) {
|
||||
ucl_copy(dst,src,cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3>
|
||||
static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer,
|
||||
const bool async) {
|
||||
static inline void copy(t1 &dst, t2 &src, const int cols, t3 & /*buffer*/, const bool async) {
|
||||
ucl_copy(dst,src,cols,async);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3>
|
||||
static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer,
|
||||
command_queue &cq) {
|
||||
static inline void copy(t1 &dst, t2 &src, const int cols, t3 & /*buffer*/, command_queue &cq) {
|
||||
ucl_copy(dst,src,cols,cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3>
|
||||
static inline void copy(t1 &dst, t2 &src, const int rows, const int cols,
|
||||
t3 &buffer, const bool async) {
|
||||
static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, t3 & /*buffer*/, const bool async) {
|
||||
ucl_copy(dst,src,rows,cols,async);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3>
|
||||
static inline void copy(t1 &dst, t2 &src, const int rows, const int cols,
|
||||
t3 &buffer, command_queue &cq) {
|
||||
static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, t3 & /*buffer*/, command_queue &cq) {
|
||||
ucl_copy(dst,src,rows,cols,cq);
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3>
|
||||
static inline int dev_resize(t1 &device, t2 &host, t3 &buff,const int cols) {
|
||||
static inline int dev_resize(t1 &device, t2 &host, t3 & /*buff*/,const int cols) {
|
||||
if (device.kind()==UCL_VIEW) {
|
||||
device.view(host);
|
||||
return UCL_SUCCESS;
|
||||
@ -353,7 +349,7 @@ template <int st> struct _ucl_s_obj_help {
|
||||
}
|
||||
|
||||
template <class t1, class t2, class t3>
|
||||
static inline int dev_resize(t1 &device, t2 &host, t3 &buff,const int cols) {
|
||||
static inline int dev_resize(t1 &device, t2 & /*host*/, t3 &buff,const int cols) {
|
||||
int err=buff.resize(cols);
|
||||
if (err!=UCL_SUCCESS)
|
||||
return err;
|
||||
|
||||
322
lib/gpu/lal_amoeba.cpp
Normal file
322
lib/gpu/lal_amoeba.cpp
Normal file
@ -0,0 +1,322 @@
|
||||
/***************************************************************************
|
||||
amoeba.cpp
|
||||
-------------------
|
||||
Trung Dac Nguyen (Northwestern)
|
||||
|
||||
Class for acceleration of the amoeba pair style.
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
email : trung.nguyen@northwestern.edu
|
||||
***************************************************************************/
|
||||
|
||||
#if defined(USE_OPENCL)
|
||||
#include "amoeba_cl.h"
|
||||
#elif defined(USE_CUDART)
|
||||
const char *amoeba=0;
|
||||
#else
|
||||
#include "amoeba_cubin.h"
|
||||
#endif
|
||||
|
||||
#include "lal_amoeba.h"
|
||||
#include <cassert>
|
||||
namespace LAMMPS_AL {
|
||||
#define AmoebaT Amoeba<numtyp, acctyp>
|
||||
|
||||
extern Device<PRECISION,ACC_PRECISION> device;
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
AmoebaT::Amoeba() : BaseAmoeba<numtyp,acctyp>(),
|
||||
_allocated(false) {
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
AmoebaT::~Amoeba() {
|
||||
clear();
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int AmoebaT::bytes_per_atom(const int max_nbors) const {
|
||||
return this->bytes_per_atom_atomic(max_nbors);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int AmoebaT::init(const int ntypes, const int max_amtype, const int max_amclass,
|
||||
const double *host_pdamp, const double *host_thole,
|
||||
const double *host_dirdamp, const int *host_amtype2class,
|
||||
const double *host_special_hal,
|
||||
const double * /*host_special_repel*/,
|
||||
const double * /*host_special_disp*/,
|
||||
const double *host_special_mpole,
|
||||
const double * /*host_special_polar_wscale*/,
|
||||
const double *host_special_polar_piscale,
|
||||
const double *host_special_polar_pscale,
|
||||
const double *host_csix, const double *host_adisp,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const int maxspecial15,
|
||||
const double cell_size, const double gpu_split, FILE *_screen,
|
||||
const double polar_dscale, const double polar_uscale) {
|
||||
int success;
|
||||
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15,
|
||||
cell_size,gpu_split,_screen,amoeba,
|
||||
"k_amoeba_multipole", "k_amoeba_udirect2b",
|
||||
"k_amoeba_umutual2b", "k_amoeba_polar",
|
||||
"k_amoeba_fphi_uind", "k_amoeba_fphi_mpole",
|
||||
"k_amoeba_short_nbor", "k_amoeba_special15");
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
// If atom type constants fit in shared memory use fast kernel
|
||||
int lj_types=ntypes;
|
||||
shared_types=false;
|
||||
int max_shared_types=this->device->max_shared_types();
|
||||
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
|
||||
lj_types=max_shared_types;
|
||||
shared_types=true;
|
||||
}
|
||||
_lj_types=lj_types;
|
||||
|
||||
// Allocate a host write buffer for data initialization
|
||||
|
||||
UCL_H_Vec<numtyp4> host_write(max_amtype, *(this->ucl_device), UCL_WRITE_ONLY);
|
||||
for (int i = 0; i < max_amtype; i++) {
|
||||
host_write[i].x = host_pdamp[i];
|
||||
host_write[i].y = host_thole[i];
|
||||
host_write[i].z = host_dirdamp[i];
|
||||
host_write[i].w = host_amtype2class[i];
|
||||
}
|
||||
|
||||
coeff_amtype.alloc(max_amtype,*(this->ucl_device), UCL_READ_ONLY);
|
||||
ucl_copy(coeff_amtype,host_write,false);
|
||||
|
||||
UCL_H_Vec<numtyp4> host_write2(max_amclass, *(this->ucl_device), UCL_WRITE_ONLY);
|
||||
for (int i = 0; i < max_amclass; i++) {
|
||||
host_write2[i].x = host_csix[i];
|
||||
host_write2[i].y = host_adisp[i];
|
||||
host_write2[i].z = (numtyp)0;
|
||||
host_write2[i].w = (numtyp)0;
|
||||
}
|
||||
|
||||
coeff_amclass.alloc(max_amclass,*(this->ucl_device), UCL_READ_ONLY);
|
||||
ucl_copy(coeff_amclass,host_write2,false);
|
||||
|
||||
UCL_H_Vec<numtyp4> dview(5, *(this->ucl_device), UCL_WRITE_ONLY);
|
||||
sp_amoeba.alloc(5,*(this->ucl_device),UCL_READ_ONLY);
|
||||
for (int i=0; i<5; i++) {
|
||||
dview[i].x=host_special_hal[i];
|
||||
dview[i].y=host_special_polar_piscale[i];
|
||||
dview[i].z=host_special_polar_pscale[i];
|
||||
dview[i].w=host_special_mpole[i];
|
||||
}
|
||||
ucl_copy(sp_amoeba,dview,5,false);
|
||||
|
||||
_polar_dscale = polar_dscale;
|
||||
_polar_uscale = polar_uscale;
|
||||
|
||||
_allocated=true;
|
||||
this->_max_bytes=coeff_amtype.row_bytes() + coeff_amclass.row_bytes()
|
||||
+ sp_amoeba.row_bytes() + this->_tep.row_bytes()
|
||||
+ this->_fieldp.row_bytes() + this->_thetai1.row_bytes()
|
||||
+ this->_thetai2.row_bytes() + this->_thetai3.row_bytes()
|
||||
+ this->_igrid.row_bytes() + this->_cgrid_brick.row_bytes();
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void AmoebaT::clear() {
|
||||
if (!_allocated)
|
||||
return;
|
||||
_allocated=false;
|
||||
|
||||
coeff_amtype.clear();
|
||||
coeff_amclass.clear();
|
||||
sp_amoeba.clear();
|
||||
|
||||
this->clear_atomic();
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
double AmoebaT::host_memory_usage() const {
|
||||
return this->host_memory_usage_atomic()+sizeof(Amoeba<numtyp,acctyp>);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Calculate the multipole real-space term, returning tep
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
int AmoebaT::multipole_real(const int eflag, const int vflag) {
|
||||
int ainum=this->ans->inum();
|
||||
if (ainum == 0)
|
||||
return 0;
|
||||
|
||||
int _nall=this->atom->nall();
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
|
||||
// Compute the block size and grid size to keep all cores busy
|
||||
const int BX=this->block_size();
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
this->time_pair.start();
|
||||
|
||||
// Build the short neighbor list for the cutoff off2_mpole,
|
||||
// at this point mpole is the first kernel in a time step for AMOEBA
|
||||
|
||||
this->k_short_nbor.set_size(GX,BX);
|
||||
this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
|
||||
&this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor, &this->_off2_mpole, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
this->k_multipole.set_size(GX,BX);
|
||||
this->k_multipole.run(&this->atom->x, &this->atom->extra,
|
||||
&coeff_amtype, &sp_amoeba,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor,
|
||||
&this->ans->force, &this->ans->engv, &this->_tep,
|
||||
&eflag, &vflag, &ainum, &_nall, &nbor_pitch,
|
||||
&this->_threads_per_atom, &this->_aewald, &this->_felec,
|
||||
&this->_off2_mpole, &_polar_dscale, &_polar_uscale);
|
||||
this->time_pair.stop();
|
||||
|
||||
return GX;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Launch the real-space permanent field kernel
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
int AmoebaT::udirect2b(const int /*eflag*/, const int /*vflag*/) {
|
||||
int ainum=this->ans->inum();
|
||||
if (ainum == 0)
|
||||
return 0;
|
||||
|
||||
int _nall=this->atom->nall();
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
|
||||
// Compute the block size and grid size to keep all cores busy
|
||||
const int BX=this->block_size();
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
this->time_pair.start();
|
||||
|
||||
// Build the short neighbor list for the cutoff _off2_polar, if not done yet
|
||||
// this is the first kernel in a time step where _off2_polar is used
|
||||
|
||||
if (!this->short_nbor_polar_avail) {
|
||||
this->k_short_nbor.set_size(GX,BX);
|
||||
this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
|
||||
&this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor, &this->_off2_polar, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
this->short_nbor_polar_avail = true;
|
||||
}
|
||||
|
||||
this->k_udirect2b.set_size(GX,BX);
|
||||
this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_amoeba,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor,
|
||||
&this->_fieldp, &ainum, &_nall, &nbor_pitch,
|
||||
&this->_threads_per_atom, &this->_aewald, &this->_off2_polar,
|
||||
&_polar_dscale, &_polar_uscale);
|
||||
|
||||
this->time_pair.stop();
|
||||
return GX;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Launch the real-space induced field kernel, returning field and fieldp
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
int AmoebaT::umutual2b(const int /*eflag*/, const int /*vflag*/) {
|
||||
int ainum=this->ans->inum();
|
||||
if (ainum == 0)
|
||||
return 0;
|
||||
|
||||
int _nall=this->atom->nall();
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
|
||||
// Compute the block size and grid size to keep all cores busy
|
||||
const int BX=this->block_size();
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
this->time_pair.start();
|
||||
|
||||
// Build the short neighbor list if not done yet
|
||||
if (!this->short_nbor_polar_avail) {
|
||||
this->k_short_nbor.set_size(GX,BX);
|
||||
this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
|
||||
&this->_nbor_data->begin(), &this->dev_short_nbor,
|
||||
&this->_off2_polar, &ainum, &nbor_pitch,
|
||||
&this->_threads_per_atom);
|
||||
this->short_nbor_polar_avail = true;
|
||||
}
|
||||
|
||||
this->k_umutual2b.set_size(GX,BX);
|
||||
this->k_umutual2b.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_amoeba,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor, &this->_fieldp, &ainum, &_nall,
|
||||
&nbor_pitch, &this->_threads_per_atom, &this->_aewald,
|
||||
&this->_off2_polar, &_polar_dscale, &_polar_uscale);
|
||||
|
||||
this->time_pair.stop();
|
||||
return GX;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Launch the polar real-space kernel, returning tep
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
int AmoebaT::polar_real(const int eflag, const int vflag) {
|
||||
int ainum=this->ans->inum();
|
||||
if (ainum == 0)
|
||||
return 0;
|
||||
|
||||
int _nall=this->atom->nall();
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
|
||||
// Compute the block size and grid size to keep all cores busy
|
||||
|
||||
const int BX=this->block_size();
|
||||
const int GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
|
||||
/*
|
||||
const int cus = this->device->gpu->cus();
|
||||
while (GX < cus && GX > 1) {
|
||||
BX /= 2;
|
||||
GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
|
||||
}
|
||||
*/
|
||||
this->time_pair.start();
|
||||
|
||||
// Build the short neighbor list if not done yet
|
||||
if (!this->short_nbor_polar_avail) {
|
||||
this->k_short_nbor.set_size(GX,BX);
|
||||
this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
|
||||
&this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor, &this->_off2_polar, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
this->short_nbor_polar_avail = true;
|
||||
}
|
||||
|
||||
this->k_polar.set_size(GX,BX);
|
||||
this->k_polar.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_amoeba,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor,
|
||||
&this->ans->force, &this->ans->engv, &this->_tep,
|
||||
&eflag, &vflag, &ainum, &_nall, &nbor_pitch,
|
||||
&this->_threads_per_atom, &this->_aewald, &this->_felec,
|
||||
&this->_off2_polar, &_polar_dscale, &_polar_uscale);
|
||||
this->time_pair.stop();
|
||||
|
||||
// Signal that short nbor list is not avail for the next time step
|
||||
// do it here because polar_real() is the last kernel in a time step at this point
|
||||
|
||||
this->short_nbor_polar_avail = false;
|
||||
|
||||
return GX;
|
||||
}
|
||||
|
||||
template class Amoeba<PRECISION,ACC_PRECISION>;
|
||||
}
|
||||
2099
lib/gpu/lal_amoeba.cu
Normal file
2099
lib/gpu/lal_amoeba.cu
Normal file
File diff suppressed because it is too large
Load Diff
100
lib/gpu/lal_amoeba.h
Normal file
100
lib/gpu/lal_amoeba.h
Normal file
@ -0,0 +1,100 @@
|
||||
/***************************************************************************
|
||||
amoeba.h
|
||||
-------------------
|
||||
Trung Dac Nguyen (Northwestern)
|
||||
|
||||
Class for acceleration of the amoeba pair style.
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
email : trung.nguyen@northwestern.edu
|
||||
***************************************************************************/
|
||||
|
||||
#ifndef LAL_AMOEBA_H
|
||||
#define LAL_AMOEBA_H
|
||||
|
||||
#include "lal_base_amoeba.h"
|
||||
|
||||
namespace LAMMPS_AL {
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
class Amoeba : public BaseAmoeba<numtyp, acctyp> {
|
||||
public:
|
||||
Amoeba();
|
||||
~Amoeba();
|
||||
|
||||
/// Clear any previous data and set up for a new LAMMPS run
|
||||
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||
* \param cell_size cutoff + skin
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successful
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
* - -5 Double precision is not supported on card **/
|
||||
int init(const int ntypes, const int max_amtype, const int max_amclass,
|
||||
const double *host_pdamp, const double *host_thole,
|
||||
const double *host_dirdamp, const int *host_amtype2class,
|
||||
const double *host_special_mpole,
|
||||
const double *host_special_hal,
|
||||
const double *host_special_repel,
|
||||
const double *host_special_disp,
|
||||
const double *host_special_polar_wscale,
|
||||
const double *host_special_polar_piscale,
|
||||
const double *host_special_polar_pscale,
|
||||
const double *host_csix, const double *host_adisp,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const int maxspecial15, const double cell_size,
|
||||
const double gpu_split, FILE *_screen,
|
||||
const double polar_dscale, const double polar_uscale);
|
||||
|
||||
/// Clear all host and device data
|
||||
/** \note This is called at the beginning of the init() routine **/
|
||||
void clear();
|
||||
|
||||
/// Returns memory usage on device per atom
|
||||
int bytes_per_atom(const int max_nbors) const;
|
||||
|
||||
/// Total host memory used by library for pair style
|
||||
double host_memory_usage() const;
|
||||
|
||||
// --------------------------- TYPE DATA --------------------------
|
||||
|
||||
/// pdamp = coeff_amtype.x; thole = coeff_amtype.y;
|
||||
/// dirdamp = coeff_amtype.z; amtype2class = coeff_amtype.w
|
||||
UCL_D_Vec<numtyp4> coeff_amtype;
|
||||
/// csix = coeff_amclass.x; adisp = coeff_amclass.y;
|
||||
UCL_D_Vec<numtyp4> coeff_amclass;
|
||||
/// Special amoeba values [0-4]:
|
||||
/// sp_amoeba.x = special_hal
|
||||
/// sp_amoeba.y = special_polar_pscale,
|
||||
/// sp_amoeba.z = special_polar_piscale
|
||||
/// sp_amoeba.w = special_mpole
|
||||
UCL_D_Vec<numtyp4> sp_amoeba;
|
||||
|
||||
/// If atom type constants fit in shared memory, use fast kernels
|
||||
bool shared_types;
|
||||
|
||||
/// Number of atom types
|
||||
int _lj_types;
|
||||
|
||||
numtyp _polar_dscale, _polar_uscale;
|
||||
numtyp _qqrd2e;
|
||||
|
||||
protected:
|
||||
bool _allocated;
|
||||
int multipole_real(const int eflag, const int vflag);
|
||||
int udirect2b(const int eflag, const int vflag);
|
||||
int umutual2b(const int eflag, const int vflag);
|
||||
int polar_real(const int eflag, const int vflag);
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
213
lib/gpu/lal_amoeba_ext.cpp
Normal file
213
lib/gpu/lal_amoeba_ext.cpp
Normal file
@ -0,0 +1,213 @@
|
||||
/***************************************************************************
|
||||
amoeba_ext.cpp
|
||||
-------------------
|
||||
Trung Dac Nguyen (Northwestern)
|
||||
|
||||
Functions for LAMMPS access to amoeba acceleration routines.
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
email : trung.nguyen@northwestern.edu
|
||||
***************************************************************************/
|
||||
|
||||
#include <iostream>
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
|
||||
#include "lal_amoeba.h"
|
||||
|
||||
using namespace std;
|
||||
using namespace LAMMPS_AL;
|
||||
|
||||
static Amoeba<PRECISION,ACC_PRECISION> AMOEBAMF;
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Allocate memory on host and device and copy constants to device
|
||||
// ---------------------------------------------------------------------------
|
||||
int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclass,
|
||||
const double *host_pdamp, const double *host_thole,
|
||||
const double *host_dirdamp, const int *host_amtype2class,
|
||||
const double *host_special_hal,
|
||||
const double *host_special_repel,
|
||||
const double *host_special_disp,
|
||||
const double *host_special_mpole,
|
||||
const double *host_special_polar_wscale,
|
||||
const double *host_special_polar_piscale,
|
||||
const double *host_special_polar_pscale,
|
||||
const double *host_csix, const double *host_adisp,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const int maxspecial15,
|
||||
const double cell_size, int &gpu_mode, FILE *screen,
|
||||
const double polar_dscale, const double polar_uscale) {
|
||||
AMOEBAMF.clear();
|
||||
gpu_mode=AMOEBAMF.device->gpu_mode();
|
||||
double gpu_split=AMOEBAMF.device->particle_split();
|
||||
int first_gpu=AMOEBAMF.device->first_device();
|
||||
int last_gpu=AMOEBAMF.device->last_device();
|
||||
int world_me=AMOEBAMF.device->world_me();
|
||||
int gpu_rank=AMOEBAMF.device->gpu_rank();
|
||||
int procs_per_gpu=AMOEBAMF.device->procs_per_gpu();
|
||||
|
||||
AMOEBAMF.device->init_message(screen,"amoeba",first_gpu,last_gpu);
|
||||
|
||||
bool message=false;
|
||||
if (AMOEBAMF.device->replica_me()==0 && screen)
|
||||
message=true;
|
||||
|
||||
if (message) {
|
||||
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||
fflush(screen);
|
||||
}
|
||||
|
||||
int init_ok=0;
|
||||
if (world_me==0)
|
||||
init_ok=AMOEBAMF.init(ntypes, max_amtype, max_amclass,
|
||||
host_pdamp, host_thole, host_dirdamp,
|
||||
host_amtype2class, host_special_hal,
|
||||
host_special_repel, host_special_disp,
|
||||
host_special_mpole, host_special_polar_wscale,
|
||||
host_special_polar_piscale, host_special_polar_pscale,
|
||||
host_csix, host_adisp, nlocal, nall, max_nbors,
|
||||
maxspecial, maxspecial15, cell_size, gpu_split,
|
||||
screen, polar_dscale, polar_uscale);
|
||||
|
||||
AMOEBAMF.device->world_barrier();
|
||||
if (message)
|
||||
fprintf(screen,"Done.\n");
|
||||
|
||||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (message) {
|
||||
if (last_gpu-first_gpu==0)
|
||||
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||
else
|
||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
if (gpu_rank==i && world_me!=0)
|
||||
init_ok=AMOEBAMF.init(ntypes, max_amtype, max_amclass,
|
||||
host_pdamp, host_thole, host_dirdamp,
|
||||
host_amtype2class, host_special_hal,
|
||||
host_special_repel, host_special_disp,
|
||||
host_special_mpole, host_special_polar_wscale,
|
||||
host_special_polar_piscale, host_special_polar_pscale,
|
||||
host_csix, host_adisp, nlocal, nall, max_nbors,
|
||||
maxspecial, maxspecial15, cell_size, gpu_split,
|
||||
screen, polar_dscale, polar_uscale);
|
||||
|
||||
AMOEBAMF.device->gpu_barrier();
|
||||
if (message)
|
||||
fprintf(screen,"Done.\n");
|
||||
}
|
||||
if (message)
|
||||
fprintf(screen,"\n");
|
||||
|
||||
if (init_ok==0)
|
||||
AMOEBAMF.estimate_gpu_overhead();
|
||||
return init_ok;
|
||||
}
|
||||
|
||||
void amoeba_gpu_clear() {
|
||||
AMOEBAMF.clear();
|
||||
}
|
||||
|
||||
int** amoeba_gpu_precompute(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, int *host_amtype,
|
||||
int *host_amgroup, double **host_rpole, double ** /*host_uind*/,
|
||||
double ** /*host_uinp*/, double * /*host_pval*/,
|
||||
double *sublo, double *subhi, tagint *tag,
|
||||
int **nspecial, tagint **special,
|
||||
int *nspecial15, tagint **special15,
|
||||
const bool eflag_in, const bool vflag_in,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
int **ilist, int **jnum, const double cpu_time,
|
||||
bool &success, double *host_q, double *boxlo, double *prd) {
|
||||
return AMOEBAMF.precompute(ago, inum_full, nall, host_x, host_type,
|
||||
host_amtype, host_amgroup, host_rpole,
|
||||
nullptr, nullptr, nullptr, sublo, subhi, tag,
|
||||
nspecial, special, nspecial15, special15,
|
||||
eflag_in, vflag_in, eatom, vatom,
|
||||
host_start, ilist, jnum, cpu_time,
|
||||
success, host_q, boxlo, prd);
|
||||
}
|
||||
|
||||
|
||||
void amoeba_gpu_compute_multipole_real(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
int *host_amtype, int *host_amgroup, double **host_rpole,
|
||||
double *sublo, double *subhi, tagint *tag, int **nspecial,
|
||||
tagint **special, int *nspecial15, tagint** special15,
|
||||
const bool eflag, const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start,
|
||||
int **ilist, int **jnum, const double cpu_time,
|
||||
bool &success, const double aewald, const double felec, const double off2,
|
||||
double *host_q, double *boxlo, double *prd, void **tep_ptr) {
|
||||
AMOEBAMF.compute_multipole_real(ago, inum_full, nall, host_x, host_type,
|
||||
host_amtype, host_amgroup, host_rpole, nullptr, sublo, subhi,
|
||||
tag, nspecial, special, nspecial15, special15,
|
||||
eflag, vflag, eatom, vatom, host_start, ilist, jnum,
|
||||
cpu_time, success, aewald, felec, off2, host_q, boxlo, prd, tep_ptr);
|
||||
}
|
||||
|
||||
void amoeba_gpu_compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole,
|
||||
double **host_uind, double **host_uinp,
|
||||
const double aewald, const double off2, void **fieldp_ptr) {
|
||||
AMOEBAMF.compute_udirect2b(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, nullptr,
|
||||
aewald, off2, fieldp_ptr);
|
||||
}
|
||||
|
||||
void amoeba_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole,
|
||||
double **host_uind, double **host_uinp,
|
||||
const double aewald, const double off2, void **fieldp_ptr) {
|
||||
AMOEBAMF.compute_umutual2b(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, nullptr,
|
||||
aewald, off2, fieldp_ptr);
|
||||
}
|
||||
|
||||
void amoeba_gpu_update_fieldp(void **fieldp_ptr) {
|
||||
AMOEBAMF.update_fieldp(fieldp_ptr);
|
||||
}
|
||||
|
||||
void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole,
|
||||
double **host_uind, double **host_uinp,
|
||||
const bool eflag_in, const bool vflag_in,
|
||||
const bool eatom, const bool vatom,
|
||||
const double aewald, const double felec, const double off2,
|
||||
void **tep_ptr) {
|
||||
AMOEBAMF.compute_polar_real(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, nullptr,
|
||||
eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
|
||||
}
|
||||
|
||||
void amoeba_gpu_precompute_kspace(const int inum_full, const int bsorder,
|
||||
double ***host_thetai1, double ***host_thetai2,
|
||||
double ***host_thetai3, int** igrid,
|
||||
const int nzlo_out, const int nzhi_out,
|
||||
const int nylo_out, const int nyhi_out,
|
||||
const int nxlo_out, const int nxhi_out) {
|
||||
AMOEBAMF.precompute_kspace(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3, igrid,
|
||||
nzlo_out, nzhi_out, nylo_out, nyhi_out, nxlo_out, nxhi_out);
|
||||
}
|
||||
|
||||
void amoeba_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1,
|
||||
void **host_fdip_phi2, void **host_fdip_sum_phi) {
|
||||
AMOEBAMF.compute_fphi_uind(host_grid_brick, host_fdip_phi1,
|
||||
host_fdip_phi2, host_fdip_sum_phi);
|
||||
}
|
||||
|
||||
void amoeba_gpu_fphi_mpole(double ***host_grid_brick, void **host_fphi, const double felec) {
|
||||
AMOEBAMF.compute_fphi_mpole(host_grid_brick, host_fphi, felec);
|
||||
}
|
||||
|
||||
void amoeba_setup_fft(const int numel, const int element_type) {
|
||||
AMOEBAMF.setup_fft(numel, element_type);
|
||||
}
|
||||
|
||||
void amoeba_compute_fft1d(void* in, void* out, const int numel, const int mode) {
|
||||
AMOEBAMF.compute_fft1d(in, out, numel, mode);
|
||||
}
|
||||
|
||||
double amoeba_gpu_bytes() {
|
||||
return AMOEBAMF.host_memory_usage();
|
||||
}
|
||||
@ -48,6 +48,8 @@ int AtomT::bytes_per_atom() const {
|
||||
bytes+=sizeof(numtyp);
|
||||
if (_vel)
|
||||
bytes+=4*sizeof(numtyp);
|
||||
if (_extra_fields>0)
|
||||
bytes+=_extra_fields*sizeof(numtyp4);
|
||||
return bytes;
|
||||
}
|
||||
|
||||
@ -122,6 +124,11 @@ bool AtomT::alloc(const int nall) {
|
||||
UCL_READ_ONLY)==UCL_SUCCESS);
|
||||
gpu_bytes+=v.device.row_bytes();
|
||||
}
|
||||
if (_extra_fields>0) {
|
||||
success=success && (extra.alloc(_max_atoms*_extra_fields,*dev,UCL_WRITE_ONLY,
|
||||
UCL_READ_ONLY)==UCL_SUCCESS);
|
||||
gpu_bytes+=extra.device.row_bytes();
|
||||
}
|
||||
|
||||
if (_gpu_nbor>0) {
|
||||
if (_bonds) {
|
||||
@ -156,7 +163,8 @@ bool AtomT::alloc(const int nall) {
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
bool AtomT::add_fields(const bool charge, const bool rot,
|
||||
const int gpu_nbor, const bool bonds, const bool vel) {
|
||||
const int gpu_nbor, const bool bonds, const bool vel,
|
||||
const int extra_fields) {
|
||||
bool success=true;
|
||||
// Ignore host/device transfers?
|
||||
int gpu_bytes=0;
|
||||
@ -191,7 +199,17 @@ bool AtomT::add_fields(const bool charge, const bool rot,
|
||||
}
|
||||
}
|
||||
|
||||
if (bonds && !_bonds) {
|
||||
if (extra_fields > 0 && _extra_fields==0) {
|
||||
_extra_fields=extra_fields;
|
||||
_other=true;
|
||||
if (_host_view==false) {
|
||||
success=success && (extra.alloc(_max_atoms*_extra_fields,*dev,UCL_WRITE_ONLY,
|
||||
UCL_READ_ONLY)==UCL_SUCCESS);
|
||||
gpu_bytes+=extra.device.row_bytes();
|
||||
}
|
||||
}
|
||||
|
||||
if (bonds && _bonds==false) {
|
||||
_bonds=true;
|
||||
if (_bonds && _gpu_nbor>0) {
|
||||
success=success && (dev_tag.alloc(_max_atoms,*dev,
|
||||
@ -254,7 +272,8 @@ bool AtomT::add_fields(const bool charge, const bool rot,
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
bool AtomT::init(const int nall, const bool charge, const bool rot,
|
||||
UCL_Device &devi, const int gpu_nbor, const bool bonds, const bool vel) {
|
||||
UCL_Device &devi, const int gpu_nbor, const bool bonds, const bool vel,
|
||||
const int extra_fields) {
|
||||
clear();
|
||||
|
||||
bool success=true;
|
||||
@ -262,13 +281,15 @@ bool AtomT::init(const int nall, const bool charge, const bool rot,
|
||||
_q_avail=false;
|
||||
_quat_avail=false;
|
||||
_v_avail=false;
|
||||
_extra_avail=false;
|
||||
_resized=false;
|
||||
_gpu_nbor=gpu_nbor;
|
||||
_bonds=bonds;
|
||||
_charge=charge;
|
||||
_rot=rot;
|
||||
_vel=vel;
|
||||
_other=_charge || _rot || _vel;
|
||||
_extra_fields=extra_fields;
|
||||
_other=_charge || _rot || _vel || (extra_fields>0);
|
||||
dev=&devi;
|
||||
_time_transfer=0;
|
||||
|
||||
@ -282,10 +303,14 @@ bool AtomT::init(const int nall, const bool charge, const bool rot,
|
||||
time_q.init(*dev);
|
||||
time_quat.init(*dev);
|
||||
time_vel.init(*dev);
|
||||
time_extra.init(*dev);
|
||||
|
||||
time_pos.zero();
|
||||
time_q.zero();
|
||||
time_quat.zero();
|
||||
time_vel.zero();
|
||||
time_extra.zero();
|
||||
|
||||
_time_cast=0.0;
|
||||
|
||||
#ifdef GPU_CAST
|
||||
@ -308,6 +333,8 @@ void AtomT::clear_resize() {
|
||||
quat.clear();
|
||||
if (_vel)
|
||||
v.clear();
|
||||
if (_extra_fields>0)
|
||||
extra.clear();
|
||||
|
||||
dev_cell_id.clear();
|
||||
dev_particle_id.clear();
|
||||
@ -350,6 +377,7 @@ void AtomT::clear() {
|
||||
time_q.clear();
|
||||
time_quat.clear();
|
||||
time_vel.clear();
|
||||
time_extra.clear();
|
||||
clear_resize();
|
||||
|
||||
#ifdef GPU_CAST
|
||||
@ -370,12 +398,19 @@ double AtomT::host_memory_usage() const {
|
||||
atom_bytes+=4;
|
||||
if (_vel)
|
||||
atom_bytes+=4;
|
||||
if (_extra_fields>0)
|
||||
atom_bytes+=_extra_fields;
|
||||
return _max_atoms*atom_bytes*sizeof(numtyp)+sizeof(Atom<numtyp,acctyp>);
|
||||
}
|
||||
|
||||
#ifdef USE_CUDPP
|
||||
#define USE_CUDPP_ARG(arg) arg
|
||||
#else
|
||||
#define USE_CUDPP_ARG(arg)
|
||||
#endif
|
||||
// Sort arrays for neighbor list calculation
|
||||
template <class numtyp, class acctyp>
|
||||
void AtomT::sort_neighbor(const int num_atoms) {
|
||||
void AtomT::sort_neighbor(const int USE_CUDPP_ARG(num_atoms)) {
|
||||
#ifdef USE_CUDPP
|
||||
CUDPPResult result = cudppSort(sort_plan, (unsigned *)dev_cell_id.begin(),
|
||||
(int *)dev_particle_id.begin(),
|
||||
|
||||
@ -76,7 +76,7 @@ class Atom {
|
||||
* gpu_nbor 2 if binning on host and neighboring on device **/
|
||||
bool init(const int nall, const bool charge, const bool rot,
|
||||
UCL_Device &dev, const int gpu_nbor=0, const bool bonds=false,
|
||||
const bool vel=false);
|
||||
const bool vel=false, const int extra_fields=0);
|
||||
|
||||
/// Check if we have enough device storage and realloc if not
|
||||
/** Returns true if resized with any call during this timestep **/
|
||||
@ -96,7 +96,7 @@ class Atom {
|
||||
* gpu_nbor 1 if neighboring will be performed on device
|
||||
* gpu_nbor 2 if binning on host and neighboring on device **/
|
||||
bool add_fields(const bool charge, const bool rot, const int gpu_nbor,
|
||||
const bool bonds, const bool vel=false);
|
||||
const bool bonds, const bool vel=false, const int extra_fields=0);
|
||||
|
||||
/// Returns true if GPU is using charges
|
||||
bool charge() { return _charge; }
|
||||
@ -107,6 +107,9 @@ class Atom {
|
||||
/// Returns true if GPU is using velocities
|
||||
bool velocity() { return _vel; }
|
||||
|
||||
/// Returns true if GPU is using extra fields
|
||||
bool using_extra() { return (_extra_fields>0); }
|
||||
|
||||
/// Only free matrices of length inum or nall for resizing
|
||||
void clear_resize();
|
||||
|
||||
@ -128,6 +131,8 @@ class Atom {
|
||||
time_quat.add_to_total();
|
||||
if (_vel)
|
||||
time_vel.add_to_total();
|
||||
if (_extra_fields>0)
|
||||
time_extra.add_to_total();
|
||||
}
|
||||
|
||||
/// Add copy times to timers
|
||||
@ -139,6 +144,8 @@ class Atom {
|
||||
time_quat.zero();
|
||||
if (_vel)
|
||||
time_vel.zero();
|
||||
if (_extra_fields>0)
|
||||
time_extra.zero();
|
||||
}
|
||||
|
||||
/// Return the total time for host/device data transfer
|
||||
@ -158,6 +165,10 @@ class Atom {
|
||||
total+=time_vel.total_seconds();
|
||||
time_vel.zero_total();
|
||||
}
|
||||
if (_extra_fields>0) {
|
||||
total+=time_extra.total_seconds();
|
||||
time_extra.zero_total();
|
||||
}
|
||||
|
||||
return total+_time_transfer/1000.0;
|
||||
}
|
||||
@ -281,7 +292,11 @@ class Atom {
|
||||
|
||||
/// Signal that we need to transfer atom data for next timestep
|
||||
inline void data_unavail()
|
||||
{ _x_avail=false; _q_avail=false; _quat_avail=false; _v_avail=false; _resized=false; }
|
||||
{ _x_avail=false; _q_avail=false; _quat_avail=false; _v_avail=false; _extra_avail=false; _resized=false; }
|
||||
|
||||
/// Signal that we need to transfer atom extra data for next kernel call
|
||||
inline void extra_data_unavail()
|
||||
{ _extra_avail=false; }
|
||||
|
||||
typedef struct { double x,y,z; } vec3d;
|
||||
typedef struct { numtyp x,y,z,w; } vec4d_t;
|
||||
@ -312,7 +327,7 @@ class Atom {
|
||||
|
||||
/// Copy positions and types to device asynchronously
|
||||
/** Copies nall() elements **/
|
||||
inline void add_x_data(double **host_ptr, int *host_type) {
|
||||
inline void add_x_data(double ** /*host_ptr*/, int * /*host_type*/) {
|
||||
time_pos.start();
|
||||
if (_x_avail==false) {
|
||||
#ifdef GPU_CAST
|
||||
@ -426,7 +441,7 @@ class Atom {
|
||||
|
||||
/// Copy velocities and tags to device asynchronously
|
||||
/** Copies nall() elements **/
|
||||
inline void add_v_data(double **host_ptr, tagint *host_tag) {
|
||||
inline void add_v_data(double ** /*host_ptr*/, tagint * /*host_tag*/) {
|
||||
time_vel.start();
|
||||
if (_v_avail==false) {
|
||||
#ifdef GPU_CAST
|
||||
@ -450,6 +465,33 @@ class Atom {
|
||||
add_v_data(host_ptr,host_tag);
|
||||
}
|
||||
|
||||
// Cast extras to write buffer
|
||||
template<class cpytyp>
|
||||
inline void cast_extra_data(cpytyp *host_ptr) {
|
||||
if (_extra_avail==false) {
|
||||
double t=MPI_Wtime();
|
||||
#if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
|
||||
#pragma omp parallel for simd schedule(static)
|
||||
#elif (LAL_USE_OMP_SIMD == 1)
|
||||
#pragma omp simd
|
||||
#endif
|
||||
for (int i=0; i<_nall*_extra_fields; i++)
|
||||
extra[i]=host_ptr[i];
|
||||
_time_cast+=MPI_Wtime()-t;
|
||||
}
|
||||
}
|
||||
|
||||
// Copy extras to device
|
||||
/** Copies nall()*_extra elements **/
|
||||
inline void add_extra_data() {
|
||||
time_extra.start();
|
||||
if (_extra_avail==false) {
|
||||
extra.update_device(_nall*_extra_fields,true);
|
||||
_extra_avail=true;
|
||||
}
|
||||
time_extra.stop();
|
||||
}
|
||||
|
||||
/// Add in casting time from additional data (seconds)
|
||||
inline void add_cast_time(double t) { _time_cast+=t; }
|
||||
|
||||
@ -473,6 +515,8 @@ class Atom {
|
||||
UCL_Vector<numtyp,numtyp> quat;
|
||||
/// Velocities
|
||||
UCL_Vector<numtyp,numtyp> v;
|
||||
/// Extras
|
||||
UCL_Vector<numtyp4,numtyp4> extra;
|
||||
|
||||
#ifdef GPU_CAST
|
||||
UCL_Vector<numtyp,numtyp> x_cast;
|
||||
@ -493,7 +537,7 @@ class Atom {
|
||||
UCL_H_Vec<int> host_particle_id;
|
||||
|
||||
/// Device timers
|
||||
UCL_Timer time_pos, time_q, time_quat, time_vel;
|
||||
UCL_Timer time_pos, time_q, time_quat, time_vel, time_extra;
|
||||
|
||||
/// Geryon device
|
||||
UCL_Device *dev;
|
||||
@ -508,11 +552,12 @@ class Atom {
|
||||
bool _compiled;
|
||||
|
||||
// True if data has been copied to device already
|
||||
bool _x_avail, _q_avail, _quat_avail, _v_avail, _resized;
|
||||
bool _x_avail, _q_avail, _quat_avail, _v_avail, _extra_avail, _resized;
|
||||
|
||||
bool alloc(const int nall);
|
||||
|
||||
bool _allocated, _rot, _charge, _bonds, _vel, _other;
|
||||
int _extra_fields;
|
||||
int _max_atoms, _nall, _gpu_nbor;
|
||||
bool _host_view;
|
||||
double _time_cast, _time_transfer;
|
||||
|
||||
962
lib/gpu/lal_base_amoeba.cpp
Normal file
962
lib/gpu/lal_base_amoeba.cpp
Normal file
@ -0,0 +1,962 @@
|
||||
/***************************************************************************
|
||||
base_amoeba.cpp
|
||||
-------------------
|
||||
Trung Dac Nguyen (Northwestern)
|
||||
|
||||
Base class for pair styles needing per-particle data for position,
|
||||
charge, and type.
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
email : trung.nguyen@northwestern.edu
|
||||
***************************************************************************/
|
||||
|
||||
#include "lal_base_amoeba.h"
|
||||
|
||||
namespace LAMMPS_AL {
|
||||
#define BaseAmoebaT BaseAmoeba<numtyp, acctyp>
|
||||
|
||||
extern Device<PRECISION,ACC_PRECISION> global_device;
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
BaseAmoebaT::BaseAmoeba() : _compiled(false), _max_bytes(0), short_nbor_polar_avail(false) {
|
||||
device=&global_device;
|
||||
ans=new Answer<numtyp,acctyp>();
|
||||
nbor=new Neighbor();
|
||||
pair_program=nullptr;
|
||||
ucl_device=nullptr;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
BaseAmoebaT::~BaseAmoeba() {
|
||||
delete ans;
|
||||
delete nbor;
|
||||
k_multipole.clear();
|
||||
k_udirect2b.clear();
|
||||
k_umutual2b.clear();
|
||||
k_fphi_uind.clear();
|
||||
k_fphi_mpole.clear();
|
||||
k_polar.clear();
|
||||
k_special15.clear();
|
||||
k_short_nbor.clear();
|
||||
|
||||
#if 0 // !defined(USE_OPENCL) && !defined(USE_HIP)
|
||||
if (fft_plan_created) cufftDestroy(plan);
|
||||
#endif
|
||||
|
||||
if (pair_program) delete pair_program;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int BaseAmoebaT::bytes_per_atom_atomic(const int max_nbors) const {
|
||||
return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
|
||||
nbor->bytes_per_atom(max_nbors);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
|
||||
const int max_nbors, const int maxspecial,
|
||||
const int maxspecial15,
|
||||
const double cell_size, const double gpu_split,
|
||||
FILE *_screen, const void *pair_program,
|
||||
const char *k_name_multipole,
|
||||
const char *k_name_udirect2b,
|
||||
const char *k_name_umutual2b,
|
||||
const char *k_name_polar,
|
||||
const char *k_name_fphi_uind,
|
||||
const char *k_name_fphi_mpole,
|
||||
const char *k_name_short_nbor,
|
||||
const char* k_name_special15) {
|
||||
screen=_screen;
|
||||
|
||||
int gpu_nbor=0;
|
||||
if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_NEIGH)
|
||||
gpu_nbor=1;
|
||||
else if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
|
||||
gpu_nbor=2;
|
||||
|
||||
int _gpu_host=0;
|
||||
int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
|
||||
if (host_nlocal>0)
|
||||
_gpu_host=1;
|
||||
|
||||
_threads_per_atom=device->threads_per_charge();
|
||||
|
||||
bool charge = true;
|
||||
bool rot = false;
|
||||
bool vel = false;
|
||||
_extra_fields = 24; // round up to accomodate quadruples of numtyp values
|
||||
// rpole 13; uind 3; uinp 3; amtype, amgroup; pval
|
||||
int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial,vel,_extra_fields/4);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
if (ucl_device!=device->gpu) _compiled=false;
|
||||
|
||||
ucl_device=device->gpu;
|
||||
atom=&device->atom;
|
||||
|
||||
_block_size=device->pair_block_size();
|
||||
_block_bio_size=device->block_bio_pair();
|
||||
compile_kernels(*ucl_device,pair_program,k_name_multipole,
|
||||
k_name_udirect2b, k_name_umutual2b,k_name_polar,
|
||||
k_name_fphi_uind, k_name_fphi_mpole,
|
||||
k_name_short_nbor, k_name_special15);
|
||||
|
||||
if (_threads_per_atom>1 && gpu_nbor==0) {
|
||||
nbor->packing(true);
|
||||
_nbor_data=&(nbor->dev_packed);
|
||||
} else {
|
||||
_nbor_data=&(nbor->dev_nbor);
|
||||
}
|
||||
|
||||
bool alloc_packed=false;
|
||||
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,
|
||||
_gpu_host,max_nbors,cell_size,alloc_packed,
|
||||
_threads_per_atom);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
// Initialize host-device load balancer
|
||||
hd_balancer.init(device,gpu_nbor,gpu_split);
|
||||
|
||||
// Initialize timers for the selected GPU
|
||||
time_pair.init(*ucl_device);
|
||||
time_pair.zero();
|
||||
|
||||
pos_tex.bind_float(atom->x,4);
|
||||
q_tex.bind_float(atom->q,1);
|
||||
|
||||
_max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||
_maxspecial=maxspecial;
|
||||
_maxspecial15=maxspecial15;
|
||||
|
||||
// allocate per-atom array tep
|
||||
|
||||
int ef_nall=nlocal; //nall;
|
||||
if (ef_nall==0)
|
||||
ef_nall=2000;
|
||||
|
||||
dev_short_nbor.alloc(ef_nall*(2+max_nbors),*(this->ucl_device),UCL_READ_WRITE);
|
||||
|
||||
_max_tep_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
|
||||
_tep.alloc(_max_tep_size*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
|
||||
|
||||
_max_fieldp_size = _max_tep_size;
|
||||
_fieldp.alloc(_max_fieldp_size*8,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
|
||||
|
||||
_max_thetai_size = 0;
|
||||
|
||||
_nmax = nall;
|
||||
dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY);
|
||||
dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY);
|
||||
dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);
|
||||
|
||||
#if 0 // !defined(USE_OPENCL) && !defined(USE_HIP)
|
||||
fft_plan_created = false;
|
||||
#endif
|
||||
|
||||
#ifdef ASYNC_DEVICE_COPY
|
||||
_end_command_queue=ucl_device->num_queues();
|
||||
ucl_device->push_command_queue();
|
||||
#endif
|
||||
|
||||
return success;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void BaseAmoebaT::estimate_gpu_overhead(const int add_kernels) {
|
||||
device->estimate_gpu_overhead(1+add_kernels,_gpu_overhead,_driver_overhead);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void BaseAmoebaT::clear_atomic() {
|
||||
// Output any timing information
|
||||
acc_timers();
|
||||
double avg_split=hd_balancer.all_avg_split();
|
||||
_gpu_overhead*=hd_balancer.timestep();
|
||||
_driver_overhead*=hd_balancer.timestep();
|
||||
device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
|
||||
_gpu_overhead,_driver_overhead,_threads_per_atom,screen);
|
||||
|
||||
time_pair.clear();
|
||||
hd_balancer.clear();
|
||||
|
||||
dev_short_nbor.clear();
|
||||
nbor->clear();
|
||||
ans->clear();
|
||||
|
||||
_tep.clear();
|
||||
_fieldp.clear();
|
||||
_thetai1.clear();
|
||||
_thetai2.clear();
|
||||
_thetai3.clear();
|
||||
_igrid.clear();
|
||||
_fdip_phi1.clear();
|
||||
_fdip_phi2.clear();
|
||||
_fdip_sum_phi.clear();
|
||||
_cgrid_brick.clear();
|
||||
|
||||
dev_nspecial15.clear();
|
||||
dev_special15.clear();
|
||||
dev_special15_t.clear();
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Copy neighbor list from host
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
int * BaseAmoebaT::reset_nbors(const int nall, const int inum, int *ilist,
|
||||
int *numj, int **firstneigh, bool &success) {
|
||||
success=true;
|
||||
|
||||
int mn=nbor->max_nbor_loop(inum,numj,ilist);
|
||||
resize_atom(inum,nall,success);
|
||||
resize_local(inum,mn,success);
|
||||
if (!success)
|
||||
return nullptr;
|
||||
|
||||
nbor->get_host(inum,ilist,numj,firstneigh,block_size());
|
||||
|
||||
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||
if (bytes>_max_an_bytes)
|
||||
_max_an_bytes=bytes;
|
||||
|
||||
return ilist;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Build neighbor list on device
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
inline int BaseAmoebaT::build_nbor_list(const int inum, const int host_inum,
|
||||
const int nall, double **host_x,
|
||||
int *host_type, double *sublo,
|
||||
double *subhi, tagint *tag,
|
||||
int **nspecial, tagint **special,
|
||||
int *nspecial15, tagint **special15,
|
||||
bool &success) {
|
||||
success=true;
|
||||
resize_atom(inum,nall,success);
|
||||
resize_local(inum,host_inum,nbor->max_nbors(),success);
|
||||
if (!success)
|
||||
return 0;
|
||||
atom->cast_copy_x(host_x,host_type);
|
||||
|
||||
int mn;
|
||||
nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi,
|
||||
tag, nspecial, special, success, mn, ans->error_flag);
|
||||
|
||||
// add one-five neighbors
|
||||
|
||||
if (_maxspecial15>0) {
|
||||
UCL_H_Vec<int> view_nspecial15;
|
||||
UCL_H_Vec<tagint> view_special15;
|
||||
view_nspecial15.view(nspecial15,nall,*ucl_device);
|
||||
view_special15.view(special15[0],nall*_maxspecial15,*ucl_device);
|
||||
ucl_copy(dev_nspecial15,view_nspecial15,nall,false);
|
||||
ucl_copy(dev_special15_t,view_special15,_maxspecial15*nall,false);
|
||||
nbor->transpose(dev_special15, dev_special15_t, _maxspecial15, nall);
|
||||
|
||||
add_onefive_neighbors();
|
||||
}
|
||||
|
||||
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||
if (bytes>_max_an_bytes)
|
||||
_max_an_bytes=bytes;
|
||||
return mn;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Prepare for multiple kernel calls in a time step:
|
||||
// - reallocate per-atom arrays, if needed
|
||||
// - transfer extra data from host to device
|
||||
// - build the full neighbor lists for use by different kernels
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, int *host_amtype,
|
||||
int *host_amgroup, double **host_rpole,
|
||||
double **host_uind, double **host_uinp, double *host_pval,
|
||||
double *sublo, double *subhi, tagint *tag,
|
||||
int **nspecial, tagint **special,
|
||||
int *nspecial15, tagint **special15,
|
||||
const bool eflag_in, const bool vflag_in,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
int **&ilist, int **&jnum, const double cpu_time,
|
||||
bool &success, double *host_q, double * /*boxlo*/, double * /*prd*/) {
|
||||
acc_timers();
|
||||
if (eatom) _eflag=2;
|
||||
else if (eflag_in) _eflag=1;
|
||||
else _eflag=0;
|
||||
if (vatom) _vflag=2;
|
||||
else if (vflag_in) _vflag=1;
|
||||
else _vflag=0;
|
||||
|
||||
#ifdef LAL_NO_BLOCK_REDUCE
|
||||
if (_eflag) _eflag=2;
|
||||
if (_vflag) _vflag=2;
|
||||
#endif
|
||||
|
||||
set_kernel(_eflag,_vflag);
|
||||
|
||||
// ------------------- Resize 1-5 neighbor arrays ------------------------
|
||||
|
||||
if (nall>_nmax) {
|
||||
_nmax = nall;
|
||||
dev_nspecial15.clear();
|
||||
dev_special15.clear();
|
||||
dev_special15_t.clear();
|
||||
dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY);
|
||||
dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY);
|
||||
dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);
|
||||
}
|
||||
|
||||
if (inum_full==0) {
|
||||
host_start=0;
|
||||
// Make sure textures are correct if realloc by a different hybrid style
|
||||
resize_atom(0,nall,success);
|
||||
zero_timers();
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
hd_balancer.balance(cpu_time);
|
||||
int inum=hd_balancer.get_gpu_count(ago,inum_full);
|
||||
ans->inum(inum);
|
||||
host_start=inum;
|
||||
|
||||
// Build neighbor list on GPU if necessary
|
||||
if (ago==0) {
|
||||
_max_nbors = build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
|
||||
sublo, subhi, tag, nspecial, special, nspecial15, special15,
|
||||
success);
|
||||
if (!success)
|
||||
return nullptr;
|
||||
atom->cast_q_data(host_q);
|
||||
hd_balancer.start_timer();
|
||||
} else {
|
||||
atom->cast_x_data(host_x,host_type);
|
||||
atom->cast_q_data(host_q);
|
||||
hd_balancer.start_timer();
|
||||
atom->add_x_data(host_x,host_type);
|
||||
}
|
||||
atom->add_q_data();
|
||||
cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
|
||||
atom->add_extra_data();
|
||||
|
||||
*ilist=nbor->host_ilist.begin();
|
||||
*jnum=nbor->host_acc.begin();
|
||||
|
||||
// re-allocate dev_short_nbor if necessary
|
||||
if (inum_full*(2+_max_nbors) > dev_short_nbor.cols()) {
|
||||
int _nmax=static_cast<int>(static_cast<double>(inum_full)*1.10);
|
||||
dev_short_nbor.resize((2+_max_nbors)*_nmax);
|
||||
}
|
||||
|
||||
hd_balancer.stop_timer();
|
||||
|
||||
return nbor->host_jlist.begin()-host_start;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Compute multipole real-space part
|
||||
// precompute() should be already invoked before mem (re)allocation
|
||||
// this is the first part in a time step done on the GPU for AMOEBA for now
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
void BaseAmoebaT::compute_multipole_real(const int /*ago*/, const int inum_full,
|
||||
const int /*nall*/, double ** /*host_x*/,
|
||||
int * /*host_type*/, int * /*host_amtype*/,
|
||||
int * /*host_amgroup*/, double ** /*host_rpole*/,
|
||||
double */*host_pval*/, double * /*sublo*/,
|
||||
double * /*subhi*/, tagint * /*tag*/,
|
||||
int ** /*nspecial*/, tagint ** /*special*/,
|
||||
int * /*nspecial15*/, tagint ** /*special15*/,
|
||||
const bool /*eflag_in*/, const bool /*vflag_in*/,
|
||||
const bool /*eatom*/, const bool /*vatom*/,
|
||||
int & /*host_start*/, int ** /*ilist*/, int ** /*jnum*/,
|
||||
const double /*cpu_time*/, bool & /*success*/,
|
||||
const double aewald, const double felec,
|
||||
const double off2_mpole, double * /*host_q*/,
|
||||
double * /*boxlo*/, double * /*prd*/, void **tep_ptr) {
|
||||
// ------------------- Resize _tep array ------------------------
|
||||
|
||||
if (inum_full>_max_tep_size) {
|
||||
_max_tep_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
|
||||
_tep.resize(_max_tep_size*4);
|
||||
}
|
||||
*tep_ptr=_tep.host.begin();
|
||||
|
||||
_off2_mpole = off2_mpole;
|
||||
_felec = felec;
|
||||
_aewald = aewald;
|
||||
multipole_real(_eflag,_vflag);
|
||||
|
||||
// leave the answers (forces, energies and virial) on the device,
|
||||
// only copy them back in the last kernel (polar_real)
|
||||
//ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
|
||||
//device->add_ans_object(ans);
|
||||
|
||||
// copy tep from device to host
|
||||
|
||||
_tep.update_host(_max_tep_size*4,false);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Reneighbor on GPU if necessary, and then compute the direct real space part
|
||||
// of the permanent field
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
void BaseAmoebaT::compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole,
|
||||
double **host_uind, double **host_uinp, double *host_pval,
|
||||
const double aewald, const double off2_polar,
|
||||
void** fieldp_ptr) {
|
||||
// all the necessary data arrays are already copied from host to device
|
||||
|
||||
cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
|
||||
atom->add_extra_data();
|
||||
|
||||
*fieldp_ptr=_fieldp.host.begin();
|
||||
|
||||
// specify the correct cutoff and alpha values
|
||||
_off2_polar = off2_polar;
|
||||
_aewald = aewald;
|
||||
udirect2b(_eflag,_vflag);
|
||||
|
||||
// copy field and fieldp from device to host (_fieldp store both arrays, one after another)
|
||||
|
||||
_fieldp.update_host(_max_fieldp_size*8,false);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Reneighbor on GPU if necessary, and then compute the direct real space part
|
||||
// of the induced field
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double ** /*host_rpole*/,
|
||||
double **host_uind, double **host_uinp, double * /*host_pval*/,
|
||||
const double aewald, const double off2_polar,
|
||||
void** /*fieldp_ptr*/) {
|
||||
// only copy the necessary data arrays that are updated over the iterations
|
||||
// use nullptr for the other arrays that are already copied from host to device
|
||||
cast_extra_data(host_amtype, host_amgroup, nullptr, host_uind, host_uinp, nullptr);
|
||||
atom->add_extra_data();
|
||||
|
||||
// set the correct cutoff and alpha
|
||||
_off2_polar = off2_polar;
|
||||
_aewald = aewald;
|
||||
// launch the kernel
|
||||
umutual2b(_eflag,_vflag);
|
||||
|
||||
// copy field and fieldp from device to host (_fieldp store both arrays, one after another)
|
||||
// NOTE: move this step to update_fieldp() to delay device-host transfer
|
||||
// after umutual1 and self are done on the GPU
|
||||
// *fieldp_ptr=_fieldp.host.begin();
|
||||
// _fieldp.update_host(_max_fieldp_size*8,false);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Prepare for umutual1() after bspline_fill() is done on host
|
||||
// - reallocate per-atom arrays, thetai1, thetai2, thetai3, and igrid if needed
|
||||
// host_thetai1, host_thetai2, host_thetai3 are allocated with nmax by bsordermax by 4
|
||||
// host_igrid is allocated with nmax by 4
|
||||
// - transfer extra data from host to device
|
||||
// NOTE: can be re-used for fphi_mpole() but with a different bsorder value
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void BaseAmoebaT::precompute_kspace(const int inum_full, const int bsorder,
|
||||
double ***host_thetai1, double ***host_thetai2,
|
||||
double ***host_thetai3, int** host_igrid,
|
||||
const int nzlo_out, const int nzhi_out,
|
||||
const int nylo_out, const int nyhi_out,
|
||||
const int nxlo_out, const int nxhi_out) {
|
||||
// update bsorder with that of the kspace solver
|
||||
_bsorder = bsorder;
|
||||
|
||||
// allocate or resize per-atom arrays
|
||||
// _max_thetai_size, _max_tep_size and _max_fieldp_size are essentially _nmax
|
||||
// will be consolidated once all terms are ready
|
||||
|
||||
if (_max_thetai_size == 0) {
|
||||
_max_thetai_size = static_cast<int>(static_cast<double>(inum_full)*1.10);
|
||||
_thetai1.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY);
|
||||
_thetai2.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY);
|
||||
_thetai3.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY);
|
||||
_igrid.alloc(_max_thetai_size*4,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY);
|
||||
|
||||
_fdip_phi1.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_READ_WRITE);
|
||||
_fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_READ_WRITE);
|
||||
_fdip_sum_phi.alloc(_max_thetai_size*20,*(this->ucl_device),UCL_READ_WRITE);
|
||||
} else {
|
||||
if ((int)_thetai1.cols()<_max_thetai_size*bsorder) {
|
||||
_max_thetai_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
|
||||
_thetai1.resize(_max_thetai_size*bsorder);
|
||||
_thetai2.resize(_max_thetai_size*bsorder);
|
||||
_thetai3.resize(_max_thetai_size*bsorder);
|
||||
_igrid.resize(_max_thetai_size*4);
|
||||
|
||||
_fdip_phi1.resize(_max_thetai_size*10);
|
||||
_fdip_phi2.resize(_max_thetai_size*10);
|
||||
_fdip_sum_phi.resize(_max_thetai_size*20);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef ASYNC_DEVICE_COPY
|
||||
_thetai1.cq(ucl_device->cq(_end_command_queue));
|
||||
_thetai2.cq(ucl_device->cq(_end_command_queue));
|
||||
_thetai3.cq(ucl_device->cq(_end_command_queue));
|
||||
#endif
|
||||
|
||||
// pack host data to device
|
||||
|
||||
for (int i = 0; i < inum_full; i++)
|
||||
for (int j = 0; j < bsorder; j++) {
|
||||
int idx = i*bsorder + j;
|
||||
numtyp4 v;
|
||||
v.x = host_thetai1[i][j][0];
|
||||
v.y = host_thetai1[i][j][1];
|
||||
v.z = host_thetai1[i][j][2];
|
||||
v.w = host_thetai1[i][j][3];
|
||||
_thetai1[idx] = v;
|
||||
}
|
||||
_thetai1.update_device(true);
|
||||
|
||||
for (int i = 0; i < inum_full; i++)
|
||||
for (int j = 0; j < bsorder; j++) {
|
||||
int idx = i*bsorder + j;
|
||||
numtyp4 v;
|
||||
v.x = host_thetai2[i][j][0];
|
||||
v.y = host_thetai2[i][j][1];
|
||||
v.z = host_thetai2[i][j][2];
|
||||
v.w = host_thetai2[i][j][3];
|
||||
_thetai2[idx] = v;
|
||||
}
|
||||
_thetai2.update_device(true);
|
||||
|
||||
for (int i = 0; i < inum_full; i++)
|
||||
for (int j = 0; j < bsorder; j++) {
|
||||
int idx = i*bsorder + j;
|
||||
numtyp4 v;
|
||||
v.x = host_thetai3[i][j][0];
|
||||
v.y = host_thetai3[i][j][1];
|
||||
v.z = host_thetai3[i][j][2];
|
||||
v.w = host_thetai3[i][j][3];
|
||||
_thetai3[idx] = v;
|
||||
}
|
||||
_thetai3.update_device(true);
|
||||
|
||||
for (int i = 0; i < inum_full; i++) {
|
||||
int idx = i*4;
|
||||
_igrid[idx+0] = host_igrid[i][0];
|
||||
_igrid[idx+1] = host_igrid[i][1];
|
||||
_igrid[idx+2] = host_igrid[i][2];
|
||||
}
|
||||
_igrid.update_device(true);
|
||||
|
||||
// _cgrid_brick holds the grid-based potential
|
||||
|
||||
_nzlo_out = nzlo_out;
|
||||
_nzhi_out = nzhi_out;
|
||||
_nylo_out = nylo_out;
|
||||
_nyhi_out = nyhi_out;
|
||||
_nxlo_out = nxlo_out;
|
||||
_nxhi_out = nxhi_out;
|
||||
_ngridz = nzhi_out - nzlo_out + 1;
|
||||
_ngridy = nyhi_out - nylo_out + 1;
|
||||
_ngridx = nxhi_out - nxlo_out + 1;
|
||||
_num_grid_points = _ngridx * _ngridy * _ngridz;
|
||||
|
||||
int numel = _num_grid_points;
|
||||
if (_cgrid_brick.cols() == 0) {
|
||||
int nsize=(int)(((double)numel)*1.1);
|
||||
_cgrid_brick.alloc(nsize, *(this->ucl_device), UCL_READ_WRITE, UCL_READ_ONLY);
|
||||
} else if (numel > (int)_cgrid_brick.cols()) {
|
||||
_cgrid_brick.resize(numel);
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// fphi_uind = induced potential from grid
|
||||
// fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
|
||||
// NOTE: host_grid_brick is from ic_kspace post_convolution()
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void BaseAmoebaT::compute_fphi_uind(double ****host_grid_brick,
|
||||
void **host_fdip_phi1,
|
||||
void **host_fdip_phi2,
|
||||
void **host_fdip_sum_phi)
|
||||
{
|
||||
int n = 0;
|
||||
for (int iz = _nzlo_out; iz <= _nzhi_out; iz++)
|
||||
for (int iy = _nylo_out; iy <= _nyhi_out; iy++)
|
||||
for (int ix = _nxlo_out; ix <= _nxhi_out; ix++) {
|
||||
numtyp2 v;
|
||||
v.x = host_grid_brick[iz][iy][ix][0];
|
||||
v.y = host_grid_brick[iz][iy][ix][1];
|
||||
_cgrid_brick[n] = v;
|
||||
n++;
|
||||
}
|
||||
_cgrid_brick.update_device(_num_grid_points, false);
|
||||
|
||||
#ifdef ASYNC_DEVICE_COPY
|
||||
ucl_device->sync();
|
||||
#endif
|
||||
|
||||
// launch the kernel with its execution configuration (see below)
|
||||
fphi_uind();
|
||||
|
||||
// copy data from device to host
|
||||
_fdip_phi1.update_host(_max_thetai_size*10, false);
|
||||
_fdip_phi2.update_host(_max_thetai_size*10, false);
|
||||
_fdip_sum_phi.update_host(_max_thetai_size*20, false);
|
||||
|
||||
// return the pointers to the host-side arrays
|
||||
*host_fdip_phi1 = _fdip_phi1.host.begin();
|
||||
*host_fdip_phi2 = _fdip_phi2.host.begin();
|
||||
*host_fdip_sum_phi = _fdip_sum_phi.host.begin();
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Interpolate the potential from the PME grid
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
int BaseAmoebaT::fphi_uind() {
|
||||
int ainum=ans->inum();
|
||||
if (ainum == 0)
|
||||
return 0;
|
||||
|
||||
// Compute the block size and grid size to keep all cores busy
|
||||
|
||||
const int BX=block_size();
|
||||
const int GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
|
||||
|
||||
time_pair.start();
|
||||
int ngridxy = _ngridx * _ngridy;
|
||||
k_fphi_uind.set_size(GX,BX);
|
||||
k_fphi_uind.run(&_thetai1, &_thetai2, &_thetai3, &_igrid, &_cgrid_brick,
|
||||
&_fdip_phi1, &_fdip_phi2, &_fdip_sum_phi, &_bsorder, &ainum,
|
||||
&_nzlo_out, &_nylo_out, &_nxlo_out, &ngridxy, &_ngridx);
|
||||
time_pair.stop();
|
||||
|
||||
return GX;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// fphi_mpole = multipole potential from grid (limited to polar_kspace for now)
|
||||
// fphi_mpole extracts the permanent multipole potential from
|
||||
// the particle mesh Ewald grid
|
||||
// NOTE: host_grid_brick is from p_kspace post_convolution()
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void BaseAmoebaT::compute_fphi_mpole(double ***host_grid_brick, void **host_fphi, const double felec)
|
||||
{
|
||||
int n = 0;
|
||||
for (int iz = _nzlo_out; iz <= _nzhi_out; iz++)
|
||||
for (int iy = _nylo_out; iy <= _nyhi_out; iy++)
|
||||
for (int ix = _nxlo_out; ix <= _nxhi_out; ix++) {
|
||||
numtyp2 v;
|
||||
v.x = host_grid_brick[iz][iy][ix];
|
||||
v.y = (numtyp)0;
|
||||
_cgrid_brick[n] = v;
|
||||
n++;
|
||||
}
|
||||
_cgrid_brick.update_device(_num_grid_points, false);
|
||||
|
||||
_felec = felec;
|
||||
fphi_mpole();
|
||||
|
||||
_fdip_sum_phi.update_host(_max_thetai_size*20, false);
|
||||
|
||||
*host_fphi = _fdip_sum_phi.host.begin();
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Interpolate the potential from the PME grid
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
int BaseAmoebaT::fphi_mpole() {
|
||||
int ainum=ans->inum();
|
||||
if (ainum == 0)
|
||||
return 0;
|
||||
|
||||
// Compute the block size and grid size to keep all cores busy
|
||||
|
||||
const int BX=block_size();
|
||||
const int GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
|
||||
|
||||
time_pair.start();
|
||||
int ngridxy = _ngridx * _ngridy;
|
||||
k_fphi_mpole.set_size(GX,BX);
|
||||
k_fphi_mpole.run(&_thetai1, &_thetai2, &_thetai3, &_igrid, &_cgrid_brick,
|
||||
&_fdip_sum_phi, &_bsorder, &ainum, &_felec,
|
||||
&_nzlo_out, &_nylo_out, &_nxlo_out, &ngridxy, &_ngridx);
|
||||
time_pair.stop();
|
||||
|
||||
return GX;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Reneighbor on GPU if necessary, and then compute polar real-space
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
void BaseAmoebaT::compute_polar_real(int *host_amtype, int *host_amgroup,
|
||||
double **host_rpole, double **host_uind,
|
||||
double **host_uinp, double *host_pval,
|
||||
const bool eflag_in, const bool vflag_in,
|
||||
const bool eatom, const bool vatom,
|
||||
const double aewald, const double felec,
|
||||
const double off2_polar, void **tep_ptr) {
|
||||
|
||||
// cast necessary data arrays from host to device
|
||||
|
||||
cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
|
||||
atom->add_extra_data();
|
||||
|
||||
*tep_ptr=_tep.host.begin();
|
||||
|
||||
_off2_polar = off2_polar;
|
||||
_felec = felec;
|
||||
_aewald = aewald;
|
||||
const int red_blocks=polar_real(_eflag,_vflag);
|
||||
|
||||
// only copy answers (forces, energies and virial) back from the device
|
||||
// in the last kernel (which is polar_real here)
|
||||
ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
|
||||
device->add_ans_object(ans);
|
||||
|
||||
// copy tep from device to host
|
||||
_tep.update_host(_max_tep_size*4,false);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Return the memory bytes allocated on the host and device
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
double BaseAmoebaT::host_memory_usage_atomic() const {
|
||||
return device->atom.host_memory_usage()+nbor->host_memory_usage()+
|
||||
4*sizeof(numtyp)+sizeof(BaseAmoeba<numtyp,acctyp>);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Setup the FFT plan: only placeholder for now
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void BaseAmoebaT::setup_fft(const int /*numel*/, const int /*element_type*/)
|
||||
{
|
||||
// TODO: setting up FFT plan based on the backend (cuFFT or hipFFT)
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Compute FFT on the device: only placeholder for now
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void BaseAmoebaT::compute_fft1d(void * /*in*/, void * /*out*/,
|
||||
const int /*numel*/, const int /*mode*/)
|
||||
{
|
||||
// TODO: setting up FFT plan based on the backend (cuFFT or hipFFT)
|
||||
#if 0 // !defined(USE_OPENCL) && !defined(USE_HIP)
|
||||
if (fft_plan_created == false) {
|
||||
int m = numel/2;
|
||||
cufftPlan1d(&plan, m, CUFFT_Z2Z, 1);
|
||||
fft_plan_created = true;
|
||||
}
|
||||
|
||||
// n = number of double complex
|
||||
int n = numel/2;
|
||||
|
||||
// copy the host array to the device (data)
|
||||
UCL_Vector<cufftDoubleComplex,cufftDoubleComplex> data;
|
||||
data.alloc(n, *(this->ucl_device), UCL_READ_WRITE, UCL_READ_WRITE);
|
||||
int m = 0;
|
||||
double* d_in = (double*)in;
|
||||
for (int i = 0; i < n; i++) {
|
||||
data[i].x = d_in[m];
|
||||
data[i].y = d_in[m+1];
|
||||
m += 2;
|
||||
}
|
||||
data.update_device(false);
|
||||
|
||||
// perform the in-place forward FFT
|
||||
|
||||
cufftResult result = cufftExecZ2Z(plan, (cufftDoubleComplex*)&data.device,
|
||||
(cufftDoubleComplex*)&data.device, CUFFT_FORWARD);
|
||||
if (result != CUFFT_SUCCESS) printf("failed cufft %d\n", result);
|
||||
ucl_device->sync();
|
||||
data.update_host(false);
|
||||
|
||||
// copy back the data to the host array
|
||||
|
||||
m = 0;
|
||||
double* d_out = (double*)out;
|
||||
for (int i = 0; i < n; i++) {
|
||||
d_out[m] = data[i].x;
|
||||
d_out[m+1] = data[i].y;
|
||||
m += 2;
|
||||
}
|
||||
|
||||
data.clear();
|
||||
#endif
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Copy the extra data from host to device
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole,
|
||||
double** uind, double** uinp, double* pval) {
|
||||
// signal that we need to transfer extra data from the host
|
||||
|
||||
atom->extra_data_unavail();
|
||||
|
||||
int _nall=atom->nall();
|
||||
numtyp4 *pextra=reinterpret_cast<numtyp4*>(&(atom->extra[0]));
|
||||
|
||||
int n = 0;
|
||||
int nstride = 1; //4;
|
||||
if (rpole) {
|
||||
for (int i = 0; i < _nall; i++) {
|
||||
int idx = n+i*nstride;
|
||||
pextra[idx].x = rpole[i][0];
|
||||
pextra[idx].y = rpole[i][1];
|
||||
pextra[idx].z = rpole[i][2];
|
||||
pextra[idx].w = rpole[i][3];
|
||||
}
|
||||
|
||||
n += nstride*_nall;
|
||||
for (int i = 0; i < _nall; i++) {
|
||||
int idx = n+i*nstride;
|
||||
pextra[idx].x = rpole[i][4];
|
||||
pextra[idx].y = rpole[i][5];
|
||||
pextra[idx].z = rpole[i][6];
|
||||
pextra[idx].w = rpole[i][8];
|
||||
}
|
||||
|
||||
n += nstride*_nall;
|
||||
for (int i = 0; i < _nall; i++) {
|
||||
int idx = n+i*nstride;
|
||||
pextra[idx].x = rpole[i][9];
|
||||
pextra[idx].y = rpole[i][12];
|
||||
pextra[idx].z = (numtyp)amtype[i];
|
||||
pextra[idx].w = (numtyp)amgroup[i];
|
||||
}
|
||||
} else {
|
||||
n += 2*nstride*_nall;
|
||||
}
|
||||
|
||||
n += nstride*_nall;
|
||||
if (uind) {
|
||||
for (int i = 0; i < _nall; i++) {
|
||||
int idx = n+i*nstride;
|
||||
pextra[idx].x = uind[i][0];
|
||||
pextra[idx].y = uind[i][1];
|
||||
pextra[idx].z = uind[i][2];
|
||||
pextra[idx].w = 0;
|
||||
}
|
||||
}
|
||||
|
||||
n += nstride*_nall;
|
||||
if (uinp) {
|
||||
for (int i = 0; i < _nall; i++) {
|
||||
int idx = n+i*nstride;
|
||||
pextra[idx].x = uinp[i][0];
|
||||
pextra[idx].y = uinp[i][1];
|
||||
pextra[idx].z = uinp[i][2];
|
||||
pextra[idx].w = 0;
|
||||
}
|
||||
}
|
||||
|
||||
n += nstride*_nall;
|
||||
if (pval) {
|
||||
for (int i = 0; i < _nall; i++) {
|
||||
int idx = n+i*nstride;
|
||||
pextra[idx].x = pval[i];
|
||||
pextra[idx].y = 0;
|
||||
pextra[idx].z = 0;
|
||||
pextra[idx].w = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Compile (load) the kernel strings and set the kernels
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
|
||||
const char *kname_multipole,
|
||||
const char *kname_udirect2b,
|
||||
const char *kname_umutual2b,
|
||||
const char *kname_polar,
|
||||
const char *kname_fphi_uind,
|
||||
const char *kname_fphi_mpole,
|
||||
const char *kname_short_nbor,
|
||||
const char* kname_special15) {
|
||||
if (_compiled)
|
||||
return;
|
||||
|
||||
if (pair_program) delete pair_program;
|
||||
pair_program=new UCL_Program(dev);
|
||||
std::string oclstring = device->compile_string()+" -DEVFLAG=1";
|
||||
pair_program->load_string(pair_str, oclstring.c_str(),nullptr, screen);
|
||||
|
||||
k_multipole.set_function(*pair_program, kname_multipole);
|
||||
k_udirect2b.set_function(*pair_program, kname_udirect2b);
|
||||
k_umutual2b.set_function(*pair_program, kname_umutual2b);
|
||||
k_polar.set_function(*pair_program, kname_polar);
|
||||
k_fphi_uind.set_function(*pair_program, kname_fphi_uind);
|
||||
k_fphi_mpole.set_function(*pair_program, kname_fphi_mpole);
|
||||
k_short_nbor.set_function(*pair_program, kname_short_nbor);
|
||||
k_special15.set_function(*pair_program, kname_special15);
|
||||
pos_tex.get_texture(*pair_program, "pos_tex");
|
||||
q_tex.get_texture(*pair_program, "q_tex");
|
||||
|
||||
_compiled=true;
|
||||
|
||||
#if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
|
||||
if (dev.has_subgroup_support()) {
|
||||
int mx_subgroup_sz = k_polar.max_subgroup_size(_block_size);
|
||||
if (_threads_per_atom > mx_subgroup_sz)
|
||||
_threads_per_atom = mx_subgroup_sz;
|
||||
device->set_simd_size(mx_subgroup_sz);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Specify 1-5 neighbors from the current neighbor list
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int BaseAmoebaT::add_onefive_neighbors() {
|
||||
// Compute the block size and grid size to keep all cores busy
|
||||
const int BX=block_size();
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(ans->inum())/
|
||||
(BX/_threads_per_atom)));
|
||||
|
||||
int _nall=atom->nall();
|
||||
int ainum=ans->inum();
|
||||
int nbor_pitch=nbor->nbor_pitch();
|
||||
|
||||
k_special15.set_size(GX,BX);
|
||||
k_special15.run(&nbor->dev_nbor, &_nbor_data->begin(),
|
||||
&atom->dev_tag, &dev_nspecial15, &dev_special15,
|
||||
&ainum, &_nall, &nbor_pitch,
|
||||
&_threads_per_atom);
|
||||
|
||||
return GX;
|
||||
}
|
||||
|
||||
template class BaseAmoeba<PRECISION,ACC_PRECISION>;
|
||||
}
|
||||
325
lib/gpu/lal_base_amoeba.h
Normal file
325
lib/gpu/lal_base_amoeba.h
Normal file
@ -0,0 +1,325 @@
|
||||
/***************************************************************************
|
||||
base_amoeba.h
|
||||
-------------------
|
||||
Trung Dac Nguyen (Northwestern)
|
||||
|
||||
Base class for pair styles needing per-particle data for position,
|
||||
charge, and type.
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
email : trung.nguyen@northwestern.edu
|
||||
***************************************************************************/
|
||||
|
||||
#ifndef LAL_BASE_AMOEBA_H
|
||||
#define LAL_BASE_AMOEBA_H
|
||||
|
||||
#include "lal_device.h"
|
||||
#include "lal_balance.h"
|
||||
#include "mpi.h"
|
||||
|
||||
#if defined(USE_OPENCL)
|
||||
#include "geryon/ocl_texture.h"
|
||||
#elif defined(USE_CUDART)
|
||||
#include "geryon/nvc_texture.h"
|
||||
#elif defined(USE_HIP)
|
||||
#include "geryon/hip_texture.h"
|
||||
#else
|
||||
#include "geryon/nvd_texture.h"
|
||||
#endif
|
||||
|
||||
//#define ASYNC_DEVICE_COPY
|
||||
|
||||
#if !defined(USE_OPENCL) && !defined(USE_HIP)
|
||||
// temporary workaround for int2 also defined in cufft
|
||||
#ifdef int2
|
||||
#undef int2
|
||||
#endif
|
||||
#include "cufft.h"
|
||||
#endif
|
||||
|
||||
namespace LAMMPS_AL {
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
class BaseAmoeba {
|
||||
public:
|
||||
BaseAmoeba();
|
||||
virtual ~BaseAmoeba();
|
||||
|
||||
/// Clear any previous data and set up for a new LAMMPS run
|
||||
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||
* \param cell_size cutoff + skin
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
* \param k_name name for the kernel for force calculation
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successful
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
* - -5 Double precision is not supported on card **/
|
||||
int init_atomic(const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const int maxspecial15, const double cell_size,
|
||||
const double gpu_split, FILE *screen, const void *pair_program,
|
||||
const char *kname_multipole, const char *kname_udirect2b,
|
||||
const char *kname_umutual2b, const char *kname_polar,
|
||||
const char *kname_fphi_uind, const char *kname_fphi_mpole,
|
||||
const char *kname_short_nbor, const char* kname_special15);
|
||||
|
||||
/// Estimate the overhead for GPU context changes and CPU driver
|
||||
void estimate_gpu_overhead(const int add_kernels=0);
|
||||
|
||||
/// Check if there is enough storage for atom arrays and realloc if not
|
||||
/** \param success set to false if insufficient memory **/
|
||||
inline void resize_atom(const int inum, const int nall, bool &success) {
|
||||
if (atom->resize(nall, success)) {
|
||||
pos_tex.bind_float(atom->x,4);
|
||||
q_tex.bind_float(atom->q,1);
|
||||
}
|
||||
ans->resize(inum,success);
|
||||
}
|
||||
|
||||
/// Check if there is enough storage for neighbors and realloc if not
|
||||
/** \param nlocal number of particles whose nbors must be stored on device
|
||||
* \param host_inum number of particles whose nbors need to copied to host
|
||||
* \param current maximum number of neighbors
|
||||
* \note olist_size=total number of local particles **/
|
||||
inline void resize_local(const int inum, const int max_nbors, bool &success) {
|
||||
nbor->resize(inum,max_nbors,success);
|
||||
}
|
||||
|
||||
/// Check if there is enough storage for neighbors and realloc if not
|
||||
/** \param nlocal number of particles whose nbors must be stored on device
|
||||
* \param host_inum number of particles whose nbors need to copied to host
|
||||
* \param current maximum number of neighbors
|
||||
* \note host_inum is 0 if the host is performing neighboring
|
||||
* \note nlocal+host_inum=total number local particles
|
||||
* \note olist_size=0 **/
|
||||
inline void resize_local(const int inum, const int host_inum,
|
||||
const int max_nbors, bool &success) {
|
||||
nbor->resize(inum,host_inum,max_nbors,success);
|
||||
}
|
||||
|
||||
/// Clear all host and device data
|
||||
/** \note This is called at the beginning of the init() routine **/
|
||||
void clear_atomic();
|
||||
|
||||
/// Returns memory usage on device per atom
|
||||
int bytes_per_atom_atomic(const int max_nbors) const;
|
||||
|
||||
/// Total host memory used by library for pair style
|
||||
double host_memory_usage_atomic() const;
|
||||
|
||||
/// Accumulate timers
|
||||
inline void acc_timers() {
|
||||
if (device->time_device()) {
|
||||
nbor->acc_timers(screen);
|
||||
time_pair.add_to_total();
|
||||
atom->acc_timers();
|
||||
ans->acc_timers();
|
||||
}
|
||||
}
|
||||
|
||||
/// Zero timers
|
||||
inline void zero_timers() {
|
||||
time_pair.zero();
|
||||
atom->zero_timers();
|
||||
ans->zero_timers();
|
||||
}
|
||||
|
||||
/// Copy neighbor list from host
|
||||
int * reset_nbors(const int nall, const int inum, int *ilist, int *numj,
|
||||
int **firstneigh, bool &success);
|
||||
|
||||
/// Build neighbor list on device
|
||||
int build_nbor_list(const int inum, const int host_inum,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *sublo, double *subhi, tagint *tag, int **nspecial,
|
||||
tagint **special, int *nspecial15, tagint **special15,
|
||||
bool &success);
|
||||
|
||||
/// Reallocate per-atom arrays if needed, and build neighbor lists once, if needed
|
||||
virtual int** precompute(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, int *host_amtype,
|
||||
int *host_amgroup, double **host_rpole, double **host_uind,
|
||||
double **host_uinp, double *host_pval, double *sublo, double *subhi,
|
||||
tagint *tag, int **nspecial, tagint **special,
|
||||
int *nspecial15, tagint **special15,
|
||||
const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
int **&ilist, int **&numj, const double cpu_time, bool &success,
|
||||
double *charge, double *boxlo, double *prd);
|
||||
|
||||
/// Compute multipole real-space with device neighboring
|
||||
virtual void compute_multipole_real(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, int *host_amtype,
|
||||
int *host_amgroup, double **host_rpole, double *host_pval,
|
||||
double *sublo, double *subhi, tagint *tag,
|
||||
int **nspecial, tagint **special, int *nspecial15, tagint **special15,
|
||||
const bool eflag, const bool vflag, const bool eatom, const bool vatom,
|
||||
int &host_start, int **ilist, int **numj, const double cpu_time,
|
||||
bool &success, const double aewald, const double felec,
|
||||
const double off2_mpole, double *charge, double *boxlo,
|
||||
double *prd, void **tep_ptr);
|
||||
|
||||
/// Compute the real space part of the permanent field (udirect2b) with device neighboring
|
||||
virtual void compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole,
|
||||
double **host_uind, double **host_uinp, double *host_pval,
|
||||
const double aewald, const double off2_polar, void **fieldp_ptr);
|
||||
|
||||
/// Compute the real space part of the induced field (umutual2b) with device neighboring
|
||||
virtual void compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole,
|
||||
double **host_uind, double **host_uinp, double *host_pval,
|
||||
const double aewald, const double off2_polar, void **fieldp_ptr);
|
||||
|
||||
/// Allocate/resize per-atom arrays before the kspace parts in induce() and polar
|
||||
virtual void precompute_kspace(const int inum_full, const int bsorder,
|
||||
double ***host_thetai1, double ***host_thetai2,
|
||||
double ***host_thetai3, int** igrid,
|
||||
const int nzlo_out, const int nzhi_out,
|
||||
const int nylo_out, const int nyhi_out,
|
||||
const int nxlo_out, const int nxhi_out);
|
||||
/// Interpolate the induced potential from the grid
|
||||
virtual void compute_fphi_uind(double ****host_grid_brick,
|
||||
void **host_fdip_phi1, void **host_fdip_phi2,
|
||||
void **host_fdip_sum_phi);
|
||||
|
||||
/// Interpolate the multipolar potential from the grid
|
||||
virtual void compute_fphi_mpole(double ***host_grid_brick, void **host_fphi,
|
||||
const double felec);
|
||||
|
||||
/// Compute polar real-space with device neighboring
|
||||
virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole,
|
||||
double **host_uind, double **host_uinp, double *host_pval,
|
||||
const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom,
|
||||
const double aewald, const double felec, const double off2_polar,
|
||||
void **tep_ptr);
|
||||
|
||||
// copy field and fieldp from device to host after umutual2b
|
||||
virtual void update_fieldp(void **fieldp_ptr) {
|
||||
*fieldp_ptr=_fieldp.host.begin();
|
||||
// _fieldp store both arrays, one after another
|
||||
_fieldp.update_host(_max_fieldp_size*8,false);
|
||||
}
|
||||
|
||||
/// setup a plan for FFT, where size is the number of elements
|
||||
|
||||
void setup_fft(const int size, const int element_type=0);
|
||||
|
||||
/// compute forward/backward FFT on the device
|
||||
|
||||
void compute_fft1d(void* in, void* out, const int numel, const int mode);
|
||||
|
||||
// -------------------------- DEVICE DATA -------------------------
|
||||
|
||||
/// Device Properties and Atom and Neighbor storage
|
||||
Device<numtyp,acctyp> *device;
|
||||
|
||||
/// Geryon device
|
||||
UCL_Device *ucl_device;
|
||||
|
||||
/// Device Timers
|
||||
UCL_Timer time_pair;
|
||||
|
||||
/// Host device load balancer
|
||||
Balance<numtyp,acctyp> hd_balancer;
|
||||
|
||||
/// LAMMPS pointer for screen output
|
||||
FILE *screen;
|
||||
|
||||
// --------------------------- ATOM DATA --------------------------
|
||||
|
||||
/// Atom Data
|
||||
Atom<numtyp,acctyp> *atom;
|
||||
|
||||
UCL_Vector<numtyp,numtyp> polar1, polar2, polar3, polar4, polar5;
|
||||
|
||||
/// cast host arrays into a single array for atom->extra
|
||||
void cast_extra_data(int* amtype, int* amgroup, double** rpole,
|
||||
double** uind, double** uinp, double* pval=nullptr);
|
||||
|
||||
/// Per-atom arrays
|
||||
UCL_Vector<acctyp,acctyp> _tep, _fieldp;
|
||||
int _nmax, _max_tep_size, _max_fieldp_size;
|
||||
|
||||
int _bsorder;
|
||||
UCL_Vector<numtyp4,numtyp4> _thetai1, _thetai2, _thetai3;
|
||||
UCL_Vector<int,int> _igrid;
|
||||
UCL_Vector<numtyp2,numtyp2> _cgrid_brick;
|
||||
UCL_Vector<acctyp,acctyp> _fdip_phi1, _fdip_phi2, _fdip_sum_phi;
|
||||
int _max_thetai_size;
|
||||
int _nzlo_out, _nzhi_out, _nylo_out, _nyhi_out, _nxlo_out, _nxhi_out;
|
||||
int _ngridx, _ngridy, _ngridz, _num_grid_points;
|
||||
|
||||
int _end_command_queue;
|
||||
|
||||
// ------------------------ FORCE/ENERGY DATA -----------------------
|
||||
|
||||
Answer<numtyp,acctyp> *ans;
|
||||
|
||||
// --------------------------- NBOR DATA ----------------------------
|
||||
|
||||
/// Neighbor data
|
||||
Neighbor *nbor;
|
||||
/// Device storage for 1-5 special neighbor counts
|
||||
UCL_D_Vec<int> dev_nspecial15;
|
||||
/// Device storage for special neighbors
|
||||
UCL_D_Vec<tagint> dev_special15, dev_special15_t;
|
||||
|
||||
int add_onefive_neighbors();
|
||||
|
||||
UCL_D_Vec<int> dev_short_nbor;
|
||||
|
||||
// ------------------------- DEVICE KERNELS -------------------------
|
||||
UCL_Program *pair_program;
|
||||
UCL_Kernel k_multipole, k_udirect2b, k_umutual2b, k_polar;
|
||||
UCL_Kernel k_fphi_uind, k_fphi_mpole;
|
||||
UCL_Kernel k_special15, k_short_nbor;
|
||||
inline int block_size() { return _block_size; }
|
||||
inline void set_kernel(const int /*eflag*/, const int /*vflag*/) {}
|
||||
|
||||
// --------------------------- TEXTURES -----------------------------
|
||||
UCL_Texture pos_tex;
|
||||
UCL_Texture q_tex;
|
||||
|
||||
protected:
|
||||
bool _compiled;
|
||||
int _block_size, _block_bio_size, _threads_per_atom;
|
||||
int _extra_fields;
|
||||
double _max_bytes, _max_an_bytes, _maxspecial, _maxspecial15, _max_nbors;
|
||||
double _gpu_overhead, _driver_overhead;
|
||||
bool short_nbor_polar_avail;
|
||||
UCL_D_Vec<int> *_nbor_data;
|
||||
|
||||
numtyp _aewald,_felec;
|
||||
numtyp _off2_hal,_off2_repulse,_off2_disp,_off2_mpole,_off2_polar;
|
||||
|
||||
int _eflag, _vflag;
|
||||
|
||||
void compile_kernels(UCL_Device &dev, const void *pair_string,
|
||||
const char *kname_multipole, const char *kname_udirect2b,
|
||||
const char *kname_umutual2b, const char *kname_polar,
|
||||
const char *kname_fphi_uind, const char *kname_fphi_mpole,
|
||||
const char *kname_short_nbor, const char* kname_special15);
|
||||
|
||||
virtual int multipole_real(const int eflag, const int vflag) = 0;
|
||||
virtual int udirect2b(const int eflag, const int vflag) = 0;
|
||||
virtual int umutual2b(const int eflag, const int vflag) = 0;
|
||||
virtual int fphi_uind();
|
||||
virtual int fphi_mpole();
|
||||
virtual int polar_real(const int eflag, const int vflag) = 0;
|
||||
|
||||
|
||||
#if !defined(USE_OPENCL) && !defined(USE_HIP)
|
||||
cufftHandle plan;
|
||||
#endif
|
||||
bool fft_plan_created;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
@ -72,7 +72,9 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall,
|
||||
|
||||
_threads_per_atom=device->threads_per_atom();
|
||||
|
||||
int success=device->init(*ans,false,false,nlocal,nall,maxspecial);
|
||||
bool charge = false;
|
||||
bool rot = false;
|
||||
int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
|
||||
@ -72,7 +72,9 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall,
|
||||
|
||||
_threads_per_atom=device->threads_per_charge();
|
||||
|
||||
int success=device->init(*ans,true,false,nlocal,nall,maxspecial);
|
||||
bool charge = true;
|
||||
bool rot = false;
|
||||
int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
|
||||
@ -73,7 +73,9 @@ int BaseDipoleT::init_atomic(const int nlocal, const int nall,
|
||||
|
||||
_threads_per_atom=device->threads_per_charge();
|
||||
|
||||
int success=device->init(*ans,true,true,nlocal,nall,maxspecial);
|
||||
bool charge = true;
|
||||
bool rot = true;
|
||||
int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
|
||||
@ -72,7 +72,10 @@ int BaseDPDT::init_atomic(const int nlocal, const int nall,
|
||||
|
||||
_threads_per_atom=device->threads_per_atom();
|
||||
|
||||
int success=device->init(*ans,false,false,nlocal,nall,maxspecial,true);
|
||||
bool charge = false;
|
||||
bool rot = false;
|
||||
bool vel = true;
|
||||
int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial,vel);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
@ -193,7 +196,7 @@ void BaseDPDT::compute(const int f_ago, const int inum_full, const int nall,
|
||||
const double cpu_time, bool &success, tagint *tag,
|
||||
double **host_v, const double dtinvsqrt,
|
||||
const int seed, const int timestep,
|
||||
const int nlocal, double *boxlo, double *prd) {
|
||||
const int /*nlocal*/, double * /*boxlo*/, double * /*prd*/) {
|
||||
acc_timers();
|
||||
int eflag, vflag;
|
||||
if (eatom) eflag=2;
|
||||
@ -258,7 +261,7 @@ int** BaseDPDT::compute(const int ago, const int inum_full,
|
||||
const double cpu_time, bool &success,
|
||||
double **host_v, const double dtinvsqrt,
|
||||
const int seed, const int timestep,
|
||||
double *boxlo, double *prd) {
|
||||
double * /*boxlo*/, double * /*prd*/) {
|
||||
acc_timers();
|
||||
int eflag, vflag;
|
||||
if (eatom) eflag=2;
|
||||
|
||||
@ -94,7 +94,9 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
|
||||
else
|
||||
_threads_per_atom=device->threads_per_three();
|
||||
|
||||
int success=device->init(*ans,false,false,nlocal,nall,maxspecial);
|
||||
bool charge = false;
|
||||
bool rot = false;
|
||||
int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
|
||||
@ -44,18 +44,14 @@ int CHARMMLongT::bytes_per_atom(const int max_nbors) const {
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int CHARMMLongT::init(const int ntypes,
|
||||
double host_cut_bothsq, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **host_offset,
|
||||
double *host_special_lj, const int nlocal,
|
||||
const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *_screen,
|
||||
int CHARMMLongT::init(const int ntypes, double host_cut_bothsq, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3, double **host_lj4,
|
||||
double ** /*host_offset*/, double *host_special_lj, const int nlocal,
|
||||
const int nall, const int max_nbors, const int maxspecial,
|
||||
const double cell_size, const double gpu_split, FILE *_screen,
|
||||
double host_cut_ljsq, const double host_cut_coulsq,
|
||||
double *host_special_coul, const double qqrd2e,
|
||||
const double g_ewald, const double cut_lj_innersq,
|
||||
const double denom_lj, double **epsilon,
|
||||
double *host_special_coul, const double qqrd2e, const double g_ewald,
|
||||
const double cut_lj_innersq, const double denom_lj, double **epsilon,
|
||||
double **sigma, const bool mix_arithmetic) {
|
||||
int success;
|
||||
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||
|
||||
@ -52,7 +52,7 @@ DeviceT::~Device() {
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int ngpu,
|
||||
int DeviceT::init_device(MPI_Comm /*world*/, MPI_Comm replica, const int ngpu,
|
||||
const int first_gpu_id, const int gpu_mode,
|
||||
const double p_split, const int t_per_atom,
|
||||
const double user_cell_size, char *ocl_args,
|
||||
@ -386,6 +386,9 @@ int DeviceT::set_ocl_params(std::string s_config, const std::string &extra_args)
|
||||
}
|
||||
|
||||
_ocl_compile_string="-cl-mad-enable ";
|
||||
#ifdef CL_VERSION_2_0
|
||||
_ocl_compile_string+="-cl-std=CL2.0 ";
|
||||
#endif
|
||||
if (params[4]!="0") _ocl_compile_string+="-cl-fast-relaxed-math ";
|
||||
_ocl_compile_string+=std::string(OCL_INT_TYPE)+" "+
|
||||
std::string(OCL_PRECISION_COMPILE);
|
||||
@ -438,7 +441,7 @@ template <class numtyp, class acctyp>
|
||||
int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
|
||||
const bool rot, const int nlocal,
|
||||
const int nall, const int maxspecial,
|
||||
const bool vel) {
|
||||
const bool vel, const int extra_fields) {
|
||||
if (!_device_init)
|
||||
return -1;
|
||||
if (sizeof(acctyp)==sizeof(double) && !gpu->double_precision())
|
||||
@ -467,7 +470,7 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
|
||||
|
||||
if (_init_count==0) {
|
||||
// Initialize atom and nbor data
|
||||
if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor>0 && maxspecial>0,vel))
|
||||
if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor>0 && maxspecial>0,vel,extra_fields))
|
||||
return -3;
|
||||
|
||||
_data_in_estimate++;
|
||||
@ -477,6 +480,9 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
|
||||
_data_in_estimate++;
|
||||
if (vel)
|
||||
_data_in_estimate++;
|
||||
if (extra_fields>0)
|
||||
_data_in_estimate++;
|
||||
|
||||
} else {
|
||||
if (!atom.charge() && charge)
|
||||
_data_in_estimate++;
|
||||
@ -484,7 +490,9 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
|
||||
_data_in_estimate++;
|
||||
if (!atom.velocity() && vel)
|
||||
_data_in_estimate++;
|
||||
if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial,vel))
|
||||
if (atom.using_extra() && extra_fields>0)
|
||||
_data_in_estimate++;
|
||||
if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial,vel,extra_fields))
|
||||
return -3;
|
||||
}
|
||||
|
||||
@ -520,7 +528,7 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const int nlocal,
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int DeviceT::init_nbor(Neighbor *nbor, const int nlocal,
|
||||
const int host_nlocal, const int nall,
|
||||
const int host_nlocal, const int /*nall*/,
|
||||
const int maxspecial, const int gpu_host,
|
||||
const int max_nbors, const double cutoff,
|
||||
const bool pre_cut, const int threads_per_atom,
|
||||
|
||||
@ -61,6 +61,7 @@ class Device {
|
||||
* \param nall Total number of local+ghost particles
|
||||
* \param maxspecial Maximum mumber of special bonded atoms per atom
|
||||
* \param vel True if velocities need to be stored
|
||||
* \param extra_fields Nonzero if extra fields need to be stored
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successful
|
||||
@ -70,7 +71,7 @@ class Device {
|
||||
* - -5 Double precision is not supported on card **/
|
||||
int init(Answer<numtyp,acctyp> &ans, const bool charge, const bool rot,
|
||||
const int nlocal, const int nall, const int maxspecial,
|
||||
const bool vel=false);
|
||||
const bool vel=false, const int extra_fields=0);
|
||||
|
||||
/// Initialize the device for Atom storage only
|
||||
/** \param nlocal Total number of local particles to allocate memory for
|
||||
|
||||
@ -30,7 +30,7 @@ static DPD<PRECISION,ACC_PRECISION> DPDTMF;
|
||||
int dpd_tstat_gpu_init(const int ntypes, double **cutsq, double **host_a0,
|
||||
double **host_gamma, double **host_sigma, double **host_cut,
|
||||
double *special_lj, const int inum,
|
||||
const int nall, const int max_nbors, const int maxspecial,
|
||||
const int nall, const int /*max_nbors*/, const int maxspecial,
|
||||
const double cell_size, int &gpu_mode, FILE *screen) {
|
||||
DPDTMF.clear();
|
||||
gpu_mode=DPDTMF.device->gpu_mode();
|
||||
|
||||
@ -310,7 +310,7 @@ void EAMT::compute(const int f_ago, const int inum_full, const int nlocal,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh,
|
||||
const bool eflag_in, const bool vflag_in,
|
||||
const bool eatom, const bool vatom,
|
||||
const bool /*eatom*/, const bool /*vatom*/,
|
||||
int &host_start, const double cpu_time,
|
||||
bool &success, void **fp_ptr) {
|
||||
this->acc_timers();
|
||||
@ -386,8 +386,8 @@ int** EAMT::compute(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, double *sublo,
|
||||
double *subhi, tagint *tag, int **nspecial,
|
||||
tagint **special, const bool eflag_in,
|
||||
const bool vflag_in, const bool eatom,
|
||||
const bool vatom, int &host_start, int **ilist, int **jnum,
|
||||
const bool vflag_in, const bool /*eatom*/,
|
||||
const bool /*vatom*/, int &host_start, int **ilist, int **jnum,
|
||||
const double cpu_time, bool &success, int &inum,
|
||||
void **fp_ptr) {
|
||||
this->acc_timers();
|
||||
|
||||
641
lib/gpu/lal_hippo.cpp
Normal file
641
lib/gpu/lal_hippo.cpp
Normal file
@ -0,0 +1,641 @@
|
||||
/***************************************************************************
|
||||
hippo.cpp
|
||||
-------------------
|
||||
Trung Dac Nguyen (Northwestern)
|
||||
|
||||
Class for acceleration of the hippo pair style.
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
email : trung.nguyen@northwestern.edu
|
||||
***************************************************************************/
|
||||
|
||||
#if defined(USE_OPENCL)
|
||||
#include "hippo_cl.h"
|
||||
#elif defined(USE_CUDART)
|
||||
const char *hippo=0;
|
||||
#else
|
||||
#include "hippo_cubin.h"
|
||||
#endif
|
||||
|
||||
#include "lal_hippo.h"
|
||||
#include <cassert>
|
||||
namespace LAMMPS_AL {
|
||||
#define HippoT Hippo<numtyp, acctyp>
|
||||
|
||||
extern Device<PRECISION,ACC_PRECISION> device;
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
HippoT::Hippo() : BaseAmoeba<numtyp,acctyp>(),
|
||||
_allocated(false) {
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
HippoT::~Hippo() {
|
||||
clear();
|
||||
k_repulsion.clear();
|
||||
k_dispersion.clear();
|
||||
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int HippoT::bytes_per_atom(const int max_nbors) const {
|
||||
return this->bytes_per_atom_atomic(max_nbors);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass,
|
||||
const double *host_pdamp, const double *host_thole,
|
||||
const double *host_dirdamp, const int *host_amtype2class,
|
||||
const double *host_special_repel, const double *host_special_disp,
|
||||
const double *host_special_mpole,
|
||||
const double *host_special_polar_wscale,
|
||||
const double *host_special_polar_piscale,
|
||||
const double *host_special_polar_pscale,
|
||||
const double *host_sizpr, const double *host_dmppr, const double *host_elepr,
|
||||
const double *host_csix, const double *host_adisp,
|
||||
const double *host_pcore, const double *host_palpha,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const int maxspecial15,
|
||||
const double cell_size, const double gpu_split, FILE *_screen,
|
||||
const double polar_dscale, const double polar_uscale) {
|
||||
int success;
|
||||
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15,
|
||||
cell_size,gpu_split,_screen,hippo,
|
||||
"k_hippo_multipole", "k_hippo_udirect2b",
|
||||
"k_hippo_umutual2b", "k_hippo_polar",
|
||||
"k_hippo_fphi_uind", "k_hippo_fphi_mpole",
|
||||
"k_hippo_short_nbor", "k_hippo_special15");
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
// specific to HIPPO
|
||||
k_repulsion.set_function(*(this->pair_program),"k_hippo_repulsion");
|
||||
k_dispersion.set_function(*(this->pair_program),"k_hippo_dispersion");
|
||||
|
||||
// If atom type constants fit in shared memory use fast kernel
|
||||
int lj_types=ntypes;
|
||||
shared_types=false;
|
||||
int max_shared_types=this->device->max_shared_types();
|
||||
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
|
||||
lj_types=max_shared_types;
|
||||
shared_types=true;
|
||||
}
|
||||
_lj_types=lj_types;
|
||||
|
||||
// Allocate a host write buffer for data initialization
|
||||
|
||||
UCL_H_Vec<numtyp4> host_write(max_amtype, *(this->ucl_device), UCL_WRITE_ONLY);
|
||||
for (int i = 0; i < max_amtype; i++) {
|
||||
host_write[i].x = host_pdamp[i];
|
||||
host_write[i].y = host_thole[i];
|
||||
host_write[i].z = host_dirdamp[i];
|
||||
host_write[i].w = host_amtype2class[i];
|
||||
}
|
||||
|
||||
coeff_amtype.alloc(max_amtype,*(this->ucl_device), UCL_READ_ONLY);
|
||||
ucl_copy(coeff_amtype,host_write,false);
|
||||
|
||||
for (int i = 0; i < max_amtype; i++) {
|
||||
host_write[i].x = host_sizpr[i];
|
||||
host_write[i].y = host_dmppr[i];
|
||||
host_write[i].z = host_elepr[i];
|
||||
host_write[i].w = (numtyp)0;
|
||||
}
|
||||
|
||||
coeff_rep.alloc(max_amtype,*(this->ucl_device), UCL_READ_ONLY);
|
||||
ucl_copy(coeff_rep,host_write,false);
|
||||
|
||||
UCL_H_Vec<numtyp4> host_write2(max_amclass, *(this->ucl_device), UCL_WRITE_ONLY);
|
||||
for (int i = 0; i < max_amclass; i++) {
|
||||
host_write2[i].x = host_csix[i];
|
||||
host_write2[i].y = host_adisp[i];
|
||||
host_write2[i].z = host_pcore[i];
|
||||
host_write2[i].w = host_palpha[i];
|
||||
}
|
||||
|
||||
coeff_amclass.alloc(max_amclass,*(this->ucl_device), UCL_READ_ONLY);
|
||||
ucl_copy(coeff_amclass,host_write2,false);
|
||||
|
||||
UCL_H_Vec<numtyp4> dview(5, *(this->ucl_device), UCL_WRITE_ONLY);
|
||||
sp_polar.alloc(5,*(this->ucl_device),UCL_READ_ONLY);
|
||||
for (int i=0; i<5; i++) {
|
||||
dview[i].x=host_special_polar_wscale[i];
|
||||
dview[i].y=host_special_polar_piscale[i];
|
||||
dview[i].z=host_special_polar_pscale[i];
|
||||
dview[i].w=host_special_mpole[i];
|
||||
}
|
||||
ucl_copy(sp_polar,dview,5,false);
|
||||
|
||||
sp_nonpolar.alloc(5,*(this->ucl_device),UCL_READ_ONLY);
|
||||
for (int i=0; i<5; i++) {
|
||||
dview[i].x=host_special_repel[i];
|
||||
dview[i].y=host_special_disp[i];
|
||||
dview[i].z=(numtyp)0;
|
||||
dview[i].w=(numtyp)0;
|
||||
}
|
||||
ucl_copy(sp_nonpolar,dview,5,false);
|
||||
|
||||
_polar_dscale = polar_dscale;
|
||||
_polar_uscale = polar_uscale;
|
||||
|
||||
_allocated=true;
|
||||
this->_max_bytes=coeff_amtype.row_bytes() + coeff_rep.row_bytes()
|
||||
+ coeff_amclass.row_bytes() + sp_polar.row_bytes()
|
||||
+ sp_nonpolar.row_bytes() + this->_tep.row_bytes()
|
||||
+ this->_fieldp.row_bytes() + this->_thetai1.row_bytes()
|
||||
+ this->_thetai2.row_bytes() + this->_thetai3.row_bytes()
|
||||
+ this->_igrid.row_bytes() + this->_cgrid_brick.row_bytes();
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void HippoT::clear() {
|
||||
if (!_allocated)
|
||||
return;
|
||||
_allocated=false;
|
||||
|
||||
coeff_amtype.clear();
|
||||
coeff_rep.clear();
|
||||
coeff_amclass.clear();
|
||||
sp_polar.clear();
|
||||
sp_nonpolar.clear();
|
||||
|
||||
this->clear_atomic();
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
double HippoT::host_memory_usage() const {
|
||||
return this->host_memory_usage_atomic()+sizeof(Hippo<numtyp,acctyp>);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Compute the repulsion term, returning tep
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
void HippoT::compute_repulsion(const int /*ago*/, const int inum_full,
|
||||
const int /*nall*/, double ** /*host_x*/,
|
||||
int * /*host_type*/, int * /*host_amtype*/,
|
||||
int * /*host_amgroup*/, double ** /*host_rpole*/,
|
||||
double * /*sublo*/, double * /*subhi*/, tagint * /*tag*/,
|
||||
int ** /*nspecial*/, tagint ** /*special*/,
|
||||
int * /*nspecial15*/, tagint ** /*special15*/,
|
||||
const bool eflag_in, const bool vflag_in,
|
||||
const bool eatom, const bool vatom,
|
||||
int & /*host_start*/, int ** /*ilist*/, int ** /*jnum*/,
|
||||
const double /*cpu_time*/, bool & /*success*/,
|
||||
const double /*aewald*/, const double off2_repulse,
|
||||
double * /*host_q*/, double * /*boxlo*/, double * /*prd*/,
|
||||
double cut2, double c0, double c1, double c2,
|
||||
double c3, double c4, double c5, void **tep_ptr) {
|
||||
this->acc_timers();
|
||||
int eflag, vflag;
|
||||
if (eatom) eflag=2;
|
||||
else if (eflag_in) eflag=1;
|
||||
else eflag=0;
|
||||
if (vatom) vflag=2;
|
||||
else if (vflag_in) vflag=1;
|
||||
else vflag=0;
|
||||
|
||||
#ifdef LAL_NO_BLOCK_REDUCE
|
||||
if (eflag) eflag=2;
|
||||
if (vflag) vflag=2;
|
||||
#endif
|
||||
|
||||
this->set_kernel(eflag,vflag);
|
||||
|
||||
// ------------------- Resize _tep array ------------------------
|
||||
|
||||
if (inum_full>this->_max_tep_size) {
|
||||
this->_max_tep_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
|
||||
this->_tep.resize(this->_max_tep_size*4);
|
||||
}
|
||||
*tep_ptr=this->_tep.host.begin();
|
||||
|
||||
this->_off2_repulse = off2_repulse;
|
||||
_cut2 = cut2;
|
||||
_c0 = c0;
|
||||
_c1 = c1;
|
||||
_c2 = c2;
|
||||
_c3 = c3;
|
||||
_c4 = c4;
|
||||
_c5 = c5;
|
||||
repulsion(this->_eflag,this->_vflag);
|
||||
|
||||
// copy tep from device to host
|
||||
this->_tep.update_host(this->_max_tep_size*4,false);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Launch the repulsion kernel
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
int HippoT::repulsion(const int eflag, const int vflag) {
|
||||
int ainum=this->ans->inum();
|
||||
if (ainum == 0)
|
||||
return 0;
|
||||
|
||||
int _nall=this->atom->nall();
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
|
||||
// Compute the block size and grid size to keep all cores busy
|
||||
const int BX=this->block_size();
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
this->time_pair.start();
|
||||
|
||||
// Build the short neighbor list for the cutoff off2_disp,
|
||||
// at this point repuslion is the first kernel in a time step for HIPPO
|
||||
|
||||
this->k_short_nbor.set_size(GX,BX);
|
||||
this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
|
||||
&this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor, &this->_off2_repulse, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
k_repulsion.set_size(GX,BX);
|
||||
k_repulsion.run(&this->atom->x, &this->atom->extra,
|
||||
&coeff_rep, &sp_nonpolar,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor,
|
||||
&this->ans->force, &this->ans->engv, &this->_tep,
|
||||
&eflag, &vflag, &ainum, &_nall, &nbor_pitch,
|
||||
&this->_threads_per_atom, &this->_aewald,
|
||||
&this->_off2_repulse, &_cut2,
|
||||
&_c0, &_c1, &_c2, &_c3, &_c4, &_c5);
|
||||
this->time_pair.stop();
|
||||
|
||||
return GX;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Compute dispersion real-space
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
void HippoT::compute_dispersion_real(int *host_amtype, int *host_amgroup,
|
||||
double **host_rpole, const double aewald,
|
||||
const double off2_disp) {
|
||||
|
||||
// cast necessary data arrays from host to device
|
||||
|
||||
this->cast_extra_data(host_amtype, host_amgroup, host_rpole,
|
||||
nullptr, nullptr, nullptr);
|
||||
this->atom->add_extra_data();
|
||||
|
||||
this->_off2_disp = off2_disp;
|
||||
this->_aewald = aewald;
|
||||
dispersion_real(this->_eflag,this->_vflag);
|
||||
|
||||
// only copy them back if this is the last kernel
|
||||
// otherwise, commenting out these two lines to leave the answers
|
||||
// (forces, energies and virial) on the device until the last kernel
|
||||
//this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
|
||||
//this->device->add_ans_object(this->ans);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Launch the dispersion real-space kernel
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
int HippoT::dispersion_real(const int eflag, const int vflag) {
|
||||
int ainum=this->ans->inum();
|
||||
if (ainum == 0)
|
||||
return 0;
|
||||
|
||||
int _nall=this->atom->nall();
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
|
||||
// Compute the block size and grid size to keep all cores busy
|
||||
const int BX=this->block_size();
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
this->time_pair.start();
|
||||
|
||||
// Build the short neighbor list for the cutoff off2_disp,
|
||||
// at this point dispersion is the first kernel in a time step
|
||||
|
||||
this->k_short_nbor.set_size(GX,BX);
|
||||
this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
|
||||
&this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor, &this->_off2_disp, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
k_dispersion.set_size(GX,BX);
|
||||
k_dispersion.run(&this->atom->x, &this->atom->extra,
|
||||
&coeff_amtype, &coeff_amclass, &sp_nonpolar,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor,
|
||||
&this->ans->force, &this->ans->engv,
|
||||
&eflag, &vflag, &ainum, &_nall, &nbor_pitch,
|
||||
&this->_threads_per_atom, &this->_aewald,
|
||||
&this->_off2_disp);
|
||||
this->time_pair.stop();
|
||||
|
||||
return GX;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Compute the multipole real-space term, returning tep
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
void HippoT::compute_multipole_real(const int /*ago*/, const int inum_full,
|
||||
const int /*nall*/, double ** /*host_x*/,
|
||||
int * /*host_type*/, int * /*host_amtype*/,
|
||||
int * /*host_amgroup*/, double ** /*host_rpole*/,
|
||||
double* host_pval, double * /*sublo*/,
|
||||
double * /*subhi*/, tagint * /*tag*/,
|
||||
int ** /*nspecial*/, tagint ** /*special*/,
|
||||
int * /*nspecial15*/, tagint ** /*special15*/,
|
||||
const bool /*eflag_in*/, const bool /*vflag_in*/,
|
||||
const bool /*eatom*/, const bool /*vatom*/,
|
||||
int & /*host_start*/, int ** /*ilist*/, int ** /*jnum*/,
|
||||
const double /*cpu_time*/, bool & /*success*/,
|
||||
const double aewald, const double felec,
|
||||
const double off2_mpole, double * /*host_q*/,
|
||||
double * /*boxlo*/, double * /*prd*/, void **tep_ptr) {
|
||||
|
||||
// cast necessary data arrays from host to device
|
||||
|
||||
this->cast_extra_data(nullptr, nullptr, nullptr, nullptr, nullptr, host_pval);
|
||||
this->atom->add_extra_data();
|
||||
|
||||
// ------------------- Resize _tep array ------------------------
|
||||
|
||||
if (inum_full>this->_max_tep_size) {
|
||||
this->_max_tep_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
|
||||
this->_tep.resize(this->_max_tep_size*4);
|
||||
}
|
||||
*tep_ptr=this->_tep.host.begin();
|
||||
|
||||
this->_off2_mpole = off2_mpole;
|
||||
this->_felec = felec;
|
||||
this->_aewald = aewald;
|
||||
multipole_real(this->_eflag,this->_vflag);
|
||||
|
||||
// copy tep from device to host
|
||||
this->_tep.update_host(this->_max_tep_size*4,false);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Launch the multipole real-space kernel
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
int HippoT::multipole_real(const int eflag, const int vflag) {
|
||||
int ainum=this->ans->inum();
|
||||
if (ainum == 0)
|
||||
return 0;
|
||||
|
||||
int _nall=this->atom->nall();
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
|
||||
// Compute the block size and grid size to keep all cores busy
|
||||
const int BX=this->block_size();
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
this->time_pair.start();
|
||||
|
||||
// Build the short neighbor list for the cutoff off2_mpole
|
||||
|
||||
this->k_short_nbor.set_size(GX,BX);
|
||||
this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
|
||||
&this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor, &this->_off2_mpole, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
this->k_multipole.set_size(GX,BX);
|
||||
this->k_multipole.run(&this->atom->x, &this->atom->extra,
|
||||
&coeff_amtype, &coeff_amclass, &sp_polar,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor,
|
||||
&this->ans->force, &this->ans->engv, &this->_tep,
|
||||
&eflag, &vflag, &ainum, &_nall, &nbor_pitch,
|
||||
&this->_threads_per_atom, &this->_aewald, &this->_felec,
|
||||
&this->_off2_mpole, &_polar_dscale, &_polar_uscale);
|
||||
this->time_pair.stop();
|
||||
|
||||
return GX;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Compute the direct real space part of the permanent field
|
||||
// returning field and fieldp
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
void HippoT::compute_udirect2b(int * /*host_amtype*/, int * /*host_amgroup*/, double ** /*host_rpole*/,
|
||||
double **host_uind, double **host_uinp, double* host_pval,
|
||||
const double aewald, const double off2_polar,
|
||||
void** fieldp_ptr) {
|
||||
|
||||
// all the necessary data arrays are already copied from host to device
|
||||
|
||||
this->cast_extra_data(nullptr, nullptr, nullptr, host_uind, host_uinp, host_pval);
|
||||
this->atom->add_extra_data();
|
||||
|
||||
*fieldp_ptr=this->_fieldp.host.begin();
|
||||
|
||||
this->_off2_polar = off2_polar;
|
||||
this->_aewald = aewald;
|
||||
udirect2b(this->_eflag,this->_vflag);
|
||||
|
||||
// copy field and fieldp from device to host (_fieldp store both arrays, one after another)
|
||||
|
||||
this->_fieldp.update_host(this->_max_fieldp_size*8,false);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Launch the real-space permanent field kernel
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
int HippoT::udirect2b(const int /*eflag*/, const int /*vflag*/) {
|
||||
int ainum=this->ans->inum();
|
||||
if (ainum == 0)
|
||||
return 0;
|
||||
|
||||
int _nall=this->atom->nall();
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
|
||||
// Compute the block size and grid size to keep all cores busy
|
||||
const int BX=this->block_size();
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
this->time_pair.start();
|
||||
|
||||
// Build the short neighbor list for the cutoff _off2_polar, if not done yet
|
||||
// this is the first kernel in a time step where _off2_polar is used
|
||||
|
||||
if (!this->short_nbor_polar_avail) {
|
||||
this->k_short_nbor.set_size(GX,BX);
|
||||
this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
|
||||
&this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor, &this->_off2_polar, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
this->short_nbor_polar_avail = true;
|
||||
}
|
||||
|
||||
this->k_udirect2b.set_size(GX,BX);
|
||||
this->k_udirect2b.run(&this->atom->x, &this->atom->extra,
|
||||
&coeff_amtype, &coeff_amclass, &sp_polar,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor,
|
||||
&this->_fieldp, &ainum, &_nall, &nbor_pitch,
|
||||
&this->_threads_per_atom, &this->_aewald, &this->_off2_polar,
|
||||
&_polar_dscale, &_polar_uscale);
|
||||
|
||||
this->time_pair.stop();
|
||||
return GX;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Compute the direct real space term of the induced field
|
||||
// returning field and fieldp
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
void HippoT::compute_umutual2b(int * /*host_amtype*/, int * /*host_amgroup*/, double ** /*host_rpole*/,
|
||||
double **host_uind, double **host_uinp, double * /*host_pval*/,
|
||||
const double aewald, const double off2_polar, void ** /*fieldp_ptr*/) {
|
||||
|
||||
// cast necessary data arrays from host to device
|
||||
|
||||
this->cast_extra_data(nullptr, nullptr, nullptr, host_uind, host_uinp, nullptr);
|
||||
this->atom->add_extra_data();
|
||||
|
||||
this->_off2_polar = off2_polar;
|
||||
this->_aewald = aewald;
|
||||
umutual2b(this->_eflag,this->_vflag);
|
||||
|
||||
// copy field and fieldp from device to host (_fieldp store both arrays, one after another)
|
||||
// NOTE: move this step to update_fieldp() to delay device-host transfer
|
||||
// *fieldp_ptr=this->_fieldp.host.begin();
|
||||
// this->_fieldp.update_host(this->_max_fieldp_size*8,false);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Launch the real-space induced field kernel
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
int HippoT::umutual2b(const int /*eflag*/, const int /*vflag*/) {
|
||||
int ainum=this->ans->inum();
|
||||
if (ainum == 0)
|
||||
return 0;
|
||||
|
||||
int _nall=this->atom->nall();
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
|
||||
// Compute the block size and grid size to keep all cores busy
|
||||
const int BX=this->block_size();
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
this->time_pair.start();
|
||||
|
||||
// Build the short neighbor list if not done yet
|
||||
if (!this->short_nbor_polar_avail) {
|
||||
this->k_short_nbor.set_size(GX,BX);
|
||||
this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
|
||||
&this->_nbor_data->begin(), &this->dev_short_nbor,
|
||||
&this->_off2_polar, &ainum, &nbor_pitch,
|
||||
&this->_threads_per_atom);
|
||||
this->short_nbor_polar_avail = true;
|
||||
}
|
||||
|
||||
this->k_umutual2b.set_size(GX,BX);
|
||||
this->k_umutual2b.run(&this->atom->x, &this->atom->extra,
|
||||
&coeff_amtype, &coeff_amclass, &sp_polar,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor, &this->_fieldp, &ainum, &_nall,
|
||||
&nbor_pitch, &this->_threads_per_atom, &this->_aewald,
|
||||
&this->_off2_polar, &_polar_dscale, &_polar_uscale);
|
||||
|
||||
this->time_pair.stop();
|
||||
return GX;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Reneighbor on GPU if necessary, and then compute polar real-space
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
void HippoT::compute_polar_real(int * /*host_amtype*/, int * /*host_amgroup*/, double ** /*host_rpole*/,
|
||||
double **host_uind, double **host_uinp, double * /*host_pval*/,
|
||||
const bool eflag_in, const bool vflag_in,
|
||||
const bool eatom, const bool vatom,
|
||||
const double aewald, const double felec,
|
||||
const double off2_polar, void **tep_ptr) {
|
||||
// cast necessary data arrays from host to device
|
||||
|
||||
this->cast_extra_data(nullptr, nullptr, nullptr, host_uind, host_uinp, nullptr);
|
||||
this->atom->add_extra_data();
|
||||
|
||||
*tep_ptr=this->_tep.host.begin();
|
||||
|
||||
this->_off2_polar = off2_polar;
|
||||
this->_felec = felec;
|
||||
this->_aewald = aewald;
|
||||
const int red_blocks=polar_real(this->_eflag,this->_vflag);
|
||||
|
||||
// only copy answers (forces, energies and virial) back from the device
|
||||
// in the last kernel in a timestep (which is polar_real here)
|
||||
this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
|
||||
this->device->add_ans_object(this->ans);
|
||||
|
||||
// copy tep from device to host
|
||||
this->_tep.update_host(this->_max_tep_size*4,false);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Launch the polar real-space kernel
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
int HippoT::polar_real(const int eflag, const int vflag) {
|
||||
int ainum=this->ans->inum();
|
||||
if (ainum == 0)
|
||||
return 0;
|
||||
|
||||
int _nall=this->atom->nall();
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
|
||||
// Compute the block size and grid size to keep all cores busy
|
||||
|
||||
const int BX=this->block_size();
|
||||
const int GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
|
||||
/*
|
||||
const int cus = this->device->gpu->cus();
|
||||
while (GX < cus && GX > 1) {
|
||||
BX /= 2;
|
||||
GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
|
||||
}
|
||||
*/
|
||||
this->time_pair.start();
|
||||
|
||||
// Build the short neighbor list if not done yet
|
||||
if (!this->short_nbor_polar_avail) {
|
||||
this->k_short_nbor.set_size(GX,BX);
|
||||
this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
|
||||
&this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor, &this->_off2_polar, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
this->short_nbor_polar_avail = true;
|
||||
}
|
||||
|
||||
this->k_polar.set_size(GX,BX);
|
||||
this->k_polar.run(&this->atom->x, &this->atom->extra,
|
||||
&coeff_amtype, &coeff_amclass, &sp_polar,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor,
|
||||
&this->ans->force, &this->ans->engv, &this->_tep,
|
||||
&eflag, &vflag, &ainum, &_nall, &nbor_pitch,
|
||||
&this->_threads_per_atom, &this->_aewald, &this->_felec,
|
||||
&this->_off2_polar, &_polar_dscale, &_polar_uscale);
|
||||
this->time_pair.stop();
|
||||
|
||||
// Signal that short nbor list is not avail for the next time step
|
||||
// do it here because polar_real() is the last kernel in a time step at this point
|
||||
|
||||
this->short_nbor_polar_avail = false;
|
||||
|
||||
return GX;
|
||||
}
|
||||
|
||||
template class Hippo<PRECISION,ACC_PRECISION>;
|
||||
}
|
||||
2519
lib/gpu/lal_hippo.cu
Normal file
2519
lib/gpu/lal_hippo.cu
Normal file
File diff suppressed because it is too large
Load Diff
166
lib/gpu/lal_hippo.h
Normal file
166
lib/gpu/lal_hippo.h
Normal file
@ -0,0 +1,166 @@
|
||||
/***************************************************************************
|
||||
hippo.h
|
||||
-------------------
|
||||
Trung Dac Nguyen (Northwestern)
|
||||
|
||||
Class for acceleration of the hippo pair style.
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
email : trung.nguyen@northwestern.edu
|
||||
***************************************************************************/
|
||||
|
||||
#ifndef LAL_HIPPO_H
|
||||
#define LAL_HIPPO_H
|
||||
|
||||
#include "lal_base_amoeba.h"
|
||||
|
||||
namespace LAMMPS_AL {
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
class Hippo : public BaseAmoeba<numtyp, acctyp> {
|
||||
public:
|
||||
Hippo();
|
||||
~Hippo();
|
||||
|
||||
/// Clear any previous data and set up for a new LAMMPS run
|
||||
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||
* \param cell_size cutoff + skin
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successful
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
* - -5 Double precision is not supported on card **/
|
||||
int init(const int ntypes, const int max_amtype, const int max_amclass,
|
||||
const double *host_pdamp, const double *host_thole,
|
||||
const double *host_dirdamp, const int *host_amtype2class,
|
||||
const double *host_special_mpole,
|
||||
const double *host_special_repel,
|
||||
const double *host_special_disp,
|
||||
const double *host_special_polar_wscale,
|
||||
const double *host_special_polar_piscale,
|
||||
const double *host_special_polar_pscale,
|
||||
const double *host_sizpr, const double *host_dmppr, const double *host_elepr,
|
||||
const double *host_csix, const double *host_adisp,
|
||||
const double *host_pcore, const double *host_palpha,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const int maxspecial15, const double cell_size,
|
||||
const double gpu_split, FILE *_screen,
|
||||
const double polar_dscale, const double polar_uscale);
|
||||
|
||||
/// Compute repulsion with device neighboring
|
||||
virtual void compute_repulsion(const int ago, const int inum_full,
|
||||
const int nall, double **host_x,
|
||||
int *host_type, int *host_amtype,
|
||||
int *host_amgroup, double **host_rpole,
|
||||
double *sublo, double *subhi, tagint *tag,
|
||||
int **nspecial, tagint **special,
|
||||
int *nspecial15, tagint **special15,
|
||||
const bool eflag_in, const bool vflag_in,
|
||||
const bool eatom, const bool vatom,
|
||||
int &host_start, int **ilist, int **jnum,
|
||||
const double cpu_time, bool &success,
|
||||
const double aewald, const double off2_repulse,
|
||||
double *host_q, double *boxlo, double *prd,
|
||||
double cut2, double c0, double c1, double c2,
|
||||
double c3, double c4, double c5,void** tep_ptr);
|
||||
|
||||
/// Compute dispersion real-space with device neighboring
|
||||
virtual void compute_dispersion_real(int *host_amtype, int *host_amgroup,
|
||||
double **host_rpole, const double aewald,
|
||||
const double off2_disp);
|
||||
|
||||
/// Compute multipole real-space with device neighboring
|
||||
virtual void compute_multipole_real(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, int *host_amtype,
|
||||
int *host_amgroup, double **host_rpole, double *host_pval,
|
||||
double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special,
|
||||
int *nspecial15, tagint **special15,
|
||||
const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
int **ilist, int **numj, const double cpu_time, bool &success,
|
||||
const double aewald, const double felec, const double off2_mpole, double *charge,
|
||||
double *boxlo, double *prd, void **tep_ptr);
|
||||
|
||||
/// Compute the real space part of the permanent field (udirect2b) with device neighboring
|
||||
virtual void compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole,
|
||||
double **host_uind, double **host_uinp, double* host_pval,
|
||||
const double aewald, const double off2_polar, void** fieldp_ptr);
|
||||
|
||||
/// Compute the real space part of the induced field (umutual2b) with device neighboring
|
||||
virtual void compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole,
|
||||
double **host_uind, double **host_uinp, double *host_pval,
|
||||
const double aewald, const double off2_polar,
|
||||
void** fieldp_ptr);
|
||||
|
||||
/// Compute polar real-space with device neighboring
|
||||
virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole,
|
||||
double **host_uind, double **host_uinp, double *host_pval,
|
||||
const bool eflag_in, const bool vflag_in,
|
||||
const bool eatom, const bool vatom,
|
||||
const double aewald, const double felec, const double off2_polar,
|
||||
void **tep_ptr);
|
||||
|
||||
/// Clear all host and device data
|
||||
/** \note This is called at the beginning of the init() routine **/
|
||||
void clear();
|
||||
|
||||
/// Returns memory usage on device per atom
|
||||
int bytes_per_atom(const int max_nbors) const;
|
||||
|
||||
/// Total host memory used by library for pair style
|
||||
double host_memory_usage() const;
|
||||
|
||||
// --------------------------- TYPE DATA --------------------------
|
||||
|
||||
/// pdamp = coeff_amtype.x; thole = coeff_amtype.y;
|
||||
/// dirdamp = coeff_amtype.z; amtype2class = coeff_amtype.w
|
||||
UCL_D_Vec<numtyp4> coeff_amtype;
|
||||
/// csix = coeff_amclass.x; adisp = coeff_amclass.y;
|
||||
UCL_D_Vec<numtyp4> coeff_amclass;
|
||||
/// sizpr = coeff_rep.x; dmppr = coeff_rep.y; elepr = coeff_rep.z;
|
||||
UCL_D_Vec<numtyp4> coeff_rep;
|
||||
/// Special polar values [0-4]:
|
||||
/// sp_polar.x = special_polar_wscale
|
||||
/// sp_polar.y special_polar_pscale,
|
||||
/// sp_polar.z = special_polar_piscale
|
||||
/// sp_polar.w = special_mpole
|
||||
UCL_D_Vec<numtyp4> sp_polar;
|
||||
/// Special nonpolar values [0-4]:
|
||||
/// sp_nonpolar.x = special_hal
|
||||
/// sp_nonpolar.y special_repel
|
||||
/// sp_nonpolar.z = special_disp
|
||||
UCL_D_Vec<numtyp4> sp_nonpolar;
|
||||
|
||||
/// If atom type constants fit in shared memory, use fast kernels
|
||||
bool shared_types;
|
||||
|
||||
/// Number of atom types
|
||||
int _lj_types;
|
||||
|
||||
numtyp _cut2,_c0,_c1,_c2,_c3,_c4,_c5;
|
||||
numtyp _polar_dscale, _polar_uscale;
|
||||
numtyp _qqrd2e;
|
||||
|
||||
UCL_Kernel k_repulsion, k_dispersion;
|
||||
|
||||
protected:
|
||||
bool _allocated;
|
||||
int repulsion(const int eflag, const int vflag);
|
||||
int dispersion_real(const int eflag, const int vflag);
|
||||
int multipole_real(const int eflag, const int vflag);
|
||||
int udirect2b(const int eflag, const int vflag);
|
||||
int umutual2b(const int eflag, const int vflag);
|
||||
int polar_real(const int eflag, const int vflag);
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
231
lib/gpu/lal_hippo_ext.cpp
Normal file
231
lib/gpu/lal_hippo_ext.cpp
Normal file
@ -0,0 +1,231 @@
|
||||
/***************************************************************************
|
||||
hippo_ext.cpp
|
||||
-------------------
|
||||
Trung Dac Nguyen (Northwestern)
|
||||
|
||||
Functions for LAMMPS access to hippo acceleration routines.
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
email : trung.nguyen@northwestern.edu
|
||||
***************************************************************************/
|
||||
|
||||
#include <iostream>
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
|
||||
#include "lal_hippo.h"
|
||||
|
||||
using namespace std;
|
||||
using namespace LAMMPS_AL;
|
||||
|
||||
static Hippo<PRECISION,ACC_PRECISION> HIPPOMF;
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Allocate memory on host and device and copy constants to device
|
||||
// ---------------------------------------------------------------------------
|
||||
int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass,
|
||||
const double *host_pdamp, const double *host_thole,
|
||||
const double *host_dirdamp, const int *host_amtype2class,
|
||||
const double *host_special_repel,
|
||||
const double *host_special_disp,
|
||||
const double *host_special_mpole,
|
||||
const double *host_special_polar_wscale,
|
||||
const double *host_special_polar_piscale,
|
||||
const double *host_special_polar_pscale,
|
||||
const double *host_sizpr, const double *host_dmppr, const double *host_elepr,
|
||||
const double *host_csix, const double *host_adisp,
|
||||
const double *host_pcore, const double *host_palpha,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const int maxspecial15,
|
||||
const double cell_size, int &gpu_mode, FILE *screen,
|
||||
const double polar_dscale, const double polar_uscale) {
|
||||
HIPPOMF.clear();
|
||||
gpu_mode=HIPPOMF.device->gpu_mode();
|
||||
double gpu_split=HIPPOMF.device->particle_split();
|
||||
int first_gpu=HIPPOMF.device->first_device();
|
||||
int last_gpu=HIPPOMF.device->last_device();
|
||||
int world_me=HIPPOMF.device->world_me();
|
||||
int gpu_rank=HIPPOMF.device->gpu_rank();
|
||||
int procs_per_gpu=HIPPOMF.device->procs_per_gpu();
|
||||
|
||||
HIPPOMF.device->init_message(screen,"HIPPO",first_gpu,last_gpu);
|
||||
|
||||
bool message=false;
|
||||
if (HIPPOMF.device->replica_me()==0 && screen)
|
||||
message=true;
|
||||
|
||||
if (message) {
|
||||
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||
fflush(screen);
|
||||
}
|
||||
|
||||
int init_ok=0;
|
||||
if (world_me==0)
|
||||
init_ok=HIPPOMF.init(ntypes, max_amtype, max_amclass,
|
||||
host_pdamp, host_thole, host_dirdamp,
|
||||
host_amtype2class, host_special_repel, host_special_disp,
|
||||
host_special_mpole, host_special_polar_wscale,
|
||||
host_special_polar_piscale, host_special_polar_pscale,
|
||||
host_sizpr, host_dmppr, host_elepr,
|
||||
host_csix, host_adisp, host_pcore, host_palpha,
|
||||
nlocal, nall, max_nbors,
|
||||
maxspecial, maxspecial15, cell_size, gpu_split,
|
||||
screen, polar_dscale, polar_uscale);
|
||||
|
||||
HIPPOMF.device->world_barrier();
|
||||
if (message)
|
||||
fprintf(screen,"Done.\n");
|
||||
|
||||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (message) {
|
||||
if (last_gpu-first_gpu==0)
|
||||
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||
else
|
||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
if (gpu_rank==i && world_me!=0)
|
||||
init_ok=HIPPOMF.init(ntypes, max_amtype, max_amclass,
|
||||
host_pdamp, host_thole, host_dirdamp,
|
||||
host_amtype2class, host_special_repel, host_special_disp,
|
||||
host_special_mpole, host_special_polar_wscale,
|
||||
host_special_polar_piscale, host_special_polar_pscale,
|
||||
host_sizpr, host_dmppr, host_elepr,
|
||||
host_csix, host_adisp, host_pcore, host_palpha,
|
||||
nlocal, nall, max_nbors,
|
||||
maxspecial, maxspecial15, cell_size, gpu_split,
|
||||
screen, polar_dscale, polar_uscale);
|
||||
|
||||
HIPPOMF.device->gpu_barrier();
|
||||
if (message)
|
||||
fprintf(screen,"Done.\n");
|
||||
}
|
||||
if (message)
|
||||
fprintf(screen,"\n");
|
||||
|
||||
if (init_ok==0)
|
||||
HIPPOMF.estimate_gpu_overhead();
|
||||
return init_ok;
|
||||
}
|
||||
|
||||
void hippo_gpu_clear() {
|
||||
HIPPOMF.clear();
|
||||
}
|
||||
|
||||
int** hippo_gpu_precompute(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, int *host_amtype,
|
||||
int *host_amgroup, double **host_rpole,
|
||||
double ** /*host_uind*/, double ** /*host_uinp*/, double * /*host_pval*/,
|
||||
double *sublo, double *subhi, tagint *tag,
|
||||
int **nspecial, tagint **special,
|
||||
int *nspecial15, tagint **special15,
|
||||
const bool eflag_in, const bool vflag_in,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
int **ilist, int **jnum, const double cpu_time,
|
||||
bool &success, double *host_q, double *boxlo, double *prd) {
|
||||
return HIPPOMF.precompute(ago, inum_full, nall, host_x, host_type,
|
||||
host_amtype, host_amgroup, host_rpole,
|
||||
nullptr, nullptr, nullptr, sublo, subhi, tag,
|
||||
nspecial, special, nspecial15, special15,
|
||||
eflag_in, vflag_in, eatom, vatom,
|
||||
host_start, ilist, jnum, cpu_time,
|
||||
success, host_q, boxlo, prd);
|
||||
}
|
||||
|
||||
void hippo_gpu_compute_repulsion(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
int *host_amtype, int *host_amgroup, double **host_rpole,
|
||||
double *sublo, double *subhi, tagint *tag, int **nspecial,
|
||||
tagint **special, int *nspecial15, tagint** special15,
|
||||
const bool eflag, const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start,
|
||||
int **ilist, int **jnum, const double cpu_time,
|
||||
bool &success, const double aewald, const double off2,
|
||||
double *host_q, double *boxlo, double *prd,
|
||||
double cut2, double c0, double c1, double c2,
|
||||
double c3, double c4, double c5, void **tep_ptr) {
|
||||
HIPPOMF.compute_repulsion(ago, inum_full, nall, host_x, host_type,
|
||||
host_amtype, host_amgroup, host_rpole, sublo, subhi,
|
||||
tag, nspecial, special, nspecial15, special15,
|
||||
eflag, vflag, eatom, vatom, host_start, ilist, jnum,
|
||||
cpu_time, success, aewald, off2, host_q, boxlo, prd,
|
||||
cut2, c0, c1, c2, c3, c4, c5, tep_ptr);
|
||||
}
|
||||
|
||||
void hippo_gpu_compute_dispersion_real(int *host_amtype, int *host_amgroup,
|
||||
double **host_rpole, const double aewald,
|
||||
const double off2) {
|
||||
HIPPOMF.compute_dispersion_real(host_amtype, host_amgroup, host_rpole,
|
||||
aewald, off2);
|
||||
}
|
||||
|
||||
void hippo_gpu_compute_multipole_real(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
int *host_amtype, int *host_amgroup, double **host_rpole,
|
||||
double *host_pval, double *sublo, double *subhi, tagint *tag, int **nspecial,
|
||||
tagint **special, int *nspecial15, tagint** special15,
|
||||
const bool eflag, const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start,
|
||||
int **ilist, int **jnum, const double cpu_time,
|
||||
bool &success, const double aewald, const double felec, const double off2,
|
||||
double *host_q, double *boxlo, double *prd, void **tep_ptr) {
|
||||
HIPPOMF.compute_multipole_real(ago, inum_full, nall, host_x, host_type,
|
||||
host_amtype, host_amgroup, host_rpole, host_pval, sublo, subhi,
|
||||
tag, nspecial, special, nspecial15, special15,
|
||||
eflag, vflag, eatom, vatom, host_start, ilist, jnum,
|
||||
cpu_time, success, aewald, felec, off2, host_q, boxlo, prd, tep_ptr);
|
||||
}
|
||||
|
||||
void hippo_gpu_compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole,
|
||||
double **host_uind, double **host_uinp, double *host_pval,
|
||||
const double aewald, const double off2, void **fieldp_ptr) {
|
||||
HIPPOMF.compute_udirect2b(host_amtype, host_amgroup, host_rpole,
|
||||
host_uind, host_uinp, host_pval,
|
||||
aewald, off2, fieldp_ptr);
|
||||
}
|
||||
|
||||
void hippo_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole,
|
||||
double **host_uind, double **host_uinp, double *host_pval,
|
||||
const double aewald, const double off2, void **fieldp_ptr) {
|
||||
HIPPOMF.compute_umutual2b(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval,
|
||||
aewald, off2, fieldp_ptr);
|
||||
}
|
||||
|
||||
void hippo_gpu_update_fieldp(void **fieldp_ptr) {
|
||||
HIPPOMF.update_fieldp(fieldp_ptr);
|
||||
}
|
||||
|
||||
void hippo_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole,
|
||||
double **host_uind, double **host_uinp, double *host_pval,
|
||||
const bool eflag_in, const bool vflag_in,
|
||||
const bool eatom, const bool vatom,
|
||||
const double aewald, const double felec, const double off2,
|
||||
void **tep_ptr) {
|
||||
HIPPOMF.compute_polar_real(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval,
|
||||
eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
|
||||
}
|
||||
|
||||
void hippo_gpu_precompute_kspace(const int inum_full, const int bsorder,
|
||||
double ***host_thetai1, double ***host_thetai2,
|
||||
double ***host_thetai3, int** igrid,
|
||||
const int nzlo_out, const int nzhi_out,
|
||||
const int nylo_out, const int nyhi_out,
|
||||
const int nxlo_out, const int nxhi_out) {
|
||||
HIPPOMF.precompute_kspace(inum_full, bsorder, host_thetai1, host_thetai2,
|
||||
host_thetai3, igrid, nzlo_out, nzhi_out,
|
||||
nylo_out, nyhi_out, nxlo_out, nxhi_out);
|
||||
}
|
||||
|
||||
void hippo_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1,
|
||||
void **host_fdip_phi2, void **host_fdip_sum_phi) {
|
||||
HIPPOMF.compute_fphi_uind(host_grid_brick, host_fdip_phi1, host_fdip_phi2, host_fdip_sum_phi);
|
||||
}
|
||||
|
||||
double hippo_gpu_bytes() {
|
||||
return HIPPOMF.host_memory_usage();
|
||||
}
|
||||
431
lib/gpu/lal_hippo_extra.h
Normal file
431
lib/gpu/lal_hippo_extra.h
Normal file
@ -0,0 +1,431 @@
|
||||
/// **************************************************************************
|
||||
// hippo_extra.h
|
||||
// -------------------
|
||||
// Trung Dac Nguyen
|
||||
//
|
||||
// Device code for hippo math routines
|
||||
//
|
||||
// __________________________________________________________________________
|
||||
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
// __________________________________________________________________________
|
||||
//
|
||||
// begin :
|
||||
// email : ndactrung@gmail.com
|
||||
// ***************************************************************************/*
|
||||
|
||||
#ifndef LAL_HIPPO_EXTRA_H
|
||||
#define LAL_HIPPO_EXTRA_H
|
||||
|
||||
#if defined(NV_KERNEL) || defined(USE_HIP)
|
||||
#include "lal_aux_fun1.h"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#define MY_PI2 (numtyp)1.57079632679489661923
|
||||
#define MY_PI4 (numtyp)0.78539816339744830962
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
damprep generates coefficients for the Pauli repulsion
|
||||
damping function for powers of the interatomic distance
|
||||
|
||||
literature reference:
|
||||
|
||||
J. A. Rackers and J. W. Ponder, "Classical Pauli Repulsion: An
|
||||
Anisotropic, Atomic Multipole Model", Journal of Chemical Physics,
|
||||
150, 084104 (2019)
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
ucl_inline void damprep(const numtyp r, const numtyp r2, const numtyp rr1,
|
||||
const numtyp rr3, const numtyp rr5, const numtyp rr7,
|
||||
const numtyp rr9, const numtyp rr11, const int rorder,
|
||||
const numtyp dmpi, const numtyp dmpk, numtyp dmpik[11])
|
||||
{
|
||||
numtyp r3,r4;
|
||||
numtyp r5,r6,r7,r8;
|
||||
numtyp s,ds,d2s;
|
||||
numtyp d3s,d4s,d5s;
|
||||
numtyp dmpi2,dmpk2;
|
||||
numtyp dmpi22,dmpi23;
|
||||
numtyp dmpi24,dmpi25;
|
||||
numtyp dmpi26,dmpi27;
|
||||
numtyp dmpk22,dmpk23;
|
||||
numtyp dmpk24,dmpk25;
|
||||
numtyp dmpk26;
|
||||
numtyp eps,diff;
|
||||
numtyp expi,expk;
|
||||
numtyp dampi,dampk;
|
||||
numtyp pre,term,tmp;
|
||||
|
||||
// compute tolerance value for damping exponents
|
||||
|
||||
eps = (numtyp)0.001;
|
||||
diff = dmpi-dmpk; // fabs(dmpi-dmpk)
|
||||
if (diff < (numtyp)0) diff = -diff;
|
||||
|
||||
// treat the case where alpha damping exponents are equal
|
||||
|
||||
if (diff < eps) {
|
||||
r3 = r2 * r;
|
||||
r4 = r3 * r;
|
||||
r5 = r4 * r;
|
||||
r6 = r5 * r;
|
||||
r7 = r6 * r;
|
||||
dmpi2 = (numtyp)0.5 * dmpi;
|
||||
dampi = dmpi2 * r;
|
||||
expi = ucl_exp(-dampi);
|
||||
dmpi22 = dmpi2 * dmpi2;
|
||||
dmpi23 = dmpi22 * dmpi2;
|
||||
dmpi24 = dmpi23 * dmpi2;
|
||||
dmpi25 = dmpi24 * dmpi2;
|
||||
dmpi26 = dmpi25 * dmpi2;
|
||||
pre = (numtyp)128.0;
|
||||
s = (r + dmpi2*r2 + dmpi22*r3/(numtyp)3.0) * expi;
|
||||
|
||||
ds = (dmpi22*r3 + dmpi23*r4) * expi / (numtyp)3.0;
|
||||
d2s = dmpi24 * expi * r5 / (numtyp)9.0;
|
||||
d3s = dmpi25 * expi * r6 / (numtyp)45.0;
|
||||
d4s = (dmpi25*r6 + dmpi26*r7) * expi / (numtyp)315.0;
|
||||
if (rorder >= 11) {
|
||||
r8 = r7 * r;
|
||||
dmpi27 = dmpi2 * dmpi26;
|
||||
d5s = (dmpi25*r6 + dmpi26*r7 + dmpi27*r8/(numtyp)3.0) * expi / (numtyp)945.0;
|
||||
}
|
||||
|
||||
// treat the case where alpha damping exponents are unequal
|
||||
|
||||
} else {
|
||||
r3 = r2 * r;
|
||||
r4 = r3 * r;
|
||||
r5 = r4 * r;
|
||||
dmpi2 = (numtyp)0.5 * dmpi;
|
||||
dmpk2 = (numtyp)0.5 * dmpk;
|
||||
dampi = dmpi2 * r;
|
||||
dampk = dmpk2 * r;
|
||||
expi = ucl_exp(-dampi);
|
||||
expk = ucl_exp(-dampk);
|
||||
dmpi22 = dmpi2 * dmpi2;
|
||||
dmpi23 = dmpi22 * dmpi2;
|
||||
dmpi24 = dmpi23 * dmpi2;
|
||||
dmpi25 = dmpi24 * dmpi2;
|
||||
dmpk22 = dmpk2 * dmpk2;
|
||||
dmpk23 = dmpk22 * dmpk2;
|
||||
dmpk24 = dmpk23 * dmpk2;
|
||||
dmpk25 = dmpk24 * dmpk2;
|
||||
term = dmpi22 - dmpk22;
|
||||
pre = (numtyp)8192.0 * dmpi23 * dmpk23 / (term*term*term*term); //ucl_powr(term,(numtyp)4.0);
|
||||
tmp = (numtyp)4.0 * dmpi2 * dmpk2 / term;
|
||||
s = (dampi-tmp)*expk + (dampk+tmp)*expi;
|
||||
|
||||
ds = (dmpi2*dmpk2*r2 - (numtyp)4.0*dmpi2*dmpk22*r/term -
|
||||
(numtyp)4.0*dmpi2*dmpk2/term) * expk +
|
||||
(dmpi2*dmpk2*r2 + (numtyp)4.0*dmpi22*dmpk2*r/term + (numtyp)4.0*dmpi2*dmpk2/term) * expi;
|
||||
d2s = (dmpi2*dmpk2*r2/3.0 + dmpi2*dmpk22*r3/(numtyp)3.0 -
|
||||
((numtyp)4.0/(numtyp)3.0)*dmpi2*dmpk23*r2/term - (numtyp)4.0*dmpi2*dmpk22*r/term -
|
||||
(numtyp)4.0*dmpi2*dmpk2/term) * expk +
|
||||
(dmpi2*dmpk2*r2/(numtyp)3.0 + dmpi22*dmpk2*r3/(numtyp)3.0 +
|
||||
((numtyp)4.0/(numtyp)3.0)*dmpi23*dmpk2*r2/term + (numtyp)4.0*dmpi22*dmpk2*r/term +
|
||||
(numtyp)4.0*dmpi2*dmpk2/term) * expi;
|
||||
d3s = (dmpi2*dmpk23*r4/(numtyp)15.0 + dmpi2*dmpk22*r3/(numtyp)5.0 + dmpi2*dmpk2*r2/(numtyp)5.0 -
|
||||
((numtyp)4.0/(numtyp)15.0)*dmpi2*dmpk24*r3/term - ((numtyp)8.0/(numtyp)5.0)*dmpi2*dmpk23*r2/term -
|
||||
(numtyp)4.0*dmpi2*dmpk22*r/term - (numtyp)4.0/term*dmpi2*dmpk2) * expk +
|
||||
(dmpi23*dmpk2*r4/(numtyp)15.0 + dmpi22*dmpk2*r3/(numtyp)5.0 + dmpi2*dmpk2*r2/(numtyp)5.0 +
|
||||
((numtyp)4.0/(numtyp)15.0)*dmpi24*dmpk2*r3/term + ((numtyp)8.0/(numtyp)5.0)*dmpi23*dmpk2*r2/term +
|
||||
(numtyp)4.0*dmpi22*dmpk2*r/term + (numtyp)4.0/term*dmpi2*dmpk2) * expi;
|
||||
d4s = (dmpi2*dmpk24*r5/(numtyp)105.0 + ((numtyp)2.0/(numtyp)35.0)*dmpi2*dmpk23*r4 +
|
||||
dmpi2*dmpk22*r3/(numtyp)7.0 + dmpi2*dmpk2*r2/(numtyp)7.0 -
|
||||
((numtyp)4.0/(numtyp)105.0)*dmpi2*dmpk25*r4/term - ((numtyp)8.0/21.0)*dmpi2*dmpk24*r3/term -
|
||||
((numtyp)12.0/(numtyp)7.0)*dmpi2*dmpk23*r2/term - (numtyp)4.0*dmpi2*dmpk22*r/term -
|
||||
(numtyp)4.0*dmpi2*dmpk2/term) * expk +
|
||||
(dmpi24*dmpk2*r5/(numtyp)105.0 + ((numtyp)2.0/(numtyp)35.0)*dmpi23*dmpk2*r4 +
|
||||
dmpi22*dmpk2*r3/(numtyp)7.0 + dmpi2*dmpk2*r2/(numtyp)7.0 +
|
||||
((numtyp)4.0/(numtyp)105.0)*dmpi25*dmpk2*r4/term + ((numtyp)8.0/(numtyp)21.0)*dmpi24*dmpk2*r3/term +
|
||||
((numtyp)12.0/(numtyp)7.0)*dmpi23*dmpk2*r2/term + (numtyp)4.0*dmpi22*dmpk2*r/term +
|
||||
(numtyp)4.0*dmpi2*dmpk2/term) * expi;
|
||||
|
||||
if (rorder >= 11) {
|
||||
r6 = r5 * r;
|
||||
dmpi26 = dmpi25 * dmpi2;
|
||||
dmpk26 = dmpk25 * dmpk2;
|
||||
d5s = (dmpi2*dmpk25*r6/(numtyp)945.0 + ((numtyp)2.0/(numtyp)189.0)*dmpi2*dmpk24*r5 +
|
||||
dmpi2*dmpk23*r4/(numtyp)21.0 + dmpi2*dmpk22*r3/(numtyp)9.0 + dmpi2*dmpk2*r2/(numtyp)9.0 -
|
||||
((numtyp)4.0/(numtyp)945.0)*dmpi2*dmpk26*r5/term -
|
||||
((numtyp)4.0/(numtyp)63.0)*dmpi2*dmpk25*r4/term - ((numtyp)4.0/(numtyp)9.0)*dmpi2*dmpk24*r3/term -
|
||||
((numtyp)16.0/(numtyp)9.0)*dmpi2*dmpk23*r2/term - (numtyp)4.0*dmpi2*dmpk22*r/term -
|
||||
(numtyp)4.0*dmpi2*dmpk2/term) * expk +
|
||||
(dmpi25*dmpk2*r6/(numtyp)945.0 + ((numtyp)2.0/(numtyp)189.0)*dmpi24*dmpk2*r5 +
|
||||
dmpi23*dmpk2*r4/(numtyp)21.0 + dmpi22*dmpk2*r3/(numtyp)9.0 + dmpi2*dmpk2*r2/(numtyp)9.0 +
|
||||
((numtyp)4.0/(numtyp)945.0)*dmpi26*dmpk2*r5/term + ((numtyp)4.0/(numtyp)63.0)*dmpi25*dmpk2*r4/term +
|
||||
((numtyp)4.0/(numtyp)9.0)*dmpi24*dmpk2*r3/term + ((numtyp)16.0/(numtyp)9.0)*dmpi23*dmpk2*r2/term +
|
||||
(numtyp)4.0*dmpi22*dmpk2*r/term + (numtyp)4.0*dmpi2*dmpk2/term) * expi;
|
||||
}
|
||||
}
|
||||
|
||||
// convert partial derivatives into full derivatives
|
||||
|
||||
s = s * rr1;
|
||||
ds = ds * rr3;
|
||||
d2s = d2s * rr5;
|
||||
d3s = d3s * rr7;
|
||||
d4s = d4s * rr9;
|
||||
d5s = d5s * rr11;
|
||||
dmpik[0] = (numtyp)0.5 * pre * s * s;
|
||||
dmpik[2] = pre * s * ds;
|
||||
dmpik[4] = pre * (s*d2s + ds*ds);
|
||||
dmpik[6] = pre * (s*d3s + (numtyp)3.0*ds*d2s);
|
||||
dmpik[8] = pre * (s*d4s + (numtyp)4.0*ds*d3s + (numtyp)3.0*d2s*d2s);
|
||||
|
||||
if (rorder >= 11) dmpik[10] = pre * (s*d5s + (numtyp)5.0*ds*d4s + (numtyp)10.0*d2s*d3s);
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
damppole generates coefficients for the charge penetration
|
||||
damping function for powers of the interatomic distance
|
||||
|
||||
literature references:
|
||||
|
||||
L. V. Slipchenko and M. S. Gordon, "Electrostatic Energy in the
|
||||
Effective Fragment Potential Method: Theory and Application to
|
||||
the Benzene Dimer", Journal of Computational Chemistry, 28,
|
||||
276-291 (2007) [Gordon f1 and f2 models]
|
||||
|
||||
J. A. Rackers, Q. Wang, C. Liu, J.-P. Piquemal, P. Ren and
|
||||
J. W. Ponder, "An Optimized Charge Penetration Model for Use with
|
||||
the AMOEBA Force Field", Physical Chemistry Chemical Physics, 19,
|
||||
276-291 (2017)
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
ucl_inline void damppole(const numtyp r, const int rorder,
|
||||
const numtyp alphai, const numtyp alphak,
|
||||
numtyp dmpi[9], numtyp dmpk[9], numtyp dmpik[11])
|
||||
{
|
||||
numtyp termi,termk;
|
||||
numtyp termi2,termk2;
|
||||
numtyp alphai2,alphak2;
|
||||
numtyp eps,diff;
|
||||
numtyp expi,expk;
|
||||
numtyp dampi,dampk;
|
||||
numtyp dampi2,dampi3;
|
||||
numtyp dampi4,dampi5;
|
||||
numtyp dampi6,dampi7;
|
||||
numtyp dampi8;
|
||||
numtyp dampk2,dampk3;
|
||||
numtyp dampk4,dampk5;
|
||||
numtyp dampk6;
|
||||
|
||||
// compute tolerance and exponential damping factors
|
||||
|
||||
eps = (numtyp)0.001;
|
||||
diff = alphai-alphak;
|
||||
if (diff < (numtyp)0) diff = -diff;
|
||||
dampi = alphai * r;
|
||||
dampk = alphak * r;
|
||||
expi = ucl_exp(-dampi);
|
||||
expk = ucl_exp(-dampk);
|
||||
|
||||
// core-valence charge penetration damping for Gordon f1
|
||||
|
||||
dampi2 = dampi * dampi;
|
||||
dampi3 = dampi * dampi2;
|
||||
dampi4 = dampi2 * dampi2;
|
||||
dampi5 = dampi2 * dampi3;
|
||||
dmpi[0] = (numtyp)1.0 - ((numtyp)1.0 + (numtyp)0.5*dampi)*expi;
|
||||
dmpi[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2)*expi;
|
||||
dmpi[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi;
|
||||
dmpi[6] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + dampi4/(numtyp)30.0)*expi;
|
||||
dmpi[8] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
|
||||
(numtyp)4.0*dampi4/(numtyp)105.0 + dampi5/(numtyp)210.0)*expi;
|
||||
if (diff < eps) {
|
||||
dmpk[0] = dmpi[0];
|
||||
dmpk[2] = dmpi[2];
|
||||
dmpk[4] = dmpi[4];
|
||||
dmpk[6] = dmpi[6];
|
||||
dmpk[8] = dmpi[8];
|
||||
} else {
|
||||
dampk2 = dampk * dampk;
|
||||
dampk3 = dampk * dampk2;
|
||||
dampk4 = dampk2 * dampk2;
|
||||
dampk5 = dampk2 * dampk3;
|
||||
dmpk[0] = (numtyp)1.0 - ((numtyp)1.0 + (numtyp)0.5*dampk)*expk;
|
||||
dmpk[2] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2)*expk;
|
||||
dmpk[4] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0)*expk;
|
||||
dmpk[6] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + dampk4/(numtyp)30.0)*expk;
|
||||
dmpk[8] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 +
|
||||
(numtyp)4.0*dampk4/(numtyp)105.0 + dampk5/(numtyp)210.0)*expk;
|
||||
}
|
||||
|
||||
// valence-valence charge penetration damping for Gordon f1
|
||||
|
||||
if (diff < eps) {
|
||||
dampi6 = dampi3 * dampi3;
|
||||
dampi7 = dampi3 * dampi4;
|
||||
dmpik[0] = (numtyp)1.0 - ((numtyp)1.0 + (numtyp)11.0*dampi/(numtyp)16.0 + (numtyp)3.0*dampi2/(numtyp)16.0 +
|
||||
dampi3/(numtyp)48.0)*expi;
|
||||
dmpik[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 +
|
||||
(numtyp)7.0*dampi3/(numtyp)48.0 + dampi4/(numtyp)48.0)*expi;
|
||||
dmpik[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
|
||||
dampi4/(numtyp)24.0 + dampi5/(numtyp)144.0)*expi;
|
||||
dmpik[6] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
|
||||
dampi4/(numtyp)24.0 + dampi5/(numtyp)120.0 + dampi6/(numtyp)720.0)*expi;
|
||||
dmpik[8] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
|
||||
dampi4/(numtyp)24.0 + dampi5/(numtyp)120.0 + dampi6/(numtyp)720.0 +
|
||||
dampi7/(numtyp)5040.0)*expi;
|
||||
if (rorder >= 11) {
|
||||
dampi8 = dampi4 * dampi4;
|
||||
dmpik[10] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
|
||||
dampi4/(numtyp)24.0 + dampi5/(numtyp)120.0 + dampi6/(numtyp)720.0 +
|
||||
dampi7/(numtyp)5040.0 + dampi8/(numtyp)45360.0)*expi;
|
||||
}
|
||||
|
||||
} else {
|
||||
alphai2 = alphai * alphai;
|
||||
alphak2 = alphak * alphak;
|
||||
termi = alphak2 / (alphak2-alphai2);
|
||||
termk = alphai2 / (alphai2-alphak2);
|
||||
termi2 = termi * termi;
|
||||
termk2 = termk * termk;
|
||||
dmpik[0] = (numtyp)1.0 - termi2*(1.0 + (numtyp)2.0*termk + (numtyp)0.5*dampi)*expi -
|
||||
termk2*((numtyp)1.0 + (numtyp)2.0*termi + (numtyp)0.5*dampk)*expk;
|
||||
dmpik[2] = (numtyp)1.0 - termi2*((numtyp)1.0+dampi+(numtyp)0.5*dampi2)*expi -
|
||||
termk2*((numtyp)1.0+dampk+(numtyp)0.5*dampk2)*expk -
|
||||
(numtyp)2.0*termi2*termk*((numtyp)1.0+dampi)*expi -
|
||||
(numtyp)2.0*termk2*termi*((numtyp)1.0+dampk)*expk;
|
||||
dmpik[4] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi -
|
||||
termk2*(1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0)*expk -
|
||||
(numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + dampi2/(numtyp)3.0)*expi -
|
||||
(numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + dampk2/(numtyp)3.0)*expk;
|
||||
dmpik[6] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 +
|
||||
dampi3/(numtyp)6.0 + dampi4/(numtyp)30.0)*expi -
|
||||
termk2*((numtyp)1.0 + dampk + 0.5*dampk2 + dampk3/(numtyp)6.0 + dampk4/(numtyp)30.0)*expk -
|
||||
(numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)2.0*dampi2/(numtyp)5.0 + dampi3/(numtyp)15.0)*expi -
|
||||
(numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + (numtyp)2.0*dampk2/(numtyp)5.0 + dampk3/(numtyp)15.0)*expk;
|
||||
dmpik[8] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
|
||||
(numtyp)4.0*dampi4/(numtyp)105.0 + dampi5/(numtyp)210.0)*expi -
|
||||
termk2*((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 +
|
||||
(numtyp)4.0*dampk4/105.0 + dampk5/(numtyp)210.0)*expk -
|
||||
(numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)3.0*dampi2/(numtyp)7.0 +
|
||||
(numtyp)2.0*dampi3/(numtyp)21.0 + dampi4/(numtyp)105.0)*expi -
|
||||
(numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + (numtyp)3.0*dampk2/(numtyp)7.0 +
|
||||
(numtyp)2.0*dampk3/(numtyp)21.0 + dampk4/(numtyp)105.0)*expk;
|
||||
|
||||
if (rorder >= 11) {
|
||||
dampi6 = dampi3 * dampi3;
|
||||
dampk6 = dampk3 * dampk3;
|
||||
dmpik[10] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
|
||||
(numtyp)5.0*dampi4/(numtyp)126.0 + (numtyp)2.0*dampi5/(numtyp)315.0 +
|
||||
dampi6/(numtyp)1890.0)*expi -
|
||||
termk2*((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + (numtyp)5.0*dampk4/(numtyp)126.0 +
|
||||
(numtyp)2.0*dampk5/(numtyp)315.0 + dampk6/(numtyp)1890.0)*expk -
|
||||
(numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)4.0*dampi2/(numtyp)9.0 + dampi3/(numtyp)9.0 +
|
||||
dampi4/(numtyp)63.0 + dampi5/(numtyp)945.0)*expi -
|
||||
(numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + 4.0*dampk2/(numtyp)9.0 + dampk3/(numtyp)9.0 +
|
||||
dampk4/(numtyp)63.0 + dampk5/(numtyp)945.0)*expk;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
dampdir = direct field damping coefficents
|
||||
dampdir generates coefficients for the direct field damping
|
||||
function for powers of the interatomic distance
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
ucl_inline void dampdir(numtyp r, numtyp alphai, numtyp alphak, numtyp *dmpi, numtyp *dmpk)
|
||||
{
|
||||
numtyp eps,diff;
|
||||
numtyp expi,expk;
|
||||
numtyp dampi,dampk;
|
||||
numtyp dampi2,dampk2;
|
||||
numtyp dampi3,dampk3;
|
||||
numtyp dampi4,dampk4;
|
||||
|
||||
// compute tolerance and exponential damping factors
|
||||
|
||||
eps = (numtyp)0.001;
|
||||
diff = alphai-alphak; // fabs(alphai-alphak);
|
||||
if (diff < (numtyp)0) diff = -diff;
|
||||
dampi = alphai * r;
|
||||
dampk = alphak * r;
|
||||
expi = ucl_exp(-dampi);
|
||||
expk = ucl_exp(-dampk);
|
||||
|
||||
// core-valence charge penetration damping for Gordon f1 (HIPPO)
|
||||
|
||||
dampi2 = dampi * dampi;
|
||||
dampi3 = dampi * dampi2;
|
||||
dampi4 = dampi2 * dampi2;
|
||||
dmpi[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2)*expi;
|
||||
dmpi[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi;
|
||||
dmpi[6] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + dampi4/(numtyp)30.0)*expi;
|
||||
if (diff < eps) {
|
||||
dmpk[2] = dmpi[2];
|
||||
dmpk[4] = dmpi[4];
|
||||
dmpk[6] = dmpi[6];
|
||||
} else {
|
||||
dampk2 = dampk * dampk;
|
||||
dampk3 = dampk * dampk2;
|
||||
dampk4 = dampk2 * dampk2;
|
||||
dmpk[2] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2)*expk;
|
||||
dmpk[4] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0)*expk;
|
||||
dmpk[6] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + dampk4/30.0)*expk;
|
||||
}
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
dampmut = mutual field damping coefficents
|
||||
dampmut generates coefficients for the mutual field damping
|
||||
function for powers of the interatomic distance
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
ucl_inline void dampmut(numtyp r, numtyp alphai, numtyp alphak, numtyp dmpik[5])
|
||||
{
|
||||
numtyp termi,termk;
|
||||
numtyp termi2,termk2;
|
||||
numtyp alphai2,alphak2;
|
||||
numtyp eps,diff;
|
||||
numtyp expi,expk;
|
||||
numtyp dampi,dampk;
|
||||
numtyp dampi2,dampi3;
|
||||
numtyp dampi4,dampi5;
|
||||
numtyp dampk2,dampk3;
|
||||
|
||||
// compute tolerance and exponential damping factors
|
||||
|
||||
eps = (numtyp)0.001;
|
||||
diff = alphai-alphak; // fabs(alphai-alphak);
|
||||
if (diff < (numtyp)0) diff = -diff;
|
||||
dampi = alphai * r;
|
||||
dampk = alphak * r;
|
||||
expi = ucl_exp(-dampi);
|
||||
expk = ucl_exp(-dampk);
|
||||
|
||||
// valence-valence charge penetration damping for Gordon f1 (HIPPO)
|
||||
|
||||
dampi2 = dampi * dampi;
|
||||
dampi3 = dampi * dampi2;
|
||||
if (diff < eps) {
|
||||
dampi4 = dampi2 * dampi2;
|
||||
dampi5 = dampi2 * dampi3;
|
||||
dmpik[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 +
|
||||
7.0*dampi3/(numtyp)48.0 + dampi4/48.0)*expi;
|
||||
dmpik[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
|
||||
dampi4/(numtyp)24.0 + dampi5/(numtyp)144.0)*expi;
|
||||
} else {
|
||||
dampk2 = dampk * dampk;
|
||||
dampk3 = dampk * dampk2;
|
||||
alphai2 = alphai * alphai;
|
||||
alphak2 = alphak * alphak;
|
||||
termi = alphak2 / (alphak2-alphai2);
|
||||
termk = alphai2 / (alphai2-alphak2);
|
||||
termi2 = termi * termi;
|
||||
termk2 = termk * termk;
|
||||
dmpik[2] = (numtyp)1.0 - termi2*((numtyp)1.0+dampi+(numtyp)0.5*dampi2)*expi -
|
||||
termk2*((numtyp)1.0+dampk+(numtyp)0.5*dampk2)*expk -
|
||||
(numtyp)2.0*termi2*termk*((numtyp)1.0+dampi)*expi - (numtyp)2.0*termk2*termi*((numtyp)1.0+dampk)*expk;
|
||||
dmpik[4] = (numtyp)1.0 - termi2*((numtyp)1.0+dampi+(numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi -
|
||||
termk2*((numtyp)1.0+dampk+(numtyp)0.5*dampk2 + dampk3/(numtyp)6.00)*expk -
|
||||
(numtyp)2.0*termi2*termk *((numtyp)1.0+dampi+dampi2/(numtyp)3.0)*expi -
|
||||
(numtyp)2.0*termk2*termi *((numtyp)1.0+dampk+dampk2/(numtyp)3.0)*expk;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
@ -576,6 +576,11 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
|
||||
time_nbor.stop();
|
||||
if (_time_device)
|
||||
time_nbor.add_to_total();
|
||||
|
||||
// on the host, special[i][j] = the special j neighbor of atom i (nall by maxspecial)
|
||||
// on the device, transpose the matrix (1-d array) for coalesced reads
|
||||
// dev_special[i][j] = the special i neighbor of atom j
|
||||
|
||||
time_transpose.start();
|
||||
const int b2x=_block_cell_2d;
|
||||
const int b2y=_block_cell_2d;
|
||||
@ -679,6 +684,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
|
||||
if (_cutoff < _cell_size) vadjust*=1.46;
|
||||
mn=std::max(mn,static_cast<int>(ceil(_max_neighbor_factor*vadjust*mn)));
|
||||
if (mn<33) mn+=3;
|
||||
|
||||
resize_max_neighbors<numtyp,acctyp>(mn,success);
|
||||
set_nbor_block_size(mn/2);
|
||||
if (!success)
|
||||
@ -831,6 +837,17 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
|
||||
time_nbor.stop();
|
||||
}
|
||||
|
||||
void Neighbor::transpose(UCL_D_Vec<tagint> &out, const UCL_D_Vec<tagint> &in,
|
||||
const int columns_in, const int rows_in)
|
||||
{
|
||||
const int b2x=_block_cell_2d;
|
||||
const int b2y=_block_cell_2d;
|
||||
const int g2x=static_cast<int>(ceil(static_cast<double>(columns_in)/b2x));
|
||||
const int g2y=static_cast<int>(ceil(static_cast<double>(rows_in)/b2y));
|
||||
_shared->k_transpose.set_size(g2x,g2y,b2x,b2y);
|
||||
_shared->k_transpose.run(&out, &in, &columns_in, &rows_in);
|
||||
}
|
||||
|
||||
template void Neighbor::build_nbor_list<PRECISION,ACC_PRECISION>
|
||||
(double **x, const int inum, const int host_inum, const int nall,
|
||||
Atom<PRECISION,ACC_PRECISION> &atom, double *sublo, double *subhi,
|
||||
|
||||
@ -33,7 +33,7 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(USE_HIP)
|
||||
#if defined(USE_HIP) || defined(__APPLE__)
|
||||
#define LAL_USE_OLD_NEIGHBOR
|
||||
#endif
|
||||
|
||||
@ -259,6 +259,10 @@ class Neighbor {
|
||||
return o.str();
|
||||
}
|
||||
|
||||
/// Helper function
|
||||
void transpose(UCL_D_Vec<tagint> &out, const UCL_D_Vec<tagint> &in,
|
||||
const int columns_in, const int rows_in);
|
||||
|
||||
private:
|
||||
NeighborShared *_shared;
|
||||
UCL_Device *dev;
|
||||
@ -289,15 +293,17 @@ class Neighbor {
|
||||
#endif
|
||||
|
||||
int _simd_size;
|
||||
inline void set_nbor_block_size(const int mn) {
|
||||
#ifdef LAL_USE_OLD_NEIGHBOR
|
||||
inline void set_nbor_block_size(const int mn) {
|
||||
int desired=mn/(2*_simd_size);
|
||||
desired*=_simd_size;
|
||||
if (desired<_simd_size) desired=_simd_size;
|
||||
else if (desired>_max_block_nbor_build) desired=_max_block_nbor_build;
|
||||
_block_nbor_build=desired;
|
||||
#endif
|
||||
}
|
||||
#else
|
||||
inline void set_nbor_block_size(const int) {}
|
||||
#endif
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@ -48,6 +48,19 @@ _texture_2d( pos_tex,int4);
|
||||
#define LAL_USE_OLD_NEIGHBOR
|
||||
#endif
|
||||
|
||||
/*
|
||||
compute the id of the cell where the atoms belong to
|
||||
x: atom coordinates
|
||||
cell_id: cell ids
|
||||
particle_id:
|
||||
boxlo[0-2]: the lower left corner of the local box
|
||||
ncell[xyz]: the number of cells in xyz dims
|
||||
i_cell_size is the inverse cell size
|
||||
inum = the number of the local atoms that are ported to the device
|
||||
nall = the number of the local+ghost atoms that are ported to the device
|
||||
cells_in_cutoff = the number of cells that are within the cutoff
|
||||
*/
|
||||
|
||||
__kernel void calc_cell_id(const numtyp4 *restrict x_,
|
||||
unsigned *restrict cell_id,
|
||||
int *restrict particle_id,
|
||||
@ -90,6 +103,8 @@ __kernel void calc_cell_id(const numtyp4 *restrict x_,
|
||||
}
|
||||
}
|
||||
|
||||
// compute the number of atoms in each cell
|
||||
|
||||
__kernel void kernel_calc_cell_counts(const unsigned *restrict cell_id,
|
||||
int *restrict cell_counts,
|
||||
int nall, int ncell) {
|
||||
|
||||
@ -182,12 +182,15 @@
|
||||
#define ucl_cbrt cbrt
|
||||
#define ucl_ceil ceil
|
||||
#define ucl_abs fabs
|
||||
#define ucl_recip(x) ((numtyp)1.0/(x))
|
||||
#define ucl_rsqrt rsqrt
|
||||
#define ucl_sqrt sqrt
|
||||
#define ucl_recip(x) ((numtyp)1.0/(x))
|
||||
#define ucl_erfc erfc
|
||||
|
||||
#else
|
||||
|
||||
#define ucl_exp expf
|
||||
#define ucl_powr powf
|
||||
#define ucl_atan atanf
|
||||
#define ucl_cbrt cbrtf
|
||||
#define ucl_ceil ceilf
|
||||
@ -195,8 +198,7 @@
|
||||
#define ucl_recip(x) ((numtyp)1.0/(x))
|
||||
#define ucl_rsqrt rsqrtf
|
||||
#define ucl_sqrt sqrtf
|
||||
#define ucl_exp expf
|
||||
#define ucl_powr powf
|
||||
#define ucl_erfc erfcf
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
@ -166,6 +166,7 @@
|
||||
#define ucl_cbrt cbrt
|
||||
#define ucl_ceil ceil
|
||||
#define ucl_abs fabs
|
||||
#define ucl_erfc erfc
|
||||
|
||||
#if defined(FAST_MATH) && !defined(_DOUBLE_DOUBLE)
|
||||
|
||||
@ -330,6 +331,10 @@
|
||||
#define NEIGHMASK 0x3FFFFFFF
|
||||
ucl_inline int sbmask(int j) { return j >> SBBITS & 3; };
|
||||
|
||||
#define SBBITS15 29
|
||||
#define NEIGHMASK15 0x1FFFFFFF
|
||||
ucl_inline int sbmask15(int j) { return j >> SBBITS15 & 7; };
|
||||
|
||||
// default to 32-bit smallint and other ints, 64-bit bigint:
|
||||
// same as defined in src/lmptype.h
|
||||
#if !defined(LAMMPS_SMALLSMALL) && !defined(LAMMPS_BIGBIG) && \
|
||||
|
||||
@ -150,7 +150,7 @@ double SWT::host_memory_usage() const {
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
int SWT::loop(const int eflag, const int vflag, const int evatom,
|
||||
bool &success) {
|
||||
bool & /*success*/) {
|
||||
const int nbor_pitch=this->nbor->nbor_pitch();
|
||||
|
||||
// build the short neighbor list
|
||||
|
||||
@ -106,6 +106,7 @@ _texture_2d( pos_tex,int4);
|
||||
} \
|
||||
}
|
||||
|
||||
// (SHUFFLE_AVAIL == 1)
|
||||
#else
|
||||
|
||||
#define local_allocate_acc_zeta()
|
||||
@ -202,6 +203,7 @@ _texture_2d( pos_tex,int4);
|
||||
} \
|
||||
}
|
||||
|
||||
// EVFLAG == 0
|
||||
#else
|
||||
|
||||
#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom, \
|
||||
@ -216,8 +218,8 @@ _texture_2d( pos_tex,int4);
|
||||
ans[ii]=old; \
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif
|
||||
#endif // EVFLAG
|
||||
#endif // SHUFFLE_AVAIL
|
||||
|
||||
#ifdef LAL_SIMD_IP_SYNC
|
||||
#define t_per_atom t_per_atom_in
|
||||
|
||||
@ -56,7 +56,7 @@ int VashishtaT::init(const int ntypes, const int nlocal, const int nall, const i
|
||||
const double* costheta, const double* bigb,
|
||||
const double* big2b, const double* bigc)
|
||||
{
|
||||
int success;
|
||||
int success=0;
|
||||
success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
|
||||
_screen,vashishta,"k_vashishta","k_vashishta_three_center",
|
||||
"k_vashishta_three_end","k_vashishta_short_nbor");
|
||||
@ -211,7 +211,7 @@ double VashishtaT::host_memory_usage() const {
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
int VashishtaT::loop(const int eflag, const int vflag, const int evatom,
|
||||
bool &success) {
|
||||
bool & /*success*/) {
|
||||
const int nbor_pitch=this->nbor->nbor_pitch();
|
||||
|
||||
// build the short neighbor list
|
||||
|
||||
@ -1,199 +0,0 @@
|
||||
diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos
|
||||
index 22af411f32..530510a0d1 100644
|
||||
--- a/lib/kokkos/Makefile.kokkos
|
||||
+++ b/lib/kokkos/Makefile.kokkos
|
||||
@@ -20,7 +20,7 @@ KOKKOS_DEVICES ?= "OpenMP"
|
||||
#KOKKOS_DEVICES ?= "Threads"
|
||||
# Options:
|
||||
# Intel: KNC,KNL,SNB,HSW,BDW,SKL,SKX,ICL,ICX,SPR
|
||||
-# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86
|
||||
+# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86,Hopper90
|
||||
# ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX
|
||||
# IBM: BGQ,Power7,Power8,Power9
|
||||
# AMD-GPUS: Vega900,Vega906,Vega908,Vega90A
|
||||
@@ -401,6 +401,7 @@ KOKKOS_INTERNAL_USE_ARCH_VOLTA72 := $(call kokkos_has_string,$(KOKKOS_ARCH),Volt
|
||||
KOKKOS_INTERNAL_USE_ARCH_TURING75 := $(call kokkos_has_string,$(KOKKOS_ARCH),Turing75)
|
||||
KOKKOS_INTERNAL_USE_ARCH_AMPERE80 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere80)
|
||||
KOKKOS_INTERNAL_USE_ARCH_AMPERE86 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere86)
|
||||
+KOKKOS_INTERNAL_USE_ARCH_HOPPER90 := $(call kokkos_has_string,$(KOKKOS_ARCH),Hopper90)
|
||||
KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \
|
||||
@@ -414,7 +415,8 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLE
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_TURING75) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_AMPERE80) \
|
||||
- + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86))
|
||||
+ + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86) \
|
||||
+ + $(KOKKOS_INTERNAL_USE_ARCH_HOPPER90))
|
||||
|
||||
#SEK: This seems like a bug to me
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
|
||||
@@ -1194,6 +1196,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA_ARCH), 1)
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE86")
|
||||
KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_86
|
||||
endif
|
||||
+ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_HOPPER90), 1)
|
||||
+ tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER")
|
||||
+ tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER90")
|
||||
+ KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_90
|
||||
+ endif
|
||||
|
||||
ifneq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
|
||||
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)
|
||||
diff --git a/lib/kokkos/cmake/KokkosCore_config.h.in b/lib/kokkos/cmake/KokkosCore_config.h.in
|
||||
index 88ddc48378..b83ced9243 100644
|
||||
--- a/lib/kokkos/cmake/KokkosCore_config.h.in
|
||||
+++ b/lib/kokkos/cmake/KokkosCore_config.h.in
|
||||
@@ -102,6 +102,7 @@
|
||||
#cmakedefine KOKKOS_ARCH_AMPERE
|
||||
#cmakedefine KOKKOS_ARCH_AMPERE80
|
||||
#cmakedefine KOKKOS_ARCH_AMPERE86
|
||||
+#cmakedefine KOKKOS_ARCH_HOPPER90
|
||||
#cmakedefine KOKKOS_ARCH_AMD_ZEN
|
||||
#cmakedefine KOKKOS_ARCH_AMD_ZEN2
|
||||
#cmakedefine KOKKOS_ARCH_AMD_ZEN3
|
||||
diff --git a/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc b/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc
|
||||
index f56cef1651..2585a6a64c 100644
|
||||
--- a/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc
|
||||
+++ b/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc
|
||||
@@ -74,6 +74,7 @@ int main() {
|
||||
case 75: std::cout << "Set -DKokkos_ARCH_TURING75=ON ." << std::endl; break;
|
||||
case 80: std::cout << "Set -DKokkos_ARCH_AMPERE80=ON ." << std::endl; break;
|
||||
case 86: std::cout << "Set -DKokkos_ARCH_AMPERE86=ON ." << std::endl; break;
|
||||
+ case 90: std::cout << "Set -DKokkos_ARCH_HOPPER90=ON ." << std::endl; break;
|
||||
default:
|
||||
std::cout << "Compute capability " << compute_capability
|
||||
<< " is not supported" << std::endl;
|
||||
diff --git a/lib/kokkos/cmake/kokkos_arch.cmake b/lib/kokkos/cmake/kokkos_arch.cmake
|
||||
index ef16aad047..c1d76cceeb 100644
|
||||
--- a/lib/kokkos/cmake/kokkos_arch.cmake
|
||||
+++ b/lib/kokkos/cmake/kokkos_arch.cmake
|
||||
@@ -86,6 +86,7 @@ KOKKOS_ARCH_OPTION(VOLTA72 GPU "NVIDIA Volta generation CC 7.2" "KOKK
|
||||
KOKKOS_ARCH_OPTION(TURING75 GPU "NVIDIA Turing generation CC 7.5" "KOKKOS_SHOW_CUDA_ARCHS")
|
||||
KOKKOS_ARCH_OPTION(AMPERE80 GPU "NVIDIA Ampere generation CC 8.0" "KOKKOS_SHOW_CUDA_ARCHS")
|
||||
KOKKOS_ARCH_OPTION(AMPERE86 GPU "NVIDIA Ampere generation CC 8.6" "KOKKOS_SHOW_CUDA_ARCHS")
|
||||
+KOKKOS_ARCH_OPTION(HOPPER90 GPU "NVIDIA Hopper generation CC 9.0" "KOKKOS_SHOW_CUDA_ARCHS")
|
||||
|
||||
IF(Kokkos_ENABLE_HIP OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_UNSUPPORTED_ARCHS)
|
||||
SET(KOKKOS_SHOW_HIP_ARCHS ON)
|
||||
@@ -544,6 +545,7 @@ CHECK_CUDA_ARCH(VOLTA72 sm_72)
|
||||
CHECK_CUDA_ARCH(TURING75 sm_75)
|
||||
CHECK_CUDA_ARCH(AMPERE80 sm_80)
|
||||
CHECK_CUDA_ARCH(AMPERE86 sm_86)
|
||||
+CHECK_CUDA_ARCH(HOPPER90 sm_90)
|
||||
|
||||
SET(AMDGPU_ARCH_ALREADY_SPECIFIED "")
|
||||
FUNCTION(CHECK_AMDGPU_ARCH ARCH FLAG)
|
||||
@@ -806,6 +808,10 @@ IF (KOKKOS_ARCH_AMPERE80 OR KOKKOS_ARCH_AMPERE86)
|
||||
SET(KOKKOS_ARCH_AMPERE ON)
|
||||
ENDIF()
|
||||
|
||||
+IF (KOKKOS_ARCH_HOPPER90)
|
||||
+ SET(KOKKOS_ARCH_HOPPER ON)
|
||||
+ENDIF()
|
||||
+
|
||||
#Regardless of version, make sure we define the general architecture name
|
||||
IF (KOKKOS_ARCH_VEGA900 OR KOKKOS_ARCH_VEGA906 OR KOKKOS_ARCH_VEGA908 OR KOKKOS_ARCH_VEGA90A)
|
||||
SET(KOKKOS_ARCH_VEGA ON)
|
||||
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
|
||||
index 56f9117844..fcd4773dbc 100644
|
||||
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
|
||||
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
|
||||
@@ -232,7 +232,8 @@ inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) {
|
||||
case 61: return 96;
|
||||
case 70:
|
||||
case 80:
|
||||
- case 86: return 8;
|
||||
+ case 86:
|
||||
+ case 90: return 8;
|
||||
case 75: return 32;
|
||||
default:
|
||||
Kokkos::Impl::throw_runtime_exception(
|
||||
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp
|
||||
index 40a263561f..8c40ebd60d 100644
|
||||
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp
|
||||
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp
|
||||
@@ -418,7 +418,7 @@ KOKKOS_INLINE_FUNCTION
|
||||
#endif // CUDA_VERSION >= 11000 && CUDA_VERSION < 11010
|
||||
|
||||
#if CUDA_VERSION >= 11010 && \
|
||||
- ((defined(KOKKOS_ARCH_AMPERE80) || defined(KOKKOS_ARCH_AMPERE86)))
|
||||
+ ((defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER)))
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bhalf_t cast_to_bhalf(bhalf_t val) { return val; }
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp
|
||||
index f9451ecfe6..2ce1efb98c 100644
|
||||
--- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp
|
||||
+++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp
|
||||
@@ -51,7 +51,7 @@ namespace Kokkos::Experimental::Impl {
|
||||
|
||||
struct OpenACC_Traits {
|
||||
#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
|
||||
- defined(KOKKOS_ARCH_AMPERE)
|
||||
+ defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER)
|
||||
static constexpr acc_device_t dev_type = acc_device_nvidia;
|
||||
static constexpr bool may_fallback_to_host = false;
|
||||
#else
|
||||
diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
|
||||
index a9bc085912..27ee1d4232 100644
|
||||
--- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
|
||||
+++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
|
||||
@@ -115,8 +115,9 @@ void OpenMPTargetInternal::impl_initialize() {
|
||||
|
||||
// FIXME_OPENMPTARGET: Only fix the number of teams for NVIDIA architectures
|
||||
// from Pascal and upwards.
|
||||
-#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
|
||||
- defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
|
||||
+#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
|
||||
+ defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
|
||||
+ defined(KOKKOS_ARCH_HOPPER)
|
||||
#if defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG >= 1300)
|
||||
omp_set_num_teams(512);
|
||||
#endif
|
||||
diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
|
||||
index 840db4327c..7e5addbc5b 100644
|
||||
--- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
|
||||
+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
|
||||
@@ -155,7 +155,7 @@ void SYCL::impl_initialize(InitializationSettings const& settings) {
|
||||
#if !defined(KOKKOS_ARCH_INTEL_GPU) && !defined(KOKKOS_ARCH_KEPLER) && \
|
||||
!defined(KOKKOS_ARCH_MAXWELL) && !defined(KOKKOS_ARCH_PASCAL) && \
|
||||
!defined(KOKKOS_ARCH_VOLTA) && !defined(KOKKOS_ARCH_TURING75) && \
|
||||
- !defined(KOKKOS_ARCH_AMPERE)
|
||||
+ !defined(KOKKOS_ARCH_AMPERE) && !defined(KOKKOS_ARCH_HOPPER)
|
||||
if (!settings.has_device_id() && gpu_devices.empty()) {
|
||||
Impl::SYCLInternal::singleton().initialize(sycl::device());
|
||||
return;
|
||||
diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
|
||||
index 5ac7d8af30..ba101f699e 100644
|
||||
--- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
|
||||
+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
|
||||
@@ -335,9 +335,10 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
|
||||
return std::min({
|
||||
int(m_space.impl_internal_space_instance()->m_maxWorkgroupSize),
|
||||
// FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs.
|
||||
-#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
|
||||
- defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
|
||||
- defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
|
||||
+#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
|
||||
+ defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
|
||||
+ defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
|
||||
+ defined(KOKKOS_ARCH_HOPPER)
|
||||
256,
|
||||
#endif
|
||||
max_threads_for_memory
|
||||
@@ -367,9 +368,10 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
|
||||
return std::min<int>({
|
||||
int(m_space.impl_internal_space_instance()->m_maxWorkgroupSize),
|
||||
// FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs.
|
||||
-#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
|
||||
- defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
|
||||
- defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
|
||||
+#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
|
||||
+ defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
|
||||
+ defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
|
||||
+ defined(KOKKOS_ARCH_HOPPER)
|
||||
256,
|
||||
#endif
|
||||
max_threads_for_memory
|
||||
@ -1,523 +0,0 @@
|
||||
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
|
||||
index fcd4773dbc..30b6958a67 100644
|
||||
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
|
||||
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
|
||||
@@ -207,7 +207,6 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
|
||||
LaunchBounds{});
|
||||
}
|
||||
|
||||
-// Assuming cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1)
|
||||
// NOTE these number can be obtained several ways:
|
||||
// * One option is to download the CUDA Occupancy Calculator spreadsheet, select
|
||||
// "Compute Capability" first and check what is the smallest "Shared Memory
|
||||
@@ -242,6 +241,7 @@ inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) {
|
||||
return 0;
|
||||
}() * 1024;
|
||||
}
|
||||
+
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
|
||||
index 5811498e01..e22eb3b842 100644
|
||||
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
|
||||
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
|
||||
@@ -569,12 +569,6 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default
|
||||
}
|
||||
#endif
|
||||
|
||||
-#ifdef KOKKOS_ENABLE_PRE_CUDA_10_DEPRECATION_API
|
||||
- cudaThreadSetCacheConfig(cudaFuncCachePreferShared);
|
||||
-#else
|
||||
- cudaDeviceSetCacheConfig(cudaFuncCachePreferShared);
|
||||
-#endif
|
||||
-
|
||||
// Init the array for used for arbitrarily sized atomics
|
||||
if (stream == nullptr) Impl::initialize_host_cuda_lock_arrays();
|
||||
|
||||
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
|
||||
index b7a80ad84f..5c4c3a7d39 100644
|
||||
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
|
||||
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
|
||||
@@ -93,10 +93,6 @@ namespace Impl {
|
||||
// __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
|
||||
// function qualifier which could be used to improve performance.
|
||||
//----------------------------------------------------------------------------
|
||||
-// Maximize L1 cache and minimize shared memory:
|
||||
-// cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1 );
|
||||
-// For 2.0 capability: 48 KB L1 and 16 KB shared
|
||||
-//----------------------------------------------------------------------------
|
||||
|
||||
template <class DriverType>
|
||||
__global__ static void cuda_parallel_launch_constant_memory() {
|
||||
@@ -158,63 +154,105 @@ inline void check_shmem_request(CudaInternal const* cuda_instance, int shmem) {
|
||||
}
|
||||
}
|
||||
|
||||
-// This function needs to be template on DriverType and LaunchBounds
|
||||
+// These functions needs to be template on DriverType and LaunchBounds
|
||||
// so that the static bool is unique for each type combo
|
||||
// KernelFuncPtr does not necessarily contain that type information.
|
||||
+
|
||||
template <class DriverType, class LaunchBounds, class KernelFuncPtr>
|
||||
-inline void configure_shmem_preference(KernelFuncPtr const& func,
|
||||
- bool prefer_shmem) {
|
||||
+const cudaFuncAttributes& get_cuda_kernel_func_attributes(
|
||||
+ const KernelFuncPtr& func) {
|
||||
+ // Only call cudaFuncGetAttributes once for each unique kernel
|
||||
+ // by leveraging static variable initialization rules
|
||||
+ auto wrap_get_attributes = [&]() -> cudaFuncAttributes {
|
||||
+ cudaFuncAttributes attr;
|
||||
+ KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncGetAttributes(&attr, func));
|
||||
+ return attr;
|
||||
+ };
|
||||
+ static cudaFuncAttributes func_attr = wrap_get_attributes();
|
||||
+ return func_attr;
|
||||
+}
|
||||
+
|
||||
+template <class DriverType, class LaunchBounds, class KernelFuncPtr>
|
||||
+inline void configure_shmem_preference(const KernelFuncPtr& func,
|
||||
+ const cudaDeviceProp& device_props,
|
||||
+ const size_t block_size, int& shmem,
|
||||
+ const size_t occupancy) {
|
||||
#ifndef KOKKOS_ARCH_KEPLER
|
||||
- // On Kepler the L1 has no benefit since it doesn't cache reads
|
||||
+
|
||||
+ const auto& func_attr =
|
||||
+ get_cuda_kernel_func_attributes<DriverType, LaunchBounds>(func);
|
||||
+
|
||||
+ // Compute limits for number of blocks due to registers/SM
|
||||
+ const size_t regs_per_sm = device_props.regsPerMultiprocessor;
|
||||
+ const size_t regs_per_thread = func_attr.numRegs;
|
||||
+ // The granularity of register allocation is chunks of 256 registers per warp
|
||||
+ // -> 8 registers per thread
|
||||
+ const size_t allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
|
||||
+ const size_t max_blocks_regs =
|
||||
+ regs_per_sm / (allocated_regs_per_thread * block_size);
|
||||
+
|
||||
+ // Compute how many threads per sm we actually want
|
||||
+ const size_t max_threads_per_sm = device_props.maxThreadsPerMultiProcessor;
|
||||
+ // only allocate multiples of warp size
|
||||
+ const size_t num_threads_desired =
|
||||
+ ((max_threads_per_sm * occupancy / 100 + 31) / 32) * 32;
|
||||
+ // Get close to the desired occupancy,
|
||||
+ // don't undershoot by much but also don't allocate a whole new block just
|
||||
+ // because one is a few threads over otherwise.
|
||||
+ size_t num_blocks_desired =
|
||||
+ (num_threads_desired + block_size * 0.8) / block_size;
|
||||
+ num_blocks_desired = ::std::min(max_blocks_regs, num_blocks_desired);
|
||||
+ if (num_blocks_desired == 0) num_blocks_desired = 1;
|
||||
+
|
||||
+ // Calculate how much shared memory we need per block
|
||||
+ size_t shmem_per_block = shmem + func_attr.sharedSizeBytes;
|
||||
+
|
||||
+ // The minimum shared memory allocation we can have in total per SM is 8kB.
|
||||
+ // If we want to lower occupancy we have to make sure we request at least that
|
||||
+ // much in aggregate over all blocks, so that shared memory actually becomes a
|
||||
+ // limiting factor for occupancy
|
||||
+ constexpr size_t min_shmem_size_per_sm = 8192;
|
||||
+ if ((occupancy < 100) &&
|
||||
+ (shmem_per_block * num_blocks_desired < min_shmem_size_per_sm)) {
|
||||
+ shmem_per_block = min_shmem_size_per_sm / num_blocks_desired;
|
||||
+ // Need to set the caller's shmem variable so that the
|
||||
+ // kernel launch uses the correct dynamic shared memory request
|
||||
+ shmem = shmem_per_block - func_attr.sharedSizeBytes;
|
||||
+ }
|
||||
+
|
||||
+ // Compute the carveout fraction we need based on occupancy
|
||||
+ // Use multiples of 8kB
|
||||
+ const size_t max_shmem_per_sm = device_props.sharedMemPerMultiprocessor;
|
||||
+ size_t carveout = shmem_per_block == 0
|
||||
+ ? 0
|
||||
+ : 100 *
|
||||
+ (((num_blocks_desired * shmem_per_block +
|
||||
+ min_shmem_size_per_sm - 1) /
|
||||
+ min_shmem_size_per_sm) *
|
||||
+ min_shmem_size_per_sm) /
|
||||
+ max_shmem_per_sm;
|
||||
+ if (carveout > 100) carveout = 100;
|
||||
+
|
||||
+ // Set the carveout, but only call it once per kernel or when it changes
|
||||
auto set_cache_config = [&] {
|
||||
- KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
|
||||
- func,
|
||||
- (prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1)));
|
||||
- return prefer_shmem;
|
||||
+ KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetAttribute(
|
||||
+ func, cudaFuncAttributePreferredSharedMemoryCarveout, carveout));
|
||||
+ return carveout;
|
||||
};
|
||||
- static bool cache_config_preference_cached = set_cache_config();
|
||||
- if (cache_config_preference_cached != prefer_shmem) {
|
||||
+ // Store the value in a static variable so we only reset if needed
|
||||
+ static size_t cache_config_preference_cached = set_cache_config();
|
||||
+ if (cache_config_preference_cached != carveout) {
|
||||
cache_config_preference_cached = set_cache_config();
|
||||
}
|
||||
#else
|
||||
// Use the parameters so we don't get a warning
|
||||
(void)func;
|
||||
- (void)prefer_shmem;
|
||||
+ (void)device_props;
|
||||
+ (void)block_size;
|
||||
+ (void)occupancy;
|
||||
#endif
|
||||
}
|
||||
|
||||
-template <class Policy>
|
||||
-std::enable_if_t<Policy::experimental_contains_desired_occupancy>
|
||||
-modify_launch_configuration_if_desired_occupancy_is_specified(
|
||||
- Policy const& policy, cudaDeviceProp const& properties,
|
||||
- cudaFuncAttributes const& attributes, dim3 const& block, int& shmem,
|
||||
- bool& prefer_shmem) {
|
||||
- int const block_size = block.x * block.y * block.z;
|
||||
- int const desired_occupancy = policy.impl_get_desired_occupancy().value();
|
||||
-
|
||||
- size_t const shmem_per_sm_prefer_l1 = get_shmem_per_sm_prefer_l1(properties);
|
||||
- size_t const static_shmem = attributes.sharedSizeBytes;
|
||||
-
|
||||
- // round to nearest integer and avoid division by zero
|
||||
- int active_blocks = std::max(
|
||||
- 1, static_cast<int>(std::round(
|
||||
- static_cast<double>(properties.maxThreadsPerMultiProcessor) /
|
||||
- block_size * desired_occupancy / 100)));
|
||||
- int const dynamic_shmem =
|
||||
- shmem_per_sm_prefer_l1 / active_blocks - static_shmem;
|
||||
-
|
||||
- if (dynamic_shmem > shmem) {
|
||||
- shmem = dynamic_shmem;
|
||||
- prefer_shmem = false;
|
||||
- }
|
||||
-}
|
||||
-
|
||||
-template <class Policy>
|
||||
-std::enable_if_t<!Policy::experimental_contains_desired_occupancy>
|
||||
-modify_launch_configuration_if_desired_occupancy_is_specified(
|
||||
- Policy const&, cudaDeviceProp const&, cudaFuncAttributes const&,
|
||||
- dim3 const& /*block*/, int& /*shmem*/, bool& /*prefer_shmem*/) {}
|
||||
-
|
||||
// </editor-fold> end Some helper functions for launch code readability }}}1
|
||||
//==============================================================================
|
||||
|
||||
@@ -348,7 +386,7 @@ struct CudaParallelLaunchKernelInvoker<
|
||||
#ifdef KOKKOS_CUDA_ENABLE_GRAPHS
|
||||
inline static void create_parallel_launch_graph_node(
|
||||
DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
|
||||
- CudaInternal const* cuda_instance, bool prefer_shmem) {
|
||||
+ CudaInternal const* cuda_instance) {
|
||||
//----------------------------------------
|
||||
auto const& graph = Impl::get_cuda_graph_from_kernel(driver);
|
||||
KOKKOS_EXPECTS(bool(graph));
|
||||
@@ -358,8 +396,15 @@ struct CudaParallelLaunchKernelInvoker<
|
||||
|
||||
if (!Impl::is_empty_launch(grid, block)) {
|
||||
Impl::check_shmem_request(cuda_instance, shmem);
|
||||
- Impl::configure_shmem_preference<DriverType, LaunchBounds>(
|
||||
- base_t::get_kernel_func(), prefer_shmem);
|
||||
+ if (DriverType::Policy::
|
||||
+ experimental_contains_desired_occupancy) {
|
||||
+ int desired_occupancy =
|
||||
+ driver.get_policy().impl_get_desired_occupancy().value();
|
||||
+ size_t block_size = block.x * block.y * block.z;
|
||||
+ Impl::configure_shmem_preference<DriverType, LaunchBounds>(
|
||||
+ base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
|
||||
+ shmem, desired_occupancy);
|
||||
+ }
|
||||
|
||||
void const* args[] = {&driver};
|
||||
|
||||
@@ -442,7 +487,7 @@ struct CudaParallelLaunchKernelInvoker<
|
||||
#ifdef KOKKOS_CUDA_ENABLE_GRAPHS
|
||||
inline static void create_parallel_launch_graph_node(
|
||||
DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
|
||||
- CudaInternal const* cuda_instance, bool prefer_shmem) {
|
||||
+ CudaInternal const* cuda_instance) {
|
||||
//----------------------------------------
|
||||
auto const& graph = Impl::get_cuda_graph_from_kernel(driver);
|
||||
KOKKOS_EXPECTS(bool(graph));
|
||||
@@ -452,8 +497,15 @@ struct CudaParallelLaunchKernelInvoker<
|
||||
|
||||
if (!Impl::is_empty_launch(grid, block)) {
|
||||
Impl::check_shmem_request(cuda_instance, shmem);
|
||||
- Impl::configure_shmem_preference<DriverType, LaunchBounds>(
|
||||
- base_t::get_kernel_func(), prefer_shmem);
|
||||
+ if constexpr (DriverType::Policy::
|
||||
+ experimental_contains_desired_occupancy) {
|
||||
+ int desired_occupancy =
|
||||
+ driver.get_policy().impl_get_desired_occupancy().value();
|
||||
+ size_t block_size = block.x * block.y * block.z;
|
||||
+ Impl::configure_shmem_preference<DriverType, LaunchBounds>(
|
||||
+ base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
|
||||
+ shmem, desired_occupancy);
|
||||
+ }
|
||||
|
||||
auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver);
|
||||
|
||||
@@ -566,7 +618,7 @@ struct CudaParallelLaunchKernelInvoker<
|
||||
#ifdef KOKKOS_CUDA_ENABLE_GRAPHS
|
||||
inline static void create_parallel_launch_graph_node(
|
||||
DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
|
||||
- CudaInternal const* cuda_instance, bool prefer_shmem) {
|
||||
+ CudaInternal const* cuda_instance) {
|
||||
// Just use global memory; coordinating through events to share constant
|
||||
// memory with the non-graph interface is not really reasonable since
|
||||
// events don't work with Graphs directly, and this would anyway require
|
||||
@@ -580,7 +632,7 @@ struct CudaParallelLaunchKernelInvoker<
|
||||
DriverType, LaunchBounds,
|
||||
Experimental::CudaLaunchMechanism::GlobalMemory>;
|
||||
global_launch_impl_t::create_parallel_launch_graph_node(
|
||||
- driver, grid, block, shmem, cuda_instance, prefer_shmem);
|
||||
+ driver, grid, block, shmem, cuda_instance);
|
||||
}
|
||||
#endif
|
||||
};
|
||||
@@ -613,8 +665,7 @@ struct CudaParallelLaunchImpl<
|
||||
|
||||
inline static void launch_kernel(const DriverType& driver, const dim3& grid,
|
||||
const dim3& block, int shmem,
|
||||
- const CudaInternal* cuda_instance,
|
||||
- bool prefer_shmem) {
|
||||
+ const CudaInternal* cuda_instance) {
|
||||
if (!Impl::is_empty_launch(grid, block)) {
|
||||
// Prevent multiple threads to simultaneously set the cache configuration
|
||||
// preference and launch the same kernel
|
||||
@@ -623,18 +674,17 @@ struct CudaParallelLaunchImpl<
|
||||
|
||||
Impl::check_shmem_request(cuda_instance, shmem);
|
||||
|
||||
- // If a desired occupancy is specified, we compute how much shared memory
|
||||
- // to ask for to achieve that occupancy, assuming that the cache
|
||||
- // configuration is `cudaFuncCachePreferL1`. If the amount of dynamic
|
||||
- // shared memory computed is actually smaller than `shmem` we overwrite
|
||||
- // `shmem` and set `prefer_shmem` to `false`.
|
||||
- modify_launch_configuration_if_desired_occupancy_is_specified(
|
||||
- driver.get_policy(), cuda_instance->m_deviceProp,
|
||||
- get_cuda_func_attributes(), block, shmem, prefer_shmem);
|
||||
-
|
||||
- Impl::configure_shmem_preference<
|
||||
- DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
|
||||
- base_t::get_kernel_func(), prefer_shmem);
|
||||
+ if (DriverType::Policy::
|
||||
+ experimental_contains_desired_occupancy) {
|
||||
+ int desired_occupancy =
|
||||
+ driver.get_policy().impl_get_desired_occupancy().value();
|
||||
+ size_t block_size = block.x * block.y * block.z;
|
||||
+ Impl::configure_shmem_preference<
|
||||
+ DriverType,
|
||||
+ Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
|
||||
+ base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
|
||||
+ shmem, desired_occupancy);
|
||||
+ }
|
||||
|
||||
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
|
||||
|
||||
@@ -650,18 +700,9 @@ struct CudaParallelLaunchImpl<
|
||||
}
|
||||
|
||||
static cudaFuncAttributes get_cuda_func_attributes() {
|
||||
- // Race condition inside of cudaFuncGetAttributes if the same address is
|
||||
- // given requires using a local variable as input instead of a static Rely
|
||||
- // on static variable initialization to make sure only one thread executes
|
||||
- // the code and the result is visible.
|
||||
- auto wrap_get_attributes = []() -> cudaFuncAttributes {
|
||||
- cudaFuncAttributes attr_tmp;
|
||||
- KOKKOS_IMPL_CUDA_SAFE_CALL(
|
||||
- cudaFuncGetAttributes(&attr_tmp, base_t::get_kernel_func()));
|
||||
- return attr_tmp;
|
||||
- };
|
||||
- static cudaFuncAttributes attr = wrap_get_attributes();
|
||||
- return attr;
|
||||
+ return get_cuda_kernel_func_attributes<
|
||||
+ DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
|
||||
+ base_t::get_kernel_func());
|
||||
}
|
||||
};
|
||||
|
||||
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
|
||||
index e586bb4cc6..0e348c092a 100644
|
||||
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
|
||||
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
|
||||
@@ -121,8 +121,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
|
||||
maxblocks[1]),
|
||||
1);
|
||||
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
||||
- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
|
||||
- false);
|
||||
+ *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
|
||||
} else if (RP::rank == 3) {
|
||||
const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], m_rp.m_tile[2]);
|
||||
KOKKOS_ASSERT(block.x > 0);
|
||||
@@ -139,8 +138,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
|
||||
(m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1) / block.z,
|
||||
maxblocks[2]));
|
||||
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
||||
- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
|
||||
- false);
|
||||
+ *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
|
||||
} else if (RP::rank == 4) {
|
||||
// id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to
|
||||
// threadIdx.z
|
||||
@@ -158,8 +156,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
|
||||
(m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1) / block.z,
|
||||
maxblocks[2]));
|
||||
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
||||
- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
|
||||
- false);
|
||||
+ *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
|
||||
} else if (RP::rank == 5) {
|
||||
// id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to
|
||||
// threadIdx.z
|
||||
@@ -175,8 +172,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
|
||||
(m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1) / block.z,
|
||||
maxblocks[2]));
|
||||
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
||||
- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
|
||||
- false);
|
||||
+ *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
|
||||
} else if (RP::rank == 6) {
|
||||
// id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4,id5 to
|
||||
// threadIdx.z
|
||||
@@ -191,8 +187,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
|
||||
std::min<array_index_type>(m_rp.m_tile_end[4] * m_rp.m_tile_end[5],
|
||||
maxblocks[2]));
|
||||
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
||||
- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
|
||||
- false);
|
||||
+ *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
|
||||
} else {
|
||||
Kokkos::abort("Kokkos::MDRange Error: Exceeded rank bounds with Cuda\n");
|
||||
}
|
||||
@@ -405,8 +400,8 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
|
||||
|
||||
CudaParallelLaunch<ParallelReduce, LaunchBounds>(
|
||||
*this, grid, block, shmem,
|
||||
- m_policy.space().impl_internal_space_instance(),
|
||||
- false); // copy to device and execute
|
||||
+ m_policy.space()
|
||||
+ .impl_internal_space_instance()); // copy to device and execute
|
||||
|
||||
if (!m_result_ptr_device_accessible) {
|
||||
if (m_result_ptr) {
|
||||
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
|
||||
index ac160f8fe2..d1031751c2 100644
|
||||
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
|
||||
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
|
||||
@@ -135,8 +135,7 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
|
||||
#endif
|
||||
|
||||
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
||||
- *this, grid, block, 0, m_policy.space().impl_internal_space_instance(),
|
||||
- false);
|
||||
+ *this, grid, block, 0, m_policy.space().impl_internal_space_instance());
|
||||
}
|
||||
|
||||
ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
|
||||
@@ -375,8 +374,8 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
|
||||
|
||||
CudaParallelLaunch<ParallelReduce, LaunchBounds>(
|
||||
*this, grid, block, shmem,
|
||||
- m_policy.space().impl_internal_space_instance(),
|
||||
- false); // copy to device and execute
|
||||
+ m_policy.space()
|
||||
+ .impl_internal_space_instance()); // copy to device and execute
|
||||
|
||||
if (!m_result_ptr_device_accessible) {
|
||||
if (m_result_ptr) {
|
||||
@@ -726,16 +725,16 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
|
||||
m_final = false;
|
||||
CudaParallelLaunch<ParallelScan, LaunchBounds>(
|
||||
*this, grid, block, shmem,
|
||||
- m_policy.space().impl_internal_space_instance(),
|
||||
- false); // copy to device and execute
|
||||
+ m_policy.space()
|
||||
+ .impl_internal_space_instance()); // copy to device and execute
|
||||
#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
|
||||
}
|
||||
#endif
|
||||
m_final = true;
|
||||
CudaParallelLaunch<ParallelScan, LaunchBounds>(
|
||||
*this, grid, block, shmem,
|
||||
- m_policy.space().impl_internal_space_instance(),
|
||||
- false); // copy to device and execute
|
||||
+ m_policy.space()
|
||||
+ .impl_internal_space_instance()); // copy to device and execute
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1038,16 +1037,16 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
|
||||
m_final = false;
|
||||
CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>(
|
||||
*this, grid, block, shmem,
|
||||
- m_policy.space().impl_internal_space_instance(),
|
||||
- false); // copy to device and execute
|
||||
+ m_policy.space()
|
||||
+ .impl_internal_space_instance()); // copy to device and execute
|
||||
#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
|
||||
}
|
||||
#endif
|
||||
m_final = true;
|
||||
CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>(
|
||||
*this, grid, block, shmem,
|
||||
- m_policy.space().impl_internal_space_instance(),
|
||||
- false); // copy to device and execute
|
||||
+ m_policy.space()
|
||||
+ .impl_internal_space_instance()); // copy to device and execute
|
||||
|
||||
const int size = Analysis::value_size(m_functor);
|
||||
#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
|
||||
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
|
||||
index cdd16085b3..ea9430b812 100644
|
||||
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
|
||||
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
|
||||
@@ -552,8 +552,8 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
|
||||
|
||||
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
||||
*this, grid, block, shmem_size_total,
|
||||
- m_policy.space().impl_internal_space_instance(),
|
||||
- true); // copy to device and execute
|
||||
+ m_policy.space()
|
||||
+ .impl_internal_space_instance()); // copy to device and execute
|
||||
}
|
||||
|
||||
ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
|
||||
@@ -878,8 +878,8 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
|
||||
|
||||
CudaParallelLaunch<ParallelReduce, LaunchBounds>(
|
||||
*this, grid, block, shmem_size_total,
|
||||
- m_policy.space().impl_internal_space_instance(),
|
||||
- true); // copy to device and execute
|
||||
+ m_policy.space()
|
||||
+ .impl_internal_space_instance()); // copy to device and execute
|
||||
|
||||
if (!m_result_ptr_device_accessible) {
|
||||
m_policy.space().fence(
|
||||
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
|
||||
index 34d4bef9fd..178012431c 100644
|
||||
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
|
||||
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
|
||||
@@ -428,11 +428,6 @@ struct CudaReductionsFunctor<FunctorType, false, false> {
|
||||
// __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
|
||||
// function qualifier which could be used to improve performance.
|
||||
//----------------------------------------------------------------------------
|
||||
-// Maximize shared memory and minimize L1 cache:
|
||||
-// cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferShared );
|
||||
-// For 2.0 capability: 48 KB shared and 16 KB L1
|
||||
-//----------------------------------------------------------------------------
|
||||
-//----------------------------------------------------------------------------
|
||||
/*
|
||||
* Algorithmic constraints:
|
||||
* (a) blockDim.y <= 1024
|
||||
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
|
||||
index fb3a6b138f..a12378a891 100644
|
||||
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
|
||||
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
|
||||
@@ -100,8 +100,7 @@ class ParallelFor<FunctorType, Kokkos::WorkGraphPolicy<Traits...>,
|
||||
const int shared = 0;
|
||||
|
||||
Kokkos::Impl::CudaParallelLaunch<Self>(
|
||||
- *this, grid, block, shared, Cuda().impl_internal_space_instance(),
|
||||
- false);
|
||||
+ *this, grid, block, shared, Cuda().impl_internal_space_instance());
|
||||
}
|
||||
|
||||
inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
|
||||
@ -1,46 +0,0 @@
|
||||
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
|
||||
index 30b6958a67..b94f053272 100644
|
||||
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
|
||||
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
|
||||
@@ -207,41 +207,6 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
|
||||
LaunchBounds{});
|
||||
}
|
||||
|
||||
-// NOTE these number can be obtained several ways:
|
||||
-// * One option is to download the CUDA Occupancy Calculator spreadsheet, select
|
||||
-// "Compute Capability" first and check what is the smallest "Shared Memory
|
||||
-// Size Config" that is available. The "Shared Memory Per Multiprocessor" in
|
||||
-// bytes is then to be found below in the summary.
|
||||
-// * Another option would be to look for the information in the "Tuning
|
||||
-// Guide(s)" of the CUDA Toolkit Documentation for each GPU architecture, in
|
||||
-// the "Shared Memory" section (more tedious)
|
||||
-inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) {
|
||||
- int const compute_capability = properties.major * 10 + properties.minor;
|
||||
- return [compute_capability]() {
|
||||
- switch (compute_capability) {
|
||||
- case 30:
|
||||
- case 32:
|
||||
- case 35: return 16;
|
||||
- case 37: return 80;
|
||||
- case 50:
|
||||
- case 53:
|
||||
- case 60:
|
||||
- case 62: return 64;
|
||||
- case 52:
|
||||
- case 61: return 96;
|
||||
- case 70:
|
||||
- case 80:
|
||||
- case 86:
|
||||
- case 90: return 8;
|
||||
- case 75: return 32;
|
||||
- default:
|
||||
- Kokkos::Impl::throw_runtime_exception(
|
||||
- "Unknown device in cuda block size deduction");
|
||||
- }
|
||||
- return 0;
|
||||
- }() * 1024;
|
||||
-}
|
||||
-
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
@ -1,204 +0,0 @@
|
||||
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
|
||||
index b94f053272..252c13c524 100644
|
||||
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
|
||||
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
|
||||
@@ -53,17 +53,69 @@
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
+inline int cuda_warp_per_sm_allocation_granularity(
|
||||
+ cudaDeviceProp const& properties) {
|
||||
+ // Allocation granularity of warps in each sm
|
||||
+ switch (properties.major) {
|
||||
+ case 3:
|
||||
+ case 5:
|
||||
+ case 7:
|
||||
+ case 8:
|
||||
+ case 9: return 4;
|
||||
+ case 6: return (properties.minor == 0 ? 2 : 4);
|
||||
+ default:
|
||||
+ throw_runtime_exception(
|
||||
+ "Unknown device in cuda warp per sm allocation granularity");
|
||||
+ return 0;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+inline int cuda_max_warps_per_sm_registers(
|
||||
+ cudaDeviceProp const& properties, cudaFuncAttributes const& attributes) {
|
||||
+ // Maximum number of warps per sm as a function of register counts,
|
||||
+ // subject to the constraint that warps are allocated with a fixed granularity
|
||||
+ int const max_regs_per_block = properties.regsPerBlock;
|
||||
+ int const regs_per_warp = attributes.numRegs * properties.warpSize;
|
||||
+ int const warp_granularity =
|
||||
+ cuda_warp_per_sm_allocation_granularity(properties);
|
||||
+ // The granularity of register allocation is chunks of 256 registers per warp,
|
||||
+ // which implies a need to over-allocate, so we round up
|
||||
+ int const allocated_regs_per_warp = (regs_per_warp + 256 - 1) / 256;
|
||||
+
|
||||
+ // The maximum number of warps per SM is constrained from above by register
|
||||
+ // allocation. To satisfy the constraint that warps per SM is allocated at a
|
||||
+ // finite granularity, we need to round down.
|
||||
+ int const max_warps_per_sm =
|
||||
+ warp_granularity *
|
||||
+ (max_regs_per_block / (allocated_regs_per_warp * warp_granularity));
|
||||
+
|
||||
+ return max_warps_per_sm;
|
||||
+}
|
||||
+
|
||||
inline int cuda_max_active_blocks_per_sm(cudaDeviceProp const& properties,
|
||||
cudaFuncAttributes const& attributes,
|
||||
int block_size, size_t dynamic_shmem) {
|
||||
- // Limits due do registers/SM
|
||||
+ // Limits due to registers/SM
|
||||
int const regs_per_sm = properties.regsPerMultiprocessor;
|
||||
int const regs_per_thread = attributes.numRegs;
|
||||
// The granularity of register allocation is chunks of 256 registers per warp
|
||||
// -> 8 registers per thread
|
||||
int const allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
|
||||
- int const max_blocks_regs =
|
||||
- regs_per_sm / (allocated_regs_per_thread * block_size);
|
||||
+ int max_blocks_regs = regs_per_sm / (allocated_regs_per_thread * block_size);
|
||||
+
|
||||
+ // Compute the maximum number of warps as a function of the number of
|
||||
+ // registers
|
||||
+ int const max_warps_per_sm_registers =
|
||||
+ cuda_max_warps_per_sm_registers(properties, attributes);
|
||||
+
|
||||
+ // Constrain the number of blocks to respect the maximum number of warps per
|
||||
+ // SM On face value this should be an equality, but due to the warp
|
||||
+ // granularity constraints noted in `cuda_max_warps_per_sm_registers` the
|
||||
+ // left-hand-side of this comparison can overshoot what the hardware allows
|
||||
+ // based on register counts alone
|
||||
+ while ((max_blocks_regs * block_size / properties.warpSize) >
|
||||
+ max_warps_per_sm_registers)
|
||||
+ max_blocks_regs--;
|
||||
|
||||
// Limits due to shared memory/SM
|
||||
size_t const shmem_per_sm = properties.sharedMemPerMultiprocessor;
|
||||
@@ -207,6 +259,19 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
|
||||
LaunchBounds{});
|
||||
}
|
||||
|
||||
+template <class LaunchBounds>
|
||||
+int cuda_get_opt_block_size_no_shmem(const cudaFuncAttributes& attr,
|
||||
+ LaunchBounds) {
|
||||
+ auto const& prop = Kokkos::Cuda().cuda_device_prop();
|
||||
+
|
||||
+ // Thin version of cuda_get_opt_block_size for cases where there is no shared
|
||||
+ // memory
|
||||
+ auto const block_size_to_no_shmem = [&](int /*block_size*/) { return 0; };
|
||||
+
|
||||
+ return cuda_deduce_block_size(false, prop, attr, block_size_to_no_shmem,
|
||||
+ LaunchBounds{});
|
||||
+}
|
||||
+
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
|
||||
index 5c4c3a7d39..170183ca0a 100644
|
||||
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
|
||||
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
|
||||
@@ -188,9 +188,23 @@ inline void configure_shmem_preference(const KernelFuncPtr& func,
|
||||
// The granularity of register allocation is chunks of 256 registers per warp
|
||||
// -> 8 registers per thread
|
||||
const size_t allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
|
||||
- const size_t max_blocks_regs =
|
||||
+ size_t max_blocks_regs =
|
||||
regs_per_sm / (allocated_regs_per_thread * block_size);
|
||||
|
||||
+ // Compute the maximum number of warps as a function of the number of
|
||||
+ // registers
|
||||
+ const size_t max_warps_per_sm_registers =
|
||||
+ cuda_max_warps_per_sm_registers(device_props, func_attr);
|
||||
+
|
||||
+ // Constrain the number of blocks to respect the maximum number of warps per
|
||||
+ // SM On face value this should be an equality, but due to the warp
|
||||
+ // granularity constraints noted in `cuda_max_warps_per_sm_registers` the
|
||||
+ // left-hand-side of this comparison can overshoot what the hardware allows
|
||||
+ // based on register counts alone
|
||||
+ while ((max_blocks_regs * block_size / device_props.warpSize) >
|
||||
+ max_warps_per_sm_registers)
|
||||
+ max_blocks_regs--;
|
||||
+
|
||||
// Compute how many threads per sm we actually want
|
||||
const size_t max_threads_per_sm = device_props.maxThreadsPerMultiProcessor;
|
||||
// only allocate multiples of warp size
|
||||
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
|
||||
index 0e348c092a..7e4f62f12e 100644
|
||||
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
|
||||
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
|
||||
@@ -67,6 +67,34 @@
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
+template <typename ParallelType, typename Policy, typename LaunchBounds>
|
||||
+int max_tile_size_product_helper(const Policy& pol, const LaunchBounds&) {
|
||||
+ cudaFuncAttributes attr =
|
||||
+ CudaParallelLaunch<ParallelType,
|
||||
+ LaunchBounds>::get_cuda_func_attributes();
|
||||
+ auto const& prop = pol.space().cuda_device_prop();
|
||||
+
|
||||
+ // Limits due to registers/SM, MDRange doesn't have
|
||||
+ // shared memory constraints
|
||||
+ int const optimal_block_size =
|
||||
+ Kokkos::Impl::cuda_get_opt_block_size_no_shmem(attr, LaunchBounds{});
|
||||
+
|
||||
+ // Compute how many blocks of this size we can launch, based on warp
|
||||
+ // constraints
|
||||
+ int const max_warps_per_sm_registers =
|
||||
+ Kokkos::Impl::cuda_max_warps_per_sm_registers(prop, attr);
|
||||
+ int const max_num_threads_from_warps =
|
||||
+ max_warps_per_sm_registers * prop.warpSize;
|
||||
+ int const max_num_blocks = max_num_threads_from_warps / optimal_block_size;
|
||||
+
|
||||
+ // Compute the total number of threads
|
||||
+ int const max_threads_per_sm = optimal_block_size * max_num_blocks;
|
||||
+
|
||||
+ return std::min(
|
||||
+ max_threads_per_sm,
|
||||
+ static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
|
||||
+}
|
||||
+
|
||||
template <class FunctorType, class... Traits>
|
||||
class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
|
||||
public:
|
||||
@@ -85,18 +113,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
|
||||
public:
|
||||
template <typename Policy, typename Functor>
|
||||
static int max_tile_size_product(const Policy& pol, const Functor&) {
|
||||
- cudaFuncAttributes attr =
|
||||
- CudaParallelLaunch<ParallelFor,
|
||||
- LaunchBounds>::get_cuda_func_attributes();
|
||||
- auto const& prop = pol.space().cuda_device_prop();
|
||||
- // Limits due to registers/SM, MDRange doesn't have
|
||||
- // shared memory constraints
|
||||
- int const regs_per_sm = prop.regsPerMultiprocessor;
|
||||
- int const regs_per_thread = attr.numRegs;
|
||||
- int const max_threads_per_sm = regs_per_sm / regs_per_thread;
|
||||
- return std::min(
|
||||
- max_threads_per_sm,
|
||||
- static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
|
||||
+ return max_tile_size_product_helper<ParallelFor>(pol, LaunchBounds{});
|
||||
}
|
||||
Policy const& get_policy() const { return m_rp; }
|
||||
inline __device__ void operator()() const {
|
||||
@@ -258,17 +275,7 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
|
||||
public:
|
||||
template <typename Policy, typename Functor>
|
||||
static int max_tile_size_product(const Policy& pol, const Functor&) {
|
||||
- cudaFuncAttributes attr =
|
||||
- CudaParallelLaunch<ParallelReduce,
|
||||
- LaunchBounds>::get_cuda_func_attributes();
|
||||
- auto const& prop = pol.space().cuda_device_prop();
|
||||
- // Limits due do registers/SM
|
||||
- int const regs_per_sm = prop.regsPerMultiprocessor;
|
||||
- int const regs_per_thread = attr.numRegs;
|
||||
- int const max_threads_per_sm = regs_per_sm / regs_per_thread;
|
||||
- return std::min(
|
||||
- max_threads_per_sm,
|
||||
- static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
|
||||
+ return max_tile_size_product_helper<ParallelReduce>(pol, LaunchBounds{});
|
||||
}
|
||||
Policy const& get_policy() const { return m_policy; }
|
||||
inline __device__ void exec_range(reference_type update) const {
|
||||
@ -1,63 +0,0 @@
|
||||
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
|
||||
index 170183ca0a..ba43e362bb 100644
|
||||
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
|
||||
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
|
||||
@@ -412,12 +412,16 @@ struct CudaParallelLaunchKernelInvoker<
|
||||
Impl::check_shmem_request(cuda_instance, shmem);
|
||||
if (DriverType::Policy::
|
||||
experimental_contains_desired_occupancy) {
|
||||
+ /*
|
||||
int desired_occupancy =
|
||||
driver.get_policy().impl_get_desired_occupancy().value();
|
||||
size_t block_size = block.x * block.y * block.z;
|
||||
Impl::configure_shmem_preference<DriverType, LaunchBounds>(
|
||||
base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
|
||||
- shmem, desired_occupancy);
|
||||
+ shmem, desired_occupancy);*/
|
||||
+ Kokkos::Impl::throw_runtime_exception(
|
||||
+ std::string("Cuda graph node creation FAILED:"
|
||||
+ " occupancy requests are currently broken."));
|
||||
}
|
||||
|
||||
void const* args[] = {&driver};
|
||||
@@ -511,14 +515,17 @@ struct CudaParallelLaunchKernelInvoker<
|
||||
|
||||
if (!Impl::is_empty_launch(grid, block)) {
|
||||
Impl::check_shmem_request(cuda_instance, shmem);
|
||||
- if constexpr (DriverType::Policy::
|
||||
+ if (DriverType::Policy::
|
||||
experimental_contains_desired_occupancy) {
|
||||
- int desired_occupancy =
|
||||
+ /*int desired_occupancy =
|
||||
driver.get_policy().impl_get_desired_occupancy().value();
|
||||
size_t block_size = block.x * block.y * block.z;
|
||||
Impl::configure_shmem_preference<DriverType, LaunchBounds>(
|
||||
base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
|
||||
- shmem, desired_occupancy);
|
||||
+ shmem, desired_occupancy);*/
|
||||
+ Kokkos::Impl::throw_runtime_exception(
|
||||
+ std::string("Cuda graph node creation FAILED:"
|
||||
+ " occupancy requests are currently broken."));
|
||||
}
|
||||
|
||||
auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver);
|
||||
@@ -690,14 +697,17 @@ struct CudaParallelLaunchImpl<
|
||||
|
||||
if (DriverType::Policy::
|
||||
experimental_contains_desired_occupancy) {
|
||||
- int desired_occupancy =
|
||||
+ /*int desired_occupancy =
|
||||
driver.get_policy().impl_get_desired_occupancy().value();
|
||||
size_t block_size = block.x * block.y * block.z;
|
||||
Impl::configure_shmem_preference<
|
||||
DriverType,
|
||||
Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
|
||||
base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
|
||||
- shmem, desired_occupancy);
|
||||
+ shmem, desired_occupancy);*/
|
||||
+ Kokkos::Impl::throw_runtime_exception(
|
||||
+ std::string("Cuda graph node creation FAILED:"
|
||||
+ " occupancy requests are currently broken."));
|
||||
}
|
||||
|
||||
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
|
||||
30
potentials/HGa.msmeam
Normal file
30
potentials/HGa.msmeam
Normal file
@ -0,0 +1,30 @@
|
||||
bkgd_dyn = 1
|
||||
emb_lin_neg = 1
|
||||
augt1=0
|
||||
ialloy=1
|
||||
rc = 5.9
|
||||
#H
|
||||
attrac(1,1)=0.460
|
||||
repuls(1,1)=0.460
|
||||
Cmin(1,1,1)=1.3 # PuMS
|
||||
Cmax(1,1,1)= 2.80
|
||||
nn2(1,1)=1
|
||||
#Ga
|
||||
rho0(2) = 0.6
|
||||
attrac(2,2)=0.097
|
||||
repuls(2,2)=0.097
|
||||
nn2(2,2)=1
|
||||
#HGa
|
||||
attrac(1,2)=0.300
|
||||
repuls(1,2)=0.300
|
||||
lattce(1,2)=l12
|
||||
re(1,2)=3.19
|
||||
delta(1,2)=-0.48
|
||||
alpha(1,2)=6.6
|
||||
Cmin(1,1,2)=2.0
|
||||
Cmin(2,1,2)= 2.0
|
||||
Cmin(1,2,1)=2.0
|
||||
Cmin(2,2,1) = 1.4
|
||||
Cmin(1,2,2) = 1.4
|
||||
Cmin(1,1,2) = 1.4
|
||||
nn2(1,2)=1
|
||||
14
potentials/library.msmeam
Normal file
14
potentials/library.msmeam
Normal file
@ -0,0 +1,14 @@
|
||||
# DATE: 2018-09-22 UNITS: metal CONTRIBUTOR: Steve Valone, smv@lanl.gov CITATION: Baskes, PRB 1992; smv, sr, mib, JNM 2010
|
||||
# ms-meam data format May 2010
|
||||
# elt lat z ielement atwt
|
||||
# alpha b0 b1 b2 b3 b1m b2m b3m alat esub asub
|
||||
# - t0 t1 t2 t3 t1m t2m t3m rozero ibar
|
||||
# NOTE: leading character cannot be a space
|
||||
|
||||
'H' 'dim' 1.0 1 1.0079
|
||||
2.960 2.960 3.0 1.0 1.0 1.0 3.0 1.0 0.741 2.235 2.50
|
||||
1.0 0.44721 0.0 0.00 0.0 0.31623 0 6.70 0
|
||||
|
||||
'Ga4' 'fcc' 12.0 31 69.723
|
||||
4.42 4.80 3.10 6.00 0.00 0.0 0.0 0.5 4.247 2.897 0.97
|
||||
1.0 1.649 1.435 0.00 0.0 0.0 2.0 0.70 0
|
||||
@ -32,7 +32,7 @@ if not pylib.Py_IsInitialized():
|
||||
else:
|
||||
from .loader import load_model, load_unified, activate_mliappy
|
||||
try:
|
||||
from .loader import load_model_kokkos, activate_mliappy_kokkos
|
||||
from .loader import load_model_kokkos, load_unified_kokkos, activate_mliappy_kokkos
|
||||
except Exception as ee:
|
||||
# ignore import error, it means that the KOKKOS package was not included in LAMMPS
|
||||
pass
|
||||
|
||||
@ -75,7 +75,7 @@ def activate_mliappy(lmp):
|
||||
def activate_mliappy_kokkos(lmp):
|
||||
try:
|
||||
library = lmp.lib
|
||||
module_names = ["mliap_model_python_couple_kokkos"]
|
||||
module_names = ["mliap_model_python_couple_kokkos", "mliap_unified_couple_kokkos"]
|
||||
api_version = library.lammps_python_api_version()
|
||||
|
||||
for module_name in module_names:
|
||||
@ -118,3 +118,12 @@ def load_unified(model):
|
||||
) from ie
|
||||
mliap_unified_couple.load_from_python(model)
|
||||
|
||||
def load_unified_kokkos(model):
|
||||
try:
|
||||
import mliap_unified_couple_kokkos
|
||||
except ImportError as ie:
|
||||
raise ImportError("ML-IAP python module must be activated before loading\n"
|
||||
"the pair style. Call lammps.mliap.activate_mliappy(lmp)."
|
||||
) from ie
|
||||
mliap_unified_couple_kokkos.load_from_python(model)
|
||||
|
||||
|
||||
@ -22,6 +22,7 @@
|
||||
#include "memory.h"
|
||||
#include "neighbor.h"
|
||||
#include "remap_wrap.h"
|
||||
#include "timer.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
@ -326,15 +327,23 @@ FFT_SCALAR *AmoebaConvolution::pre_convolution_3d()
|
||||
cfft[n++] = ZEROF;
|
||||
}
|
||||
|
||||
double time0,time1;
|
||||
|
||||
if (timer->has_sync()) MPI_Barrier(world);
|
||||
time0 = platform::walltime();
|
||||
|
||||
// perform forward FFT
|
||||
|
||||
fft1->compute(cfft,cfft,FFT3d::FORWARD);
|
||||
time1 = platform::walltime();
|
||||
|
||||
if (SCALE) {
|
||||
double scale = 1.0/nfft_global;
|
||||
FFT_SCALAR scale = 1.0/nfft_global;
|
||||
for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale;
|
||||
}
|
||||
|
||||
time_fft += time1 - time0;
|
||||
|
||||
#if DEBUG_AMOEBA
|
||||
debug_scalar(CFFT1,"PRE Convo / POST FFT");
|
||||
debug_file(CFFT1,"pre.convo.post.fft");
|
||||
@ -382,15 +391,24 @@ FFT_SCALAR *AmoebaConvolution::pre_convolution_4d()
|
||||
debug_scalar(FFT,"PRE Convo / POST Remap");
|
||||
debug_file(FFT,"pre.convo.post.remap");
|
||||
#endif
|
||||
|
||||
double time0,time1;
|
||||
|
||||
if (timer->has_sync()) MPI_Barrier(world);
|
||||
time0 = platform::walltime();
|
||||
|
||||
// perform forward FFT
|
||||
|
||||
fft1->compute(cfft,cfft,FFT3d::FORWARD);
|
||||
time1 = platform::walltime();
|
||||
|
||||
if (SCALE) {
|
||||
double scale = 1.0/nfft_global;
|
||||
FFT_SCALAR scale = 1.0/nfft_global;
|
||||
for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale;
|
||||
}
|
||||
|
||||
time_fft += time1 - time0;
|
||||
|
||||
#if DEBUG_AMOEBA
|
||||
debug_scalar(CFFT1,"PRE Convo / POST FFT");
|
||||
debug_file(CFFT1,"pre.convo.post.fft");
|
||||
@ -423,7 +441,16 @@ void *AmoebaConvolution::post_convolution_3d()
|
||||
debug_scalar(CFFT1,"POST Convo / PRE FFT");
|
||||
debug_file(CFFT1,"post.convo.pre.fft");
|
||||
#endif
|
||||
|
||||
double time0,time1;
|
||||
|
||||
if (timer->has_sync()) MPI_Barrier(world);
|
||||
time0 = platform::walltime();
|
||||
|
||||
fft2->compute(cfft,cfft,FFT3d::BACKWARD);
|
||||
time1 = platform::walltime();
|
||||
|
||||
time_fft += time1 - time0;
|
||||
|
||||
#if DEBUG_AMOEBA
|
||||
debug_scalar(CFFT2,"POST Convo / POST FFT");
|
||||
@ -465,8 +492,18 @@ void *AmoebaConvolution::post_convolution_4d()
|
||||
debug_scalar(CFFT1,"POST Convo / PRE FFT");
|
||||
debug_file(CFFT1,"post.convo.pre.fft");
|
||||
#endif
|
||||
|
||||
double time0,time1;
|
||||
|
||||
if (timer->has_sync()) MPI_Barrier(world);
|
||||
time0 = platform::walltime();
|
||||
|
||||
fft2->compute(cfft,cfft,FFT3d::BACKWARD);
|
||||
|
||||
time1 = platform::walltime();
|
||||
|
||||
time_fft += time1 - time0;
|
||||
|
||||
#if DEBUG_AMOEBA
|
||||
debug_scalar(CFFT2,"POST Convo / POST FFT");
|
||||
debug_file(CFFT2,"post.convo.post.fft");
|
||||
|
||||
@ -38,7 +38,7 @@ class AmoebaConvolution : protected Pointers {
|
||||
int nxlo_out, nxhi_out, nylo_out, nyhi_out, nzlo_out, nzhi_out;
|
||||
int nxlo_fft, nxhi_fft, nylo_fft, nyhi_fft, nzlo_fft, nzhi_fft;
|
||||
bigint nfft_global; // nx * ny * nz
|
||||
double *grid_brick_start; // lower left corner of (c)grid_brick data
|
||||
FFT_SCALAR *grid_brick_start; // lower left corner of (c)grid_brick data
|
||||
|
||||
AmoebaConvolution(class LAMMPS *, class Pair *, int, int, int, int, int);
|
||||
~AmoebaConvolution();
|
||||
@ -47,7 +47,9 @@ class AmoebaConvolution : protected Pointers {
|
||||
FFT_SCALAR *pre_convolution();
|
||||
void *post_convolution();
|
||||
|
||||
private:
|
||||
double time_fft;
|
||||
|
||||
protected:
|
||||
int which; // caller name for convolution being performed
|
||||
int flag3d; // 1 if using 3d grid_brick, 0 for 4d cgrid_brick
|
||||
int nbrick_owned; // owned grid points in brick decomp
|
||||
@ -59,23 +61,23 @@ class AmoebaConvolution : protected Pointers {
|
||||
class Grid3d *gc;
|
||||
class Remap *remap;
|
||||
|
||||
double ***grid_brick; // 3d real brick grid with ghosts
|
||||
double ****cgrid_brick; // 4d complex brick grid with ghosts
|
||||
FFT_SCALAR ***grid_brick; // 3d real brick grid with ghosts
|
||||
FFT_SCALAR ****cgrid_brick; // 4d complex brick grid with ghosts
|
||||
|
||||
FFT_SCALAR *grid_fft; // 3d FFT grid as 1d vector
|
||||
FFT_SCALAR *cfft; // 3d complex FFT grid as 1d vector
|
||||
|
||||
double *gc_buf1, *gc_buf2; // buffers for GridComm
|
||||
double *remap_buf; // buffer for Remap
|
||||
FFT_SCALAR *gc_buf1, *gc_buf2; // buffers for GridComm
|
||||
FFT_SCALAR *remap_buf; // buffer for Remap
|
||||
|
||||
void allocate_grid();
|
||||
void deallocate_grid();
|
||||
void *zero_3d();
|
||||
void *zero_4d();
|
||||
FFT_SCALAR *pre_convolution_3d();
|
||||
FFT_SCALAR *pre_convolution_4d();
|
||||
virtual FFT_SCALAR *pre_convolution_4d();
|
||||
void *post_convolution_3d();
|
||||
void *post_convolution_4d();
|
||||
virtual void *post_convolution_4d();
|
||||
void procs2grid2d(int, int, int, int &, int &);
|
||||
|
||||
// DEBUG
|
||||
|
||||
@ -285,7 +285,7 @@ void PairAmoeba::dispersion_kspace()
|
||||
// gridpre = my portion of 3d grid in brick decomp w/ ghost values
|
||||
// zeroed by zero()
|
||||
|
||||
double ***gridpre = (double ***) d_kspace->zero();
|
||||
FFT_SCALAR ***gridpre = (FFT_SCALAR ***) d_kspace->zero();
|
||||
|
||||
// map atoms to grid
|
||||
|
||||
@ -294,7 +294,7 @@ void PairAmoeba::dispersion_kspace()
|
||||
// pre-convolution operations including forward FFT
|
||||
// gridfft = my portion of complex 3d grid in FFT decomposition
|
||||
|
||||
double *gridfft = d_kspace->pre_convolution();
|
||||
FFT_SCALAR *gridfft = d_kspace->pre_convolution();
|
||||
|
||||
// ---------------------
|
||||
// convolution operation
|
||||
|
||||
@ -24,6 +24,7 @@
|
||||
#include "math_special.h"
|
||||
#include "my_page.h"
|
||||
#include "neigh_list.h"
|
||||
#include "timer.h"
|
||||
|
||||
#include <cmath>
|
||||
|
||||
@ -381,8 +382,6 @@ void PairAmoeba::induce()
|
||||
}
|
||||
}
|
||||
|
||||
// if (comm->me == 0) printf("CG iteration count = %d\n",iter);
|
||||
|
||||
// terminate the calculation if dipoles failed to converge
|
||||
// NOTE: could make this an error
|
||||
|
||||
@ -546,13 +545,19 @@ void PairAmoeba::ufield0c(double **field, double **fieldp)
|
||||
}
|
||||
}
|
||||
|
||||
// get the reciprocal space part of the mutual field
|
||||
|
||||
if (polar_kspace_flag) umutual1(field,fieldp);
|
||||
double time0, time1, time2;
|
||||
if (timer->has_sync()) MPI_Barrier(world);
|
||||
time0 = platform::walltime();
|
||||
|
||||
// get the real space portion of the mutual field
|
||||
|
||||
if (polar_rspace_flag) umutual2b(field,fieldp);
|
||||
time1 = platform::walltime();
|
||||
|
||||
// get the reciprocal space part of the mutual field
|
||||
|
||||
if (polar_kspace_flag) umutual1(field,fieldp);
|
||||
time2 = platform::walltime();
|
||||
|
||||
// add the self-energy portion of the mutual field
|
||||
|
||||
@ -563,6 +568,11 @@ void PairAmoeba::ufield0c(double **field, double **fieldp)
|
||||
fieldp[i][j] += term*uinp[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
// accumulate timing information
|
||||
|
||||
time_mutual_rspace += time1 - time0;
|
||||
time_mutual_kspace += time2 - time1;
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
@ -785,7 +795,12 @@ void PairAmoeba::dfield0c(double **field, double **fieldp)
|
||||
|
||||
// get the reciprocal space part of the permanent field
|
||||
|
||||
double time0, time1, time2;
|
||||
if (timer->has_sync()) MPI_Barrier(world);
|
||||
time0 = platform::walltime();
|
||||
|
||||
if (polar_kspace_flag) udirect1(field);
|
||||
time1 = platform::walltime();
|
||||
|
||||
for (i = 0; i < nlocal; i++) {
|
||||
for (j = 0; j < 3; j++) {
|
||||
@ -796,6 +811,7 @@ void PairAmoeba::dfield0c(double **field, double **fieldp)
|
||||
// get the real space portion of the permanent field
|
||||
|
||||
if (polar_rspace_flag) udirect2b(field,fieldp);
|
||||
time2 = platform::walltime();
|
||||
|
||||
// get the self-energy portion of the permanent field
|
||||
|
||||
@ -806,6 +822,11 @@ void PairAmoeba::dfield0c(double **field, double **fieldp)
|
||||
fieldp[i][j] += term*rpole[i][j+1];
|
||||
}
|
||||
}
|
||||
|
||||
// accumulate timing information
|
||||
|
||||
time_direct_kspace += time1 - time0;
|
||||
time_direct_rspace += time2 - time1;
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
@ -842,18 +863,26 @@ void PairAmoeba::umutual1(double **field, double **fieldp)
|
||||
}
|
||||
}
|
||||
|
||||
double time0, time1;
|
||||
|
||||
// gridpre = my portion of 4d grid in brick decomp w/ ghost values
|
||||
|
||||
double ****gridpre = (double ****) ic_kspace->zero();
|
||||
FFT_SCALAR ****gridpre = (FFT_SCALAR ****) ic_kspace->zero();
|
||||
|
||||
// map 2 values to grid
|
||||
|
||||
if (timer->has_sync()) MPI_Barrier(world);
|
||||
time0 = platform::walltime();
|
||||
|
||||
grid_uind(fuind,fuinp,gridpre);
|
||||
|
||||
time1 = platform::walltime();
|
||||
time_grid_uind += (time1 - time0);
|
||||
|
||||
// pre-convolution operations including forward FFT
|
||||
// gridfft = my portion of complex 3d grid in FFT decomposition
|
||||
|
||||
double *gridfft = ic_kspace->pre_convolution();
|
||||
FFT_SCALAR *gridfft = ic_kspace->pre_convolution();
|
||||
|
||||
// ---------------------
|
||||
// convolution operation
|
||||
@ -883,12 +912,18 @@ void PairAmoeba::umutual1(double **field, double **fieldp)
|
||||
// post-convolution operations including backward FFT
|
||||
// gridppost = my portion of 4d grid in brick decomp w/ ghost values
|
||||
|
||||
double ****gridpost = (double ****) ic_kspace->post_convolution();
|
||||
FFT_SCALAR ****gridpost = (FFT_SCALAR ****) ic_kspace->post_convolution();
|
||||
|
||||
// get potential
|
||||
|
||||
if (timer->has_sync()) MPI_Barrier(world);
|
||||
time0 = platform::walltime();
|
||||
|
||||
fphi_uind(gridpost,fdip_phi1,fdip_phi2,fdip_sum_phi);
|
||||
|
||||
time1 = platform::walltime();
|
||||
time_fphi_uind += (time1 - time0);
|
||||
|
||||
// store fractional reciprocal potentials for OPT method
|
||||
|
||||
if (poltyp == OPT) {
|
||||
@ -1055,7 +1090,7 @@ void PairAmoeba::udirect1(double **field)
|
||||
// gridpre = my portion of 3d grid in brick decomp w/ ghost values
|
||||
// zeroed by setup()
|
||||
|
||||
double ***gridpre = (double ***) i_kspace->zero();
|
||||
FFT_SCALAR ***gridpre = (FFT_SCALAR ***) i_kspace->zero();
|
||||
|
||||
// map multipole moments to grid
|
||||
|
||||
@ -1064,7 +1099,7 @@ void PairAmoeba::udirect1(double **field)
|
||||
// pre-convolution operations including forward FFT
|
||||
// gridfft = my 1d portion of complex 3d grid in FFT decomp
|
||||
|
||||
double *gridfft = i_kspace->pre_convolution();
|
||||
FFT_SCALAR *gridfft = i_kspace->pre_convolution();
|
||||
|
||||
// ---------------------
|
||||
// convolution operation
|
||||
@ -1109,7 +1144,7 @@ void PairAmoeba::udirect1(double **field)
|
||||
// post-convolution operations including backward FFT
|
||||
// gridppost = my portion of 3d grid in brick decomp w/ ghost values
|
||||
|
||||
double ***gridpost = (double ***) i_kspace->post_convolution();
|
||||
FFT_SCALAR ***gridpost = (FFT_SCALAR ***) i_kspace->post_convolution();
|
||||
|
||||
// get potential
|
||||
|
||||
|
||||
@ -68,25 +68,23 @@ void PairAmoeba::moduli()
|
||||
int maxfft = MAX(nfft1,nfft2);
|
||||
maxfft = MAX(maxfft,nfft3);
|
||||
|
||||
double *array = new double[bsorder];
|
||||
double *bsarray = new double[maxfft];
|
||||
if (maxfft > _nfft_max) {
|
||||
memory->destroy(_moduli_bsarray);
|
||||
_nfft_max = maxfft;
|
||||
memory->create(_moduli_bsarray,_nfft_max,"amoeba:_moduli_bsarray");
|
||||
}
|
||||
|
||||
// compute and load the moduli values
|
||||
|
||||
double x = 0.0;
|
||||
bspline(x,bsorder,array);
|
||||
bspline(x,bsorder,_moduli_array);
|
||||
|
||||
for (i = 0; i < maxfft; i++) bsarray[i] = 0.0;
|
||||
for (i = 0; i < bsorder; i++) bsarray[i+1] = array[i];
|
||||
for (i = 0; i < maxfft; i++) _moduli_bsarray[i] = 0.0;
|
||||
for (i = 0; i < bsorder; i++) _moduli_bsarray[i+1] = _moduli_array[i];
|
||||
|
||||
dftmod(bsmod1,bsarray,nfft1,bsorder);
|
||||
dftmod(bsmod2,bsarray,nfft2,bsorder);
|
||||
dftmod(bsmod3,bsarray,nfft3,bsorder);
|
||||
|
||||
// perform deallocation of local arrays
|
||||
|
||||
delete[] array;
|
||||
delete[] bsarray;
|
||||
dftmod(bsmod1,_moduli_bsarray,nfft1,bsorder);
|
||||
dftmod(bsmod2,_moduli_bsarray,nfft2,bsorder);
|
||||
dftmod(bsmod3,_moduli_bsarray,nfft3,bsorder);
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
@ -525,7 +523,7 @@ void PairAmoeba::frac_to_cart()
|
||||
grid_mpole maps fractional atomic multipoles to PME grid
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
void PairAmoeba::grid_mpole(double **fmp, double ***grid)
|
||||
void PairAmoeba::grid_mpole(double **fmp, FFT_SCALAR ***grid)
|
||||
{
|
||||
int i,j,k,m,ib,jb,kb;
|
||||
double v0,u0,t0;
|
||||
@ -598,7 +596,7 @@ void PairAmoeba::grid_mpole(double **fmp, double ***grid)
|
||||
the particle mesh Ewald grid
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
void PairAmoeba::fphi_mpole(double ***grid, double **fphi)
|
||||
void PairAmoeba::fphi_mpole(FFT_SCALAR ***grid, double **fphi)
|
||||
{
|
||||
int i,j,k,m,ib,jb,kb;
|
||||
double v0,v1,v2,v3;
|
||||
@ -742,7 +740,7 @@ void PairAmoeba::fphi_mpole(double ***grid, double **fphi)
|
||||
grid_uind maps fractional induced dipoles to the PME grid
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
void PairAmoeba::grid_uind(double **fuind, double **fuinp, double ****grid)
|
||||
void PairAmoeba::grid_uind(double **fuind, double **fuinp, FFT_SCALAR ****grid)
|
||||
{
|
||||
int i,j,k,m,ib,jb,kb;
|
||||
double v0,u0,t0;
|
||||
@ -793,7 +791,7 @@ void PairAmoeba::grid_uind(double **fuind, double **fuinp, double ****grid)
|
||||
fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
void PairAmoeba::fphi_uind(double ****grid, double **fdip_phi1,
|
||||
void PairAmoeba::fphi_uind(FFT_SCALAR ****grid, double **fdip_phi1,
|
||||
double **fdip_phi2, double **fdip_sum_phi)
|
||||
{
|
||||
int i,j,k,m,ib,jb,kb;
|
||||
@ -1042,7 +1040,7 @@ void PairAmoeba::fphi_uind(double ****grid, double **fdip_phi1,
|
||||
grid_disp maps dispersion coefficients to PME grid
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
void PairAmoeba::grid_disp(double ***grid)
|
||||
void PairAmoeba::grid_disp(FFT_SCALAR ***grid)
|
||||
{
|
||||
int i,j,k,m,ib,jb,kb,itype,iclass;
|
||||
double v0,u0,t0;
|
||||
|
||||
@ -21,6 +21,7 @@
|
||||
#include "math_const.h"
|
||||
#include "math_special.h"
|
||||
#include "neigh_list.h"
|
||||
#include "timer.h"
|
||||
|
||||
#include <cmath>
|
||||
|
||||
@ -55,6 +56,8 @@ void PairAmoeba::multipole()
|
||||
double qixx,qixy,qixz,qiyy,qiyz,qizz;
|
||||
double cii,dii,qii;
|
||||
|
||||
double time0,time1,time2;
|
||||
|
||||
// set cutoffs, taper coeffs, and PME params
|
||||
|
||||
if (use_ewald) choose(MPOLE_LONG);
|
||||
@ -78,13 +81,18 @@ void PairAmoeba::multipole()
|
||||
|
||||
felec = electric / am_dielectric;
|
||||
|
||||
if (timer->has_sync()) MPI_Barrier(world);
|
||||
time0 = platform::walltime();
|
||||
|
||||
// compute the real space part of the Ewald summation
|
||||
|
||||
if (mpole_rspace_flag) multipole_real();
|
||||
time1 = platform::walltime();
|
||||
|
||||
// compute the reciprocal space part of the Ewald summation
|
||||
|
||||
if (mpole_kspace_flag) multipole_kspace();
|
||||
time2 = platform::walltime();
|
||||
|
||||
// compute the Ewald self-energy term over all the atoms
|
||||
|
||||
@ -109,6 +117,11 @@ void PairAmoeba::multipole()
|
||||
e = fterm * (cii + term*(dii/3.0+2.0*term*qii/5.0));
|
||||
empole += e;
|
||||
}
|
||||
|
||||
// accumulate timing information
|
||||
|
||||
time_mpole_rspace += time1 - time0;
|
||||
time_mpole_kspace += time2 - time1;
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
@ -361,6 +374,9 @@ void PairAmoeba::multipole_real()
|
||||
bn[k] = (bfac*bn[k-1]+alsq2n*exp2a) / r2;
|
||||
}
|
||||
for (k = 0; k < 6; k++) bn[k] *= felec;
|
||||
//if (i == 0 && j < 10) {
|
||||
// printf("j = %d: aewald = %f; rr1 = %f; bn: %f %f %f %f %f %f\n", j, aewald, rr1, bn[0], bn[1], bn[2], bn[3], bn[4], bn[5]);
|
||||
//}
|
||||
|
||||
// find damped multipole intermediates and energy value
|
||||
|
||||
@ -404,6 +420,8 @@ void PairAmoeba::multipole_real()
|
||||
term2i*rr3i + term2k*rr3k + term2ik*rr3ik +
|
||||
term3i*rr5i + term3k*rr5k + term3ik*rr5ik;
|
||||
|
||||
|
||||
|
||||
// find damped multipole intermediates for force and torque
|
||||
|
||||
de = term1*rr3 + term4ik*rr9ik + term5ik*rr11ik +
|
||||
@ -444,6 +462,7 @@ void PairAmoeba::multipole_real()
|
||||
term4 = 2.0 * (-ck*rr5+dkr*rr7-qkr*rr9);
|
||||
term5 = 2.0 * (-ci*rr5-dir*rr7-qir*rr9);
|
||||
term6 = 4.0 * rr7;
|
||||
|
||||
}
|
||||
|
||||
empole += e;
|
||||
@ -482,6 +501,7 @@ void PairAmoeba::multipole_real()
|
||||
tq[i][2] += ttmi[2];
|
||||
|
||||
// increment force-based gradient and torque on second site
|
||||
// commenting out j parts for DEBUGGING
|
||||
|
||||
f[j][0] += frcx;
|
||||
f[j][1] += frcy;
|
||||
@ -638,7 +658,7 @@ void PairAmoeba::multipole_kspace()
|
||||
|
||||
// gridpre = my portion of 3d grid in brick decomp w/ ghost values
|
||||
|
||||
double ***gridpre = (double ***) m_kspace->zero();
|
||||
FFT_SCALAR ***gridpre = (FFT_SCALAR ***) m_kspace->zero();
|
||||
|
||||
// map atoms to grid
|
||||
|
||||
@ -647,7 +667,7 @@ void PairAmoeba::multipole_kspace()
|
||||
// pre-convolution operations including forward FFT
|
||||
// gridfft = my portion of complex 3d grid in FFT decomp as 1d vector
|
||||
|
||||
double *gridfft = m_kspace->pre_convolution();
|
||||
FFT_SCALAR *gridfft = m_kspace->pre_convolution();
|
||||
|
||||
// ---------------------
|
||||
// convolution operation
|
||||
@ -718,7 +738,7 @@ void PairAmoeba::multipole_kspace()
|
||||
// post-convolution operations including backward FFT
|
||||
// gridppost = my portion of 3d grid in brick decomp w/ ghost values
|
||||
|
||||
double ***gridpost = (double ***) m_kspace->post_convolution();
|
||||
FFT_SCALAR ***gridpost = (FFT_SCALAR ***) m_kspace->post_convolution();
|
||||
|
||||
// get potential
|
||||
|
||||
|
||||
@ -21,6 +21,7 @@
|
||||
#include "math_const.h"
|
||||
#include "math_special.h"
|
||||
#include "neigh_list.h"
|
||||
#include "timer.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
@ -55,6 +56,8 @@ void PairAmoeba::polar()
|
||||
double fix[3],fiy[3],fiz[3];
|
||||
double tep[3];
|
||||
|
||||
double time0,time1,time2;
|
||||
|
||||
// set cutoffs, taper coeffs, and PME params
|
||||
|
||||
if (use_ewald) choose(POLAR_LONG);
|
||||
@ -76,11 +79,16 @@ void PairAmoeba::polar()
|
||||
|
||||
// compute the real space part of the dipole interactions
|
||||
|
||||
if (timer->has_sync()) MPI_Barrier(world);
|
||||
time0 = platform::walltime();
|
||||
|
||||
if (polar_rspace_flag) polar_real();
|
||||
time1 = platform::walltime();
|
||||
|
||||
// compute the reciprocal space part of dipole interactions
|
||||
|
||||
if (polar_kspace_flag) polar_kspace();
|
||||
time2 = platform::walltime();
|
||||
|
||||
// compute the Ewald self-energy torque and virial terms
|
||||
|
||||
@ -133,6 +141,11 @@ void PairAmoeba::polar()
|
||||
virpolar[4] -= vxz;
|
||||
virpolar[5] -= vyz;
|
||||
}
|
||||
|
||||
// accumulate timing information
|
||||
|
||||
time_polar_rspace += time1 - time0;
|
||||
time_polar_kspace += time2 - time1;
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
@ -382,7 +395,7 @@ void PairAmoeba::polar_real()
|
||||
factor_uscale = 1.0;
|
||||
}
|
||||
}
|
||||
|
||||
//if (i == 12 && j < 20) printf("j = %d: r = %f; factor_wscale = %f\n", j, sqrt(r2), factor_wscale);
|
||||
r = sqrt(r2);
|
||||
ck = rpole[j][0];
|
||||
dkx = rpole[j][1];
|
||||
@ -597,7 +610,6 @@ void PairAmoeba::polar_real()
|
||||
dufld[i][3] += xr*tiz5 + zr*tix5 + 2.0*xr*zr*tuir;
|
||||
dufld[i][4] += yr*tiz5 + zr*tiy5 + 2.0*yr*zr*tuir;
|
||||
dufld[i][5] += zr*tiz5 + zr*zr*tuir;
|
||||
|
||||
dufld[j][0] -= xr*tkx5 + xr*xr*tukr;
|
||||
dufld[j][1] -= xr*tky5 + yr*tkx5 + 2.0*xr*yr*tukr;
|
||||
dufld[j][2] -= yr*tky5 + yr*yr*tukr;
|
||||
@ -855,6 +867,7 @@ void PairAmoeba::polar_real()
|
||||
frcx = -2.0 * depx;
|
||||
frcy = -2.0 * depy;
|
||||
frcz = -2.0 * depz;
|
||||
|
||||
}
|
||||
|
||||
// get the dtau/dr terms used for mutual polarization force
|
||||
@ -1327,7 +1340,7 @@ void PairAmoeba::polar_kspace()
|
||||
|
||||
// gridpre = my portion of 3d grid in brick decomp w/ ghost values
|
||||
|
||||
double ***gridpre = (double ***) p_kspace->zero();
|
||||
FFT_SCALAR ***gridpre = (FFT_SCALAR ***) p_kspace->zero();
|
||||
|
||||
// map atoms to grid
|
||||
|
||||
@ -1336,7 +1349,7 @@ void PairAmoeba::polar_kspace()
|
||||
// pre-convolution operations including forward FFT
|
||||
// gridfft = my portion of complex 3d grid in FFT decomp as 1d vector
|
||||
|
||||
double *gridfft = p_kspace->pre_convolution();
|
||||
FFT_SCALAR *gridfft = p_kspace->pre_convolution();
|
||||
|
||||
// ---------------------
|
||||
// convolution operation
|
||||
@ -1386,7 +1399,7 @@ void PairAmoeba::polar_kspace()
|
||||
// post-convolution operations including backward FFT
|
||||
// gridppost = my portion of 3d grid in brick decomp w/ ghost values
|
||||
|
||||
double ***gridpost = (double ***) p_kspace->post_convolution();
|
||||
FFT_SCALAR ***gridpost = (FFT_SCALAR ***) p_kspace->post_convolution();
|
||||
|
||||
// get potential
|
||||
|
||||
@ -1419,7 +1432,7 @@ void PairAmoeba::polar_kspace()
|
||||
|
||||
// gridpre2 = my portion of 4d grid in brick decomp w/ ghost values
|
||||
|
||||
double ****gridpre2 = (double ****) pc_kspace->zero();
|
||||
FFT_SCALAR ****gridpre2 = (FFT_SCALAR ****) pc_kspace->zero();
|
||||
|
||||
// map 2 values to grid
|
||||
|
||||
@ -1428,7 +1441,7 @@ void PairAmoeba::polar_kspace()
|
||||
// pre-convolution operations including forward FFT
|
||||
// gridfft = my portion of complex 3d grid in FFT decomposition
|
||||
|
||||
double *gridfft = pc_kspace->pre_convolution();
|
||||
FFT_SCALAR *gridfft = pc_kspace->pre_convolution();
|
||||
|
||||
// ---------------------
|
||||
// convolution operation
|
||||
@ -1451,7 +1464,7 @@ void PairAmoeba::polar_kspace()
|
||||
// post-convolution operations including backward FFT
|
||||
// gridppost = my portion of 4d grid in brick decomp w/ ghost values
|
||||
|
||||
double ****gridpost = (double ****) pc_kspace->post_convolution();
|
||||
FFT_SCALAR ****gridpost = (FFT_SCALAR ****) pc_kspace->post_convolution();
|
||||
|
||||
// get potential
|
||||
|
||||
@ -1857,7 +1870,7 @@ void PairAmoeba::polar_kspace()
|
||||
// gridpre = my portion of 3d grid in brick decomp w/ ghost values
|
||||
// zeroed by zero()
|
||||
|
||||
double ***gridpre = (double ***) p_kspace->zero();
|
||||
FFT_SCALAR ***gridpre = (FFT_SCALAR ***) p_kspace->zero();
|
||||
|
||||
// map atoms to grid
|
||||
|
||||
@ -1887,7 +1900,7 @@ void PairAmoeba::polar_kspace()
|
||||
// gridpre = my portion of 3d grid in brick decomp w/ ghost values
|
||||
// zeroed by zero()
|
||||
|
||||
gridpre = (double ***) p_kspace->zero();
|
||||
gridpre = (FFT_SCALAR ***) p_kspace->zero();
|
||||
|
||||
// map atoms to grid
|
||||
|
||||
@ -1896,7 +1909,7 @@ void PairAmoeba::polar_kspace()
|
||||
// pre-convolution operations including forward FFT
|
||||
// gridfft1/2 = my portions of complex 3d grid in FFT decomp as 1d vectors
|
||||
|
||||
double *gridfft2 = p_kspace->pre_convolution();
|
||||
FFT_SCALAR *gridfft2 = p_kspace->pre_convolution();
|
||||
|
||||
// ---------------------
|
||||
// convolution operation
|
||||
@ -1953,7 +1966,7 @@ void PairAmoeba::polar_kspace()
|
||||
// gridpre = my portion of 3d grid in brick decomp w/ ghost values
|
||||
// zeroed by zero()
|
||||
|
||||
double ***gridpre = (double ***) p_kspace->zero();
|
||||
FFT_SCALAR ***gridpre = (FFT_SCALAR ***) p_kspace->zero();
|
||||
|
||||
// map atoms to grid
|
||||
|
||||
@ -1962,12 +1975,12 @@ void PairAmoeba::polar_kspace()
|
||||
// pre-convolution operations including forward FFT
|
||||
// gridfft = my portion of complex 3d grid in FFT decomp as 1d vector
|
||||
|
||||
double *gridfft = p_kspace->pre_convolution();
|
||||
FFT_SCALAR *gridfft = p_kspace->pre_convolution();
|
||||
|
||||
// gridfft1 = copy of first FFT
|
||||
|
||||
int nfft_owned = p_kspace->nfft_owned;
|
||||
memcpy(gridfft1,gridfft,2*nfft_owned*sizeof(double));
|
||||
memcpy(gridfft1,gridfft,2*nfft_owned*sizeof(FFT_SCALAR));
|
||||
|
||||
// assign ??? to the PME grid
|
||||
|
||||
@ -1982,7 +1995,7 @@ void PairAmoeba::polar_kspace()
|
||||
|
||||
// gridpre = my portion of 3d grid in brick decomp w/ ghost values
|
||||
|
||||
gridpre = (double ***) p_kspace->zero();
|
||||
gridpre = (FFT_SCALAR ***) p_kspace->zero();
|
||||
|
||||
// map atoms to grid
|
||||
|
||||
@ -1991,7 +2004,7 @@ void PairAmoeba::polar_kspace()
|
||||
// pre-convolution operations including forward FFT
|
||||
// gridfft = my portion of complex 3d grid in FFT decomp as 1d vector
|
||||
|
||||
double *gridfft2 = p_kspace->pre_convolution();
|
||||
FFT_SCALAR *gridfft2 = p_kspace->pre_convolution();
|
||||
|
||||
// ---------------------
|
||||
// convolution operation
|
||||
|
||||
@ -194,8 +194,8 @@ void FixAmoebaBiTorsion::init()
|
||||
// error check that PairAmoeba or PairHiippo exist
|
||||
|
||||
pair = nullptr;
|
||||
pair = force->pair_match("amoeba",1,0);
|
||||
if (!pair) pair = force->pair_match("hippo",1,0);
|
||||
pair = force->pair_match("^amoeba",0,0);
|
||||
if (!pair) pair = force->pair_match("^hippo",0,0);
|
||||
|
||||
if (!pair)
|
||||
error->all(FLERR,"Cannot use fix amoeba/bitorsion w/out pair amoeba/hippo");
|
||||
|
||||
@ -285,8 +285,9 @@ void ImproperAmoeba::init_style()
|
||||
// check if PairAmoeba disabled improper terms
|
||||
|
||||
Pair *pair = nullptr;
|
||||
pair = force->pair_match("amoeba",1,0);
|
||||
if (!pair) pair = force->pair_match("hippo",1,0);
|
||||
pair = force->pair_match("^amoeba",0,0);
|
||||
if (!pair) pair = force->pair_match("^hippo",0,0);
|
||||
|
||||
if (!pair) error->all(FLERR,"Improper amoeba could not find pair amoeba/hippo");
|
||||
|
||||
int tmp;
|
||||
|
||||
@ -29,6 +29,7 @@
|
||||
#include "my_page.h"
|
||||
#include "neigh_list.h"
|
||||
#include "neighbor.h"
|
||||
#include "timer.h"
|
||||
#include "update.h"
|
||||
|
||||
#include <cmath>
|
||||
@ -47,6 +48,7 @@ enum{MUTUAL,OPT,TCG,DIRECT};
|
||||
enum{GEAR,ASPC,LSQR};
|
||||
|
||||
#define DELTASTACK 16
|
||||
#define DEBUG_AMOEBA 0
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
@ -85,6 +87,10 @@ PairAmoeba::PairAmoeba(LAMMPS *lmp) : Pair(lmp)
|
||||
cmp = fmp = nullptr;
|
||||
cphi = fphi = nullptr;
|
||||
|
||||
_moduli_array = nullptr;
|
||||
_moduli_bsarray = nullptr;
|
||||
_nfft_max = 0;
|
||||
|
||||
poli = nullptr;
|
||||
conj = conjp = nullptr;
|
||||
vec = vecp = nullptr;
|
||||
@ -227,6 +233,9 @@ PairAmoeba::~PairAmoeba()
|
||||
memory->destroy(fphidp);
|
||||
memory->destroy(cphidp);
|
||||
|
||||
memory->destroy(_moduli_array);
|
||||
memory->destroy(_moduli_bsarray);
|
||||
|
||||
memory->destroy(thetai1);
|
||||
memory->destroy(thetai2);
|
||||
memory->destroy(thetai3);
|
||||
@ -349,12 +358,22 @@ void PairAmoeba::compute(int eflag, int vflag)
|
||||
if (update->ntimestep <= update->beginstep+1) {
|
||||
time_init = time_hal = time_repulse = time_disp = time_mpole = 0.0;
|
||||
time_induce = time_polar = time_qxfer = 0.0;
|
||||
|
||||
time_mpole_rspace = time_mpole_kspace = 0.0;
|
||||
time_direct_rspace = time_direct_kspace = 0.0;
|
||||
time_mutual_rspace = time_mutual_kspace = 0.0;
|
||||
time_polar_rspace = time_polar_kspace = 0.0;
|
||||
|
||||
time_grid_uind = time_fphi_uind = 0.0;
|
||||
if (ic_kspace) {
|
||||
ic_kspace->time_fft = 0.0;
|
||||
}
|
||||
}
|
||||
|
||||
double time0,time1,time2,time3,time4,time5,time6,time7,time8;
|
||||
|
||||
MPI_Barrier(world);
|
||||
time0 = MPI_Wtime();
|
||||
if (timer->has_sync()) MPI_Barrier(world);
|
||||
time0 = platform::walltime();
|
||||
|
||||
// if reneighboring step:
|
||||
// augment neighbor list to include 1-5 neighbor flags
|
||||
@ -410,8 +429,7 @@ void PairAmoeba::compute(int eflag, int vflag)
|
||||
comm->forward_comm(this);
|
||||
|
||||
if (amoeba) pbc_xred();
|
||||
|
||||
time1 = MPI_Wtime();
|
||||
time1 = platform::walltime();
|
||||
|
||||
// ----------------------------------------
|
||||
// compute components of force field
|
||||
@ -420,22 +438,22 @@ void PairAmoeba::compute(int eflag, int vflag)
|
||||
// buffered 14-7 Vdwl, pairwise
|
||||
|
||||
if (amoeba && hal_flag) hal();
|
||||
time2 = MPI_Wtime();
|
||||
time2 = platform::walltime();
|
||||
|
||||
// Pauli repulsion, pairwise
|
||||
|
||||
if (!amoeba && repulse_flag) repulsion();
|
||||
time3 = MPI_Wtime();
|
||||
time3 = platform::walltime();
|
||||
|
||||
// Ewald dispersion, pairwise and long range
|
||||
|
||||
if (!amoeba && (disp_rspace_flag || disp_kspace_flag)) dispersion();
|
||||
time4 = MPI_Wtime();
|
||||
time4 = platform::walltime();
|
||||
|
||||
// multipole, pairwise and long range
|
||||
|
||||
if (mpole_rspace_flag || mpole_kspace_flag) multipole();
|
||||
time5 = MPI_Wtime();
|
||||
time5 = platform::walltime();
|
||||
|
||||
// induced dipoles, interative CG relaxation
|
||||
// communicate induce() output values needed by ghost atoms
|
||||
@ -445,17 +463,17 @@ void PairAmoeba::compute(int eflag, int vflag)
|
||||
cfstyle = INDUCE;
|
||||
comm->forward_comm(this);
|
||||
}
|
||||
time6 = MPI_Wtime();
|
||||
time6 = platform::walltime();
|
||||
|
||||
// dipoles, pairwise and long range
|
||||
|
||||
if (polar_rspace_flag || polar_kspace_flag) polar();
|
||||
time7 = MPI_Wtime();
|
||||
time7 = platform::walltime();
|
||||
|
||||
// charge transfer, pairwise
|
||||
|
||||
if (!amoeba && qxfer_flag) charge_transfer();
|
||||
time8 = MPI_Wtime();
|
||||
time8 = platform::walltime();
|
||||
|
||||
// store energy components for output by compute pair command
|
||||
|
||||
@ -518,6 +536,44 @@ void PairAmoeba::finish()
|
||||
MPI_Allreduce(&time_qxfer,&ave,1,MPI_DOUBLE,MPI_SUM,world);
|
||||
time_qxfer = ave/comm->nprocs;
|
||||
|
||||
#if DEBUG_AMOEBA
|
||||
// real-space/kspace breakdown
|
||||
MPI_Allreduce(&time_mpole_rspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
|
||||
time_mpole_rspace = ave/comm->nprocs;
|
||||
|
||||
MPI_Allreduce(&time_mpole_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
|
||||
time_mpole_kspace = ave/comm->nprocs;
|
||||
|
||||
MPI_Allreduce(&time_direct_rspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
|
||||
time_direct_rspace = ave/comm->nprocs;
|
||||
|
||||
MPI_Allreduce(&time_direct_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
|
||||
time_direct_kspace = ave/comm->nprocs;
|
||||
|
||||
MPI_Allreduce(&time_mutual_rspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
|
||||
time_mutual_rspace = ave/comm->nprocs;
|
||||
|
||||
MPI_Allreduce(&time_mutual_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
|
||||
time_mutual_kspace = ave/comm->nprocs;
|
||||
|
||||
MPI_Allreduce(&time_polar_rspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
|
||||
time_polar_rspace = ave/comm->nprocs;
|
||||
|
||||
MPI_Allreduce(&time_polar_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
|
||||
time_polar_kspace = ave/comm->nprocs;
|
||||
|
||||
MPI_Allreduce(&time_grid_uind,&ave,1,MPI_DOUBLE,MPI_SUM,world);
|
||||
time_grid_uind = ave/comm->nprocs;
|
||||
|
||||
MPI_Allreduce(&time_fphi_uind,&ave,1,MPI_DOUBLE,MPI_SUM,world);
|
||||
time_fphi_uind = ave/comm->nprocs;
|
||||
|
||||
double time_mutual_fft = 0;
|
||||
if (ic_kspace) time_mutual_fft = ic_kspace->time_fft;
|
||||
MPI_Allreduce(&time_mutual_fft,&ave,1,MPI_DOUBLE,MPI_SUM,world);
|
||||
time_mutual_fft = ave/comm->nprocs;
|
||||
#endif // DEBUG_AMOEBA
|
||||
|
||||
double time_total = (time_init + time_hal + time_repulse + time_disp +
|
||||
time_mpole + time_induce + time_polar + time_qxfer) / 100.0;
|
||||
|
||||
@ -534,8 +590,27 @@ void PairAmoeba::finish()
|
||||
utils::logmesg(lmp," Induce time: {:<12.6g} {:6.2f}%\n", time_induce, time_induce/time_total);
|
||||
utils::logmesg(lmp," Polar time: {:<12.6g} {:6.2f}%\n", time_polar, time_polar/time_total);
|
||||
if (!amoeba)
|
||||
utils::logmesg(lmp," Qxfer time: {:<12.6g} {:6.2f}%\n", time_qxfer, time_qxfer/time_total);
|
||||
utils::logmesg(lmp," Total time: {:<12.6g}\n",time_total * 100.0);
|
||||
utils::logmesg(lmp," Qxfer time: {:.6g} {:.6g}\n", time_qxfer, time_qxfer/time_total);
|
||||
utils::logmesg(lmp," Total time: {:.6g}\n",time_total * 100.0);
|
||||
|
||||
#if DEBUG_AMOEBA
|
||||
double rspace_time = time_mpole_rspace + time_direct_rspace + time_mutual_rspace + time_polar_rspace;
|
||||
double kspace_time = time_mpole_kspace + time_direct_kspace + time_mutual_kspace + time_polar_kspace;
|
||||
|
||||
utils::logmesg(lmp," Real-space timing breakdown: {:.3g}%\n", rspace_time/time_total);
|
||||
utils::logmesg(lmp," Mpole time: {:.6g} {:.3g}%\n", time_mpole_rspace, time_mpole_rspace/time_total);
|
||||
utils::logmesg(lmp," Direct time: {:.6g} {:.3g}%\n", time_direct_rspace, time_direct_rspace/time_total);
|
||||
utils::logmesg(lmp," Mutual time: {:.6g} {:.3g}%\n", time_mutual_rspace, time_mutual_rspace/time_total);
|
||||
utils::logmesg(lmp," Polar time: {:.6g} {:.3g}%\n", time_polar_rspace, time_polar_rspace/time_total);
|
||||
utils::logmesg(lmp," K-space timing breakdown : {:.3g}%\n", kspace_time/time_total);
|
||||
utils::logmesg(lmp," Mpole time: {:.6g} {:.3g}%\n", time_mpole_kspace, time_mpole_kspace/time_total);
|
||||
utils::logmesg(lmp," Direct time: {:.6g} {:.3g}%\n", time_direct_kspace, time_direct_kspace/time_total);
|
||||
utils::logmesg(lmp," Mutual time: {:.6g} {:.3g}%\n", time_mutual_kspace, time_mutual_kspace/time_total);
|
||||
utils::logmesg(lmp," - Grid : {:.6g} {:.3g}%\n", time_grid_uind, time_grid_uind/time_total);
|
||||
utils::logmesg(lmp," - FFT : {:.6g} {:.3g}%\n", time_mutual_fft, time_mutual_fft/time_total);
|
||||
utils::logmesg(lmp," - Interp : {:.6g} {:.3g}%\n", time_fphi_uind, time_fphi_uind/time_total);
|
||||
utils::logmesg(lmp," Polar time: {:.6g} {:.3g}%\n", time_polar_kspace, time_polar_kspace/time_total);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@ -2320,6 +2395,8 @@ void PairAmoeba::grow_local()
|
||||
firstneigh_pcpc = (double **)
|
||||
memory->smalloc(nmax*sizeof(double *),"induce:firstneigh_pcpc");
|
||||
}
|
||||
|
||||
memory->create(_moduli_array,bsordermax,"amoeba:_moduli_array");
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
|
||||
@ -82,6 +82,12 @@ class PairAmoeba : public Pair {
|
||||
double time_init, time_hal, time_repulse, time_disp;
|
||||
double time_mpole, time_induce, time_polar, time_qxfer;
|
||||
|
||||
double time_mpole_rspace, time_mpole_kspace;
|
||||
double time_direct_rspace, time_direct_kspace;
|
||||
double time_mutual_rspace, time_mutual_kspace;
|
||||
double time_polar_rspace, time_polar_kspace;
|
||||
double time_grid_uind, time_fphi_uind;
|
||||
|
||||
// energy/virial components
|
||||
|
||||
double ehal, erepulse, edisp, epolar, empole, eqxfer;
|
||||
@ -327,6 +333,10 @@ class PairAmoeba : public Pair {
|
||||
double **cmp,**fmp; // Cartesian and fractional multipoles
|
||||
double **cphi,**fphi;
|
||||
|
||||
double *_moduli_array; // buffers for moduli
|
||||
double *_moduli_bsarray;
|
||||
int _nfft_max;
|
||||
|
||||
// params for current KSpace solve and FFT being worked on
|
||||
|
||||
int nfft1, nfft2, nfft3; // size of FFT
|
||||
@ -335,8 +345,12 @@ class PairAmoeba : public Pair {
|
||||
double ctf[10][10]; // indices NOT flipped vs Fortran
|
||||
double ftc[10][10]; // indices NOT flipped vs Fortran
|
||||
|
||||
class AmoebaConvolution *m_kspace, *p_kspace, *pc_kspace, *d_kspace;
|
||||
class AmoebaConvolution *i_kspace, *ic_kspace;
|
||||
class AmoebaConvolution *m_kspace; // multipole KSpace
|
||||
class AmoebaConvolution *p_kspace; // polar KSpace
|
||||
class AmoebaConvolution *pc_kspace;
|
||||
class AmoebaConvolution *d_kspace; // dispersion KSpace
|
||||
class AmoebaConvolution *i_kspace; // induce KSpace
|
||||
class AmoebaConvolution *ic_kspace;
|
||||
|
||||
// FFT grid size factors
|
||||
|
||||
@ -347,33 +361,33 @@ class PairAmoeba : public Pair {
|
||||
|
||||
void hal();
|
||||
|
||||
void repulsion();
|
||||
void damprep(double, double, double, double, double, double, double, double, int, double, double,
|
||||
double *);
|
||||
virtual void repulsion();
|
||||
void damprep(double, double, double, double, double, double, double, double,
|
||||
int, double, double, double *);
|
||||
|
||||
void dispersion();
|
||||
void dispersion_real();
|
||||
virtual void dispersion_real();
|
||||
void dispersion_kspace();
|
||||
|
||||
void multipole();
|
||||
void multipole_real();
|
||||
virtual void multipole_real();
|
||||
void multipole_kspace();
|
||||
|
||||
void polar();
|
||||
void polar_energy();
|
||||
void polar_real();
|
||||
void polar_kspace();
|
||||
virtual void polar_real();
|
||||
virtual void polar_kspace();
|
||||
void damppole(double, int, double, double, double *, double *, double *);
|
||||
|
||||
void induce();
|
||||
virtual void induce();
|
||||
void ulspred();
|
||||
void ufield0c(double **, double **);
|
||||
virtual void ufield0c(double **, double **);
|
||||
void uscale0b(int, double **, double **, double **, double **);
|
||||
void dfield0c(double **, double **);
|
||||
void umutual1(double **, double **);
|
||||
void umutual2b(double **, double **);
|
||||
virtual void umutual1(double **, double **);
|
||||
virtual void umutual2b(double **, double **);
|
||||
void udirect1(double **);
|
||||
void udirect2b(double **, double **);
|
||||
virtual void udirect2b(double **, double **);
|
||||
void dampmut(double, double, double, double *);
|
||||
void dampdir(double, double, double, double *, double *);
|
||||
void cholesky(int, double *, double *);
|
||||
@ -393,11 +407,11 @@ class PairAmoeba : public Pair {
|
||||
void fphi_to_cphi(double **, double **);
|
||||
void frac_to_cart();
|
||||
|
||||
void grid_mpole(double **, double ***);
|
||||
void fphi_mpole(double ***, double **);
|
||||
void grid_uind(double **, double **, double ****);
|
||||
void fphi_uind(double ****, double **, double **, double **);
|
||||
void grid_disp(double ***);
|
||||
void grid_mpole(double **, FFT_SCALAR ***);
|
||||
void fphi_mpole(FFT_SCALAR ***, double **);
|
||||
void grid_uind(double **, double **, FFT_SCALAR ****);
|
||||
virtual void fphi_uind(FFT_SCALAR ****, double **, double **, double **);
|
||||
void grid_disp(FFT_SCALAR ***);
|
||||
|
||||
void kewald();
|
||||
void kewald_parallel(int, int, int, int, int &, int &, int &, int &, int &, int &, int &, int &,
|
||||
|
||||
@ -45,6 +45,10 @@ depend () {
|
||||
# add one if statement per parent package
|
||||
# add one depend() call per child package that depends on that parent
|
||||
|
||||
if (test $1 = "AMOEBA") then
|
||||
depend GPU
|
||||
fi
|
||||
|
||||
if (test $1 = "ASPHERE") then
|
||||
depend GPU
|
||||
depend OPENMP
|
||||
|
||||
@ -28,6 +28,8 @@ action () {
|
||||
|
||||
# list of files with optional dependcies
|
||||
|
||||
action amoeba_convolution_gpu.cpp amoeba_convolution.cpp
|
||||
action amoeba_convolution_gpu.h amoeba_convolution.cpp
|
||||
action fix_gpu.cpp
|
||||
action fix_gpu.h
|
||||
action fix_nve_gpu.h
|
||||
@ -41,6 +43,8 @@ action fix_npt_gpu.cpp
|
||||
action fix_nve_asphere_gpu.h fix_nve_asphere.h
|
||||
action fix_nve_asphere_gpu.cpp fix_nve_asphere.cpp
|
||||
action gpu_extra.h
|
||||
action pair_amoeba_gpu.cpp pair_amoeba.cpp
|
||||
action pair_amoeba_gpu.h pair_amoeba.h
|
||||
action pair_beck_gpu.cpp pair_beck.cpp
|
||||
action pair_beck_gpu.h pair_beck.h
|
||||
action pair_born_coul_long_gpu.cpp pair_born_coul_long.cpp
|
||||
@ -89,6 +93,8 @@ action pair_gauss_gpu.cpp pair_gauss.cpp
|
||||
action pair_gauss_gpu.h pair_gauss.h
|
||||
action pair_gayberne_gpu.cpp pair_gayberne.cpp
|
||||
action pair_gayberne_gpu.h pair_gayberne.cpp
|
||||
action pair_hippo_gpu.cpp pair_hippo.cpp
|
||||
action pair_hippo_gpu.h pair_hippo.cpp
|
||||
action pair_lj96_cut_gpu.cpp pair_lj96_cut.cpp
|
||||
action pair_lj96_cut_gpu.h pair_lj96_cut.h
|
||||
action pair_lj_charmm_coul_long_gpu.cpp pair_lj_charmm_coul_long.cpp
|
||||
@ -113,6 +119,10 @@ action pair_lj_cut_coul_msm_gpu.cpp pair_lj_cut_coul_msm.cpp
|
||||
action pair_lj_cut_coul_msm_gpu.h pair_lj_cut_coul_msm.h
|
||||
action pair_lj_cut_gpu.cpp
|
||||
action pair_lj_cut_gpu.h
|
||||
action pair_lj_cut_dipole_long_gpu.cpp pair_lj_cut_dipole_long.cpp
|
||||
action pair_lj_cut_dipole_long_gpu.h pair_lj_cut_dipole_long.cpp
|
||||
action pair_lj_cut_tip4p_long_gpu.h pair_lj_cut_tip4p_long.cpp
|
||||
action pair_lj_cut_tip4p_long_gpu.cpp pair_lj_cut_tip4p_long.cpp
|
||||
action pair_lj_smooth_gpu.cpp pair_lj_smooth.cpp
|
||||
action pair_lj_smooth_gpu.h pair_lj_smooth.cpp
|
||||
action pair_lj_expand_gpu.cpp
|
||||
@ -155,10 +165,6 @@ action pppm_gpu.cpp pppm.cpp
|
||||
action pppm_gpu.h pppm.cpp
|
||||
action pair_ufm_gpu.cpp pair_ufm.cpp
|
||||
action pair_ufm_gpu.h pair_ufm.h
|
||||
action pair_lj_cut_dipole_long_gpu.cpp pair_lj_cut_dipole_long.cpp
|
||||
action pair_lj_cut_dipole_long_gpu.h pair_lj_cut_dipole_long.cpp
|
||||
action pair_lj_cut_tip4p_long_gpu.h pair_lj_cut_tip4p_long.cpp
|
||||
action pair_lj_cut_tip4p_long_gpu.cpp pair_lj_cut_tip4p_long.cpp
|
||||
|
||||
# edit 2 Makefile.package files to include/exclude package info
|
||||
|
||||
|
||||
181
src/GPU/amoeba_convolution_gpu.cpp
Normal file
181
src/GPU/amoeba_convolution_gpu.cpp
Normal file
@ -0,0 +1,181 @@
|
||||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
https://www.lammps.org/ Sandia National Laboratories
|
||||
LAMMPS Development team: developers@lammps.org
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include "amoeba_convolution_gpu.h"
|
||||
#include "comm.h"
|
||||
#include "fft3d_wrap.h"
|
||||
#include "remap_wrap.h"
|
||||
#include "grid3d.h"
|
||||
|
||||
using namespace LAMMPS_NS;
|
||||
|
||||
// DEBUG
|
||||
|
||||
#define DEBUG_AMOEBA 0
|
||||
#if DEBUG_AMOEBA
|
||||
char *labels[7] =
|
||||
{(char *) "MPOLE_GRID", (char *) "POLAR_GRID",
|
||||
(char *) "POLAR_GRIDC", (char *) "DISP_GRID",
|
||||
(char *) "INDUCE_GRID", (char *) "INDUCE_GRIDC"};
|
||||
|
||||
enum{GRIDBRICK_OUT,GRIDBRICK_IN,FFT,CFFT1,CFFT2};
|
||||
#endif
|
||||
// END DEBUG
|
||||
|
||||
#define SCALE 0
|
||||
|
||||
//#define USE_AMOEBA_FFT
|
||||
#ifdef USE_AMOEBA_FFT
|
||||
// External functions from GPU library
|
||||
int amoeba_setup_fft(const int size, const int numel, const int element_type);
|
||||
int amoeba_compute_fft1d(void* in, void* out, const int numel, const int mode);
|
||||
#endif
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
partition an FFT grid across processors
|
||||
both for a brick and FFT x pencil decomposition
|
||||
nx,nz,nz = global FFT grid size
|
||||
order = size of stencil in each dimension that maps atoms to grid
|
||||
adapted from PPPM::set_grid_local()
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
AmoebaConvolutionGPU::AmoebaConvolutionGPU(LAMMPS *lmp, Pair *pair,
|
||||
int nx_caller, int ny_caller, int nz_caller,
|
||||
int order_caller, int which_caller) :
|
||||
AmoebaConvolution(lmp, pair, nx_caller, ny_caller, nz_caller, order_caller,
|
||||
which_caller)
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
perform pre-convolution grid operations for 4d cgrid_brick array
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
FFT_SCALAR *AmoebaConvolutionGPU::pre_convolution_4d()
|
||||
{
|
||||
int ix,iy,iz,n;
|
||||
|
||||
// reverse comm for 4d brick grid + ghosts
|
||||
|
||||
#if DEBUG_AMOEBA
|
||||
debug_scalar(GRIDBRICK_OUT,"PRE Convo / PRE Grid3d");
|
||||
#endif
|
||||
|
||||
gc->reverse_comm(Grid3d::PAIR,amoeba,which,2,sizeof(FFT_SCALAR),
|
||||
gc_buf1,gc_buf2,MPI_FFT_SCALAR);
|
||||
|
||||
#if DEBUG_AMOEBA
|
||||
debug_scalar(GRIDBRICK_IN,"PRE Convo / POST Grid3d");
|
||||
debug_file(GRIDBRICK_IN,"pre.convo.post.grid3d");
|
||||
#endif
|
||||
// copy owned 4d brick grid values to FFT grid
|
||||
|
||||
n = 0;
|
||||
for (iz = nzlo_in; iz <= nzhi_in; iz++)
|
||||
for (iy = nylo_in; iy <= nyhi_in; iy++)
|
||||
for (ix = nxlo_in; ix <= nxhi_in; ix++) {
|
||||
cfft[n++] = cgrid_brick[iz][iy][ix][0];
|
||||
cfft[n++] = cgrid_brick[iz][iy][ix][1];
|
||||
}
|
||||
|
||||
// remap FFT grid from brick to x pencil partitioning
|
||||
// NOTE: could just setup FFT to start from brick decomp and skip remap
|
||||
|
||||
remap->perform(cfft,cfft,remap_buf);
|
||||
|
||||
#if DEBUG_AMOEBA
|
||||
debug_scalar(FFT,"PRE Convo / POST Remap");
|
||||
debug_file(FFT,"pre.convo.post.remap");
|
||||
#endif
|
||||
|
||||
double time0,time1;
|
||||
|
||||
MPI_Barrier(world);
|
||||
time0 = platform::walltime();
|
||||
|
||||
// perform forward FFT
|
||||
|
||||
#ifdef USE_AMOEBA_FFT
|
||||
amoeba_compute_fft1d(cfft,cfft,2*nfft_owned,FFT3d::FORWARD);
|
||||
#else
|
||||
fft1->compute(cfft,cfft,FFT3d::FORWARD);
|
||||
#endif
|
||||
|
||||
time1 = platform::walltime();
|
||||
|
||||
time_fft += time1 - time0;
|
||||
|
||||
if (SCALE) {
|
||||
double scale = 1.0/nfft_global;
|
||||
for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale;
|
||||
}
|
||||
|
||||
#if DEBUG_AMOEBA
|
||||
debug_scalar(CFFT1,"PRE Convo / POST FFT");
|
||||
debug_file(CFFT1,"pre.convo.post.fft");
|
||||
#endif
|
||||
return cfft;
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
perform post-convolution grid operations for 4d cgrid_brick array
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
void *AmoebaConvolutionGPU::post_convolution_4d()
|
||||
{
|
||||
int ix,iy,iz,n;
|
||||
|
||||
// perform backward FFT
|
||||
|
||||
#if DEBUG_AMOEBA
|
||||
debug_scalar(CFFT1,"POST Convo / PRE FFT");
|
||||
debug_file(CFFT1,"post.convo.pre.fft");
|
||||
#endif
|
||||
|
||||
double time0,time1;
|
||||
|
||||
MPI_Barrier(world);
|
||||
time0 = platform::walltime();
|
||||
|
||||
fft2->compute(cfft,cfft,FFT3d::BACKWARD);
|
||||
|
||||
time1 = platform::walltime();
|
||||
|
||||
time_fft += time1 - time0;
|
||||
|
||||
#if DEBUG_AMOEBA
|
||||
debug_scalar(CFFT2,"POST Convo / POST FFT");
|
||||
debug_file(CFFT2,"post.convo.post.fft");
|
||||
#endif
|
||||
// copy 1d complex values into 4d complex grid
|
||||
|
||||
n = 0;
|
||||
for (iz = nzlo_in; iz <= nzhi_in; iz++)
|
||||
for (iy = nylo_in; iy <= nyhi_in; iy++)
|
||||
for (ix = nxlo_in; ix <= nxhi_in; ix++) {
|
||||
cgrid_brick[iz][iy][ix][0] = cfft[n++];
|
||||
cgrid_brick[iz][iy][ix][1] = cfft[n++];
|
||||
}
|
||||
|
||||
// forward comm to populate ghost grid values
|
||||
|
||||
#if DEBUG_AMOEBA
|
||||
debug_scalar(GRIDBRICK_IN,"POST Convo / PRE grid3d");
|
||||
debug_file(GRIDBRICK_IN,"post.convo.pre.grid3d");
|
||||
#endif
|
||||
gc->forward_comm(Grid3d::PAIR,amoeba,which,2,sizeof(FFT_SCALAR),
|
||||
gc_buf1,gc_buf2,MPI_FFT_SCALAR);
|
||||
|
||||
return (void *) cgrid_brick;
|
||||
}
|
||||
32
src/GPU/amoeba_convolution_gpu.h
Normal file
32
src/GPU/amoeba_convolution_gpu.h
Normal file
@ -0,0 +1,32 @@
|
||||
/* -*- c++ -*- ----------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
https://www.lammps.org/ Sandia National Laboratories
|
||||
LAMMPS Development team: developers@lammps.org
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifndef LMP_AMOEBA_CONVOLUTION_GPU_H
|
||||
#define LMP_AMOEBA_CONVOLUTION_GPU_H
|
||||
|
||||
#include "amoeba_convolution.h"
|
||||
|
||||
|
||||
namespace LAMMPS_NS {
|
||||
|
||||
class AmoebaConvolutionGPU : public AmoebaConvolution {
|
||||
public:
|
||||
AmoebaConvolutionGPU(class LAMMPS *, class Pair *, int, int, int, int, int);
|
||||
|
||||
FFT_SCALAR *pre_convolution_4d() override;
|
||||
void *post_convolution_4d() override;
|
||||
|
||||
};
|
||||
|
||||
} // namespace LAMMPS_NS
|
||||
#endif
|
||||
@ -131,7 +131,7 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) :
|
||||
_gpu_mode = GPU_NEIGH;
|
||||
_particle_split = 1.0;
|
||||
int nthreads = 0;
|
||||
int newtonflag = 0;
|
||||
int newtonflag = force->newton_pair;
|
||||
int threads_per_atom = -1;
|
||||
double binsize = 0.0;
|
||||
char *opencl_args = nullptr;
|
||||
@ -360,6 +360,8 @@ double FixGPU::memory_usage()
|
||||
return bytes;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
double FixGPU::binsize(const double subx, const double suby,
|
||||
const double subz, const int nlocal,
|
||||
const double cut) {
|
||||
|
||||
2067
src/GPU/pair_amoeba_gpu.cpp
Normal file
2067
src/GPU/pair_amoeba_gpu.cpp
Normal file
File diff suppressed because it is too large
Load Diff
72
src/GPU/pair_amoeba_gpu.h
Normal file
72
src/GPU/pair_amoeba_gpu.h
Normal file
@ -0,0 +1,72 @@
|
||||
/* -*- c++ -*- ----------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
https://www.lammps.org/, Sandia National Laboratories
|
||||
LAMMPS Development team: developers@lammps.org
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifdef PAIR_CLASS
|
||||
// clang-format off
|
||||
PairStyle(amoeba/gpu,PairAmoebaGPU);
|
||||
// clang-format on
|
||||
#else
|
||||
|
||||
#ifndef LMP_PAIR_AMOEBA_GPU_H
|
||||
#define LMP_PAIR_AMOEBA_GPU_H
|
||||
|
||||
#include "pair_amoeba.h"
|
||||
|
||||
namespace LAMMPS_NS {
|
||||
|
||||
class PairAmoebaGPU : public PairAmoeba {
|
||||
public:
|
||||
PairAmoebaGPU(LAMMPS *lmp);
|
||||
~PairAmoebaGPU() override;
|
||||
void init_style() override;
|
||||
double memory_usage() override;
|
||||
|
||||
enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH };
|
||||
|
||||
void induce() override;
|
||||
|
||||
void multipole_real() override;
|
||||
void udirect2b(double **, double **) override;
|
||||
void umutual1(double **, double **) override;
|
||||
void fphi_uind(FFT_SCALAR ****, double **, double **, double **) override;
|
||||
void umutual2b(double **, double **) override;
|
||||
void ufield0c(double **, double **) override;
|
||||
void polar_real() override;
|
||||
void polar_kspace() override;
|
||||
|
||||
private:
|
||||
int gpu_mode;
|
||||
double cpu_time;
|
||||
void *tq_pinned;
|
||||
void *fieldp_pinned;
|
||||
bool acc_float;
|
||||
|
||||
bool gpu_hal_ready;
|
||||
bool gpu_repulsion_ready;
|
||||
bool gpu_dispersion_real_ready;
|
||||
bool gpu_multipole_real_ready;
|
||||
bool gpu_udirect2b_ready;
|
||||
bool gpu_umutual1_ready;
|
||||
bool gpu_fphi_uind_ready;
|
||||
bool gpu_umutual2b_ready;
|
||||
bool gpu_polar_real_ready;
|
||||
|
||||
void udirect2b_cpu();
|
||||
|
||||
template<class numtyp>
|
||||
void compute_force_from_torque(const numtyp*, double**, double*);
|
||||
};
|
||||
|
||||
} // namespace LAMMPS_NS
|
||||
#endif
|
||||
#endif
|
||||
1494
src/GPU/pair_hippo_gpu.cpp
Normal file
1494
src/GPU/pair_hippo_gpu.cpp
Normal file
File diff suppressed because it is too large
Load Diff
73
src/GPU/pair_hippo_gpu.h
Normal file
73
src/GPU/pair_hippo_gpu.h
Normal file
@ -0,0 +1,73 @@
|
||||
/* -*- c++ -*- ----------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
https://www.lammps.org/, Sandia National Laboratories
|
||||
LAMMPS Development team: developers@lammps.org
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifdef PAIR_CLASS
|
||||
// clang-format off
|
||||
PairStyle(hippo/gpu,PairHippoGPU);
|
||||
// clang-format on
|
||||
#else
|
||||
|
||||
#ifndef LMP_PAIR_HIPPO_GPU_H
|
||||
#define LMP_PAIR_HIPPO_GPU_H
|
||||
|
||||
#include "pair_amoeba.h"
|
||||
|
||||
namespace LAMMPS_NS {
|
||||
|
||||
class PairHippoGPU : public PairAmoeba {
|
||||
public:
|
||||
PairHippoGPU(LAMMPS *lmp);
|
||||
~PairHippoGPU() override;
|
||||
void init_style() override;
|
||||
double memory_usage() override;
|
||||
|
||||
enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH };
|
||||
|
||||
void induce() override;
|
||||
|
||||
void repulsion() override;
|
||||
void dispersion_real() override;
|
||||
void multipole_real() override;
|
||||
void udirect2b(double **, double **) override;
|
||||
void umutual1(double **, double **) override;
|
||||
void fphi_uind(FFT_SCALAR ****, double **, double **, double **) override;
|
||||
void umutual2b(double **, double **) override;
|
||||
void ufield0c(double **, double **) override;
|
||||
void polar_real() override;
|
||||
|
||||
private:
|
||||
int gpu_mode;
|
||||
double cpu_time;
|
||||
void *tq_pinned;
|
||||
void *fieldp_pinned;
|
||||
bool acc_float;
|
||||
|
||||
bool gpu_hal_ready;
|
||||
bool gpu_repulsion_ready;
|
||||
bool gpu_dispersion_real_ready;
|
||||
bool gpu_multipole_real_ready;
|
||||
bool gpu_udirect2b_ready;
|
||||
bool gpu_umutual1_ready;
|
||||
bool gpu_fphi_uind_ready;
|
||||
bool gpu_umutual2b_ready;
|
||||
bool gpu_polar_real_ready;
|
||||
|
||||
void udirect2b_cpu();
|
||||
|
||||
template<class numtyp>
|
||||
void compute_force_from_torque(const numtyp*, double**, double*);
|
||||
};
|
||||
|
||||
} // namespace LAMMPS_NS
|
||||
#endif
|
||||
#endif
|
||||
@ -204,6 +204,8 @@ action mliap_model_linear_kokkos.h mliap_model_linear.h
|
||||
action mliap_model_python_kokkos.cpp mliap_model_linear.cpp
|
||||
action mliap_model_python_kokkos.h mliap_model_linear.h
|
||||
action mliap_model_kokkos.h mliap_model.h
|
||||
action mliap_unified_kokkos.cpp mliap_unified.cpp
|
||||
action mliap_unified_kokkos.h mliap_unified.h
|
||||
action mliap_so3_kokkos.cpp mliap_so3.cpp
|
||||
action mliap_so3_kokkos.h mliap_so3.h
|
||||
action modify_kokkos.cpp
|
||||
@ -314,6 +316,8 @@ action pair_lj_spica_kokkos.cpp pair_lj_spica.cpp
|
||||
action pair_lj_spica_kokkos.h pair_lj_spica.h
|
||||
action pair_meam_kokkos.cpp pair_meam.cpp
|
||||
action pair_meam_kokkos.h pair_meam.h
|
||||
action pair_meam_ms_kokkos.cpp pair_meam_ms.cpp
|
||||
action pair_meam_ms_kokkos.h pair_meam_ms.h
|
||||
action pair_mliap_kokkos.cpp pair_mliap.cpp
|
||||
action pair_mliap_kokkos.h pair_mliap.h
|
||||
action pair_morse_kokkos.cpp
|
||||
@ -365,6 +369,7 @@ action verlet_kokkos.h
|
||||
|
||||
# Install cython pyx file only if non-KOKKOS version is present
|
||||
action mliap_model_python_couple_kokkos.pyx mliap_model_python_couple.pyx
|
||||
action mliap_unified_couple_kokkos.pyx mliap_unified_couple.pyx
|
||||
|
||||
# edit 2 Makefile.package files to include/exclude package info
|
||||
|
||||
@ -423,15 +428,19 @@ fi
|
||||
if (test $1 = 1) then
|
||||
if (type cythonize > /dev/null 2>&1 && test -e ../python_impl.cpp) then
|
||||
cythonize -3 ../mliap_model_python_couple_kokkos.pyx
|
||||
cythonize -3 ../mliap_unified_couple_kokkos.pyx
|
||||
fi
|
||||
|
||||
elif (test $1 = 0) then
|
||||
rm -f ../mliap_model_python_couple_kokkos.cpp ../mliap_model_python_couple_kokkos.h
|
||||
rm -f ../mliap_unified_couple_kokkos.cpp ../mliap_unified_couple_kokkos.h
|
||||
|
||||
elif (test $1 = 2) then
|
||||
if (type cythonize > /dev/null 2>&1 && test -e ../python_impl.cpp) then
|
||||
cythonize -3 ../mliap_model_python_couple_kokkos.pyx
|
||||
cythonize -3 ../mliap_unified_couple_kokkos.pyx
|
||||
else
|
||||
rm -f ../mliap_model_python_couple_kokkos.cpp ../mliap_model_python_couple_kokkos.h
|
||||
rm -f ../mliap_unified_couple_kokkos.cpp ../mliap_unified_couple_kokkos.h
|
||||
fi
|
||||
fi
|
||||
|
||||
@ -39,7 +39,7 @@ FixNVTKokkos<DeviceType>::FixNVTKokkos(LAMMPS *lmp, int narg, char **arg) :
|
||||
// id = fix-ID + temp
|
||||
|
||||
this->id_temp = utils::strdup(std::string(this->id)+"_temp");
|
||||
this->modify->add_compute(fmt::format("{} all temp/kk",this->id_temp));
|
||||
this->modify->add_compute(fmt::format("{} {} temp/kk",this->id_temp,this->group->names[this->igroup]));
|
||||
this->tcomputeflag = 1;
|
||||
}
|
||||
|
||||
|
||||
@ -67,7 +67,7 @@ FixNVTSllodKokkos<DeviceType>::FixNVTSllodKokkos(LAMMPS *lmp, int narg, char **a
|
||||
}
|
||||
|
||||
this->id_temp = utils::strdup(std::string(this->id)+"_temp");
|
||||
this->modify->add_compute(fmt::format("{} all temp/deform/kk",this->id_temp));
|
||||
this->modify->add_compute(fmt::format("{} {} temp/deform/kk",this->id_temp,this->group->names[this->igroup]));
|
||||
this->tcomputeflag = 1;
|
||||
this->nondeformbias = 0;
|
||||
}
|
||||
|
||||
@ -77,9 +77,8 @@ void FixSetForceKokkos<DeviceType>::init()
|
||||
template<class DeviceType>
|
||||
void FixSetForceKokkos<DeviceType>::post_force(int /*vflag*/)
|
||||
{
|
||||
atomKK->sync(execution_space, X_MASK | F_MASK | MASK_MASK);
|
||||
atomKK->sync(execution_space, F_MASK | MASK_MASK);
|
||||
|
||||
x = atomKK->k_x.view<DeviceType>();
|
||||
f = atomKK->k_f.view<DeviceType>();
|
||||
mask = atomKK->k_mask.view<DeviceType>();
|
||||
|
||||
@ -88,6 +87,8 @@ void FixSetForceKokkos<DeviceType>::post_force(int /*vflag*/)
|
||||
// update region if necessary
|
||||
|
||||
if (region) {
|
||||
if (!utils::strmatch(region->style, "^block"))
|
||||
error->all(FLERR,"Cannot (yet) use {}-style region with fix setforce/kk",region->style);
|
||||
region->prematch();
|
||||
DAT::tdual_int_1d k_match = DAT::tdual_int_1d("setforce:k_match",nlocal);
|
||||
KokkosBase* regionKKBase = dynamic_cast<KokkosBase*>(region);
|
||||
|
||||
@ -61,17 +61,44 @@ void MEAMKokkos<DeviceType>::operator()(TagMEAMDensFinal, const int &i, EV_FLOAT
|
||||
if (elti >= 0) {
|
||||
scaleii = d_scale(type[i],type[i]);
|
||||
d_rho1[i] = 0.0;
|
||||
if (msmeamflag) {
|
||||
d_rho2[i] = -1.0 / 3.0 * (d_arho2b[i] * d_arho2b[i]
|
||||
- d_arho2mb[i] * d_arho2mb[i]);
|
||||
} else{
|
||||
d_rho2[i] = -1.0 / 3.0 * d_arho2b[i] * d_arho2b[i];
|
||||
}
|
||||
d_rho3[i] = 0.0;
|
||||
for (int m = 0; m < 3; m++) {
|
||||
if (msmeamflag) {
|
||||
d_rho1[i] = d_rho1[i] + d_arho1(i, m) * d_arho1(i, m)
|
||||
- d_arho1m(i, m) * d_arho1m(i, m);
|
||||
d_rho3[i] = d_rho3[i] - 3.0 / 5.0 * (d_arho3b(i, m) * d_arho3b(i, m)
|
||||
- d_arho3mb(i, m) * d_arho3mb(i, m));
|
||||
} else{
|
||||
d_rho1[i] += d_arho1(i,m) * d_arho1(i,m);
|
||||
d_rho3[i] -= 3.0 / 5.0 * d_arho3b(i,m) * d_arho3b(i,m);
|
||||
}
|
||||
for (int m = 0; m < 6; m++)
|
||||
}
|
||||
for (int m = 0; m < 6; m++){
|
||||
if (msmeamflag) {
|
||||
d_rho2[i] = d_rho2[i] + v2D[m] * (d_arho2(i, m) * d_arho2(i, m)
|
||||
- d_arho2m(i, m) * d_arho2m(i, m));
|
||||
} else{
|
||||
d_rho2[i] += v2D[m] * d_arho2(i,m) * d_arho2(i,m);
|
||||
}
|
||||
}
|
||||
for (int m = 0; m < 10; m++)
|
||||
if (msmeamflag) {
|
||||
d_rho3[i] = d_rho3[i] + v3D[m] * (d_arho3(i, m) * d_arho3(i, m)
|
||||
- d_arho3m(i, m) * d_arho3m(i, m));
|
||||
} else{
|
||||
d_rho3[i] += v3D[m] * d_arho3(i,m) * d_arho3(i,m);
|
||||
}
|
||||
|
||||
if (msmeamflag) {
|
||||
// with msmeam all t weights are already accounted for in rho
|
||||
d_gamma[i] = d_rho1[i] + d_rho2[i] + d_rho3[i];
|
||||
} else{
|
||||
if (d_rho0[i] > 0.0) {
|
||||
if (ialloy == 1) {
|
||||
d_t_ave(i,0) = fdiv_zero_kk(d_t_ave(i,0), d_tsq_ave(i,0));
|
||||
@ -87,8 +114,8 @@ void MEAMKokkos<DeviceType>::operator()(TagMEAMDensFinal, const int &i, EV_FLOAT
|
||||
d_t_ave(i,2) /= d_rho0[i];
|
||||
}
|
||||
}
|
||||
|
||||
d_gamma[i] = d_t_ave(i,0) * d_rho1[i] + d_t_ave(i,1) * d_rho2[i] + d_t_ave(i,2) * d_rho3[i];
|
||||
}
|
||||
|
||||
if (d_rho0[i] > 0.0)
|
||||
d_gamma[i] /= (d_rho0[i] * d_rho0[i]);
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user