Merge branch 'develop'

This commit is contained in:
Axel Kohlmeyer
2023-01-29 13:27:54 -05:00
138 changed files with 16070 additions and 1717 deletions

View File

@ -99,8 +99,15 @@ function(check_for_autogen_files source_dir)
endfunction()
macro(pkg_depends PKG1 PKG2)
if(PKG_${PKG1} AND NOT (PKG_${PKG2} OR BUILD_${PKG2}))
message(FATAL_ERROR "The ${PKG1} package needs LAMMPS to be built with the ${PKG2} package")
if(DEFINED BUILD_${PKG2})
if(PKG_${PKG1} AND NOT BUILD_${PKG2})
message(FATAL_ERROR "The ${PKG1} package needs LAMMPS to be built with -D BUILD_${PKG2}=ON")
endif()
elseif(DEFINED PKG_${PKG2})
if(PKG_${PKG1} AND NOT PKG_${PKG2})
message(WARNING "The ${PKG1} package depends on the ${PKG2} package. Enabling it.")
set(PKG_${PKG2} ON CACHE BOOL "" FORCE)
endif()
endif()
endmacro()

View File

@ -1,4 +1,9 @@
find_package(ZLIB REQUIRED)
find_package(ZLIB)
if(NOT ZLIB_FOUND)
message(WARNING "No Zlib development support found. Disabling COMPRESS package...")
set(PKG_COMPRESS OFF CACHE BOOL "" FORCE)
return()
endif()
target_link_libraries(lammps PRIVATE ZLIB::ZLIB)
find_package(PkgConfig QUIET)

View File

@ -26,6 +26,19 @@ elseif(GPU_PREC STREQUAL "SINGLE")
set(GPU_PREC_SETTING "SINGLE_SINGLE")
endif()
option(GPU_DEBUG "Enable debugging code of the GPU package" OFF)
mark_as_advanced(GPU_DEBUG)
if(PKG_AMOEBA AND FFT_SINGLE)
message(FATAL_ERROR "GPU acceleration of AMOEBA is not (yet) compatible with single precision FFT")
endif()
if (PKG_AMOEBA)
list(APPEND GPU_SOURCES
${GPU_SOURCES_DIR}/amoeba_convolution_gpu.h
${GPU_SOURCES_DIR}/amoeba_convolution_gpu.cpp)
endif()
file(GLOB GPU_LIB_SOURCES ${CONFIGURE_DEPENDS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/[^.]*.cpp)
file(MAKE_DIRECTORY ${LAMMPS_LIB_BINARY_DIR}/gpu)
@ -151,7 +164,12 @@ if(GPU_API STREQUAL "CUDA")
add_library(gpu STATIC ${GPU_LIB_SOURCES} ${GPU_LIB_CUDPP_SOURCES} ${GPU_OBJS})
target_link_libraries(gpu PRIVATE ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY})
target_include_directories(gpu PRIVATE ${LAMMPS_LIB_BINARY_DIR}/gpu ${CUDA_INCLUDE_DIRS})
target_compile_definitions(gpu PRIVATE -DUSE_CUDA -D_${GPU_PREC_SETTING} -DMPI_GERYON -DUCL_NO_EXIT ${GPU_CUDA_MPS_FLAGS})
target_compile_definitions(gpu PRIVATE -DUSE_CUDA -D_${GPU_PREC_SETTING} ${GPU_CUDA_MPS_FLAGS})
if(GPU_DEBUG)
target_compile_definitions(gpu PRIVATE -DUCL_DEBUG -DGERYON_KERNEL_DUMP)
else()
target_compile_definitions(gpu PRIVATE -DMPI_GERYON -DUCL_NO_EXIT)
endif()
if(CUDPP_OPT)
target_include_directories(gpu PRIVATE ${LAMMPS_LIB_SOURCE_DIR}/gpu/cudpp_mini)
target_compile_definitions(gpu PRIVATE -DUSE_CUDPP)
@ -192,6 +210,7 @@ elseif(GPU_API STREQUAL "OPENCL")
${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff.cu
${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_zbl.cu
${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_mod.cu
${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_hippo.cu
)
foreach(GPU_KERNEL ${GPU_LIB_CU})
@ -208,6 +227,7 @@ elseif(GPU_API STREQUAL "OPENCL")
GenerateOpenCLHeader(tersoff ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff.cu)
GenerateOpenCLHeader(tersoff_zbl ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_zbl_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_zbl_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_zbl.cu)
GenerateOpenCLHeader(tersoff_mod ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_mod_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_mod_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_mod.cu)
GenerateOpenCLHeader(hippo ${CMAKE_CURRENT_BINARY_DIR}/gpu/hippo_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_hippo_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_hippo.cu)
list(APPEND GPU_LIB_SOURCES
${CMAKE_CURRENT_BINARY_DIR}/gpu/gayberne_cl.h
@ -217,14 +237,18 @@ elseif(GPU_API STREQUAL "OPENCL")
${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_cl.h
${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_zbl_cl.h
${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_mod_cl.h
${CMAKE_CURRENT_BINARY_DIR}/gpu/hippo_cl.h
)
add_library(gpu STATIC ${GPU_LIB_SOURCES})
target_link_libraries(gpu PRIVATE OpenCL::OpenCL)
target_include_directories(gpu PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/gpu)
target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT)
target_compile_definitions(gpu PRIVATE -DUSE_OPENCL)
target_compile_definitions(gpu PRIVATE -DUSE_OPENCL -D_${GPU_PREC_SETTING})
if(GPU_DEBUG)
target_compile_definitions(gpu PRIVATE -DUCL_DEBUG -DGERYON_KERNEL_DUMP)
else()
target_compile_definitions(gpu PRIVATE -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT)
endif()
target_link_libraries(lammps PRIVATE gpu)
add_executable(ocl_get_devices ${LAMMPS_LIB_SOURCE_DIR}/gpu/geryon/ucl_get_devices.cpp)
@ -374,8 +398,12 @@ elseif(GPU_API STREQUAL "HIP")
add_library(gpu STATIC ${GPU_LIB_SOURCES})
target_include_directories(gpu PRIVATE ${LAMMPS_LIB_BINARY_DIR}/gpu)
target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -DMPI_GERYON -DUCL_NO_EXIT)
target_compile_definitions(gpu PRIVATE -DUSE_HIP)
target_compile_definitions(gpu PRIVATE -DUSE_HIP -D_${GPU_PREC_SETTING})
if(GPU_DEBUG)
target_compile_definitions(gpu PRIVATE -DUCL_DEBUG -DGERYON_KERNEL_DUMP)
else()
target_compile_definitions(gpu PRIVATE -DMPI_GERYON -DUCL_NO_EXIT)
endif()
target_link_libraries(gpu PRIVATE hip::host)
if(HIP_USE_DEVICE_SORT)

View File

@ -144,6 +144,7 @@ if(PKG_ML-IAP)
${KOKKOS_PKG_SOURCES_DIR}/mliap_descriptor_so3_kokkos.cpp
${KOKKOS_PKG_SOURCES_DIR}/mliap_model_linear_kokkos.cpp
${KOKKOS_PKG_SOURCES_DIR}/mliap_model_python_kokkos.cpp
${KOKKOS_PKG_SOURCES_DIR}/mliap_unified_kokkos.cpp
${KOKKOS_PKG_SOURCES_DIR}/mliap_so3_kokkos.cpp)
# Add KOKKOS version of ML-IAP Python coupling if non-KOKKOS version is included

View File

@ -126,10 +126,11 @@ CMake build
-D GPU_API=value # value = opencl (default) or cuda or hip
-D GPU_PREC=value # precision setting
# value = double or mixed (default) or single
-D HIP_PATH # path to HIP installation. Must be set if GPU_API=HIP
-D GPU_ARCH=value # primary GPU hardware choice for GPU_API=cuda
# value = sm_XX, see below
# default is sm_50
# value = sm_XX (see below, default is sm_50)
-D GPU_DEBUG=value # enable debug code in the GPU package library, mostly useful for developers
# value = yes or no (default)
-D HIP_PATH=value # value = path to HIP installation. Must be set if GPU_API=HIP
-D HIP_ARCH=value # primary GPU hardware choice for GPU_API=hip
# value depends on selected HIP_PLATFORM
# default is 'gfx906' for HIP_PLATFORM=amd and 'sm_50' for HIP_PLATFORM=nvcc

View File

@ -39,7 +39,7 @@ OPT.
* :doc:`agni (o) <pair_agni>`
* :doc:`airebo (io) <pair_airebo>`
* :doc:`airebo/morse (io) <pair_airebo>`
* :doc:`amoeba <pair_amoeba>`
* :doc:`amoeba (g) <pair_amoeba>`
* :doc:`atm <pair_atm>`
* :doc:`awpmd/cut <pair_awpmd>`
* :doc:`beck (go) <pair_beck>`
@ -126,7 +126,7 @@ OPT.
* :doc:`hbond/dreiding/lj (o) <pair_hbond_dreiding>`
* :doc:`hbond/dreiding/morse (o) <pair_hbond_dreiding>`
* :doc:`hdnnp <pair_hdnnp>`
* :doc:`hippo <pair_amoeba>`
* :doc:`hippo (g) <pair_amoeba>`
* :doc:`ilp/graphene/hbn (t) <pair_ilp_graphene_hbn>`
* :doc:`ilp/tmd (t) <pair_ilp_tmd>`
* :doc:`kolmogorov/crespi/full <pair_kolmogorov_crespi_full>`
@ -200,6 +200,7 @@ OPT.
* :doc:`mdpd <pair_mesodpd>`
* :doc:`mdpd/rhosum <pair_mesodpd>`
* :doc:`meam (k) <pair_meam>`
* :doc:`meam/ms (k) <pair_meam>`
* :doc:`meam/spline (o) <pair_meam_spline>`
* :doc:`meam/sw/spline <pair_meam_sw_spline>`
* :doc:`mesocnt <pair_mesocnt>`

View File

@ -149,6 +149,34 @@ related tasks for each of the partitions, e.g.
restart 1000 system_${ibead}.restart1 system_${ibead}.restart2
read_restart system_${ibead}.restart2
Restart, fix_modify, output, run start/stop, minimize info
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
This fix writes the state of the Nose/Hoover thermostat over all
quasi-beads to :doc:`binary restart files <restart>`. See the
:doc:`read_restart <read_restart>` command for info on how to re-specify
a fix in an input script that reads a restart file, so that the
operation of the fix continues in an uninterrupted fashion.
None of the :doc:`fix_modify <fix_modify>` options
are relevant to this fix.
This fix computes a global 3-vector, which can be accessed by various
:doc:`output commands <Howto_output>`. The three quantities in the
global vector are
#. the total spring energy of the quasi-beads,
#. the current temperature of the classical system of ring polymers,
#. the current value of the scalar virial estimator for the kinetic
energy of the quantum system :ref:`(Herman) <Herman>`.
The vector values calculated by this fix are "extensive", except for the
temperature, which is "intensive".
No parameter of this fix can be used with the *start/stop* keywords of
the :doc:`run <run>` command. This fix is not invoked during
:doc:`energy minimization <minimize>`.
Restrictions
""""""""""""
@ -204,3 +232,8 @@ Path Integrals, McGraw-Hill, New York (1965).
**(Calhoun)** A. Calhoun, M. Pavese, G. Voth, Chem Phys Letters, 262,
415 (1996).
.. _Herman:
**(Herman)** M. F. Herman, E. J. Bruskin, B. J. Berne, J Chem Phys, 76, 5150 (1982).

View File

@ -39,6 +39,9 @@ Syntax
*masslimit* value = massmin massmax
massmin = minimum molecular weight of species to delete
massmax = maximum molecular weight of species to delete
*delete_rate_limit* value = Nlimit Nsteps
Nlimit = maximum number of deletions allowed to occur within interval
Nsteps = the interval (number of timesteps) over which to count deletions
Examples
""""""""
@ -142,7 +145,13 @@ When using the *masslimit* keyword, each line of the *filedel* file
contains the timestep on which deletions occurs, followed by how many
of each species are deleted (with quantities preceding chemical
formulae). The *specieslist* and *masslimit* keywords cannot both be
used in the same *reaxff/species* fix.
used in the same *reaxff/species* fix. The *delete_rate_limit*
keyword can enforce an upper limit on the overall rate of molecule
deletion. The number of deletion occurrences is limited to Nlimit
within an interval of Nsteps timesteps. When using the
*delete_rate_limit* keyword, no deletions are permitted to occur
within the first Nsteps timesteps of the first run (after reading a
either a data or restart file).
----------

View File

@ -732,8 +732,8 @@ choices:
* Use one of the 4 NPT or NPH styles for the rigid bodies. Use the
*dilate* all option so that it will dilate the positions of the
*non-rigid particles as well. Use :doc:`fix nvt <fix_nh>` (or any
*other thermostat) for the non-rigid particles.
non-rigid particles as well. Use :doc:`fix nvt <fix_nh>` (or any
other thermostat) for the non-rigid particles.
* Use :doc:`fix npt <fix_nh>` for the group of non-rigid particles. Use
the *dilate* all option so that it will dilate the center-of-mass
positions of the rigid bodies as well. Use one of the 4 NVE or 2 NVT

View File

@ -1,11 +1,18 @@
.. index:: pair_style amoeba
.. index:: pair_style amoeba/gpu
.. index:: pair_style hippo
.. index:: pair_style hippo/gpu
pair_style amoeba command
=========================
Accelerator Variants: *amoeba/gpu*
pair_style hippo command
========================
Accelerator Variants: *hippo/gpu*
Syntax
""""""
@ -127,6 +134,10 @@ version discussed in :ref:`(Ponder) <amoeba-Ponder>`, :ref:`(Ren)
implementation of HIPPO in LAMMPS matches the version discussed in
:ref:`(Rackers) <amoeba-Rackers>`.
.. versionadded:: TBD
Accelerator support via the GPU package is available.
----------
Only a single pair_coeff command is used with either the *amoeba* and
@ -187,6 +198,19 @@ These pair styles can only be used via the *pair* keyword of the
----------
.. include:: accel_styles.rst
.. note::
Using the GPU accelerated pair styles 'amoeba/gpu' or 'hippo/gpu'
when compiling the GPU package for OpenCL has a few known issues
when running on integrated GPUs and the calculation may crash.
The GPU accelerated pair styles are also not (yet) compatible
with single precision FFTs.
----------
Restrictions
""""""""""""

View File

@ -1,17 +1,26 @@
.. index:: pair_style meam
.. index:: pair_style meam/kk
.. index:: pair_style meam/ms
.. index:: pair_style meam/ms/kk
pair_style meam command
=========================
Accelerator Variants: *meam/kk*
pair_style meam/ms command
==========================
Accelerator Variants: *meam/ms/kk*
Syntax
""""""
.. code-block:: LAMMPS
pair_style meam
pair_style style
* style = *meam* or *meam/ms*
Examples
""""""""
@ -22,6 +31,9 @@ Examples
pair_coeff * * ../potentials/library.meam Si ../potentials/si.meam Si
pair_coeff * * ../potentials/library.meam Ni Al NULL Ni Al Ni Ni
pair_style meam/ms
pair_coeff * * ../potentials/library.msmeam H Ga ../potentials/HGa.meam H Ga
Description
"""""""""""
@ -31,16 +43,23 @@ Description
as of November 2010; see description below of the mixture_ref_t
parameter
Pair style *meam* computes non-bonded interactions for a variety of materials
using the modified embedded-atom method (MEAM)
:ref:`(Baskes) <Baskes>`. Conceptually, it is an extension to the original
:doc:`EAM method <pair_eam>` which adds angular forces. It is
thus suitable for modeling metals and alloys with fcc, bcc, hcp and
diamond cubic structures, as well as materials with covalent interactions
like silicon and carbon. This *meam* pair style is a translation of the
original Fortran version to C++. It is functionally equivalent but more
efficient and has additional features. The Fortran version of the *meam*
pair style has been removed from LAMMPS after the 12 December 2018 release.
Pair style *meam* computes non-bonded interactions for a variety of
materials using the modified embedded-atom method (MEAM) :ref:`(Baskes)
<Baskes>`. Conceptually, it is an extension to the original :doc:`EAM
method <pair_eam>` which adds angular forces. It is thus suitable for
modeling metals and alloys with fcc, bcc, hcp and diamond cubic
structures, as well as materials with covalent interactions like silicon
and carbon.
The *meam* pair style is a translation of the original Fortran version
to C++. It is functionally equivalent but more efficient and has
additional features. The Fortran version of the *meam* pair style has
been removed from LAMMPS after the 12 December 2018 release.
Pair style *meam/ms* uses the multi-state MEAM (MS-MEAM) method
according to :ref:`(Baskes2) <Baskes2>`, which is an extension to MEAM.
This pair style is mostly equivalent to *meam* and differs only
where noted in the documentation below.
In the MEAM formulation, the total energy E of a system of atoms is
given by:
@ -351,6 +370,16 @@ Most published MEAM parameter sets use the default values *attrac* = *repulse* =
Setting *repuls* = *attrac* = *delta* corresponds to the form used in several
recent published MEAM parameter sets, such as :ref:`(Valone) <Valone>`
Then using *meam/ms* pair style the multi-state MEAM (MS-MEAM) method is
activated. This requires 6 extra parameters in the MEAM library file,
resulting in 25 parameters ordered that are ordered like this:
elt, lat, z, ielement, atwt, alpha, b0, b1, b2, b3, b1m, b2m, b3m, alat, esub, asub,
t0, t1, t2, t3, t1m, t2m, t3m, rozero, ibar
The 6 extra MS-MEAM parameters are *b1m, b2m, b3m, t1m, t2m, t3m*.
In the LAMMPS ``potentials`` folder, compatible files have an ".msmeam" extension.
----------
.. include:: accel_styles.rst
@ -393,16 +422,15 @@ This pair style can only be used via the *pair* keyword of the
Restrictions
""""""""""""
The *meam* style is provided in the MEAM package. It is
only enabled if LAMMPS was built with that package.
The *meam* and *meam/ms* pair styles are provided in the MEAM
package. They are only enabled if LAMMPS was built with that package.
See the :doc:`Build package <Build_package>` page for more info.
The maximum number of elements, that can be read from the MEAM
library file, is determined at compile time. The default is 5.
If you need support for more elements, you have to change the
define for the constant 'maxelt' at the beginning of the file
src/MEAM/meam.h and update/recompile LAMMPS. There is no
limit on the number of atoms types.
The maximum number of elements, that can be read from the MEAM library
file, is determined at compile time. The default is 5. If you need
support for more elements, you have to change the the constant 'maxelt'
at the beginning of the file ``src/MEAM/meam.h`` and update/recompile
LAMMPS. There is no limit on the number of atoms types.
Related commands
""""""""""""""""
@ -421,6 +449,10 @@ none
**(Baskes)** Baskes, Phys Rev B, 46, 2727-2742 (1992).
.. _Baskes2:
**(Baskes2)** Baskes, Phys Rev B, 75, 094113 (2007).
.. _Gullet:
**(Gullet)** Gullet, Wagner, Slepoy, SANDIA Report 2003-8782 (2003). DOI:10.2172/918395

View File

@ -277,7 +277,8 @@ accelerated styles exist.
* :doc:`lubricateU/poly <pair_lubricateU>` - hydrodynamic lubrication forces for Fast Lubrication with polydispersity
* :doc:`mdpd <pair_mesodpd>` - mDPD particle interactions
* :doc:`mdpd/rhosum <pair_mesodpd>` - mDPD particle interactions for mass density
* :doc:`meam <pair_meam>` - modified embedded atom method (MEAM) in C
* :doc:`meam <pair_meam>` - modified embedded atom method (MEAM)
* :doc:`meam/ms <pair_meam>` - multi-state modified embedded atom method (MS-MEAM)
* :doc:`meam/spline <pair_meam_spline>` - splined version of MEAM
* :doc:`meam/sw/spline <pair_meam_sw_spline>` - splined version of MEAM with a Stillinger-Weber term
* :doc:`mesocnt <pair_mesocnt>` - mesoscopic vdW potential for (carbon) nanotubes

View File

@ -0,0 +1,30 @@
bkgd_dyn = 1
emb_lin_neg = 1
augt1=0
ialloy=1
rc = 5.9
#H
attrac(1,1)=0.460
repuls(1,1)=0.460
Cmin(1,1,1)=1.3 # PuMS
Cmax(1,1,1)= 2.80
nn2(1,1)=1
#Ga
rho0(2) = 0.6
attrac(2,2)=0.097
repuls(2,2)=0.097
nn2(2,2)=1
#HGa
attrac(1,2)=0.300
repuls(1,2)=0.300
lattce(1,2)=l12
re(1,2)=3.19
delta(1,2)=-0.48
alpha(1,2)=6.6
Cmin(1,1,2)=2.0
Cmin(2,1,2)= 2.0
Cmin(1,2,1)=2.0
Cmin(2,2,1) = 1.4
Cmin(1,2,2) = 1.4
Cmin(1,1,2) = 1.4
nn2(1,2)=1

View File

@ -0,0 +1,9 @@
To run Baske's test, do
lmp -in in.msmeam
Then
diff dump.msmeam dump.msmeam.bu

View File

@ -0,0 +1,25 @@
LAMMPS data file via write_data, version 16 Feb 2016, timestep = 1
3 atoms
2 atom types
-4.0000000000000000e+00 4.0000000000000000e+00 xlo xhi
-4.0000000000000000e+00 4.0000000000000000e+00 ylo yhi
-4.0000000000000000e+00 4.0000000000000000e+00 zlo zhi
Masses
1 1.0079
2 69.723
Atoms # atomic
1 1 0.0000000000000000e+00 0.0000000000000000e+00 0.0000000000000000e+00 0 0 0
2 2 2.2000000000000002e+00 0.0000000000000000e+00 0.0000000000000000e+00 0 0 0
3 2 2.9999999999999999e-01 2.2999999999999998e+00 0.0000000000000000e+00 0 0 0
Velocities
1 0.0000000000000000e+00 0.0000000000000000e+00 0.0000000000000000e+00
2 0.0000000000000000e+00 0.0000000000000000e+00 0.0000000000000000e+00
3 0.0000000000000000e+00 0.0000000000000000e+00 0.0000000000000000e+00

View File

@ -0,0 +1,24 @@
ITEM: TIMESTEP
0
ITEM: NUMBER OF ATOMS
3
ITEM: BOX BOUNDS pp pp pp
-4 4
-4 4
-4 4
ITEM: ATOMS id x y z fx fy fz c_pot_energy c_stress[1] c_stress[2] c_stress[3] c_stress[4] c_stress[5] c_stress[6]
1 0 0 0 -131.925 -88.3005 0 22.9153 -2.147e+08 -1.62661e+08 -0 -2.05301e+07 -0 -0
2 2.2 0 0 120.809 -0.482171 0 14.7692 -2.12028e+08 -0 -0 403352 -0 -0
3 0.3 2.3 0 11.1159 88.7827 0 8.61478 -2.67145e+06 -1.62661e+08 -0 -2.09335e+07 -0 -0
ITEM: TIMESTEP
1
ITEM: NUMBER OF ATOMS
3
ITEM: BOX BOUNDS pp pp pp
-4 4
-4 4
-4 4
ITEM: ATOMS id x y z fx fy fz c_pot_energy c_stress[1] c_stress[2] c_stress[3] c_stress[4] c_stress[5] c_stress[6]
1 0 0 0 -131.925 -88.3005 0 22.9153 -2.147e+08 -1.62661e+08 -0 -2.05301e+07 -0 -0
2 2.2 0 0 120.809 -0.482171 0 14.7692 -2.12028e+08 -0 -0 403352 -0 -0
3 0.3 2.3 0 11.1159 88.7827 0 8.61478 -2.67145e+06 -1.62661e+08 -0 -2.09335e+07 -0 -0

View File

@ -0,0 +1,31 @@
echo both
log log.msmeam
# Test of MEAM potential for HGa
# ------------------------ INITIALIZATION ----------------------------
units metal
dimension 3
boundary p p p
atom_style atomic
variable latparam equal 4.646
variable ncell equal 3
# ----------------------- ATOM DEFINITION ----------------------------
region box block -4 4 -4 4 -4 4
create_box 2 box
#
include potential.mod
create_atoms 1 single 0 0 0 units box
create_atoms 2 single 2.2 0 0 units box
create_atoms 2 single 0.3 2.3 0 units box
# ---------- Define Settings ---------------------
variable teng equal "c_eatoms"
compute pot_energy all pe/atom
compute stress all stress/atom NULL
dump 1 all custom 1 dump.msmeam id x y z fx fy fz c_pot_energy c_stress[1] c_stress[2] c_stress[3] c_stress[4] c_stress[5] c_stress[6]
run 1
write_data data.msmeam
print "All done!"

View File

@ -0,0 +1,14 @@
# DATE: 2018-09-22 UNITS: metal CONTRIBUTOR: Steve Valone, smv@lanl.gov CITATION: Baskes, PRB 1992; smv, sr, mib, JNM 2010
# ms-meam data format May 2010
# elt lat z ielement atwt
# alpha b0 b1 b2 b3 b1m b2m b3m alat esub asub
# - t0 t1 t2 t3 t1m t2m t3m rozero ibar
# NOTE: leading character cannot be a space
'H' 'dim' 1.0 1 1.0079
2.960 2.960 3.0 1.0 1.0 1.0 3.0 1.0 0.741 2.235 2.50
1.0 0.44721 0.0 0.00 0.0 0.31623 0 6.70 0
'Ga4' 'fcc' 12.0 31 69.723
4.42 4.80 3.10 6.00 0.00 0.0 0.0 0.5 4.247 2.897 0.97
1.0 1.649 1.435 0.00 0.0 0.0 2.0 0.70 0

View File

@ -0,0 +1,107 @@
# Test of MEAM potential for HGa
# ------------------------ INITIALIZATION ----------------------------
units metal
dimension 3
boundary p p p
atom_style atomic
variable latparam equal 4.646
variable ncell equal 3
# ----------------------- ATOM DEFINITION ----------------------------
region box block -4 4 -4 4 -4 4
create_box 2 box
Created orthogonal box = (-4 -4 -4) to (4 4 4)
1 by 1 by 1 MPI processor grid
#
include potential.mod
# NOTE: This script can be modified for different pair styles
# See in.elastic for more info.
variable Pu string H
print "potential chosen ${Pu}"
potential chosen H
# Choose potential
pair_style MSmeam
print "we just executed"
we just executed
pair_coeff * * library.MSmeam ${Pu} Ga4 HGaMS.meam ${Pu} Ga4
pair_coeff * * library.MSmeam H Ga4 HGaMS.meam ${Pu} Ga4
pair_coeff * * library.MSmeam H Ga4 HGaMS.meam H Ga4
Reading potential file library.MSmeam with DATE: 2018-09-22
# Setup neighbor style
neighbor 1.0 nsq
neigh_modify once no every 1 delay 0 check yes
# Setup minimization style
variable dmax equal 1.0e-2
min_style cg
min_modify dmax ${dmax} line quadratic
min_modify dmax 0.01 line quadratic
compute eng all pe/atom
compute eatoms all reduce sum c_eng
# Setup output
thermo 100
thermo_style custom step temp etotal press pxx pyy pzz pxy pxz pyz lx ly lz vol c_eatoms
thermo_modify norm yes
create_atoms 1 single 0 0 0 units box
Created 1 atoms
create_atoms 2 single 2.2 0 0 units box
Created 1 atoms
create_atoms 2 single 0.3 2.3 0 units box
Created 1 atoms
# ---------- Define Settings ---------------------
variable teng equal "c_eatoms"
compute pot_energy all pe/atom
compute stress all stress/atom NULL
dump 1 all custom 1 dump.msmeam id x y z fx fy fz c_pot_energy c_stress[1] c_stress[2] c_stress[3] c_stress[4] c_stress[5] c_stress[6]
run 1
WARNING: No fixes defined, atoms won't move (../verlet.cpp:55)
Neighbor list info ...
2 neighbor list requests
update every 1 steps, delay 0 steps, check yes
max neighbors/atom: 2000, page size: 100000
master list distance cutoff = 6.9
ghost atom cutoff = 6.9
Memory usage per processor = 12.9295 Mbytes
Step Temp TotEng Press Pxx Pyy Pzz Pxy Pxz Pyz Lx Ly Lz Volume eatoms
0 0 15.433079 491354.68 838670.91 635393.13 0 80195.793 0 0 8 8 8 512 15.433079
1 0 15.433079 491354.68 838670.91 635393.13 0 80195.793 0 0 8 8 8 512 15.433079
Loop time of 0.000172138 on 1 procs for 1 steps with 3 atoms
Performance: 501.922 ns/day, 0.048 hours/ns, 5809.285 timesteps/s
81.3% CPU use with 1 MPI tasks x no OpenMP threads
MPI task timing breakdown:
Section | min time | avg time | max time |%varavg| %total
---------------------------------------------------------------
Pair | 6.6996e-05 | 6.6996e-05 | 6.6996e-05 | 0.0 | 38.92
Neigh | 0 | 0 | 0 | 0.0 | 0.00
Comm | 1.9073e-06 | 1.9073e-06 | 1.9073e-06 | 0.0 | 1.11
Output | 9.7036e-05 | 9.7036e-05 | 9.7036e-05 | 0.0 | 56.37
Modify | 0 | 0 | 0 | 0.0 | 0.00
Other | | 6.199e-06 | | | 3.60
Nlocal: 3 ave 3 max 3 min
Histogram: 1 0 0 0 0 0 0 0 0 0
Nghost: 78 ave 78 max 78 min
Histogram: 1 0 0 0 0 0 0 0 0 0
Neighs: 7 ave 7 max 7 min
Histogram: 1 0 0 0 0 0 0 0 0 0
FullNghs: 14 ave 14 max 14 min
Histogram: 1 0 0 0 0 0 0 0 0 0
Total # of neighbors = 14
Ave neighs/atom = 4.66667
Neighbor list builds = 0
Dangerous builds = 0
write_data data.msmeam
print "All done!"
All done!
Total wall time: 0:00:00

View File

@ -0,0 +1,24 @@
ITEM: TIMESTEP
0
ITEM: NUMBER OF ATOMS
3
ITEM: BOX BOUNDS pp pp pp
-4 4
-4 4
-4 4
ITEM: ATOMS id x y z fx fy fz c_pot_energy c_stress[1] c_stress[2] c_stress[3] c_stress[4] c_stress[5] c_stress[6]
1 0 0 0 -131.925 -88.3005 0 22.9153 -2.147e+08 -1.62661e+08 -0 -2.05301e+07 -0 -0
2 2.2 0 0 120.809 -0.482171 0 14.7692 -2.12028e+08 -0 -0 403352 -0 -0
3 0.3 2.3 0 11.1159 88.7827 0 8.61478 -2.67145e+06 -1.62661e+08 -0 -2.09335e+07 -0 -0
ITEM: TIMESTEP
1
ITEM: NUMBER OF ATOMS
3
ITEM: BOX BOUNDS pp pp pp
-4 4
-4 4
-4 4
ITEM: ATOMS id x y z fx fy fz c_pot_energy c_stress[1] c_stress[2] c_stress[3] c_stress[4] c_stress[5] c_stress[6]
1 0 0 0 -131.925 -88.3005 0 22.9153 -2.147e+08 -1.62661e+08 -0 -2.05301e+07 -0 -0
2 2.2 0 0 120.809 -0.482171 0 14.7692 -2.12028e+08 -0 -0 403352 -0 -0
3 0.3 2.3 0 11.1159 88.7827 0 8.61478 -2.67145e+06 -1.62661e+08 -0 -2.09335e+07 -0 -0

View File

@ -0,0 +1,25 @@
# NOTE: This script can be modified for different pair styles
# See in.elastic for more info.
variable Pu string H
print "potential chosen ${Pu}"
# Choose potential
pair_style meam/ms
print "we just executed"
pair_coeff * * library.msmeam ${Pu} Ga4 HGa.meam ${Pu} Ga4
# Setup neighbor style
neighbor 1.0 bin
neigh_modify once no every 1 delay 0 check yes
# Setup minimization style
variable dmax equal 1.0e-2
min_style cg
min_modify dmax ${dmax} line quadratic
compute eng all pe/atom
compute eatoms all reduce sum c_eng
# Setup output
thermo 100
thermo_style custom step temp etotal press pxx pyy pzz pxy pxz pyz lx ly lz vol c_eatoms
thermo_modify norm yes

View File

@ -6,6 +6,6 @@ CUDA_HOME=/usr/local/cuda
endif
gpu_SYSINC =
gpu_SYSLIB = -lcudart -lcuda
gpu_SYSLIB = -lcudart -lcuda -lcufft
gpu_SYSPATH = -L$(CUDA_HOME)/lib64 -L$(CUDA_HOME)/lib64/stubs

View File

@ -1,8 +1,16 @@
# Common headers for kernels
PRE1_H = lal_preprocessor.h lal_aux_fun1.h
# Headers for Geryon
UCL_H = $(wildcard ./geryon/ucl*.h)
NVD_H = $(wildcard ./geryon/nvd*.h) $(UCL_H) lal_preprocessor.h \
lal_pre_cuda_hip.h
ALL_H = $(NVD_H) $(wildcard ./lal_*.h)
# Headers for Host files
HOST_H = lal_answer.h lal_atom.h lal_balance.h lal_base_atomic.h lal_base_amoeba.h \
lal_base_charge.h lal_base_dipole.h lal_base_dpd.h \
lal_base_ellipsoid.h lal_base_three.h lal_device.h lal_neighbor.h \
lal_neighbor_shared.h lal_pre_ocl_config.h $(NVD_H)
# Source files
SRCS := $(wildcard ./lal_*.cpp)
@ -54,13 +62,40 @@ $(OBJ_DIR)/pppm_d.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h \
$(OBJ_DIR)/pppm_d_cubin.h: $(OBJ_DIR)/pppm_d.cubin
$(BIN2C) -c -n pppm_d $(OBJ_DIR)/pppm_d.cubin > $(OBJ_DIR)/pppm_d_cubin.h
$(OBJ_DIR)/%_cubin.h: lal_%.cu $(ALL_H)
$(OBJ_DIR)/%_cubin.h: lal_%.cu $(PRE1_H)
$(CUDA) --fatbin -DNV_KERNEL -o $(OBJ_DIR)/$*.cubin $(OBJ_DIR)/lal_$*.cu
$(BIN2C) -c -n $* $(OBJ_DIR)/$*.cubin > $@
# host code compilation
$(OBJ_DIR)/lal_%.o: lal_%.cpp $(CUHS) $(ALL_H)
$(OBJ_DIR)/lal_answer.o: lal_answer.cpp $(HOST_H)
$(CUDR) -o $@ -c lal_answer.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_dpd_tstat_ext.o: lal_dpd_tstat_ext.cpp lal_dpd.h $(HOST_H)
$(CUDR) -o $@ -c lal_dpd_tstat_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_eam_alloy_ext.o: lal_eam_alloy_ext.cpp lal_eam.h $(HOST_H)
$(CUDR) -o $@ -c lal_eam_alloy_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_eam_fs_ext.o: lal_eam_fs_ext.cpp lal_eam.h $(HOST_H)
$(CUDR) -o $@ -c lal_eam_fs_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_neighbor.o: lal_neighbor.cpp $(HOST_H)
$(CUDR) -o $@ -c lal_neighbor.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_neighbor_shared.o: lal_neighbor_shared.cpp $(HOST_H)
$(CUDR) -o $@ -c lal_neighbor_shared.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_pppm.o: lal_pppm.cpp pppm_f_cubin.h pppm_d_cubin.h $(HOST_H)
$(CUDR) -o $@ -c lal_pppm.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_%_ext.o: lal_%_ext.cpp lal_%.h $(HOST_H)
$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
$(OBJ_DIR)/lal_base_%.o: lal_base_%.cpp $(HOST_H)
$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
$(OBJ_DIR)/lal_%.o: lal_%.cpp %_cubin.h $(HOST_H)
$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
#ifdef CUDPP_OPT
@ -77,10 +112,10 @@ $(OBJ_DIR)/cudpp_plan_manager.o: cudpp_mini/cudpp_plan_manager.cpp
$(CUDR) -o $@ -c cudpp_mini/cudpp_plan_manager.cpp -Icudpp_mini
$(OBJ_DIR)/radixsort_app.cu_o: cudpp_mini/radixsort_app.cu
$(CUDA) -o $@ -c cudpp_mini/radixsort_app.cu
$(CUDA) -o $@ -c cudpp_mini/radixsort_app.cu -Icudpp_mini
$(OBJ_DIR)/scan_app.cu_o: cudpp_mini/scan_app.cu
$(CUDA) -o $@ -c cudpp_mini/scan_app.cu
$(CUDA) -o $@ -c cudpp_mini/scan_app.cu -Icudpp_mini
#endif
# build libgpu.a

View File

@ -6,7 +6,7 @@ UCL_H = $(wildcard ./geryon/ucl*.h)
OCL_H = $(wildcard ./geryon/ocl*.h) $(UCL_H) lal_precision.h
# Headers for Host files
HOST_H = lal_answer.h lal_atom.h lal_balance.h lal_base_atomic.h \
HOST_H = lal_answer.h lal_atom.h lal_balance.h lal_base_atomic.h lal_base_amoeba.h \
lal_base_charge.h lal_base_dipole.h lal_base_dpd.h \
lal_base_ellipsoid.h lal_base_three.h lal_device.h lal_neighbor.h \
lal_neighbor_shared.h lal_pre_ocl_config.h $(OCL_H)
@ -74,6 +74,9 @@ $(OBJ_DIR)/tersoff_mod_cl.h: lal_tersoff_mod.cu $(PRE1_H) lal_tersoff_mod_extra.
$(OBJ_DIR)/tersoff_zbl_cl.h: lal_tersoff_zbl.cu $(PRE1_H) lal_tersoff_zbl_extra.h
$(BSH) ./geryon/file_to_cstr.sh tersoff_zbl $(PRE1_H) lal_tersoff_zbl_extra.h lal_tersoff_zbl.cu $(OBJ_DIR)/tersoff_zbl_cl.h;
$(OBJ_DIR)/hippo_cl.h: lal_hippo.cu $(PRE1_H) lal_hippo_extra.h
$(BSH) ./geryon/file_to_cstr.sh hippo $(PRE1_H) lal_hippo_extra.h lal_hippo.cu $(OBJ_DIR)/hippo_cl.h;
$(OBJ_DIR)/%_cl.h: lal_%.cu $(PRE1_H)
$(BSH) ./geryon/file_to_cstr.sh $* $(PRE1_H) $< $@;

View File

@ -26,6 +26,9 @@
#ifdef UCL_DEBUG
#define UCL_SYNC_DEBUG
#define UCL_DESTRUCT_CHECK
#define UCL_DEBUG_ARG(arg) arg
#else
#define UCL_DEBUG_ARG(arg)
#endif
#ifndef UCL_NO_API_CHECK

View File

@ -33,6 +33,9 @@
#ifdef UCL_DEBUG
#define UCL_SYNC_DEBUG
#define UCL_DESTRUCT_CHECK
#define UCL_DEBUG_ARG(arg) arg
#else
#define UCL_DEBUG_ARG(arg)
#endif
#ifndef UCL_NO_API_CHECK

View File

@ -309,15 +309,14 @@ class UCL_Device {
/// Return the maximum memory pitch in bytes for current device
inline size_t max_pitch() { return max_pitch(_device); }
/// Return the maximum memory pitch in bytes
inline size_t max_pitch(const int i) { return 0; }
inline size_t max_pitch(const int) { return 0; }
/// Returns false if accelerator cannot be shared by multiple processes
/** If it cannot be determined, true is returned **/
inline bool sharing_supported() { return sharing_supported(_device); }
/// Returns false if accelerator cannot be shared by multiple processes
/** If it cannot be determined, true is returned **/
inline bool sharing_supported(const int i)
{ return true; }
inline bool sharing_supported(const int) { return true; }
/// True if the device is a sub-device
inline bool is_subdevice()

View File

@ -33,6 +33,9 @@
#ifdef UCL_DEBUG
#define UCL_SYNC_DEBUG
#define UCL_DESTRUCT_CHECK
#define UCL_DEBUG_ARG(arg) arg
#else
#define UCL_DEBUG_ARG(arg)
#endif
#ifndef UCL_NO_API_CHECK

View File

@ -137,7 +137,7 @@ inline int _host_view(mat_type &mat, copy_type &cm, const size_t o,
template <class mat_type>
inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
const enum UCL_MEMOPT kind, const enum UCL_MEMOPT /*kind2*/){
cl_mem_flags buffer_perm;
cl_map_flags map_perm;
if (kind==UCL_READ_ONLY) {
@ -583,7 +583,7 @@ template <> struct _ucl_memcpy<1,0> {
template <class p1, class p2>
static inline void mc(p1 &dst, const p2 &src, const size_t n,
cl_command_queue &cq, const cl_bool block,
const size_t dst_offset, const size_t src_offset) {
const size_t /*dst_offset*/, const size_t src_offset) {
if (src.cbegin()==dst.cbegin()) {
#ifdef UCL_DBG_MEM_TRACE
std::cerr << "UCL_COPY 1S\n";
@ -641,7 +641,7 @@ template <> struct _ucl_memcpy<0,1> {
template <class p1, class p2>
static inline void mc(p1 &dst, const p2 &src, const size_t n,
cl_command_queue &cq, const cl_bool block,
const size_t dst_offset, const size_t src_offset) {
const size_t dst_offset, const size_t /*src_offset*/) {
if (src.cbegin()==dst.cbegin()) {
if (block) ucl_sync(cq);
#ifdef UCL_DBG_MEM_TRACE

View File

@ -35,19 +35,19 @@ class UCL_Texture {
UCL_Texture() {}
~UCL_Texture() {}
/// Construct with a specified texture reference
inline UCL_Texture(UCL_Program &prog, const char *texture_name) { }
inline UCL_Texture(UCL_Program & /*prog*/, const char * /*texture_name*/) { }
/// Set the texture reference for this object
inline void get_texture(UCL_Program &prog, const char *texture_name) { }
inline void get_texture(UCL_Program & /*prog*/, const char * /*texture_name*/) { }
/// Bind a float array where each fetch grabs a vector of length numel
template<class mat_typ>
inline void bind_float(mat_typ &vec, const unsigned numel) { }
inline void bind_float(mat_typ & /*vec*/, const unsigned /*numel*/) { }
/// Unbind the texture reference from the memory allocation
inline void unbind() { }
/// Make a texture reference available to kernel
inline void allow(UCL_Kernel &kernel) { }
inline void allow(UCL_Kernel & /*kernel*/) { }
private:
friend class UCL_Kernel;
@ -62,7 +62,7 @@ class UCL_Const {
inline UCL_Const(UCL_Program &prog, const char *global_name)
{ get_global(prog,global_name); }
/// Set the global reference for this object
inline void get_global(UCL_Program &prog, const char *global_name) {
inline void get_global(UCL_Program &prog, const char * /*global_name*/) {
if (_active) {
CL_DESTRUCT_CALL(clReleaseContext(_context));
CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq));

View File

@ -71,7 +71,7 @@ class UCL_Timer {
inline void init(UCL_Device &dev) { init(dev,dev.cq()); }
/// Initialize command queue for timing
inline void init(UCL_Device &dev, command_queue &cq) {
inline void init(UCL_Device & /*dev*/, command_queue &cq) {
clear();
_cq=cq;
clRetainCommandQueue(_cq);

View File

@ -205,12 +205,11 @@ template <> struct _host_host_copy<1,1> {
// Should never be here
template <int host_t1, int host_t2> struct _host_host_copy {
template <class mat1, class mat2>
static inline void hhc(mat1 &dst, const mat2 &src, const size_t numel) {
static inline void hhc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*numel*/) {
assert(0==1);
}
template <class mat1, class mat2>
static inline void hhc(mat1 &dst, const mat2 &src, const size_t rows,
const size_t cols) {
static inline void hhc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*rows*/, const size_t /*cols*/) {
assert(0==1);
}
};
@ -470,24 +469,22 @@ template <int host_type1> struct _ucl_cast_copy<host_type1,1> {
// Neither on host or both on host
template <> struct _ucl_cast_copy<1,1> {
template <class mat1, class mat2, class mat3>
static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
mat3 &cast_buffer, command_queue &cq) {
static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*numel*/,
mat3 & /*cast_buffer*/, command_queue & /*cq*/) {
assert(0==1);
}
template <class mat1, class mat2, class mat3>
static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
mat3 &cast_buffer) {
static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*numel*/, mat3 & /*cast_buffer*/) {
assert(0==1);
}
template <class mat1, class mat2, class mat3>
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
const size_t cols, mat3 &cast_buffer) {
static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*rows*/,
const size_t /*cols*/, mat3 & /*cast_buffer*/) {
assert(0==1);
}
template <class mat1, class mat2, class mat3>
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
const size_t cols, mat3 &cast_buffer,
command_queue &cq) {
static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*rows*/,
const size_t /*cols*/, mat3 & /*cast_buffer*/, command_queue & /*cq*/) {
assert(0==1);
}
};
@ -495,24 +492,22 @@ template <> struct _ucl_cast_copy<1,1> {
// Neither on host or both on host
template <> struct _ucl_cast_copy<0,0> {
template <class mat1, class mat2, class mat3>
static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
mat3 &cast_buffer, command_queue &cq) {
static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*numel*/,
mat3 & /*cast_buffer*/, command_queue & /*cq*/) {
assert(0==1);
}
template <class mat1, class mat2, class mat3>
static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
mat3 &cast_buffer) {
static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*numel*/, mat3 & /*cast_buffer*/) {
assert(0==1);
}
template <class mat1, class mat2, class mat3>
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
const size_t cols, mat3 &cast_buffer) {
static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*rows*/,
const size_t /*cols*/, mat3 & /*cast_buffer*/) {
assert(0==1);
}
template <class mat1, class mat2, class mat3>
static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
const size_t cols, mat3 &cast_buffer,
command_queue &cq) {
static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*rows*/,
const size_t cols, mat3 & /*cast_buffer*/, command_queue & /*cq*/) {
assert(0==1);
}
};

View File

@ -125,7 +125,7 @@ class UCL_D_Vec : public UCL_BaseMat {
* - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs **/
template <class ucl_type>
inline void view(ucl_type &input, const size_t rows, const size_t cols) {
inline void view(ucl_type &input, const size_t UCL_DEBUG_ARG(rows), const size_t cols) {
#ifdef UCL_DEBUG
assert(rows==1);
#endif
@ -230,8 +230,8 @@ class UCL_D_Vec : public UCL_BaseMat {
* - The view does not prevent the memory from being freed by the
* allocating container when using CUDA APIs **/
template <class ucl_type>
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
const size_t cols) {
inline void view_offset(const size_t offset,ucl_type &input,
const size_t UCL_DEBUG_ARG(rows), const size_t cols) {
#ifdef UCL_DEBUG
assert(rows==1);
#endif

View File

@ -126,7 +126,7 @@ class UCL_H_Vec : public UCL_BaseMat {
* allocating container when using CUDA APIs
* - Viewing a device container on the host is not supported **/
template <class ucl_type>
inline void view(ucl_type &input, const size_t rows, const size_t cols) {
inline void view(ucl_type &input, const size_t UCL_DEBUG_ARG(rows), const size_t cols) {
#ifdef UCL_DEBUG
assert(rows==1);
#endif
@ -188,7 +188,7 @@ class UCL_H_Vec : public UCL_BaseMat {
* allocating container when using CUDA APIs
* - Viewing a device pointer on the host is not supported **/
template <class ptr_type>
inline void view(ptr_type *input, const size_t rows, const size_t cols,
inline void view(ptr_type *input, const size_t UCL_DEBUG_ARG(rows), const size_t cols,
UCL_Device &dev) {
#ifdef UCL_DEBUG
assert(rows==1);
@ -233,7 +233,7 @@ class UCL_H_Vec : public UCL_BaseMat {
* allocating container when using CUDA APIs
* - Viewing a device container on the host is not supported **/
template <class ucl_type>
inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
inline void view_offset(const size_t offset,ucl_type &input,const size_t UCL_DEBUG_ARG(rows),
const size_t cols) {
#ifdef UCL_DEBUG
assert(rows==1);

View File

@ -27,7 +27,7 @@ template <int st> struct _ucl_s_obj_help;
// -- Can potentially use same memory if shared by accelerator
template <> struct _ucl_s_obj_help<1> {
template <class t1, class t2, class t3>
static inline int alloc(t1 &host, t2 &device, t3 &_buffer,
static inline int alloc(t1 &host, t2 &device, t3 & /*_buffer*/,
const int cols, UCL_Device &acc,
const enum UCL_MEMOPT kind1,
const enum UCL_MEMOPT kind2) {
@ -131,41 +131,37 @@ template <> struct _ucl_s_obj_help<1> {
}
template <class t1, class t2, class t3>
static inline void copy(t1 &dst, t2 &src, t3 &buffer, const bool async) {
static inline void copy(t1 &dst, t2 &src, t3 & /*buffer*/, const bool async) {
ucl_copy(dst,src,async);
}
template <class t1, class t2, class t3>
static inline void copy(t1 &dst, t2 &src, t3 &buffer, command_queue &cq) {
static inline void copy(t1 &dst, t2 &src, t3 & /*buffer*/, command_queue &cq) {
ucl_copy(dst,src,cq);
}
template <class t1, class t2, class t3>
static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer,
const bool async) {
static inline void copy(t1 &dst, t2 &src, const int cols, t3 & /*buffer*/, const bool async) {
ucl_copy(dst,src,cols,async);
}
template <class t1, class t2, class t3>
static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer,
command_queue &cq) {
static inline void copy(t1 &dst, t2 &src, const int cols, t3 & /*buffer*/, command_queue &cq) {
ucl_copy(dst,src,cols,cq);
}
template <class t1, class t2, class t3>
static inline void copy(t1 &dst, t2 &src, const int rows, const int cols,
t3 &buffer, const bool async) {
static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, t3 & /*buffer*/, const bool async) {
ucl_copy(dst,src,rows,cols,async);
}
template <class t1, class t2, class t3>
static inline void copy(t1 &dst, t2 &src, const int rows, const int cols,
t3 &buffer, command_queue &cq) {
static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, t3 & /*buffer*/, command_queue &cq) {
ucl_copy(dst,src,rows,cols,cq);
}
template <class t1, class t2, class t3>
static inline int dev_resize(t1 &device, t2 &host, t3 &buff,const int cols) {
static inline int dev_resize(t1 &device, t2 &host, t3 & /*buff*/,const int cols) {
if (device.kind()==UCL_VIEW) {
device.view(host);
return UCL_SUCCESS;
@ -353,7 +349,7 @@ template <int st> struct _ucl_s_obj_help {
}
template <class t1, class t2, class t3>
static inline int dev_resize(t1 &device, t2 &host, t3 &buff,const int cols) {
static inline int dev_resize(t1 &device, t2 & /*host*/, t3 &buff,const int cols) {
int err=buff.resize(cols);
if (err!=UCL_SUCCESS)
return err;

322
lib/gpu/lal_amoeba.cpp Normal file
View File

@ -0,0 +1,322 @@
/***************************************************************************
amoeba.cpp
-------------------
Trung Dac Nguyen (Northwestern)
Class for acceleration of the amoeba pair style.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : trung.nguyen@northwestern.edu
***************************************************************************/
#if defined(USE_OPENCL)
#include "amoeba_cl.h"
#elif defined(USE_CUDART)
const char *amoeba=0;
#else
#include "amoeba_cubin.h"
#endif
#include "lal_amoeba.h"
#include <cassert>
namespace LAMMPS_AL {
#define AmoebaT Amoeba<numtyp, acctyp>
extern Device<PRECISION,ACC_PRECISION> device;
template <class numtyp, class acctyp>
AmoebaT::Amoeba() : BaseAmoeba<numtyp,acctyp>(),
_allocated(false) {
}
template <class numtyp, class acctyp>
AmoebaT::~Amoeba() {
clear();
}
template <class numtyp, class acctyp>
int AmoebaT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors);
}
template <class numtyp, class acctyp>
int AmoebaT::init(const int ntypes, const int max_amtype, const int max_amclass,
const double *host_pdamp, const double *host_thole,
const double *host_dirdamp, const int *host_amtype2class,
const double *host_special_hal,
const double * /*host_special_repel*/,
const double * /*host_special_disp*/,
const double *host_special_mpole,
const double * /*host_special_polar_wscale*/,
const double *host_special_polar_piscale,
const double *host_special_polar_pscale,
const double *host_csix, const double *host_adisp,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const int maxspecial15,
const double cell_size, const double gpu_split, FILE *_screen,
const double polar_dscale, const double polar_uscale) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15,
cell_size,gpu_split,_screen,amoeba,
"k_amoeba_multipole", "k_amoeba_udirect2b",
"k_amoeba_umutual2b", "k_amoeba_polar",
"k_amoeba_fphi_uind", "k_amoeba_fphi_mpole",
"k_amoeba_short_nbor", "k_amoeba_special15");
if (success!=0)
return success;
// If atom type constants fit in shared memory use fast kernel
int lj_types=ntypes;
shared_types=false;
int max_shared_types=this->device->max_shared_types();
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
lj_types=max_shared_types;
shared_types=true;
}
_lj_types=lj_types;
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp4> host_write(max_amtype, *(this->ucl_device), UCL_WRITE_ONLY);
for (int i = 0; i < max_amtype; i++) {
host_write[i].x = host_pdamp[i];
host_write[i].y = host_thole[i];
host_write[i].z = host_dirdamp[i];
host_write[i].w = host_amtype2class[i];
}
coeff_amtype.alloc(max_amtype,*(this->ucl_device), UCL_READ_ONLY);
ucl_copy(coeff_amtype,host_write,false);
UCL_H_Vec<numtyp4> host_write2(max_amclass, *(this->ucl_device), UCL_WRITE_ONLY);
for (int i = 0; i < max_amclass; i++) {
host_write2[i].x = host_csix[i];
host_write2[i].y = host_adisp[i];
host_write2[i].z = (numtyp)0;
host_write2[i].w = (numtyp)0;
}
coeff_amclass.alloc(max_amclass,*(this->ucl_device), UCL_READ_ONLY);
ucl_copy(coeff_amclass,host_write2,false);
UCL_H_Vec<numtyp4> dview(5, *(this->ucl_device), UCL_WRITE_ONLY);
sp_amoeba.alloc(5,*(this->ucl_device),UCL_READ_ONLY);
for (int i=0; i<5; i++) {
dview[i].x=host_special_hal[i];
dview[i].y=host_special_polar_piscale[i];
dview[i].z=host_special_polar_pscale[i];
dview[i].w=host_special_mpole[i];
}
ucl_copy(sp_amoeba,dview,5,false);
_polar_dscale = polar_dscale;
_polar_uscale = polar_uscale;
_allocated=true;
this->_max_bytes=coeff_amtype.row_bytes() + coeff_amclass.row_bytes()
+ sp_amoeba.row_bytes() + this->_tep.row_bytes()
+ this->_fieldp.row_bytes() + this->_thetai1.row_bytes()
+ this->_thetai2.row_bytes() + this->_thetai3.row_bytes()
+ this->_igrid.row_bytes() + this->_cgrid_brick.row_bytes();
return 0;
}
template <class numtyp, class acctyp>
void AmoebaT::clear() {
if (!_allocated)
return;
_allocated=false;
coeff_amtype.clear();
coeff_amclass.clear();
sp_amoeba.clear();
this->clear_atomic();
}
template <class numtyp, class acctyp>
double AmoebaT::host_memory_usage() const {
return this->host_memory_usage_atomic()+sizeof(Amoeba<numtyp,acctyp>);
}
// ---------------------------------------------------------------------------
// Calculate the multipole real-space term, returning tep
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int AmoebaT::multipole_real(const int eflag, const int vflag) {
int ainum=this->ans->inum();
if (ainum == 0)
return 0;
int _nall=this->atom->nall();
int nbor_pitch=this->nbor->nbor_pitch();
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
this->time_pair.start();
// Build the short neighbor list for the cutoff off2_mpole,
// at this point mpole is the first kernel in a time step for AMOEBA
this->k_short_nbor.set_size(GX,BX);
this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
&this->_nbor_data->begin(),
&this->dev_short_nbor, &this->_off2_mpole, &ainum,
&nbor_pitch, &this->_threads_per_atom);
this->k_multipole.set_size(GX,BX);
this->k_multipole.run(&this->atom->x, &this->atom->extra,
&coeff_amtype, &sp_amoeba,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->ans->force, &this->ans->engv, &this->_tep,
&eflag, &vflag, &ainum, &_nall, &nbor_pitch,
&this->_threads_per_atom, &this->_aewald, &this->_felec,
&this->_off2_mpole, &_polar_dscale, &_polar_uscale);
this->time_pair.stop();
return GX;
}
// ---------------------------------------------------------------------------
// Launch the real-space permanent field kernel
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int AmoebaT::udirect2b(const int /*eflag*/, const int /*vflag*/) {
int ainum=this->ans->inum();
if (ainum == 0)
return 0;
int _nall=this->atom->nall();
int nbor_pitch=this->nbor->nbor_pitch();
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
this->time_pair.start();
// Build the short neighbor list for the cutoff _off2_polar, if not done yet
// this is the first kernel in a time step where _off2_polar is used
if (!this->short_nbor_polar_avail) {
this->k_short_nbor.set_size(GX,BX);
this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
&this->_nbor_data->begin(),
&this->dev_short_nbor, &this->_off2_polar, &ainum,
&nbor_pitch, &this->_threads_per_atom);
this->short_nbor_polar_avail = true;
}
this->k_udirect2b.set_size(GX,BX);
this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_amoeba,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->_fieldp, &ainum, &_nall, &nbor_pitch,
&this->_threads_per_atom, &this->_aewald, &this->_off2_polar,
&_polar_dscale, &_polar_uscale);
this->time_pair.stop();
return GX;
}
// ---------------------------------------------------------------------------
// Launch the real-space induced field kernel, returning field and fieldp
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int AmoebaT::umutual2b(const int /*eflag*/, const int /*vflag*/) {
int ainum=this->ans->inum();
if (ainum == 0)
return 0;
int _nall=this->atom->nall();
int nbor_pitch=this->nbor->nbor_pitch();
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
this->time_pair.start();
// Build the short neighbor list if not done yet
if (!this->short_nbor_polar_avail) {
this->k_short_nbor.set_size(GX,BX);
this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
&this->_nbor_data->begin(), &this->dev_short_nbor,
&this->_off2_polar, &ainum, &nbor_pitch,
&this->_threads_per_atom);
this->short_nbor_polar_avail = true;
}
this->k_umutual2b.set_size(GX,BX);
this->k_umutual2b.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_amoeba,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor, &this->_fieldp, &ainum, &_nall,
&nbor_pitch, &this->_threads_per_atom, &this->_aewald,
&this->_off2_polar, &_polar_dscale, &_polar_uscale);
this->time_pair.stop();
return GX;
}
// ---------------------------------------------------------------------------
// Launch the polar real-space kernel, returning tep
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int AmoebaT::polar_real(const int eflag, const int vflag) {
int ainum=this->ans->inum();
if (ainum == 0)
return 0;
int _nall=this->atom->nall();
int nbor_pitch=this->nbor->nbor_pitch();
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
const int GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
/*
const int cus = this->device->gpu->cus();
while (GX < cus && GX > 1) {
BX /= 2;
GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
}
*/
this->time_pair.start();
// Build the short neighbor list if not done yet
if (!this->short_nbor_polar_avail) {
this->k_short_nbor.set_size(GX,BX);
this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
&this->_nbor_data->begin(),
&this->dev_short_nbor, &this->_off2_polar, &ainum,
&nbor_pitch, &this->_threads_per_atom);
this->short_nbor_polar_avail = true;
}
this->k_polar.set_size(GX,BX);
this->k_polar.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_amoeba,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->ans->force, &this->ans->engv, &this->_tep,
&eflag, &vflag, &ainum, &_nall, &nbor_pitch,
&this->_threads_per_atom, &this->_aewald, &this->_felec,
&this->_off2_polar, &_polar_dscale, &_polar_uscale);
this->time_pair.stop();
// Signal that short nbor list is not avail for the next time step
// do it here because polar_real() is the last kernel in a time step at this point
this->short_nbor_polar_avail = false;
return GX;
}
template class Amoeba<PRECISION,ACC_PRECISION>;
}

2099
lib/gpu/lal_amoeba.cu Normal file

File diff suppressed because it is too large Load Diff

100
lib/gpu/lal_amoeba.h Normal file
View File

@ -0,0 +1,100 @@
/***************************************************************************
amoeba.h
-------------------
Trung Dac Nguyen (Northwestern)
Class for acceleration of the amoeba pair style.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : trung.nguyen@northwestern.edu
***************************************************************************/
#ifndef LAL_AMOEBA_H
#define LAL_AMOEBA_H
#include "lal_base_amoeba.h"
namespace LAMMPS_AL {
template <class numtyp, class acctyp>
class Amoeba : public BaseAmoeba<numtyp, acctyp> {
public:
Amoeba();
~Amoeba();
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(const int ntypes, const int max_amtype, const int max_amclass,
const double *host_pdamp, const double *host_thole,
const double *host_dirdamp, const int *host_amtype2class,
const double *host_special_mpole,
const double *host_special_hal,
const double *host_special_repel,
const double *host_special_disp,
const double *host_special_polar_wscale,
const double *host_special_polar_piscale,
const double *host_special_polar_pscale,
const double *host_csix, const double *host_adisp,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const int maxspecial15, const double cell_size,
const double gpu_split, FILE *_screen,
const double polar_dscale, const double polar_uscale);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
/// Returns memory usage on device per atom
int bytes_per_atom(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage() const;
// --------------------------- TYPE DATA --------------------------
/// pdamp = coeff_amtype.x; thole = coeff_amtype.y;
/// dirdamp = coeff_amtype.z; amtype2class = coeff_amtype.w
UCL_D_Vec<numtyp4> coeff_amtype;
/// csix = coeff_amclass.x; adisp = coeff_amclass.y;
UCL_D_Vec<numtyp4> coeff_amclass;
/// Special amoeba values [0-4]:
/// sp_amoeba.x = special_hal
/// sp_amoeba.y = special_polar_pscale,
/// sp_amoeba.z = special_polar_piscale
/// sp_amoeba.w = special_mpole
UCL_D_Vec<numtyp4> sp_amoeba;
/// If atom type constants fit in shared memory, use fast kernels
bool shared_types;
/// Number of atom types
int _lj_types;
numtyp _polar_dscale, _polar_uscale;
numtyp _qqrd2e;
protected:
bool _allocated;
int multipole_real(const int eflag, const int vflag);
int udirect2b(const int eflag, const int vflag);
int umutual2b(const int eflag, const int vflag);
int polar_real(const int eflag, const int vflag);
};
}
#endif

213
lib/gpu/lal_amoeba_ext.cpp Normal file
View File

@ -0,0 +1,213 @@
/***************************************************************************
amoeba_ext.cpp
-------------------
Trung Dac Nguyen (Northwestern)
Functions for LAMMPS access to amoeba acceleration routines.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : trung.nguyen@northwestern.edu
***************************************************************************/
#include <iostream>
#include <cassert>
#include <cmath>
#include "lal_amoeba.h"
using namespace std;
using namespace LAMMPS_AL;
static Amoeba<PRECISION,ACC_PRECISION> AMOEBAMF;
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclass,
const double *host_pdamp, const double *host_thole,
const double *host_dirdamp, const int *host_amtype2class,
const double *host_special_hal,
const double *host_special_repel,
const double *host_special_disp,
const double *host_special_mpole,
const double *host_special_polar_wscale,
const double *host_special_polar_piscale,
const double *host_special_polar_pscale,
const double *host_csix, const double *host_adisp,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const int maxspecial15,
const double cell_size, int &gpu_mode, FILE *screen,
const double polar_dscale, const double polar_uscale) {
AMOEBAMF.clear();
gpu_mode=AMOEBAMF.device->gpu_mode();
double gpu_split=AMOEBAMF.device->particle_split();
int first_gpu=AMOEBAMF.device->first_device();
int last_gpu=AMOEBAMF.device->last_device();
int world_me=AMOEBAMF.device->world_me();
int gpu_rank=AMOEBAMF.device->gpu_rank();
int procs_per_gpu=AMOEBAMF.device->procs_per_gpu();
AMOEBAMF.device->init_message(screen,"amoeba",first_gpu,last_gpu);
bool message=false;
if (AMOEBAMF.device->replica_me()==0 && screen)
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fflush(screen);
}
int init_ok=0;
if (world_me==0)
init_ok=AMOEBAMF.init(ntypes, max_amtype, max_amclass,
host_pdamp, host_thole, host_dirdamp,
host_amtype2class, host_special_hal,
host_special_repel, host_special_disp,
host_special_mpole, host_special_polar_wscale,
host_special_polar_piscale, host_special_polar_pscale,
host_csix, host_adisp, nlocal, nall, max_nbors,
maxspecial, maxspecial15, cell_size, gpu_split,
screen, polar_dscale, polar_uscale);
AMOEBAMF.device->world_barrier();
if (message)
fprintf(screen,"Done.\n");
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0)
init_ok=AMOEBAMF.init(ntypes, max_amtype, max_amclass,
host_pdamp, host_thole, host_dirdamp,
host_amtype2class, host_special_hal,
host_special_repel, host_special_disp,
host_special_mpole, host_special_polar_wscale,
host_special_polar_piscale, host_special_polar_pscale,
host_csix, host_adisp, nlocal, nall, max_nbors,
maxspecial, maxspecial15, cell_size, gpu_split,
screen, polar_dscale, polar_uscale);
AMOEBAMF.device->gpu_barrier();
if (message)
fprintf(screen,"Done.\n");
}
if (message)
fprintf(screen,"\n");
if (init_ok==0)
AMOEBAMF.estimate_gpu_overhead();
return init_ok;
}
void amoeba_gpu_clear() {
AMOEBAMF.clear();
}
int** amoeba_gpu_precompute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *host_amtype,
int *host_amgroup, double **host_rpole, double ** /*host_uind*/,
double ** /*host_uinp*/, double * /*host_pval*/,
double *sublo, double *subhi, tagint *tag,
int **nspecial, tagint **special,
int *nspecial15, tagint **special15,
const bool eflag_in, const bool vflag_in,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time,
bool &success, double *host_q, double *boxlo, double *prd) {
return AMOEBAMF.precompute(ago, inum_full, nall, host_x, host_type,
host_amtype, host_amgroup, host_rpole,
nullptr, nullptr, nullptr, sublo, subhi, tag,
nspecial, special, nspecial15, special15,
eflag_in, vflag_in, eatom, vatom,
host_start, ilist, jnum, cpu_time,
success, host_q, boxlo, prd);
}
void amoeba_gpu_compute_multipole_real(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
int *host_amtype, int *host_amgroup, double **host_rpole,
double *sublo, double *subhi, tagint *tag, int **nspecial,
tagint **special, int *nspecial15, tagint** special15,
const bool eflag, const bool vflag, const bool eatom,
const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time,
bool &success, const double aewald, const double felec, const double off2,
double *host_q, double *boxlo, double *prd, void **tep_ptr) {
AMOEBAMF.compute_multipole_real(ago, inum_full, nall, host_x, host_type,
host_amtype, host_amgroup, host_rpole, nullptr, sublo, subhi,
tag, nspecial, special, nspecial15, special15,
eflag, vflag, eatom, vatom, host_start, ilist, jnum,
cpu_time, success, aewald, felec, off2, host_q, boxlo, prd, tep_ptr);
}
void amoeba_gpu_compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole,
double **host_uind, double **host_uinp,
const double aewald, const double off2, void **fieldp_ptr) {
AMOEBAMF.compute_udirect2b(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, nullptr,
aewald, off2, fieldp_ptr);
}
void amoeba_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole,
double **host_uind, double **host_uinp,
const double aewald, const double off2, void **fieldp_ptr) {
AMOEBAMF.compute_umutual2b(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, nullptr,
aewald, off2, fieldp_ptr);
}
void amoeba_gpu_update_fieldp(void **fieldp_ptr) {
AMOEBAMF.update_fieldp(fieldp_ptr);
}
void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole,
double **host_uind, double **host_uinp,
const bool eflag_in, const bool vflag_in,
const bool eatom, const bool vatom,
const double aewald, const double felec, const double off2,
void **tep_ptr) {
AMOEBAMF.compute_polar_real(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, nullptr,
eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
}
void amoeba_gpu_precompute_kspace(const int inum_full, const int bsorder,
double ***host_thetai1, double ***host_thetai2,
double ***host_thetai3, int** igrid,
const int nzlo_out, const int nzhi_out,
const int nylo_out, const int nyhi_out,
const int nxlo_out, const int nxhi_out) {
AMOEBAMF.precompute_kspace(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3, igrid,
nzlo_out, nzhi_out, nylo_out, nyhi_out, nxlo_out, nxhi_out);
}
void amoeba_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1,
void **host_fdip_phi2, void **host_fdip_sum_phi) {
AMOEBAMF.compute_fphi_uind(host_grid_brick, host_fdip_phi1,
host_fdip_phi2, host_fdip_sum_phi);
}
void amoeba_gpu_fphi_mpole(double ***host_grid_brick, void **host_fphi, const double felec) {
AMOEBAMF.compute_fphi_mpole(host_grid_brick, host_fphi, felec);
}
void amoeba_setup_fft(const int numel, const int element_type) {
AMOEBAMF.setup_fft(numel, element_type);
}
void amoeba_compute_fft1d(void* in, void* out, const int numel, const int mode) {
AMOEBAMF.compute_fft1d(in, out, numel, mode);
}
double amoeba_gpu_bytes() {
return AMOEBAMF.host_memory_usage();
}

View File

@ -48,6 +48,8 @@ int AtomT::bytes_per_atom() const {
bytes+=sizeof(numtyp);
if (_vel)
bytes+=4*sizeof(numtyp);
if (_extra_fields>0)
bytes+=_extra_fields*sizeof(numtyp4);
return bytes;
}
@ -122,6 +124,11 @@ bool AtomT::alloc(const int nall) {
UCL_READ_ONLY)==UCL_SUCCESS);
gpu_bytes+=v.device.row_bytes();
}
if (_extra_fields>0) {
success=success && (extra.alloc(_max_atoms*_extra_fields,*dev,UCL_WRITE_ONLY,
UCL_READ_ONLY)==UCL_SUCCESS);
gpu_bytes+=extra.device.row_bytes();
}
if (_gpu_nbor>0) {
if (_bonds) {
@ -156,7 +163,8 @@ bool AtomT::alloc(const int nall) {
template <class numtyp, class acctyp>
bool AtomT::add_fields(const bool charge, const bool rot,
const int gpu_nbor, const bool bonds, const bool vel) {
const int gpu_nbor, const bool bonds, const bool vel,
const int extra_fields) {
bool success=true;
// Ignore host/device transfers?
int gpu_bytes=0;
@ -191,7 +199,17 @@ bool AtomT::add_fields(const bool charge, const bool rot,
}
}
if (bonds && !_bonds) {
if (extra_fields > 0 && _extra_fields==0) {
_extra_fields=extra_fields;
_other=true;
if (_host_view==false) {
success=success && (extra.alloc(_max_atoms*_extra_fields,*dev,UCL_WRITE_ONLY,
UCL_READ_ONLY)==UCL_SUCCESS);
gpu_bytes+=extra.device.row_bytes();
}
}
if (bonds && _bonds==false) {
_bonds=true;
if (_bonds && _gpu_nbor>0) {
success=success && (dev_tag.alloc(_max_atoms,*dev,
@ -254,7 +272,8 @@ bool AtomT::add_fields(const bool charge, const bool rot,
template <class numtyp, class acctyp>
bool AtomT::init(const int nall, const bool charge, const bool rot,
UCL_Device &devi, const int gpu_nbor, const bool bonds, const bool vel) {
UCL_Device &devi, const int gpu_nbor, const bool bonds, const bool vel,
const int extra_fields) {
clear();
bool success=true;
@ -262,13 +281,15 @@ bool AtomT::init(const int nall, const bool charge, const bool rot,
_q_avail=false;
_quat_avail=false;
_v_avail=false;
_extra_avail=false;
_resized=false;
_gpu_nbor=gpu_nbor;
_bonds=bonds;
_charge=charge;
_rot=rot;
_vel=vel;
_other=_charge || _rot || _vel;
_extra_fields=extra_fields;
_other=_charge || _rot || _vel || (extra_fields>0);
dev=&devi;
_time_transfer=0;
@ -282,10 +303,14 @@ bool AtomT::init(const int nall, const bool charge, const bool rot,
time_q.init(*dev);
time_quat.init(*dev);
time_vel.init(*dev);
time_extra.init(*dev);
time_pos.zero();
time_q.zero();
time_quat.zero();
time_vel.zero();
time_extra.zero();
_time_cast=0.0;
#ifdef GPU_CAST
@ -308,6 +333,8 @@ void AtomT::clear_resize() {
quat.clear();
if (_vel)
v.clear();
if (_extra_fields>0)
extra.clear();
dev_cell_id.clear();
dev_particle_id.clear();
@ -350,6 +377,7 @@ void AtomT::clear() {
time_q.clear();
time_quat.clear();
time_vel.clear();
time_extra.clear();
clear_resize();
#ifdef GPU_CAST
@ -370,12 +398,19 @@ double AtomT::host_memory_usage() const {
atom_bytes+=4;
if (_vel)
atom_bytes+=4;
if (_extra_fields>0)
atom_bytes+=_extra_fields;
return _max_atoms*atom_bytes*sizeof(numtyp)+sizeof(Atom<numtyp,acctyp>);
}
#ifdef USE_CUDPP
#define USE_CUDPP_ARG(arg) arg
#else
#define USE_CUDPP_ARG(arg)
#endif
// Sort arrays for neighbor list calculation
template <class numtyp, class acctyp>
void AtomT::sort_neighbor(const int num_atoms) {
void AtomT::sort_neighbor(const int USE_CUDPP_ARG(num_atoms)) {
#ifdef USE_CUDPP
CUDPPResult result = cudppSort(sort_plan, (unsigned *)dev_cell_id.begin(),
(int *)dev_particle_id.begin(),

View File

@ -76,7 +76,7 @@ class Atom {
* gpu_nbor 2 if binning on host and neighboring on device **/
bool init(const int nall, const bool charge, const bool rot,
UCL_Device &dev, const int gpu_nbor=0, const bool bonds=false,
const bool vel=false);
const bool vel=false, const int extra_fields=0);
/// Check if we have enough device storage and realloc if not
/** Returns true if resized with any call during this timestep **/
@ -96,7 +96,7 @@ class Atom {
* gpu_nbor 1 if neighboring will be performed on device
* gpu_nbor 2 if binning on host and neighboring on device **/
bool add_fields(const bool charge, const bool rot, const int gpu_nbor,
const bool bonds, const bool vel=false);
const bool bonds, const bool vel=false, const int extra_fields=0);
/// Returns true if GPU is using charges
bool charge() { return _charge; }
@ -107,6 +107,9 @@ class Atom {
/// Returns true if GPU is using velocities
bool velocity() { return _vel; }
/// Returns true if GPU is using extra fields
bool using_extra() { return (_extra_fields>0); }
/// Only free matrices of length inum or nall for resizing
void clear_resize();
@ -128,6 +131,8 @@ class Atom {
time_quat.add_to_total();
if (_vel)
time_vel.add_to_total();
if (_extra_fields>0)
time_extra.add_to_total();
}
/// Add copy times to timers
@ -139,6 +144,8 @@ class Atom {
time_quat.zero();
if (_vel)
time_vel.zero();
if (_extra_fields>0)
time_extra.zero();
}
/// Return the total time for host/device data transfer
@ -158,6 +165,10 @@ class Atom {
total+=time_vel.total_seconds();
time_vel.zero_total();
}
if (_extra_fields>0) {
total+=time_extra.total_seconds();
time_extra.zero_total();
}
return total+_time_transfer/1000.0;
}
@ -281,7 +292,11 @@ class Atom {
/// Signal that we need to transfer atom data for next timestep
inline void data_unavail()
{ _x_avail=false; _q_avail=false; _quat_avail=false; _v_avail=false; _resized=false; }
{ _x_avail=false; _q_avail=false; _quat_avail=false; _v_avail=false; _extra_avail=false; _resized=false; }
/// Signal that we need to transfer atom extra data for next kernel call
inline void extra_data_unavail()
{ _extra_avail=false; }
typedef struct { double x,y,z; } vec3d;
typedef struct { numtyp x,y,z,w; } vec4d_t;
@ -312,7 +327,7 @@ class Atom {
/// Copy positions and types to device asynchronously
/** Copies nall() elements **/
inline void add_x_data(double **host_ptr, int *host_type) {
inline void add_x_data(double ** /*host_ptr*/, int * /*host_type*/) {
time_pos.start();
if (_x_avail==false) {
#ifdef GPU_CAST
@ -426,7 +441,7 @@ class Atom {
/// Copy velocities and tags to device asynchronously
/** Copies nall() elements **/
inline void add_v_data(double **host_ptr, tagint *host_tag) {
inline void add_v_data(double ** /*host_ptr*/, tagint * /*host_tag*/) {
time_vel.start();
if (_v_avail==false) {
#ifdef GPU_CAST
@ -450,6 +465,33 @@ class Atom {
add_v_data(host_ptr,host_tag);
}
// Cast extras to write buffer
template<class cpytyp>
inline void cast_extra_data(cpytyp *host_ptr) {
if (_extra_avail==false) {
double t=MPI_Wtime();
#if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
#pragma omp parallel for simd schedule(static)
#elif (LAL_USE_OMP_SIMD == 1)
#pragma omp simd
#endif
for (int i=0; i<_nall*_extra_fields; i++)
extra[i]=host_ptr[i];
_time_cast+=MPI_Wtime()-t;
}
}
// Copy extras to device
/** Copies nall()*_extra elements **/
inline void add_extra_data() {
time_extra.start();
if (_extra_avail==false) {
extra.update_device(_nall*_extra_fields,true);
_extra_avail=true;
}
time_extra.stop();
}
/// Add in casting time from additional data (seconds)
inline void add_cast_time(double t) { _time_cast+=t; }
@ -473,6 +515,8 @@ class Atom {
UCL_Vector<numtyp,numtyp> quat;
/// Velocities
UCL_Vector<numtyp,numtyp> v;
/// Extras
UCL_Vector<numtyp4,numtyp4> extra;
#ifdef GPU_CAST
UCL_Vector<numtyp,numtyp> x_cast;
@ -493,7 +537,7 @@ class Atom {
UCL_H_Vec<int> host_particle_id;
/// Device timers
UCL_Timer time_pos, time_q, time_quat, time_vel;
UCL_Timer time_pos, time_q, time_quat, time_vel, time_extra;
/// Geryon device
UCL_Device *dev;
@ -508,11 +552,12 @@ class Atom {
bool _compiled;
// True if data has been copied to device already
bool _x_avail, _q_avail, _quat_avail, _v_avail, _resized;
bool _x_avail, _q_avail, _quat_avail, _v_avail, _extra_avail, _resized;
bool alloc(const int nall);
bool _allocated, _rot, _charge, _bonds, _vel, _other;
int _extra_fields;
int _max_atoms, _nall, _gpu_nbor;
bool _host_view;
double _time_cast, _time_transfer;

962
lib/gpu/lal_base_amoeba.cpp Normal file
View File

@ -0,0 +1,962 @@
/***************************************************************************
base_amoeba.cpp
-------------------
Trung Dac Nguyen (Northwestern)
Base class for pair styles needing per-particle data for position,
charge, and type.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : trung.nguyen@northwestern.edu
***************************************************************************/
#include "lal_base_amoeba.h"
namespace LAMMPS_AL {
#define BaseAmoebaT BaseAmoeba<numtyp, acctyp>
extern Device<PRECISION,ACC_PRECISION> global_device;
template <class numtyp, class acctyp>
BaseAmoebaT::BaseAmoeba() : _compiled(false), _max_bytes(0), short_nbor_polar_avail(false) {
device=&global_device;
ans=new Answer<numtyp,acctyp>();
nbor=new Neighbor();
pair_program=nullptr;
ucl_device=nullptr;
}
template <class numtyp, class acctyp>
BaseAmoebaT::~BaseAmoeba() {
delete ans;
delete nbor;
k_multipole.clear();
k_udirect2b.clear();
k_umutual2b.clear();
k_fphi_uind.clear();
k_fphi_mpole.clear();
k_polar.clear();
k_special15.clear();
k_short_nbor.clear();
#if 0 // !defined(USE_OPENCL) && !defined(USE_HIP)
if (fft_plan_created) cufftDestroy(plan);
#endif
if (pair_program) delete pair_program;
}
template <class numtyp, class acctyp>
int BaseAmoebaT::bytes_per_atom_atomic(const int max_nbors) const {
return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
nbor->bytes_per_atom(max_nbors);
}
template <class numtyp, class acctyp>
int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
const int max_nbors, const int maxspecial,
const int maxspecial15,
const double cell_size, const double gpu_split,
FILE *_screen, const void *pair_program,
const char *k_name_multipole,
const char *k_name_udirect2b,
const char *k_name_umutual2b,
const char *k_name_polar,
const char *k_name_fphi_uind,
const char *k_name_fphi_mpole,
const char *k_name_short_nbor,
const char* k_name_special15) {
screen=_screen;
int gpu_nbor=0;
if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_NEIGH)
gpu_nbor=1;
else if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
gpu_nbor=2;
int _gpu_host=0;
int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
if (host_nlocal>0)
_gpu_host=1;
_threads_per_atom=device->threads_per_charge();
bool charge = true;
bool rot = false;
bool vel = false;
_extra_fields = 24; // round up to accomodate quadruples of numtyp values
// rpole 13; uind 3; uinp 3; amtype, amgroup; pval
int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial,vel,_extra_fields/4);
if (success!=0)
return success;
if (ucl_device!=device->gpu) _compiled=false;
ucl_device=device->gpu;
atom=&device->atom;
_block_size=device->pair_block_size();
_block_bio_size=device->block_bio_pair();
compile_kernels(*ucl_device,pair_program,k_name_multipole,
k_name_udirect2b, k_name_umutual2b,k_name_polar,
k_name_fphi_uind, k_name_fphi_mpole,
k_name_short_nbor, k_name_special15);
if (_threads_per_atom>1 && gpu_nbor==0) {
nbor->packing(true);
_nbor_data=&(nbor->dev_packed);
} else {
_nbor_data=&(nbor->dev_nbor);
}
bool alloc_packed=false;
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,
_gpu_host,max_nbors,cell_size,alloc_packed,
_threads_per_atom);
if (success!=0)
return success;
// Initialize host-device load balancer
hd_balancer.init(device,gpu_nbor,gpu_split);
// Initialize timers for the selected GPU
time_pair.init(*ucl_device);
time_pair.zero();
pos_tex.bind_float(atom->x,4);
q_tex.bind_float(atom->q,1);
_max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
_maxspecial=maxspecial;
_maxspecial15=maxspecial15;
// allocate per-atom array tep
int ef_nall=nlocal; //nall;
if (ef_nall==0)
ef_nall=2000;
dev_short_nbor.alloc(ef_nall*(2+max_nbors),*(this->ucl_device),UCL_READ_WRITE);
_max_tep_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
_tep.alloc(_max_tep_size*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
_max_fieldp_size = _max_tep_size;
_fieldp.alloc(_max_fieldp_size*8,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
_max_thetai_size = 0;
_nmax = nall;
dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY);
dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY);
dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);
#if 0 // !defined(USE_OPENCL) && !defined(USE_HIP)
fft_plan_created = false;
#endif
#ifdef ASYNC_DEVICE_COPY
_end_command_queue=ucl_device->num_queues();
ucl_device->push_command_queue();
#endif
return success;
}
template <class numtyp, class acctyp>
void BaseAmoebaT::estimate_gpu_overhead(const int add_kernels) {
device->estimate_gpu_overhead(1+add_kernels,_gpu_overhead,_driver_overhead);
}
template <class numtyp, class acctyp>
void BaseAmoebaT::clear_atomic() {
// Output any timing information
acc_timers();
double avg_split=hd_balancer.all_avg_split();
_gpu_overhead*=hd_balancer.timestep();
_driver_overhead*=hd_balancer.timestep();
device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
_gpu_overhead,_driver_overhead,_threads_per_atom,screen);
time_pair.clear();
hd_balancer.clear();
dev_short_nbor.clear();
nbor->clear();
ans->clear();
_tep.clear();
_fieldp.clear();
_thetai1.clear();
_thetai2.clear();
_thetai3.clear();
_igrid.clear();
_fdip_phi1.clear();
_fdip_phi2.clear();
_fdip_sum_phi.clear();
_cgrid_brick.clear();
dev_nspecial15.clear();
dev_special15.clear();
dev_special15_t.clear();
}
// ---------------------------------------------------------------------------
// Copy neighbor list from host
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int * BaseAmoebaT::reset_nbors(const int nall, const int inum, int *ilist,
int *numj, int **firstneigh, bool &success) {
success=true;
int mn=nbor->max_nbor_loop(inum,numj,ilist);
resize_atom(inum,nall,success);
resize_local(inum,mn,success);
if (!success)
return nullptr;
nbor->get_host(inum,ilist,numj,firstneigh,block_size());
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
if (bytes>_max_an_bytes)
_max_an_bytes=bytes;
return ilist;
}
// ---------------------------------------------------------------------------
// Build neighbor list on device
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
inline int BaseAmoebaT::build_nbor_list(const int inum, const int host_inum,
const int nall, double **host_x,
int *host_type, double *sublo,
double *subhi, tagint *tag,
int **nspecial, tagint **special,
int *nspecial15, tagint **special15,
bool &success) {
success=true;
resize_atom(inum,nall,success);
resize_local(inum,host_inum,nbor->max_nbors(),success);
if (!success)
return 0;
atom->cast_copy_x(host_x,host_type);
int mn;
nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi,
tag, nspecial, special, success, mn, ans->error_flag);
// add one-five neighbors
if (_maxspecial15>0) {
UCL_H_Vec<int> view_nspecial15;
UCL_H_Vec<tagint> view_special15;
view_nspecial15.view(nspecial15,nall,*ucl_device);
view_special15.view(special15[0],nall*_maxspecial15,*ucl_device);
ucl_copy(dev_nspecial15,view_nspecial15,nall,false);
ucl_copy(dev_special15_t,view_special15,_maxspecial15*nall,false);
nbor->transpose(dev_special15, dev_special15_t, _maxspecial15, nall);
add_onefive_neighbors();
}
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
if (bytes>_max_an_bytes)
_max_an_bytes=bytes;
return mn;
}
// ---------------------------------------------------------------------------
// Prepare for multiple kernel calls in a time step:
// - reallocate per-atom arrays, if needed
// - transfer extra data from host to device
// - build the full neighbor lists for use by different kernels
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *host_amtype,
int *host_amgroup, double **host_rpole,
double **host_uind, double **host_uinp, double *host_pval,
double *sublo, double *subhi, tagint *tag,
int **nspecial, tagint **special,
int *nspecial15, tagint **special15,
const bool eflag_in, const bool vflag_in,
const bool eatom, const bool vatom, int &host_start,
int **&ilist, int **&jnum, const double cpu_time,
bool &success, double *host_q, double * /*boxlo*/, double * /*prd*/) {
acc_timers();
if (eatom) _eflag=2;
else if (eflag_in) _eflag=1;
else _eflag=0;
if (vatom) _vflag=2;
else if (vflag_in) _vflag=1;
else _vflag=0;
#ifdef LAL_NO_BLOCK_REDUCE
if (_eflag) _eflag=2;
if (_vflag) _vflag=2;
#endif
set_kernel(_eflag,_vflag);
// ------------------- Resize 1-5 neighbor arrays ------------------------
if (nall>_nmax) {
_nmax = nall;
dev_nspecial15.clear();
dev_special15.clear();
dev_special15_t.clear();
dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY);
dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY);
dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);
}
if (inum_full==0) {
host_start=0;
// Make sure textures are correct if realloc by a different hybrid style
resize_atom(0,nall,success);
zero_timers();
return nullptr;
}
hd_balancer.balance(cpu_time);
int inum=hd_balancer.get_gpu_count(ago,inum_full);
ans->inum(inum);
host_start=inum;
// Build neighbor list on GPU if necessary
if (ago==0) {
_max_nbors = build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
sublo, subhi, tag, nspecial, special, nspecial15, special15,
success);
if (!success)
return nullptr;
atom->cast_q_data(host_q);
hd_balancer.start_timer();
} else {
atom->cast_x_data(host_x,host_type);
atom->cast_q_data(host_q);
hd_balancer.start_timer();
atom->add_x_data(host_x,host_type);
}
atom->add_q_data();
cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
atom->add_extra_data();
*ilist=nbor->host_ilist.begin();
*jnum=nbor->host_acc.begin();
// re-allocate dev_short_nbor if necessary
if (inum_full*(2+_max_nbors) > dev_short_nbor.cols()) {
int _nmax=static_cast<int>(static_cast<double>(inum_full)*1.10);
dev_short_nbor.resize((2+_max_nbors)*_nmax);
}
hd_balancer.stop_timer();
return nbor->host_jlist.begin()-host_start;
}
// ---------------------------------------------------------------------------
// Compute multipole real-space part
// precompute() should be already invoked before mem (re)allocation
// this is the first part in a time step done on the GPU for AMOEBA for now
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void BaseAmoebaT::compute_multipole_real(const int /*ago*/, const int inum_full,
const int /*nall*/, double ** /*host_x*/,
int * /*host_type*/, int * /*host_amtype*/,
int * /*host_amgroup*/, double ** /*host_rpole*/,
double */*host_pval*/, double * /*sublo*/,
double * /*subhi*/, tagint * /*tag*/,
int ** /*nspecial*/, tagint ** /*special*/,
int * /*nspecial15*/, tagint ** /*special15*/,
const bool /*eflag_in*/, const bool /*vflag_in*/,
const bool /*eatom*/, const bool /*vatom*/,
int & /*host_start*/, int ** /*ilist*/, int ** /*jnum*/,
const double /*cpu_time*/, bool & /*success*/,
const double aewald, const double felec,
const double off2_mpole, double * /*host_q*/,
double * /*boxlo*/, double * /*prd*/, void **tep_ptr) {
// ------------------- Resize _tep array ------------------------
if (inum_full>_max_tep_size) {
_max_tep_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
_tep.resize(_max_tep_size*4);
}
*tep_ptr=_tep.host.begin();
_off2_mpole = off2_mpole;
_felec = felec;
_aewald = aewald;
multipole_real(_eflag,_vflag);
// leave the answers (forces, energies and virial) on the device,
// only copy them back in the last kernel (polar_real)
//ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
//device->add_ans_object(ans);
// copy tep from device to host
_tep.update_host(_max_tep_size*4,false);
}
// ---------------------------------------------------------------------------
// Reneighbor on GPU if necessary, and then compute the direct real space part
// of the permanent field
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void BaseAmoebaT::compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole,
double **host_uind, double **host_uinp, double *host_pval,
const double aewald, const double off2_polar,
void** fieldp_ptr) {
// all the necessary data arrays are already copied from host to device
cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
atom->add_extra_data();
*fieldp_ptr=_fieldp.host.begin();
// specify the correct cutoff and alpha values
_off2_polar = off2_polar;
_aewald = aewald;
udirect2b(_eflag,_vflag);
// copy field and fieldp from device to host (_fieldp store both arrays, one after another)
_fieldp.update_host(_max_fieldp_size*8,false);
}
// ---------------------------------------------------------------------------
// Reneighbor on GPU if necessary, and then compute the direct real space part
// of the induced field
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double ** /*host_rpole*/,
double **host_uind, double **host_uinp, double * /*host_pval*/,
const double aewald, const double off2_polar,
void** /*fieldp_ptr*/) {
// only copy the necessary data arrays that are updated over the iterations
// use nullptr for the other arrays that are already copied from host to device
cast_extra_data(host_amtype, host_amgroup, nullptr, host_uind, host_uinp, nullptr);
atom->add_extra_data();
// set the correct cutoff and alpha
_off2_polar = off2_polar;
_aewald = aewald;
// launch the kernel
umutual2b(_eflag,_vflag);
// copy field and fieldp from device to host (_fieldp store both arrays, one after another)
// NOTE: move this step to update_fieldp() to delay device-host transfer
// after umutual1 and self are done on the GPU
// *fieldp_ptr=_fieldp.host.begin();
// _fieldp.update_host(_max_fieldp_size*8,false);
}
// ---------------------------------------------------------------------------
// Prepare for umutual1() after bspline_fill() is done on host
// - reallocate per-atom arrays, thetai1, thetai2, thetai3, and igrid if needed
// host_thetai1, host_thetai2, host_thetai3 are allocated with nmax by bsordermax by 4
// host_igrid is allocated with nmax by 4
// - transfer extra data from host to device
// NOTE: can be re-used for fphi_mpole() but with a different bsorder value
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void BaseAmoebaT::precompute_kspace(const int inum_full, const int bsorder,
double ***host_thetai1, double ***host_thetai2,
double ***host_thetai3, int** host_igrid,
const int nzlo_out, const int nzhi_out,
const int nylo_out, const int nyhi_out,
const int nxlo_out, const int nxhi_out) {
// update bsorder with that of the kspace solver
_bsorder = bsorder;
// allocate or resize per-atom arrays
// _max_thetai_size, _max_tep_size and _max_fieldp_size are essentially _nmax
// will be consolidated once all terms are ready
if (_max_thetai_size == 0) {
_max_thetai_size = static_cast<int>(static_cast<double>(inum_full)*1.10);
_thetai1.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY);
_thetai2.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY);
_thetai3.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY);
_igrid.alloc(_max_thetai_size*4,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY);
_fdip_phi1.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_READ_WRITE);
_fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_READ_WRITE);
_fdip_sum_phi.alloc(_max_thetai_size*20,*(this->ucl_device),UCL_READ_WRITE);
} else {
if ((int)_thetai1.cols()<_max_thetai_size*bsorder) {
_max_thetai_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
_thetai1.resize(_max_thetai_size*bsorder);
_thetai2.resize(_max_thetai_size*bsorder);
_thetai3.resize(_max_thetai_size*bsorder);
_igrid.resize(_max_thetai_size*4);
_fdip_phi1.resize(_max_thetai_size*10);
_fdip_phi2.resize(_max_thetai_size*10);
_fdip_sum_phi.resize(_max_thetai_size*20);
}
}
#ifdef ASYNC_DEVICE_COPY
_thetai1.cq(ucl_device->cq(_end_command_queue));
_thetai2.cq(ucl_device->cq(_end_command_queue));
_thetai3.cq(ucl_device->cq(_end_command_queue));
#endif
// pack host data to device
for (int i = 0; i < inum_full; i++)
for (int j = 0; j < bsorder; j++) {
int idx = i*bsorder + j;
numtyp4 v;
v.x = host_thetai1[i][j][0];
v.y = host_thetai1[i][j][1];
v.z = host_thetai1[i][j][2];
v.w = host_thetai1[i][j][3];
_thetai1[idx] = v;
}
_thetai1.update_device(true);
for (int i = 0; i < inum_full; i++)
for (int j = 0; j < bsorder; j++) {
int idx = i*bsorder + j;
numtyp4 v;
v.x = host_thetai2[i][j][0];
v.y = host_thetai2[i][j][1];
v.z = host_thetai2[i][j][2];
v.w = host_thetai2[i][j][3];
_thetai2[idx] = v;
}
_thetai2.update_device(true);
for (int i = 0; i < inum_full; i++)
for (int j = 0; j < bsorder; j++) {
int idx = i*bsorder + j;
numtyp4 v;
v.x = host_thetai3[i][j][0];
v.y = host_thetai3[i][j][1];
v.z = host_thetai3[i][j][2];
v.w = host_thetai3[i][j][3];
_thetai3[idx] = v;
}
_thetai3.update_device(true);
for (int i = 0; i < inum_full; i++) {
int idx = i*4;
_igrid[idx+0] = host_igrid[i][0];
_igrid[idx+1] = host_igrid[i][1];
_igrid[idx+2] = host_igrid[i][2];
}
_igrid.update_device(true);
// _cgrid_brick holds the grid-based potential
_nzlo_out = nzlo_out;
_nzhi_out = nzhi_out;
_nylo_out = nylo_out;
_nyhi_out = nyhi_out;
_nxlo_out = nxlo_out;
_nxhi_out = nxhi_out;
_ngridz = nzhi_out - nzlo_out + 1;
_ngridy = nyhi_out - nylo_out + 1;
_ngridx = nxhi_out - nxlo_out + 1;
_num_grid_points = _ngridx * _ngridy * _ngridz;
int numel = _num_grid_points;
if (_cgrid_brick.cols() == 0) {
int nsize=(int)(((double)numel)*1.1);
_cgrid_brick.alloc(nsize, *(this->ucl_device), UCL_READ_WRITE, UCL_READ_ONLY);
} else if (numel > (int)_cgrid_brick.cols()) {
_cgrid_brick.resize(numel);
}
}
// ---------------------------------------------------------------------------
// fphi_uind = induced potential from grid
// fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
// NOTE: host_grid_brick is from ic_kspace post_convolution()
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void BaseAmoebaT::compute_fphi_uind(double ****host_grid_brick,
void **host_fdip_phi1,
void **host_fdip_phi2,
void **host_fdip_sum_phi)
{
int n = 0;
for (int iz = _nzlo_out; iz <= _nzhi_out; iz++)
for (int iy = _nylo_out; iy <= _nyhi_out; iy++)
for (int ix = _nxlo_out; ix <= _nxhi_out; ix++) {
numtyp2 v;
v.x = host_grid_brick[iz][iy][ix][0];
v.y = host_grid_brick[iz][iy][ix][1];
_cgrid_brick[n] = v;
n++;
}
_cgrid_brick.update_device(_num_grid_points, false);
#ifdef ASYNC_DEVICE_COPY
ucl_device->sync();
#endif
// launch the kernel with its execution configuration (see below)
fphi_uind();
// copy data from device to host
_fdip_phi1.update_host(_max_thetai_size*10, false);
_fdip_phi2.update_host(_max_thetai_size*10, false);
_fdip_sum_phi.update_host(_max_thetai_size*20, false);
// return the pointers to the host-side arrays
*host_fdip_phi1 = _fdip_phi1.host.begin();
*host_fdip_phi2 = _fdip_phi2.host.begin();
*host_fdip_sum_phi = _fdip_sum_phi.host.begin();
}
// ---------------------------------------------------------------------------
// Interpolate the potential from the PME grid
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int BaseAmoebaT::fphi_uind() {
int ainum=ans->inum();
if (ainum == 0)
return 0;
// Compute the block size and grid size to keep all cores busy
const int BX=block_size();
const int GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
time_pair.start();
int ngridxy = _ngridx * _ngridy;
k_fphi_uind.set_size(GX,BX);
k_fphi_uind.run(&_thetai1, &_thetai2, &_thetai3, &_igrid, &_cgrid_brick,
&_fdip_phi1, &_fdip_phi2, &_fdip_sum_phi, &_bsorder, &ainum,
&_nzlo_out, &_nylo_out, &_nxlo_out, &ngridxy, &_ngridx);
time_pair.stop();
return GX;
}
// ---------------------------------------------------------------------------
// fphi_mpole = multipole potential from grid (limited to polar_kspace for now)
// fphi_mpole extracts the permanent multipole potential from
// the particle mesh Ewald grid
// NOTE: host_grid_brick is from p_kspace post_convolution()
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void BaseAmoebaT::compute_fphi_mpole(double ***host_grid_brick, void **host_fphi, const double felec)
{
int n = 0;
for (int iz = _nzlo_out; iz <= _nzhi_out; iz++)
for (int iy = _nylo_out; iy <= _nyhi_out; iy++)
for (int ix = _nxlo_out; ix <= _nxhi_out; ix++) {
numtyp2 v;
v.x = host_grid_brick[iz][iy][ix];
v.y = (numtyp)0;
_cgrid_brick[n] = v;
n++;
}
_cgrid_brick.update_device(_num_grid_points, false);
_felec = felec;
fphi_mpole();
_fdip_sum_phi.update_host(_max_thetai_size*20, false);
*host_fphi = _fdip_sum_phi.host.begin();
}
// ---------------------------------------------------------------------------
// Interpolate the potential from the PME grid
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int BaseAmoebaT::fphi_mpole() {
int ainum=ans->inum();
if (ainum == 0)
return 0;
// Compute the block size and grid size to keep all cores busy
const int BX=block_size();
const int GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
time_pair.start();
int ngridxy = _ngridx * _ngridy;
k_fphi_mpole.set_size(GX,BX);
k_fphi_mpole.run(&_thetai1, &_thetai2, &_thetai3, &_igrid, &_cgrid_brick,
&_fdip_sum_phi, &_bsorder, &ainum, &_felec,
&_nzlo_out, &_nylo_out, &_nxlo_out, &ngridxy, &_ngridx);
time_pair.stop();
return GX;
}
// ---------------------------------------------------------------------------
// Reneighbor on GPU if necessary, and then compute polar real-space
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void BaseAmoebaT::compute_polar_real(int *host_amtype, int *host_amgroup,
double **host_rpole, double **host_uind,
double **host_uinp, double *host_pval,
const bool eflag_in, const bool vflag_in,
const bool eatom, const bool vatom,
const double aewald, const double felec,
const double off2_polar, void **tep_ptr) {
// cast necessary data arrays from host to device
cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
atom->add_extra_data();
*tep_ptr=_tep.host.begin();
_off2_polar = off2_polar;
_felec = felec;
_aewald = aewald;
const int red_blocks=polar_real(_eflag,_vflag);
// only copy answers (forces, energies and virial) back from the device
// in the last kernel (which is polar_real here)
ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
device->add_ans_object(ans);
// copy tep from device to host
_tep.update_host(_max_tep_size*4,false);
}
// ---------------------------------------------------------------------------
// Return the memory bytes allocated on the host and device
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
double BaseAmoebaT::host_memory_usage_atomic() const {
return device->atom.host_memory_usage()+nbor->host_memory_usage()+
4*sizeof(numtyp)+sizeof(BaseAmoeba<numtyp,acctyp>);
}
// ---------------------------------------------------------------------------
// Setup the FFT plan: only placeholder for now
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void BaseAmoebaT::setup_fft(const int /*numel*/, const int /*element_type*/)
{
// TODO: setting up FFT plan based on the backend (cuFFT or hipFFT)
}
// ---------------------------------------------------------------------------
// Compute FFT on the device: only placeholder for now
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void BaseAmoebaT::compute_fft1d(void * /*in*/, void * /*out*/,
const int /*numel*/, const int /*mode*/)
{
// TODO: setting up FFT plan based on the backend (cuFFT or hipFFT)
#if 0 // !defined(USE_OPENCL) && !defined(USE_HIP)
if (fft_plan_created == false) {
int m = numel/2;
cufftPlan1d(&plan, m, CUFFT_Z2Z, 1);
fft_plan_created = true;
}
// n = number of double complex
int n = numel/2;
// copy the host array to the device (data)
UCL_Vector<cufftDoubleComplex,cufftDoubleComplex> data;
data.alloc(n, *(this->ucl_device), UCL_READ_WRITE, UCL_READ_WRITE);
int m = 0;
double* d_in = (double*)in;
for (int i = 0; i < n; i++) {
data[i].x = d_in[m];
data[i].y = d_in[m+1];
m += 2;
}
data.update_device(false);
// perform the in-place forward FFT
cufftResult result = cufftExecZ2Z(plan, (cufftDoubleComplex*)&data.device,
(cufftDoubleComplex*)&data.device, CUFFT_FORWARD);
if (result != CUFFT_SUCCESS) printf("failed cufft %d\n", result);
ucl_device->sync();
data.update_host(false);
// copy back the data to the host array
m = 0;
double* d_out = (double*)out;
for (int i = 0; i < n; i++) {
d_out[m] = data[i].x;
d_out[m+1] = data[i].y;
m += 2;
}
data.clear();
#endif
}
// ---------------------------------------------------------------------------
// Copy the extra data from host to device
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole,
double** uind, double** uinp, double* pval) {
// signal that we need to transfer extra data from the host
atom->extra_data_unavail();
int _nall=atom->nall();
numtyp4 *pextra=reinterpret_cast<numtyp4*>(&(atom->extra[0]));
int n = 0;
int nstride = 1; //4;
if (rpole) {
for (int i = 0; i < _nall; i++) {
int idx = n+i*nstride;
pextra[idx].x = rpole[i][0];
pextra[idx].y = rpole[i][1];
pextra[idx].z = rpole[i][2];
pextra[idx].w = rpole[i][3];
}
n += nstride*_nall;
for (int i = 0; i < _nall; i++) {
int idx = n+i*nstride;
pextra[idx].x = rpole[i][4];
pextra[idx].y = rpole[i][5];
pextra[idx].z = rpole[i][6];
pextra[idx].w = rpole[i][8];
}
n += nstride*_nall;
for (int i = 0; i < _nall; i++) {
int idx = n+i*nstride;
pextra[idx].x = rpole[i][9];
pextra[idx].y = rpole[i][12];
pextra[idx].z = (numtyp)amtype[i];
pextra[idx].w = (numtyp)amgroup[i];
}
} else {
n += 2*nstride*_nall;
}
n += nstride*_nall;
if (uind) {
for (int i = 0; i < _nall; i++) {
int idx = n+i*nstride;
pextra[idx].x = uind[i][0];
pextra[idx].y = uind[i][1];
pextra[idx].z = uind[i][2];
pextra[idx].w = 0;
}
}
n += nstride*_nall;
if (uinp) {
for (int i = 0; i < _nall; i++) {
int idx = n+i*nstride;
pextra[idx].x = uinp[i][0];
pextra[idx].y = uinp[i][1];
pextra[idx].z = uinp[i][2];
pextra[idx].w = 0;
}
}
n += nstride*_nall;
if (pval) {
for (int i = 0; i < _nall; i++) {
int idx = n+i*nstride;
pextra[idx].x = pval[i];
pextra[idx].y = 0;
pextra[idx].z = 0;
pextra[idx].w = 0;
}
}
}
// ---------------------------------------------------------------------------
// Compile (load) the kernel strings and set the kernels
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
const char *kname_multipole,
const char *kname_udirect2b,
const char *kname_umutual2b,
const char *kname_polar,
const char *kname_fphi_uind,
const char *kname_fphi_mpole,
const char *kname_short_nbor,
const char* kname_special15) {
if (_compiled)
return;
if (pair_program) delete pair_program;
pair_program=new UCL_Program(dev);
std::string oclstring = device->compile_string()+" -DEVFLAG=1";
pair_program->load_string(pair_str, oclstring.c_str(),nullptr, screen);
k_multipole.set_function(*pair_program, kname_multipole);
k_udirect2b.set_function(*pair_program, kname_udirect2b);
k_umutual2b.set_function(*pair_program, kname_umutual2b);
k_polar.set_function(*pair_program, kname_polar);
k_fphi_uind.set_function(*pair_program, kname_fphi_uind);
k_fphi_mpole.set_function(*pair_program, kname_fphi_mpole);
k_short_nbor.set_function(*pair_program, kname_short_nbor);
k_special15.set_function(*pair_program, kname_special15);
pos_tex.get_texture(*pair_program, "pos_tex");
q_tex.get_texture(*pair_program, "q_tex");
_compiled=true;
#if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
if (dev.has_subgroup_support()) {
int mx_subgroup_sz = k_polar.max_subgroup_size(_block_size);
if (_threads_per_atom > mx_subgroup_sz)
_threads_per_atom = mx_subgroup_sz;
device->set_simd_size(mx_subgroup_sz);
}
#endif
}
// ---------------------------------------------------------------------------
// Specify 1-5 neighbors from the current neighbor list
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int BaseAmoebaT::add_onefive_neighbors() {
// Compute the block size and grid size to keep all cores busy
const int BX=block_size();
int GX=static_cast<int>(ceil(static_cast<double>(ans->inum())/
(BX/_threads_per_atom)));
int _nall=atom->nall();
int ainum=ans->inum();
int nbor_pitch=nbor->nbor_pitch();
k_special15.set_size(GX,BX);
k_special15.run(&nbor->dev_nbor, &_nbor_data->begin(),
&atom->dev_tag, &dev_nspecial15, &dev_special15,
&ainum, &_nall, &nbor_pitch,
&_threads_per_atom);
return GX;
}
template class BaseAmoeba<PRECISION,ACC_PRECISION>;
}

325
lib/gpu/lal_base_amoeba.h Normal file
View File

@ -0,0 +1,325 @@
/***************************************************************************
base_amoeba.h
-------------------
Trung Dac Nguyen (Northwestern)
Base class for pair styles needing per-particle data for position,
charge, and type.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : trung.nguyen@northwestern.edu
***************************************************************************/
#ifndef LAL_BASE_AMOEBA_H
#define LAL_BASE_AMOEBA_H
#include "lal_device.h"
#include "lal_balance.h"
#include "mpi.h"
#if defined(USE_OPENCL)
#include "geryon/ocl_texture.h"
#elif defined(USE_CUDART)
#include "geryon/nvc_texture.h"
#elif defined(USE_HIP)
#include "geryon/hip_texture.h"
#else
#include "geryon/nvd_texture.h"
#endif
//#define ASYNC_DEVICE_COPY
#if !defined(USE_OPENCL) && !defined(USE_HIP)
// temporary workaround for int2 also defined in cufft
#ifdef int2
#undef int2
#endif
#include "cufft.h"
#endif
namespace LAMMPS_AL {
template <class numtyp, class acctyp>
class BaseAmoeba {
public:
BaseAmoeba();
virtual ~BaseAmoeba();
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
* \param k_name name for the kernel for force calculation
*
* Returns:
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init_atomic(const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const int maxspecial15, const double cell_size,
const double gpu_split, FILE *screen, const void *pair_program,
const char *kname_multipole, const char *kname_udirect2b,
const char *kname_umutual2b, const char *kname_polar,
const char *kname_fphi_uind, const char *kname_fphi_mpole,
const char *kname_short_nbor, const char* kname_special15);
/// Estimate the overhead for GPU context changes and CPU driver
void estimate_gpu_overhead(const int add_kernels=0);
/// Check if there is enough storage for atom arrays and realloc if not
/** \param success set to false if insufficient memory **/
inline void resize_atom(const int inum, const int nall, bool &success) {
if (atom->resize(nall, success)) {
pos_tex.bind_float(atom->x,4);
q_tex.bind_float(atom->q,1);
}
ans->resize(inum,success);
}
/// Check if there is enough storage for neighbors and realloc if not
/** \param nlocal number of particles whose nbors must be stored on device
* \param host_inum number of particles whose nbors need to copied to host
* \param current maximum number of neighbors
* \note olist_size=total number of local particles **/
inline void resize_local(const int inum, const int max_nbors, bool &success) {
nbor->resize(inum,max_nbors,success);
}
/// Check if there is enough storage for neighbors and realloc if not
/** \param nlocal number of particles whose nbors must be stored on device
* \param host_inum number of particles whose nbors need to copied to host
* \param current maximum number of neighbors
* \note host_inum is 0 if the host is performing neighboring
* \note nlocal+host_inum=total number local particles
* \note olist_size=0 **/
inline void resize_local(const int inum, const int host_inum,
const int max_nbors, bool &success) {
nbor->resize(inum,host_inum,max_nbors,success);
}
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear_atomic();
/// Returns memory usage on device per atom
int bytes_per_atom_atomic(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage_atomic() const;
/// Accumulate timers
inline void acc_timers() {
if (device->time_device()) {
nbor->acc_timers(screen);
time_pair.add_to_total();
atom->acc_timers();
ans->acc_timers();
}
}
/// Zero timers
inline void zero_timers() {
time_pair.zero();
atom->zero_timers();
ans->zero_timers();
}
/// Copy neighbor list from host
int * reset_nbors(const int nall, const int inum, int *ilist, int *numj,
int **firstneigh, bool &success);
/// Build neighbor list on device
int build_nbor_list(const int inum, const int host_inum,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, tagint *tag, int **nspecial,
tagint **special, int *nspecial15, tagint **special15,
bool &success);
/// Reallocate per-atom arrays if needed, and build neighbor lists once, if needed
virtual int** precompute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *host_amtype,
int *host_amgroup, double **host_rpole, double **host_uind,
double **host_uinp, double *host_pval, double *sublo, double *subhi,
tagint *tag, int **nspecial, tagint **special,
int *nspecial15, tagint **special15,
const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **&ilist, int **&numj, const double cpu_time, bool &success,
double *charge, double *boxlo, double *prd);
/// Compute multipole real-space with device neighboring
virtual void compute_multipole_real(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *host_amtype,
int *host_amgroup, double **host_rpole, double *host_pval,
double *sublo, double *subhi, tagint *tag,
int **nspecial, tagint **special, int *nspecial15, tagint **special15,
const bool eflag, const bool vflag, const bool eatom, const bool vatom,
int &host_start, int **ilist, int **numj, const double cpu_time,
bool &success, const double aewald, const double felec,
const double off2_mpole, double *charge, double *boxlo,
double *prd, void **tep_ptr);
/// Compute the real space part of the permanent field (udirect2b) with device neighboring
virtual void compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole,
double **host_uind, double **host_uinp, double *host_pval,
const double aewald, const double off2_polar, void **fieldp_ptr);
/// Compute the real space part of the induced field (umutual2b) with device neighboring
virtual void compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole,
double **host_uind, double **host_uinp, double *host_pval,
const double aewald, const double off2_polar, void **fieldp_ptr);
/// Allocate/resize per-atom arrays before the kspace parts in induce() and polar
virtual void precompute_kspace(const int inum_full, const int bsorder,
double ***host_thetai1, double ***host_thetai2,
double ***host_thetai3, int** igrid,
const int nzlo_out, const int nzhi_out,
const int nylo_out, const int nyhi_out,
const int nxlo_out, const int nxhi_out);
/// Interpolate the induced potential from the grid
virtual void compute_fphi_uind(double ****host_grid_brick,
void **host_fdip_phi1, void **host_fdip_phi2,
void **host_fdip_sum_phi);
/// Interpolate the multipolar potential from the grid
virtual void compute_fphi_mpole(double ***host_grid_brick, void **host_fphi,
const double felec);
/// Compute polar real-space with device neighboring
virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole,
double **host_uind, double **host_uinp, double *host_pval,
const bool eflag, const bool vflag,
const bool eatom, const bool vatom,
const double aewald, const double felec, const double off2_polar,
void **tep_ptr);
// copy field and fieldp from device to host after umutual2b
virtual void update_fieldp(void **fieldp_ptr) {
*fieldp_ptr=_fieldp.host.begin();
// _fieldp store both arrays, one after another
_fieldp.update_host(_max_fieldp_size*8,false);
}
/// setup a plan for FFT, where size is the number of elements
void setup_fft(const int size, const int element_type=0);
/// compute forward/backward FFT on the device
void compute_fft1d(void* in, void* out, const int numel, const int mode);
// -------------------------- DEVICE DATA -------------------------
/// Device Properties and Atom and Neighbor storage
Device<numtyp,acctyp> *device;
/// Geryon device
UCL_Device *ucl_device;
/// Device Timers
UCL_Timer time_pair;
/// Host device load balancer
Balance<numtyp,acctyp> hd_balancer;
/// LAMMPS pointer for screen output
FILE *screen;
// --------------------------- ATOM DATA --------------------------
/// Atom Data
Atom<numtyp,acctyp> *atom;
UCL_Vector<numtyp,numtyp> polar1, polar2, polar3, polar4, polar5;
/// cast host arrays into a single array for atom->extra
void cast_extra_data(int* amtype, int* amgroup, double** rpole,
double** uind, double** uinp, double* pval=nullptr);
/// Per-atom arrays
UCL_Vector<acctyp,acctyp> _tep, _fieldp;
int _nmax, _max_tep_size, _max_fieldp_size;
int _bsorder;
UCL_Vector<numtyp4,numtyp4> _thetai1, _thetai2, _thetai3;
UCL_Vector<int,int> _igrid;
UCL_Vector<numtyp2,numtyp2> _cgrid_brick;
UCL_Vector<acctyp,acctyp> _fdip_phi1, _fdip_phi2, _fdip_sum_phi;
int _max_thetai_size;
int _nzlo_out, _nzhi_out, _nylo_out, _nyhi_out, _nxlo_out, _nxhi_out;
int _ngridx, _ngridy, _ngridz, _num_grid_points;
int _end_command_queue;
// ------------------------ FORCE/ENERGY DATA -----------------------
Answer<numtyp,acctyp> *ans;
// --------------------------- NBOR DATA ----------------------------
/// Neighbor data
Neighbor *nbor;
/// Device storage for 1-5 special neighbor counts
UCL_D_Vec<int> dev_nspecial15;
/// Device storage for special neighbors
UCL_D_Vec<tagint> dev_special15, dev_special15_t;
int add_onefive_neighbors();
UCL_D_Vec<int> dev_short_nbor;
// ------------------------- DEVICE KERNELS -------------------------
UCL_Program *pair_program;
UCL_Kernel k_multipole, k_udirect2b, k_umutual2b, k_polar;
UCL_Kernel k_fphi_uind, k_fphi_mpole;
UCL_Kernel k_special15, k_short_nbor;
inline int block_size() { return _block_size; }
inline void set_kernel(const int /*eflag*/, const int /*vflag*/) {}
// --------------------------- TEXTURES -----------------------------
UCL_Texture pos_tex;
UCL_Texture q_tex;
protected:
bool _compiled;
int _block_size, _block_bio_size, _threads_per_atom;
int _extra_fields;
double _max_bytes, _max_an_bytes, _maxspecial, _maxspecial15, _max_nbors;
double _gpu_overhead, _driver_overhead;
bool short_nbor_polar_avail;
UCL_D_Vec<int> *_nbor_data;
numtyp _aewald,_felec;
numtyp _off2_hal,_off2_repulse,_off2_disp,_off2_mpole,_off2_polar;
int _eflag, _vflag;
void compile_kernels(UCL_Device &dev, const void *pair_string,
const char *kname_multipole, const char *kname_udirect2b,
const char *kname_umutual2b, const char *kname_polar,
const char *kname_fphi_uind, const char *kname_fphi_mpole,
const char *kname_short_nbor, const char* kname_special15);
virtual int multipole_real(const int eflag, const int vflag) = 0;
virtual int udirect2b(const int eflag, const int vflag) = 0;
virtual int umutual2b(const int eflag, const int vflag) = 0;
virtual int fphi_uind();
virtual int fphi_mpole();
virtual int polar_real(const int eflag, const int vflag) = 0;
#if !defined(USE_OPENCL) && !defined(USE_HIP)
cufftHandle plan;
#endif
bool fft_plan_created;
};
}
#endif

View File

@ -72,7 +72,9 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall,
_threads_per_atom=device->threads_per_atom();
int success=device->init(*ans,false,false,nlocal,nall,maxspecial);
bool charge = false;
bool rot = false;
int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial);
if (success!=0)
return success;

View File

@ -72,7 +72,9 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall,
_threads_per_atom=device->threads_per_charge();
int success=device->init(*ans,true,false,nlocal,nall,maxspecial);
bool charge = true;
bool rot = false;
int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial);
if (success!=0)
return success;

View File

@ -73,7 +73,9 @@ int BaseDipoleT::init_atomic(const int nlocal, const int nall,
_threads_per_atom=device->threads_per_charge();
int success=device->init(*ans,true,true,nlocal,nall,maxspecial);
bool charge = true;
bool rot = true;
int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial);
if (success!=0)
return success;

View File

@ -72,7 +72,10 @@ int BaseDPDT::init_atomic(const int nlocal, const int nall,
_threads_per_atom=device->threads_per_atom();
int success=device->init(*ans,false,false,nlocal,nall,maxspecial,true);
bool charge = false;
bool rot = false;
bool vel = true;
int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial,vel);
if (success!=0)
return success;
@ -193,7 +196,7 @@ void BaseDPDT::compute(const int f_ago, const int inum_full, const int nall,
const double cpu_time, bool &success, tagint *tag,
double **host_v, const double dtinvsqrt,
const int seed, const int timestep,
const int nlocal, double *boxlo, double *prd) {
const int /*nlocal*/, double * /*boxlo*/, double * /*prd*/) {
acc_timers();
int eflag, vflag;
if (eatom) eflag=2;
@ -258,7 +261,7 @@ int** BaseDPDT::compute(const int ago, const int inum_full,
const double cpu_time, bool &success,
double **host_v, const double dtinvsqrt,
const int seed, const int timestep,
double *boxlo, double *prd) {
double * /*boxlo*/, double * /*prd*/) {
acc_timers();
int eflag, vflag;
if (eatom) eflag=2;

View File

@ -94,7 +94,9 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
else
_threads_per_atom=device->threads_per_three();
int success=device->init(*ans,false,false,nlocal,nall,maxspecial);
bool charge = false;
bool rot = false;
int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial);
if (success!=0)
return success;

View File

@ -44,18 +44,14 @@ int CHARMMLongT::bytes_per_atom(const int max_nbors) const {
}
template <class numtyp, class acctyp>
int CHARMMLongT::init(const int ntypes,
double host_cut_bothsq, double **host_lj1,
double **host_lj2, double **host_lj3,
double **host_lj4, double **host_offset,
double *host_special_lj, const int nlocal,
const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *_screen,
int CHARMMLongT::init(const int ntypes, double host_cut_bothsq, double **host_lj1,
double **host_lj2, double **host_lj3, double **host_lj4,
double ** /*host_offset*/, double *host_special_lj, const int nlocal,
const int nall, const int max_nbors, const int maxspecial,
const double cell_size, const double gpu_split, FILE *_screen,
double host_cut_ljsq, const double host_cut_coulsq,
double *host_special_coul, const double qqrd2e,
const double g_ewald, const double cut_lj_innersq,
const double denom_lj, double **epsilon,
double *host_special_coul, const double qqrd2e, const double g_ewald,
const double cut_lj_innersq, const double denom_lj, double **epsilon,
double **sigma, const bool mix_arithmetic) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,

View File

@ -52,7 +52,7 @@ DeviceT::~Device() {
}
template <class numtyp, class acctyp>
int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int ngpu,
int DeviceT::init_device(MPI_Comm /*world*/, MPI_Comm replica, const int ngpu,
const int first_gpu_id, const int gpu_mode,
const double p_split, const int t_per_atom,
const double user_cell_size, char *ocl_args,
@ -386,6 +386,9 @@ int DeviceT::set_ocl_params(std::string s_config, const std::string &extra_args)
}
_ocl_compile_string="-cl-mad-enable ";
#ifdef CL_VERSION_2_0
_ocl_compile_string+="-cl-std=CL2.0 ";
#endif
if (params[4]!="0") _ocl_compile_string+="-cl-fast-relaxed-math ";
_ocl_compile_string+=std::string(OCL_INT_TYPE)+" "+
std::string(OCL_PRECISION_COMPILE);
@ -438,7 +441,7 @@ template <class numtyp, class acctyp>
int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
const bool rot, const int nlocal,
const int nall, const int maxspecial,
const bool vel) {
const bool vel, const int extra_fields) {
if (!_device_init)
return -1;
if (sizeof(acctyp)==sizeof(double) && !gpu->double_precision())
@ -467,7 +470,7 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
if (_init_count==0) {
// Initialize atom and nbor data
if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor>0 && maxspecial>0,vel))
if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor>0 && maxspecial>0,vel,extra_fields))
return -3;
_data_in_estimate++;
@ -477,6 +480,9 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
_data_in_estimate++;
if (vel)
_data_in_estimate++;
if (extra_fields>0)
_data_in_estimate++;
} else {
if (!atom.charge() && charge)
_data_in_estimate++;
@ -484,7 +490,9 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
_data_in_estimate++;
if (!atom.velocity() && vel)
_data_in_estimate++;
if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial,vel))
if (atom.using_extra() && extra_fields>0)
_data_in_estimate++;
if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial,vel,extra_fields))
return -3;
}
@ -520,7 +528,7 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const int nlocal,
template <class numtyp, class acctyp>
int DeviceT::init_nbor(Neighbor *nbor, const int nlocal,
const int host_nlocal, const int nall,
const int host_nlocal, const int /*nall*/,
const int maxspecial, const int gpu_host,
const int max_nbors, const double cutoff,
const bool pre_cut, const int threads_per_atom,

View File

@ -61,6 +61,7 @@ class Device {
* \param nall Total number of local+ghost particles
* \param maxspecial Maximum mumber of special bonded atoms per atom
* \param vel True if velocities need to be stored
* \param extra_fields Nonzero if extra fields need to be stored
*
* Returns:
* - 0 if successful
@ -70,7 +71,7 @@ class Device {
* - -5 Double precision is not supported on card **/
int init(Answer<numtyp,acctyp> &ans, const bool charge, const bool rot,
const int nlocal, const int nall, const int maxspecial,
const bool vel=false);
const bool vel=false, const int extra_fields=0);
/// Initialize the device for Atom storage only
/** \param nlocal Total number of local particles to allocate memory for

View File

@ -30,7 +30,7 @@ static DPD<PRECISION,ACC_PRECISION> DPDTMF;
int dpd_tstat_gpu_init(const int ntypes, double **cutsq, double **host_a0,
double **host_gamma, double **host_sigma, double **host_cut,
double *special_lj, const int inum,
const int nall, const int max_nbors, const int maxspecial,
const int nall, const int /*max_nbors*/, const int maxspecial,
const double cell_size, int &gpu_mode, FILE *screen) {
DPDTMF.clear();
gpu_mode=DPDTMF.device->gpu_mode();

View File

@ -310,7 +310,7 @@ void EAMT::compute(const int f_ago, const int inum_full, const int nlocal,
const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag_in, const bool vflag_in,
const bool eatom, const bool vatom,
const bool /*eatom*/, const bool /*vatom*/,
int &host_start, const double cpu_time,
bool &success, void **fp_ptr) {
this->acc_timers();
@ -386,8 +386,8 @@ int** EAMT::compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, double *sublo,
double *subhi, tagint *tag, int **nspecial,
tagint **special, const bool eflag_in,
const bool vflag_in, const bool eatom,
const bool vatom, int &host_start, int **ilist, int **jnum,
const bool vflag_in, const bool /*eatom*/,
const bool /*vatom*/, int &host_start, int **ilist, int **jnum,
const double cpu_time, bool &success, int &inum,
void **fp_ptr) {
this->acc_timers();

641
lib/gpu/lal_hippo.cpp Normal file
View File

@ -0,0 +1,641 @@
/***************************************************************************
hippo.cpp
-------------------
Trung Dac Nguyen (Northwestern)
Class for acceleration of the hippo pair style.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : trung.nguyen@northwestern.edu
***************************************************************************/
#if defined(USE_OPENCL)
#include "hippo_cl.h"
#elif defined(USE_CUDART)
const char *hippo=0;
#else
#include "hippo_cubin.h"
#endif
#include "lal_hippo.h"
#include <cassert>
namespace LAMMPS_AL {
#define HippoT Hippo<numtyp, acctyp>
extern Device<PRECISION,ACC_PRECISION> device;
template <class numtyp, class acctyp>
HippoT::Hippo() : BaseAmoeba<numtyp,acctyp>(),
_allocated(false) {
}
template <class numtyp, class acctyp>
HippoT::~Hippo() {
clear();
k_repulsion.clear();
k_dispersion.clear();
}
template <class numtyp, class acctyp>
int HippoT::bytes_per_atom(const int max_nbors) const {
return this->bytes_per_atom_atomic(max_nbors);
}
template <class numtyp, class acctyp>
int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass,
const double *host_pdamp, const double *host_thole,
const double *host_dirdamp, const int *host_amtype2class,
const double *host_special_repel, const double *host_special_disp,
const double *host_special_mpole,
const double *host_special_polar_wscale,
const double *host_special_polar_piscale,
const double *host_special_polar_pscale,
const double *host_sizpr, const double *host_dmppr, const double *host_elepr,
const double *host_csix, const double *host_adisp,
const double *host_pcore, const double *host_palpha,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const int maxspecial15,
const double cell_size, const double gpu_split, FILE *_screen,
const double polar_dscale, const double polar_uscale) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15,
cell_size,gpu_split,_screen,hippo,
"k_hippo_multipole", "k_hippo_udirect2b",
"k_hippo_umutual2b", "k_hippo_polar",
"k_hippo_fphi_uind", "k_hippo_fphi_mpole",
"k_hippo_short_nbor", "k_hippo_special15");
if (success!=0)
return success;
// specific to HIPPO
k_repulsion.set_function(*(this->pair_program),"k_hippo_repulsion");
k_dispersion.set_function(*(this->pair_program),"k_hippo_dispersion");
// If atom type constants fit in shared memory use fast kernel
int lj_types=ntypes;
shared_types=false;
int max_shared_types=this->device->max_shared_types();
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
lj_types=max_shared_types;
shared_types=true;
}
_lj_types=lj_types;
// Allocate a host write buffer for data initialization
UCL_H_Vec<numtyp4> host_write(max_amtype, *(this->ucl_device), UCL_WRITE_ONLY);
for (int i = 0; i < max_amtype; i++) {
host_write[i].x = host_pdamp[i];
host_write[i].y = host_thole[i];
host_write[i].z = host_dirdamp[i];
host_write[i].w = host_amtype2class[i];
}
coeff_amtype.alloc(max_amtype,*(this->ucl_device), UCL_READ_ONLY);
ucl_copy(coeff_amtype,host_write,false);
for (int i = 0; i < max_amtype; i++) {
host_write[i].x = host_sizpr[i];
host_write[i].y = host_dmppr[i];
host_write[i].z = host_elepr[i];
host_write[i].w = (numtyp)0;
}
coeff_rep.alloc(max_amtype,*(this->ucl_device), UCL_READ_ONLY);
ucl_copy(coeff_rep,host_write,false);
UCL_H_Vec<numtyp4> host_write2(max_amclass, *(this->ucl_device), UCL_WRITE_ONLY);
for (int i = 0; i < max_amclass; i++) {
host_write2[i].x = host_csix[i];
host_write2[i].y = host_adisp[i];
host_write2[i].z = host_pcore[i];
host_write2[i].w = host_palpha[i];
}
coeff_amclass.alloc(max_amclass,*(this->ucl_device), UCL_READ_ONLY);
ucl_copy(coeff_amclass,host_write2,false);
UCL_H_Vec<numtyp4> dview(5, *(this->ucl_device), UCL_WRITE_ONLY);
sp_polar.alloc(5,*(this->ucl_device),UCL_READ_ONLY);
for (int i=0; i<5; i++) {
dview[i].x=host_special_polar_wscale[i];
dview[i].y=host_special_polar_piscale[i];
dview[i].z=host_special_polar_pscale[i];
dview[i].w=host_special_mpole[i];
}
ucl_copy(sp_polar,dview,5,false);
sp_nonpolar.alloc(5,*(this->ucl_device),UCL_READ_ONLY);
for (int i=0; i<5; i++) {
dview[i].x=host_special_repel[i];
dview[i].y=host_special_disp[i];
dview[i].z=(numtyp)0;
dview[i].w=(numtyp)0;
}
ucl_copy(sp_nonpolar,dview,5,false);
_polar_dscale = polar_dscale;
_polar_uscale = polar_uscale;
_allocated=true;
this->_max_bytes=coeff_amtype.row_bytes() + coeff_rep.row_bytes()
+ coeff_amclass.row_bytes() + sp_polar.row_bytes()
+ sp_nonpolar.row_bytes() + this->_tep.row_bytes()
+ this->_fieldp.row_bytes() + this->_thetai1.row_bytes()
+ this->_thetai2.row_bytes() + this->_thetai3.row_bytes()
+ this->_igrid.row_bytes() + this->_cgrid_brick.row_bytes();
return 0;
}
template <class numtyp, class acctyp>
void HippoT::clear() {
if (!_allocated)
return;
_allocated=false;
coeff_amtype.clear();
coeff_rep.clear();
coeff_amclass.clear();
sp_polar.clear();
sp_nonpolar.clear();
this->clear_atomic();
}
template <class numtyp, class acctyp>
double HippoT::host_memory_usage() const {
return this->host_memory_usage_atomic()+sizeof(Hippo<numtyp,acctyp>);
}
// ---------------------------------------------------------------------------
// Compute the repulsion term, returning tep
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void HippoT::compute_repulsion(const int /*ago*/, const int inum_full,
const int /*nall*/, double ** /*host_x*/,
int * /*host_type*/, int * /*host_amtype*/,
int * /*host_amgroup*/, double ** /*host_rpole*/,
double * /*sublo*/, double * /*subhi*/, tagint * /*tag*/,
int ** /*nspecial*/, tagint ** /*special*/,
int * /*nspecial15*/, tagint ** /*special15*/,
const bool eflag_in, const bool vflag_in,
const bool eatom, const bool vatom,
int & /*host_start*/, int ** /*ilist*/, int ** /*jnum*/,
const double /*cpu_time*/, bool & /*success*/,
const double /*aewald*/, const double off2_repulse,
double * /*host_q*/, double * /*boxlo*/, double * /*prd*/,
double cut2, double c0, double c1, double c2,
double c3, double c4, double c5, void **tep_ptr) {
this->acc_timers();
int eflag, vflag;
if (eatom) eflag=2;
else if (eflag_in) eflag=1;
else eflag=0;
if (vatom) vflag=2;
else if (vflag_in) vflag=1;
else vflag=0;
#ifdef LAL_NO_BLOCK_REDUCE
if (eflag) eflag=2;
if (vflag) vflag=2;
#endif
this->set_kernel(eflag,vflag);
// ------------------- Resize _tep array ------------------------
if (inum_full>this->_max_tep_size) {
this->_max_tep_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
this->_tep.resize(this->_max_tep_size*4);
}
*tep_ptr=this->_tep.host.begin();
this->_off2_repulse = off2_repulse;
_cut2 = cut2;
_c0 = c0;
_c1 = c1;
_c2 = c2;
_c3 = c3;
_c4 = c4;
_c5 = c5;
repulsion(this->_eflag,this->_vflag);
// copy tep from device to host
this->_tep.update_host(this->_max_tep_size*4,false);
}
// ---------------------------------------------------------------------------
// Launch the repulsion kernel
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int HippoT::repulsion(const int eflag, const int vflag) {
int ainum=this->ans->inum();
if (ainum == 0)
return 0;
int _nall=this->atom->nall();
int nbor_pitch=this->nbor->nbor_pitch();
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
this->time_pair.start();
// Build the short neighbor list for the cutoff off2_disp,
// at this point repuslion is the first kernel in a time step for HIPPO
this->k_short_nbor.set_size(GX,BX);
this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
&this->_nbor_data->begin(),
&this->dev_short_nbor, &this->_off2_repulse, &ainum,
&nbor_pitch, &this->_threads_per_atom);
k_repulsion.set_size(GX,BX);
k_repulsion.run(&this->atom->x, &this->atom->extra,
&coeff_rep, &sp_nonpolar,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->ans->force, &this->ans->engv, &this->_tep,
&eflag, &vflag, &ainum, &_nall, &nbor_pitch,
&this->_threads_per_atom, &this->_aewald,
&this->_off2_repulse, &_cut2,
&_c0, &_c1, &_c2, &_c3, &_c4, &_c5);
this->time_pair.stop();
return GX;
}
// ---------------------------------------------------------------------------
// Compute dispersion real-space
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void HippoT::compute_dispersion_real(int *host_amtype, int *host_amgroup,
double **host_rpole, const double aewald,
const double off2_disp) {
// cast necessary data arrays from host to device
this->cast_extra_data(host_amtype, host_amgroup, host_rpole,
nullptr, nullptr, nullptr);
this->atom->add_extra_data();
this->_off2_disp = off2_disp;
this->_aewald = aewald;
dispersion_real(this->_eflag,this->_vflag);
// only copy them back if this is the last kernel
// otherwise, commenting out these two lines to leave the answers
// (forces, energies and virial) on the device until the last kernel
//this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
//this->device->add_ans_object(this->ans);
}
// ---------------------------------------------------------------------------
// Launch the dispersion real-space kernel
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int HippoT::dispersion_real(const int eflag, const int vflag) {
int ainum=this->ans->inum();
if (ainum == 0)
return 0;
int _nall=this->atom->nall();
int nbor_pitch=this->nbor->nbor_pitch();
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
this->time_pair.start();
// Build the short neighbor list for the cutoff off2_disp,
// at this point dispersion is the first kernel in a time step
this->k_short_nbor.set_size(GX,BX);
this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
&this->_nbor_data->begin(),
&this->dev_short_nbor, &this->_off2_disp, &ainum,
&nbor_pitch, &this->_threads_per_atom);
k_dispersion.set_size(GX,BX);
k_dispersion.run(&this->atom->x, &this->atom->extra,
&coeff_amtype, &coeff_amclass, &sp_nonpolar,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->ans->force, &this->ans->engv,
&eflag, &vflag, &ainum, &_nall, &nbor_pitch,
&this->_threads_per_atom, &this->_aewald,
&this->_off2_disp);
this->time_pair.stop();
return GX;
}
// ---------------------------------------------------------------------------
// Compute the multipole real-space term, returning tep
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void HippoT::compute_multipole_real(const int /*ago*/, const int inum_full,
const int /*nall*/, double ** /*host_x*/,
int * /*host_type*/, int * /*host_amtype*/,
int * /*host_amgroup*/, double ** /*host_rpole*/,
double* host_pval, double * /*sublo*/,
double * /*subhi*/, tagint * /*tag*/,
int ** /*nspecial*/, tagint ** /*special*/,
int * /*nspecial15*/, tagint ** /*special15*/,
const bool /*eflag_in*/, const bool /*vflag_in*/,
const bool /*eatom*/, const bool /*vatom*/,
int & /*host_start*/, int ** /*ilist*/, int ** /*jnum*/,
const double /*cpu_time*/, bool & /*success*/,
const double aewald, const double felec,
const double off2_mpole, double * /*host_q*/,
double * /*boxlo*/, double * /*prd*/, void **tep_ptr) {
// cast necessary data arrays from host to device
this->cast_extra_data(nullptr, nullptr, nullptr, nullptr, nullptr, host_pval);
this->atom->add_extra_data();
// ------------------- Resize _tep array ------------------------
if (inum_full>this->_max_tep_size) {
this->_max_tep_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
this->_tep.resize(this->_max_tep_size*4);
}
*tep_ptr=this->_tep.host.begin();
this->_off2_mpole = off2_mpole;
this->_felec = felec;
this->_aewald = aewald;
multipole_real(this->_eflag,this->_vflag);
// copy tep from device to host
this->_tep.update_host(this->_max_tep_size*4,false);
}
// ---------------------------------------------------------------------------
// Launch the multipole real-space kernel
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int HippoT::multipole_real(const int eflag, const int vflag) {
int ainum=this->ans->inum();
if (ainum == 0)
return 0;
int _nall=this->atom->nall();
int nbor_pitch=this->nbor->nbor_pitch();
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
this->time_pair.start();
// Build the short neighbor list for the cutoff off2_mpole
this->k_short_nbor.set_size(GX,BX);
this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
&this->_nbor_data->begin(),
&this->dev_short_nbor, &this->_off2_mpole, &ainum,
&nbor_pitch, &this->_threads_per_atom);
this->k_multipole.set_size(GX,BX);
this->k_multipole.run(&this->atom->x, &this->atom->extra,
&coeff_amtype, &coeff_amclass, &sp_polar,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->ans->force, &this->ans->engv, &this->_tep,
&eflag, &vflag, &ainum, &_nall, &nbor_pitch,
&this->_threads_per_atom, &this->_aewald, &this->_felec,
&this->_off2_mpole, &_polar_dscale, &_polar_uscale);
this->time_pair.stop();
return GX;
}
// ---------------------------------------------------------------------------
// Compute the direct real space part of the permanent field
// returning field and fieldp
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void HippoT::compute_udirect2b(int * /*host_amtype*/, int * /*host_amgroup*/, double ** /*host_rpole*/,
double **host_uind, double **host_uinp, double* host_pval,
const double aewald, const double off2_polar,
void** fieldp_ptr) {
// all the necessary data arrays are already copied from host to device
this->cast_extra_data(nullptr, nullptr, nullptr, host_uind, host_uinp, host_pval);
this->atom->add_extra_data();
*fieldp_ptr=this->_fieldp.host.begin();
this->_off2_polar = off2_polar;
this->_aewald = aewald;
udirect2b(this->_eflag,this->_vflag);
// copy field and fieldp from device to host (_fieldp store both arrays, one after another)
this->_fieldp.update_host(this->_max_fieldp_size*8,false);
}
// ---------------------------------------------------------------------------
// Launch the real-space permanent field kernel
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int HippoT::udirect2b(const int /*eflag*/, const int /*vflag*/) {
int ainum=this->ans->inum();
if (ainum == 0)
return 0;
int _nall=this->atom->nall();
int nbor_pitch=this->nbor->nbor_pitch();
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
this->time_pair.start();
// Build the short neighbor list for the cutoff _off2_polar, if not done yet
// this is the first kernel in a time step where _off2_polar is used
if (!this->short_nbor_polar_avail) {
this->k_short_nbor.set_size(GX,BX);
this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
&this->_nbor_data->begin(),
&this->dev_short_nbor, &this->_off2_polar, &ainum,
&nbor_pitch, &this->_threads_per_atom);
this->short_nbor_polar_avail = true;
}
this->k_udirect2b.set_size(GX,BX);
this->k_udirect2b.run(&this->atom->x, &this->atom->extra,
&coeff_amtype, &coeff_amclass, &sp_polar,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->_fieldp, &ainum, &_nall, &nbor_pitch,
&this->_threads_per_atom, &this->_aewald, &this->_off2_polar,
&_polar_dscale, &_polar_uscale);
this->time_pair.stop();
return GX;
}
// ---------------------------------------------------------------------------
// Compute the direct real space term of the induced field
// returning field and fieldp
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void HippoT::compute_umutual2b(int * /*host_amtype*/, int * /*host_amgroup*/, double ** /*host_rpole*/,
double **host_uind, double **host_uinp, double * /*host_pval*/,
const double aewald, const double off2_polar, void ** /*fieldp_ptr*/) {
// cast necessary data arrays from host to device
this->cast_extra_data(nullptr, nullptr, nullptr, host_uind, host_uinp, nullptr);
this->atom->add_extra_data();
this->_off2_polar = off2_polar;
this->_aewald = aewald;
umutual2b(this->_eflag,this->_vflag);
// copy field and fieldp from device to host (_fieldp store both arrays, one after another)
// NOTE: move this step to update_fieldp() to delay device-host transfer
// *fieldp_ptr=this->_fieldp.host.begin();
// this->_fieldp.update_host(this->_max_fieldp_size*8,false);
}
// ---------------------------------------------------------------------------
// Launch the real-space induced field kernel
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int HippoT::umutual2b(const int /*eflag*/, const int /*vflag*/) {
int ainum=this->ans->inum();
if (ainum == 0)
return 0;
int _nall=this->atom->nall();
int nbor_pitch=this->nbor->nbor_pitch();
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
this->time_pair.start();
// Build the short neighbor list if not done yet
if (!this->short_nbor_polar_avail) {
this->k_short_nbor.set_size(GX,BX);
this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
&this->_nbor_data->begin(), &this->dev_short_nbor,
&this->_off2_polar, &ainum, &nbor_pitch,
&this->_threads_per_atom);
this->short_nbor_polar_avail = true;
}
this->k_umutual2b.set_size(GX,BX);
this->k_umutual2b.run(&this->atom->x, &this->atom->extra,
&coeff_amtype, &coeff_amclass, &sp_polar,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor, &this->_fieldp, &ainum, &_nall,
&nbor_pitch, &this->_threads_per_atom, &this->_aewald,
&this->_off2_polar, &_polar_dscale, &_polar_uscale);
this->time_pair.stop();
return GX;
}
// ---------------------------------------------------------------------------
// Reneighbor on GPU if necessary, and then compute polar real-space
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void HippoT::compute_polar_real(int * /*host_amtype*/, int * /*host_amgroup*/, double ** /*host_rpole*/,
double **host_uind, double **host_uinp, double * /*host_pval*/,
const bool eflag_in, const bool vflag_in,
const bool eatom, const bool vatom,
const double aewald, const double felec,
const double off2_polar, void **tep_ptr) {
// cast necessary data arrays from host to device
this->cast_extra_data(nullptr, nullptr, nullptr, host_uind, host_uinp, nullptr);
this->atom->add_extra_data();
*tep_ptr=this->_tep.host.begin();
this->_off2_polar = off2_polar;
this->_felec = felec;
this->_aewald = aewald;
const int red_blocks=polar_real(this->_eflag,this->_vflag);
// only copy answers (forces, energies and virial) back from the device
// in the last kernel in a timestep (which is polar_real here)
this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
this->device->add_ans_object(this->ans);
// copy tep from device to host
this->_tep.update_host(this->_max_tep_size*4,false);
}
// ---------------------------------------------------------------------------
// Launch the polar real-space kernel
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int HippoT::polar_real(const int eflag, const int vflag) {
int ainum=this->ans->inum();
if (ainum == 0)
return 0;
int _nall=this->atom->nall();
int nbor_pitch=this->nbor->nbor_pitch();
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
const int GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
/*
const int cus = this->device->gpu->cus();
while (GX < cus && GX > 1) {
BX /= 2;
GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
}
*/
this->time_pair.start();
// Build the short neighbor list if not done yet
if (!this->short_nbor_polar_avail) {
this->k_short_nbor.set_size(GX,BX);
this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
&this->_nbor_data->begin(),
&this->dev_short_nbor, &this->_off2_polar, &ainum,
&nbor_pitch, &this->_threads_per_atom);
this->short_nbor_polar_avail = true;
}
this->k_polar.set_size(GX,BX);
this->k_polar.run(&this->atom->x, &this->atom->extra,
&coeff_amtype, &coeff_amclass, &sp_polar,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->ans->force, &this->ans->engv, &this->_tep,
&eflag, &vflag, &ainum, &_nall, &nbor_pitch,
&this->_threads_per_atom, &this->_aewald, &this->_felec,
&this->_off2_polar, &_polar_dscale, &_polar_uscale);
this->time_pair.stop();
// Signal that short nbor list is not avail for the next time step
// do it here because polar_real() is the last kernel in a time step at this point
this->short_nbor_polar_avail = false;
return GX;
}
template class Hippo<PRECISION,ACC_PRECISION>;
}

2519
lib/gpu/lal_hippo.cu Normal file

File diff suppressed because it is too large Load Diff

166
lib/gpu/lal_hippo.h Normal file
View File

@ -0,0 +1,166 @@
/***************************************************************************
hippo.h
-------------------
Trung Dac Nguyen (Northwestern)
Class for acceleration of the hippo pair style.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : trung.nguyen@northwestern.edu
***************************************************************************/
#ifndef LAL_HIPPO_H
#define LAL_HIPPO_H
#include "lal_base_amoeba.h"
namespace LAMMPS_AL {
template <class numtyp, class acctyp>
class Hippo : public BaseAmoeba<numtyp, acctyp> {
public:
Hippo();
~Hippo();
/// Clear any previous data and set up for a new LAMMPS run
/** \param max_nbors initial number of rows in the neighbor matrix
* \param cell_size cutoff + skin
* \param gpu_split fraction of particles handled by device
*
* Returns:
* - 0 if successful
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(const int ntypes, const int max_amtype, const int max_amclass,
const double *host_pdamp, const double *host_thole,
const double *host_dirdamp, const int *host_amtype2class,
const double *host_special_mpole,
const double *host_special_repel,
const double *host_special_disp,
const double *host_special_polar_wscale,
const double *host_special_polar_piscale,
const double *host_special_polar_pscale,
const double *host_sizpr, const double *host_dmppr, const double *host_elepr,
const double *host_csix, const double *host_adisp,
const double *host_pcore, const double *host_palpha,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const int maxspecial15, const double cell_size,
const double gpu_split, FILE *_screen,
const double polar_dscale, const double polar_uscale);
/// Compute repulsion with device neighboring
virtual void compute_repulsion(const int ago, const int inum_full,
const int nall, double **host_x,
int *host_type, int *host_amtype,
int *host_amgroup, double **host_rpole,
double *sublo, double *subhi, tagint *tag,
int **nspecial, tagint **special,
int *nspecial15, tagint **special15,
const bool eflag_in, const bool vflag_in,
const bool eatom, const bool vatom,
int &host_start, int **ilist, int **jnum,
const double cpu_time, bool &success,
const double aewald, const double off2_repulse,
double *host_q, double *boxlo, double *prd,
double cut2, double c0, double c1, double c2,
double c3, double c4, double c5,void** tep_ptr);
/// Compute dispersion real-space with device neighboring
virtual void compute_dispersion_real(int *host_amtype, int *host_amgroup,
double **host_rpole, const double aewald,
const double off2_disp);
/// Compute multipole real-space with device neighboring
virtual void compute_multipole_real(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *host_amtype,
int *host_amgroup, double **host_rpole, double *host_pval,
double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special,
int *nspecial15, tagint **special15,
const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **numj, const double cpu_time, bool &success,
const double aewald, const double felec, const double off2_mpole, double *charge,
double *boxlo, double *prd, void **tep_ptr);
/// Compute the real space part of the permanent field (udirect2b) with device neighboring
virtual void compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole,
double **host_uind, double **host_uinp, double* host_pval,
const double aewald, const double off2_polar, void** fieldp_ptr);
/// Compute the real space part of the induced field (umutual2b) with device neighboring
virtual void compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole,
double **host_uind, double **host_uinp, double *host_pval,
const double aewald, const double off2_polar,
void** fieldp_ptr);
/// Compute polar real-space with device neighboring
virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole,
double **host_uind, double **host_uinp, double *host_pval,
const bool eflag_in, const bool vflag_in,
const bool eatom, const bool vatom,
const double aewald, const double felec, const double off2_polar,
void **tep_ptr);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
/// Returns memory usage on device per atom
int bytes_per_atom(const int max_nbors) const;
/// Total host memory used by library for pair style
double host_memory_usage() const;
// --------------------------- TYPE DATA --------------------------
/// pdamp = coeff_amtype.x; thole = coeff_amtype.y;
/// dirdamp = coeff_amtype.z; amtype2class = coeff_amtype.w
UCL_D_Vec<numtyp4> coeff_amtype;
/// csix = coeff_amclass.x; adisp = coeff_amclass.y;
UCL_D_Vec<numtyp4> coeff_amclass;
/// sizpr = coeff_rep.x; dmppr = coeff_rep.y; elepr = coeff_rep.z;
UCL_D_Vec<numtyp4> coeff_rep;
/// Special polar values [0-4]:
/// sp_polar.x = special_polar_wscale
/// sp_polar.y special_polar_pscale,
/// sp_polar.z = special_polar_piscale
/// sp_polar.w = special_mpole
UCL_D_Vec<numtyp4> sp_polar;
/// Special nonpolar values [0-4]:
/// sp_nonpolar.x = special_hal
/// sp_nonpolar.y special_repel
/// sp_nonpolar.z = special_disp
UCL_D_Vec<numtyp4> sp_nonpolar;
/// If atom type constants fit in shared memory, use fast kernels
bool shared_types;
/// Number of atom types
int _lj_types;
numtyp _cut2,_c0,_c1,_c2,_c3,_c4,_c5;
numtyp _polar_dscale, _polar_uscale;
numtyp _qqrd2e;
UCL_Kernel k_repulsion, k_dispersion;
protected:
bool _allocated;
int repulsion(const int eflag, const int vflag);
int dispersion_real(const int eflag, const int vflag);
int multipole_real(const int eflag, const int vflag);
int udirect2b(const int eflag, const int vflag);
int umutual2b(const int eflag, const int vflag);
int polar_real(const int eflag, const int vflag);
};
}
#endif

231
lib/gpu/lal_hippo_ext.cpp Normal file
View File

@ -0,0 +1,231 @@
/***************************************************************************
hippo_ext.cpp
-------------------
Trung Dac Nguyen (Northwestern)
Functions for LAMMPS access to hippo acceleration routines.
__________________________________________________________________________
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
__________________________________________________________________________
begin :
email : trung.nguyen@northwestern.edu
***************************************************************************/
#include <iostream>
#include <cassert>
#include <cmath>
#include "lal_hippo.h"
using namespace std;
using namespace LAMMPS_AL;
static Hippo<PRECISION,ACC_PRECISION> HIPPOMF;
// ---------------------------------------------------------------------------
// Allocate memory on host and device and copy constants to device
// ---------------------------------------------------------------------------
int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass,
const double *host_pdamp, const double *host_thole,
const double *host_dirdamp, const int *host_amtype2class,
const double *host_special_repel,
const double *host_special_disp,
const double *host_special_mpole,
const double *host_special_polar_wscale,
const double *host_special_polar_piscale,
const double *host_special_polar_pscale,
const double *host_sizpr, const double *host_dmppr, const double *host_elepr,
const double *host_csix, const double *host_adisp,
const double *host_pcore, const double *host_palpha,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const int maxspecial15,
const double cell_size, int &gpu_mode, FILE *screen,
const double polar_dscale, const double polar_uscale) {
HIPPOMF.clear();
gpu_mode=HIPPOMF.device->gpu_mode();
double gpu_split=HIPPOMF.device->particle_split();
int first_gpu=HIPPOMF.device->first_device();
int last_gpu=HIPPOMF.device->last_device();
int world_me=HIPPOMF.device->world_me();
int gpu_rank=HIPPOMF.device->gpu_rank();
int procs_per_gpu=HIPPOMF.device->procs_per_gpu();
HIPPOMF.device->init_message(screen,"HIPPO",first_gpu,last_gpu);
bool message=false;
if (HIPPOMF.device->replica_me()==0 && screen)
message=true;
if (message) {
fprintf(screen,"Initializing GPU and compiling on process 0...");
fflush(screen);
}
int init_ok=0;
if (world_me==0)
init_ok=HIPPOMF.init(ntypes, max_amtype, max_amclass,
host_pdamp, host_thole, host_dirdamp,
host_amtype2class, host_special_repel, host_special_disp,
host_special_mpole, host_special_polar_wscale,
host_special_polar_piscale, host_special_polar_pscale,
host_sizpr, host_dmppr, host_elepr,
host_csix, host_adisp, host_pcore, host_palpha,
nlocal, nall, max_nbors,
maxspecial, maxspecial15, cell_size, gpu_split,
screen, polar_dscale, polar_uscale);
HIPPOMF.device->world_barrier();
if (message)
fprintf(screen,"Done.\n");
for (int i=0; i<procs_per_gpu; i++) {
if (message) {
if (last_gpu-first_gpu==0)
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
else
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
last_gpu,i);
fflush(screen);
}
if (gpu_rank==i && world_me!=0)
init_ok=HIPPOMF.init(ntypes, max_amtype, max_amclass,
host_pdamp, host_thole, host_dirdamp,
host_amtype2class, host_special_repel, host_special_disp,
host_special_mpole, host_special_polar_wscale,
host_special_polar_piscale, host_special_polar_pscale,
host_sizpr, host_dmppr, host_elepr,
host_csix, host_adisp, host_pcore, host_palpha,
nlocal, nall, max_nbors,
maxspecial, maxspecial15, cell_size, gpu_split,
screen, polar_dscale, polar_uscale);
HIPPOMF.device->gpu_barrier();
if (message)
fprintf(screen,"Done.\n");
}
if (message)
fprintf(screen,"\n");
if (init_ok==0)
HIPPOMF.estimate_gpu_overhead();
return init_ok;
}
void hippo_gpu_clear() {
HIPPOMF.clear();
}
int** hippo_gpu_precompute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *host_amtype,
int *host_amgroup, double **host_rpole,
double ** /*host_uind*/, double ** /*host_uinp*/, double * /*host_pval*/,
double *sublo, double *subhi, tagint *tag,
int **nspecial, tagint **special,
int *nspecial15, tagint **special15,
const bool eflag_in, const bool vflag_in,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time,
bool &success, double *host_q, double *boxlo, double *prd) {
return HIPPOMF.precompute(ago, inum_full, nall, host_x, host_type,
host_amtype, host_amgroup, host_rpole,
nullptr, nullptr, nullptr, sublo, subhi, tag,
nspecial, special, nspecial15, special15,
eflag_in, vflag_in, eatom, vatom,
host_start, ilist, jnum, cpu_time,
success, host_q, boxlo, prd);
}
void hippo_gpu_compute_repulsion(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
int *host_amtype, int *host_amgroup, double **host_rpole,
double *sublo, double *subhi, tagint *tag, int **nspecial,
tagint **special, int *nspecial15, tagint** special15,
const bool eflag, const bool vflag, const bool eatom,
const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time,
bool &success, const double aewald, const double off2,
double *host_q, double *boxlo, double *prd,
double cut2, double c0, double c1, double c2,
double c3, double c4, double c5, void **tep_ptr) {
HIPPOMF.compute_repulsion(ago, inum_full, nall, host_x, host_type,
host_amtype, host_amgroup, host_rpole, sublo, subhi,
tag, nspecial, special, nspecial15, special15,
eflag, vflag, eatom, vatom, host_start, ilist, jnum,
cpu_time, success, aewald, off2, host_q, boxlo, prd,
cut2, c0, c1, c2, c3, c4, c5, tep_ptr);
}
void hippo_gpu_compute_dispersion_real(int *host_amtype, int *host_amgroup,
double **host_rpole, const double aewald,
const double off2) {
HIPPOMF.compute_dispersion_real(host_amtype, host_amgroup, host_rpole,
aewald, off2);
}
void hippo_gpu_compute_multipole_real(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
int *host_amtype, int *host_amgroup, double **host_rpole,
double *host_pval, double *sublo, double *subhi, tagint *tag, int **nspecial,
tagint **special, int *nspecial15, tagint** special15,
const bool eflag, const bool vflag, const bool eatom,
const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time,
bool &success, const double aewald, const double felec, const double off2,
double *host_q, double *boxlo, double *prd, void **tep_ptr) {
HIPPOMF.compute_multipole_real(ago, inum_full, nall, host_x, host_type,
host_amtype, host_amgroup, host_rpole, host_pval, sublo, subhi,
tag, nspecial, special, nspecial15, special15,
eflag, vflag, eatom, vatom, host_start, ilist, jnum,
cpu_time, success, aewald, felec, off2, host_q, boxlo, prd, tep_ptr);
}
void hippo_gpu_compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole,
double **host_uind, double **host_uinp, double *host_pval,
const double aewald, const double off2, void **fieldp_ptr) {
HIPPOMF.compute_udirect2b(host_amtype, host_amgroup, host_rpole,
host_uind, host_uinp, host_pval,
aewald, off2, fieldp_ptr);
}
void hippo_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole,
double **host_uind, double **host_uinp, double *host_pval,
const double aewald, const double off2, void **fieldp_ptr) {
HIPPOMF.compute_umutual2b(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval,
aewald, off2, fieldp_ptr);
}
void hippo_gpu_update_fieldp(void **fieldp_ptr) {
HIPPOMF.update_fieldp(fieldp_ptr);
}
void hippo_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole,
double **host_uind, double **host_uinp, double *host_pval,
const bool eflag_in, const bool vflag_in,
const bool eatom, const bool vatom,
const double aewald, const double felec, const double off2,
void **tep_ptr) {
HIPPOMF.compute_polar_real(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval,
eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
}
void hippo_gpu_precompute_kspace(const int inum_full, const int bsorder,
double ***host_thetai1, double ***host_thetai2,
double ***host_thetai3, int** igrid,
const int nzlo_out, const int nzhi_out,
const int nylo_out, const int nyhi_out,
const int nxlo_out, const int nxhi_out) {
HIPPOMF.precompute_kspace(inum_full, bsorder, host_thetai1, host_thetai2,
host_thetai3, igrid, nzlo_out, nzhi_out,
nylo_out, nyhi_out, nxlo_out, nxhi_out);
}
void hippo_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1,
void **host_fdip_phi2, void **host_fdip_sum_phi) {
HIPPOMF.compute_fphi_uind(host_grid_brick, host_fdip_phi1, host_fdip_phi2, host_fdip_sum_phi);
}
double hippo_gpu_bytes() {
return HIPPOMF.host_memory_usage();
}

431
lib/gpu/lal_hippo_extra.h Normal file
View File

@ -0,0 +1,431 @@
/// **************************************************************************
// hippo_extra.h
// -------------------
// Trung Dac Nguyen
//
// Device code for hippo math routines
//
// __________________________________________________________________________
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________
//
// begin :
// email : ndactrung@gmail.com
// ***************************************************************************/*
#ifndef LAL_HIPPO_EXTRA_H
#define LAL_HIPPO_EXTRA_H
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_aux_fun1.h"
#else
#endif
#define MY_PI2 (numtyp)1.57079632679489661923
#define MY_PI4 (numtyp)0.78539816339744830962
/* ----------------------------------------------------------------------
damprep generates coefficients for the Pauli repulsion
damping function for powers of the interatomic distance
literature reference:
J. A. Rackers and J. W. Ponder, "Classical Pauli Repulsion: An
Anisotropic, Atomic Multipole Model", Journal of Chemical Physics,
150, 084104 (2019)
------------------------------------------------------------------------- */
ucl_inline void damprep(const numtyp r, const numtyp r2, const numtyp rr1,
const numtyp rr3, const numtyp rr5, const numtyp rr7,
const numtyp rr9, const numtyp rr11, const int rorder,
const numtyp dmpi, const numtyp dmpk, numtyp dmpik[11])
{
numtyp r3,r4;
numtyp r5,r6,r7,r8;
numtyp s,ds,d2s;
numtyp d3s,d4s,d5s;
numtyp dmpi2,dmpk2;
numtyp dmpi22,dmpi23;
numtyp dmpi24,dmpi25;
numtyp dmpi26,dmpi27;
numtyp dmpk22,dmpk23;
numtyp dmpk24,dmpk25;
numtyp dmpk26;
numtyp eps,diff;
numtyp expi,expk;
numtyp dampi,dampk;
numtyp pre,term,tmp;
// compute tolerance value for damping exponents
eps = (numtyp)0.001;
diff = dmpi-dmpk; // fabs(dmpi-dmpk)
if (diff < (numtyp)0) diff = -diff;
// treat the case where alpha damping exponents are equal
if (diff < eps) {
r3 = r2 * r;
r4 = r3 * r;
r5 = r4 * r;
r6 = r5 * r;
r7 = r6 * r;
dmpi2 = (numtyp)0.5 * dmpi;
dampi = dmpi2 * r;
expi = ucl_exp(-dampi);
dmpi22 = dmpi2 * dmpi2;
dmpi23 = dmpi22 * dmpi2;
dmpi24 = dmpi23 * dmpi2;
dmpi25 = dmpi24 * dmpi2;
dmpi26 = dmpi25 * dmpi2;
pre = (numtyp)128.0;
s = (r + dmpi2*r2 + dmpi22*r3/(numtyp)3.0) * expi;
ds = (dmpi22*r3 + dmpi23*r4) * expi / (numtyp)3.0;
d2s = dmpi24 * expi * r5 / (numtyp)9.0;
d3s = dmpi25 * expi * r6 / (numtyp)45.0;
d4s = (dmpi25*r6 + dmpi26*r7) * expi / (numtyp)315.0;
if (rorder >= 11) {
r8 = r7 * r;
dmpi27 = dmpi2 * dmpi26;
d5s = (dmpi25*r6 + dmpi26*r7 + dmpi27*r8/(numtyp)3.0) * expi / (numtyp)945.0;
}
// treat the case where alpha damping exponents are unequal
} else {
r3 = r2 * r;
r4 = r3 * r;
r5 = r4 * r;
dmpi2 = (numtyp)0.5 * dmpi;
dmpk2 = (numtyp)0.5 * dmpk;
dampi = dmpi2 * r;
dampk = dmpk2 * r;
expi = ucl_exp(-dampi);
expk = ucl_exp(-dampk);
dmpi22 = dmpi2 * dmpi2;
dmpi23 = dmpi22 * dmpi2;
dmpi24 = dmpi23 * dmpi2;
dmpi25 = dmpi24 * dmpi2;
dmpk22 = dmpk2 * dmpk2;
dmpk23 = dmpk22 * dmpk2;
dmpk24 = dmpk23 * dmpk2;
dmpk25 = dmpk24 * dmpk2;
term = dmpi22 - dmpk22;
pre = (numtyp)8192.0 * dmpi23 * dmpk23 / (term*term*term*term); //ucl_powr(term,(numtyp)4.0);
tmp = (numtyp)4.0 * dmpi2 * dmpk2 / term;
s = (dampi-tmp)*expk + (dampk+tmp)*expi;
ds = (dmpi2*dmpk2*r2 - (numtyp)4.0*dmpi2*dmpk22*r/term -
(numtyp)4.0*dmpi2*dmpk2/term) * expk +
(dmpi2*dmpk2*r2 + (numtyp)4.0*dmpi22*dmpk2*r/term + (numtyp)4.0*dmpi2*dmpk2/term) * expi;
d2s = (dmpi2*dmpk2*r2/3.0 + dmpi2*dmpk22*r3/(numtyp)3.0 -
((numtyp)4.0/(numtyp)3.0)*dmpi2*dmpk23*r2/term - (numtyp)4.0*dmpi2*dmpk22*r/term -
(numtyp)4.0*dmpi2*dmpk2/term) * expk +
(dmpi2*dmpk2*r2/(numtyp)3.0 + dmpi22*dmpk2*r3/(numtyp)3.0 +
((numtyp)4.0/(numtyp)3.0)*dmpi23*dmpk2*r2/term + (numtyp)4.0*dmpi22*dmpk2*r/term +
(numtyp)4.0*dmpi2*dmpk2/term) * expi;
d3s = (dmpi2*dmpk23*r4/(numtyp)15.0 + dmpi2*dmpk22*r3/(numtyp)5.0 + dmpi2*dmpk2*r2/(numtyp)5.0 -
((numtyp)4.0/(numtyp)15.0)*dmpi2*dmpk24*r3/term - ((numtyp)8.0/(numtyp)5.0)*dmpi2*dmpk23*r2/term -
(numtyp)4.0*dmpi2*dmpk22*r/term - (numtyp)4.0/term*dmpi2*dmpk2) * expk +
(dmpi23*dmpk2*r4/(numtyp)15.0 + dmpi22*dmpk2*r3/(numtyp)5.0 + dmpi2*dmpk2*r2/(numtyp)5.0 +
((numtyp)4.0/(numtyp)15.0)*dmpi24*dmpk2*r3/term + ((numtyp)8.0/(numtyp)5.0)*dmpi23*dmpk2*r2/term +
(numtyp)4.0*dmpi22*dmpk2*r/term + (numtyp)4.0/term*dmpi2*dmpk2) * expi;
d4s = (dmpi2*dmpk24*r5/(numtyp)105.0 + ((numtyp)2.0/(numtyp)35.0)*dmpi2*dmpk23*r4 +
dmpi2*dmpk22*r3/(numtyp)7.0 + dmpi2*dmpk2*r2/(numtyp)7.0 -
((numtyp)4.0/(numtyp)105.0)*dmpi2*dmpk25*r4/term - ((numtyp)8.0/21.0)*dmpi2*dmpk24*r3/term -
((numtyp)12.0/(numtyp)7.0)*dmpi2*dmpk23*r2/term - (numtyp)4.0*dmpi2*dmpk22*r/term -
(numtyp)4.0*dmpi2*dmpk2/term) * expk +
(dmpi24*dmpk2*r5/(numtyp)105.0 + ((numtyp)2.0/(numtyp)35.0)*dmpi23*dmpk2*r4 +
dmpi22*dmpk2*r3/(numtyp)7.0 + dmpi2*dmpk2*r2/(numtyp)7.0 +
((numtyp)4.0/(numtyp)105.0)*dmpi25*dmpk2*r4/term + ((numtyp)8.0/(numtyp)21.0)*dmpi24*dmpk2*r3/term +
((numtyp)12.0/(numtyp)7.0)*dmpi23*dmpk2*r2/term + (numtyp)4.0*dmpi22*dmpk2*r/term +
(numtyp)4.0*dmpi2*dmpk2/term) * expi;
if (rorder >= 11) {
r6 = r5 * r;
dmpi26 = dmpi25 * dmpi2;
dmpk26 = dmpk25 * dmpk2;
d5s = (dmpi2*dmpk25*r6/(numtyp)945.0 + ((numtyp)2.0/(numtyp)189.0)*dmpi2*dmpk24*r5 +
dmpi2*dmpk23*r4/(numtyp)21.0 + dmpi2*dmpk22*r3/(numtyp)9.0 + dmpi2*dmpk2*r2/(numtyp)9.0 -
((numtyp)4.0/(numtyp)945.0)*dmpi2*dmpk26*r5/term -
((numtyp)4.0/(numtyp)63.0)*dmpi2*dmpk25*r4/term - ((numtyp)4.0/(numtyp)9.0)*dmpi2*dmpk24*r3/term -
((numtyp)16.0/(numtyp)9.0)*dmpi2*dmpk23*r2/term - (numtyp)4.0*dmpi2*dmpk22*r/term -
(numtyp)4.0*dmpi2*dmpk2/term) * expk +
(dmpi25*dmpk2*r6/(numtyp)945.0 + ((numtyp)2.0/(numtyp)189.0)*dmpi24*dmpk2*r5 +
dmpi23*dmpk2*r4/(numtyp)21.0 + dmpi22*dmpk2*r3/(numtyp)9.0 + dmpi2*dmpk2*r2/(numtyp)9.0 +
((numtyp)4.0/(numtyp)945.0)*dmpi26*dmpk2*r5/term + ((numtyp)4.0/(numtyp)63.0)*dmpi25*dmpk2*r4/term +
((numtyp)4.0/(numtyp)9.0)*dmpi24*dmpk2*r3/term + ((numtyp)16.0/(numtyp)9.0)*dmpi23*dmpk2*r2/term +
(numtyp)4.0*dmpi22*dmpk2*r/term + (numtyp)4.0*dmpi2*dmpk2/term) * expi;
}
}
// convert partial derivatives into full derivatives
s = s * rr1;
ds = ds * rr3;
d2s = d2s * rr5;
d3s = d3s * rr7;
d4s = d4s * rr9;
d5s = d5s * rr11;
dmpik[0] = (numtyp)0.5 * pre * s * s;
dmpik[2] = pre * s * ds;
dmpik[4] = pre * (s*d2s + ds*ds);
dmpik[6] = pre * (s*d3s + (numtyp)3.0*ds*d2s);
dmpik[8] = pre * (s*d4s + (numtyp)4.0*ds*d3s + (numtyp)3.0*d2s*d2s);
if (rorder >= 11) dmpik[10] = pre * (s*d5s + (numtyp)5.0*ds*d4s + (numtyp)10.0*d2s*d3s);
}
/* ----------------------------------------------------------------------
damppole generates coefficients for the charge penetration
damping function for powers of the interatomic distance
literature references:
L. V. Slipchenko and M. S. Gordon, "Electrostatic Energy in the
Effective Fragment Potential Method: Theory and Application to
the Benzene Dimer", Journal of Computational Chemistry, 28,
276-291 (2007) [Gordon f1 and f2 models]
J. A. Rackers, Q. Wang, C. Liu, J.-P. Piquemal, P. Ren and
J. W. Ponder, "An Optimized Charge Penetration Model for Use with
the AMOEBA Force Field", Physical Chemistry Chemical Physics, 19,
276-291 (2017)
------------------------------------------------------------------------- */
ucl_inline void damppole(const numtyp r, const int rorder,
const numtyp alphai, const numtyp alphak,
numtyp dmpi[9], numtyp dmpk[9], numtyp dmpik[11])
{
numtyp termi,termk;
numtyp termi2,termk2;
numtyp alphai2,alphak2;
numtyp eps,diff;
numtyp expi,expk;
numtyp dampi,dampk;
numtyp dampi2,dampi3;
numtyp dampi4,dampi5;
numtyp dampi6,dampi7;
numtyp dampi8;
numtyp dampk2,dampk3;
numtyp dampk4,dampk5;
numtyp dampk6;
// compute tolerance and exponential damping factors
eps = (numtyp)0.001;
diff = alphai-alphak;
if (diff < (numtyp)0) diff = -diff;
dampi = alphai * r;
dampk = alphak * r;
expi = ucl_exp(-dampi);
expk = ucl_exp(-dampk);
// core-valence charge penetration damping for Gordon f1
dampi2 = dampi * dampi;
dampi3 = dampi * dampi2;
dampi4 = dampi2 * dampi2;
dampi5 = dampi2 * dampi3;
dmpi[0] = (numtyp)1.0 - ((numtyp)1.0 + (numtyp)0.5*dampi)*expi;
dmpi[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2)*expi;
dmpi[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi;
dmpi[6] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + dampi4/(numtyp)30.0)*expi;
dmpi[8] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
(numtyp)4.0*dampi4/(numtyp)105.0 + dampi5/(numtyp)210.0)*expi;
if (diff < eps) {
dmpk[0] = dmpi[0];
dmpk[2] = dmpi[2];
dmpk[4] = dmpi[4];
dmpk[6] = dmpi[6];
dmpk[8] = dmpi[8];
} else {
dampk2 = dampk * dampk;
dampk3 = dampk * dampk2;
dampk4 = dampk2 * dampk2;
dampk5 = dampk2 * dampk3;
dmpk[0] = (numtyp)1.0 - ((numtyp)1.0 + (numtyp)0.5*dampk)*expk;
dmpk[2] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2)*expk;
dmpk[4] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0)*expk;
dmpk[6] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + dampk4/(numtyp)30.0)*expk;
dmpk[8] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 +
(numtyp)4.0*dampk4/(numtyp)105.0 + dampk5/(numtyp)210.0)*expk;
}
// valence-valence charge penetration damping for Gordon f1
if (diff < eps) {
dampi6 = dampi3 * dampi3;
dampi7 = dampi3 * dampi4;
dmpik[0] = (numtyp)1.0 - ((numtyp)1.0 + (numtyp)11.0*dampi/(numtyp)16.0 + (numtyp)3.0*dampi2/(numtyp)16.0 +
dampi3/(numtyp)48.0)*expi;
dmpik[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 +
(numtyp)7.0*dampi3/(numtyp)48.0 + dampi4/(numtyp)48.0)*expi;
dmpik[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
dampi4/(numtyp)24.0 + dampi5/(numtyp)144.0)*expi;
dmpik[6] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
dampi4/(numtyp)24.0 + dampi5/(numtyp)120.0 + dampi6/(numtyp)720.0)*expi;
dmpik[8] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
dampi4/(numtyp)24.0 + dampi5/(numtyp)120.0 + dampi6/(numtyp)720.0 +
dampi7/(numtyp)5040.0)*expi;
if (rorder >= 11) {
dampi8 = dampi4 * dampi4;
dmpik[10] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
dampi4/(numtyp)24.0 + dampi5/(numtyp)120.0 + dampi6/(numtyp)720.0 +
dampi7/(numtyp)5040.0 + dampi8/(numtyp)45360.0)*expi;
}
} else {
alphai2 = alphai * alphai;
alphak2 = alphak * alphak;
termi = alphak2 / (alphak2-alphai2);
termk = alphai2 / (alphai2-alphak2);
termi2 = termi * termi;
termk2 = termk * termk;
dmpik[0] = (numtyp)1.0 - termi2*(1.0 + (numtyp)2.0*termk + (numtyp)0.5*dampi)*expi -
termk2*((numtyp)1.0 + (numtyp)2.0*termi + (numtyp)0.5*dampk)*expk;
dmpik[2] = (numtyp)1.0 - termi2*((numtyp)1.0+dampi+(numtyp)0.5*dampi2)*expi -
termk2*((numtyp)1.0+dampk+(numtyp)0.5*dampk2)*expk -
(numtyp)2.0*termi2*termk*((numtyp)1.0+dampi)*expi -
(numtyp)2.0*termk2*termi*((numtyp)1.0+dampk)*expk;
dmpik[4] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi -
termk2*(1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0)*expk -
(numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + dampi2/(numtyp)3.0)*expi -
(numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + dampk2/(numtyp)3.0)*expk;
dmpik[6] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 +
dampi3/(numtyp)6.0 + dampi4/(numtyp)30.0)*expi -
termk2*((numtyp)1.0 + dampk + 0.5*dampk2 + dampk3/(numtyp)6.0 + dampk4/(numtyp)30.0)*expk -
(numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)2.0*dampi2/(numtyp)5.0 + dampi3/(numtyp)15.0)*expi -
(numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + (numtyp)2.0*dampk2/(numtyp)5.0 + dampk3/(numtyp)15.0)*expk;
dmpik[8] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
(numtyp)4.0*dampi4/(numtyp)105.0 + dampi5/(numtyp)210.0)*expi -
termk2*((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 +
(numtyp)4.0*dampk4/105.0 + dampk5/(numtyp)210.0)*expk -
(numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)3.0*dampi2/(numtyp)7.0 +
(numtyp)2.0*dampi3/(numtyp)21.0 + dampi4/(numtyp)105.0)*expi -
(numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + (numtyp)3.0*dampk2/(numtyp)7.0 +
(numtyp)2.0*dampk3/(numtyp)21.0 + dampk4/(numtyp)105.0)*expk;
if (rorder >= 11) {
dampi6 = dampi3 * dampi3;
dampk6 = dampk3 * dampk3;
dmpik[10] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
(numtyp)5.0*dampi4/(numtyp)126.0 + (numtyp)2.0*dampi5/(numtyp)315.0 +
dampi6/(numtyp)1890.0)*expi -
termk2*((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + (numtyp)5.0*dampk4/(numtyp)126.0 +
(numtyp)2.0*dampk5/(numtyp)315.0 + dampk6/(numtyp)1890.0)*expk -
(numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)4.0*dampi2/(numtyp)9.0 + dampi3/(numtyp)9.0 +
dampi4/(numtyp)63.0 + dampi5/(numtyp)945.0)*expi -
(numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + 4.0*dampk2/(numtyp)9.0 + dampk3/(numtyp)9.0 +
dampk4/(numtyp)63.0 + dampk5/(numtyp)945.0)*expk;
}
}
}
/* ----------------------------------------------------------------------
dampdir = direct field damping coefficents
dampdir generates coefficients for the direct field damping
function for powers of the interatomic distance
------------------------------------------------------------------------- */
ucl_inline void dampdir(numtyp r, numtyp alphai, numtyp alphak, numtyp *dmpi, numtyp *dmpk)
{
numtyp eps,diff;
numtyp expi,expk;
numtyp dampi,dampk;
numtyp dampi2,dampk2;
numtyp dampi3,dampk3;
numtyp dampi4,dampk4;
// compute tolerance and exponential damping factors
eps = (numtyp)0.001;
diff = alphai-alphak; // fabs(alphai-alphak);
if (diff < (numtyp)0) diff = -diff;
dampi = alphai * r;
dampk = alphak * r;
expi = ucl_exp(-dampi);
expk = ucl_exp(-dampk);
// core-valence charge penetration damping for Gordon f1 (HIPPO)
dampi2 = dampi * dampi;
dampi3 = dampi * dampi2;
dampi4 = dampi2 * dampi2;
dmpi[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2)*expi;
dmpi[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi;
dmpi[6] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + dampi4/(numtyp)30.0)*expi;
if (diff < eps) {
dmpk[2] = dmpi[2];
dmpk[4] = dmpi[4];
dmpk[6] = dmpi[6];
} else {
dampk2 = dampk * dampk;
dampk3 = dampk * dampk2;
dampk4 = dampk2 * dampk2;
dmpk[2] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2)*expk;
dmpk[4] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0)*expk;
dmpk[6] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + dampk4/30.0)*expk;
}
}
/* ----------------------------------------------------------------------
dampmut = mutual field damping coefficents
dampmut generates coefficients for the mutual field damping
function for powers of the interatomic distance
------------------------------------------------------------------------- */
ucl_inline void dampmut(numtyp r, numtyp alphai, numtyp alphak, numtyp dmpik[5])
{
numtyp termi,termk;
numtyp termi2,termk2;
numtyp alphai2,alphak2;
numtyp eps,diff;
numtyp expi,expk;
numtyp dampi,dampk;
numtyp dampi2,dampi3;
numtyp dampi4,dampi5;
numtyp dampk2,dampk3;
// compute tolerance and exponential damping factors
eps = (numtyp)0.001;
diff = alphai-alphak; // fabs(alphai-alphak);
if (diff < (numtyp)0) diff = -diff;
dampi = alphai * r;
dampk = alphak * r;
expi = ucl_exp(-dampi);
expk = ucl_exp(-dampk);
// valence-valence charge penetration damping for Gordon f1 (HIPPO)
dampi2 = dampi * dampi;
dampi3 = dampi * dampi2;
if (diff < eps) {
dampi4 = dampi2 * dampi2;
dampi5 = dampi2 * dampi3;
dmpik[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 +
7.0*dampi3/(numtyp)48.0 + dampi4/48.0)*expi;
dmpik[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
dampi4/(numtyp)24.0 + dampi5/(numtyp)144.0)*expi;
} else {
dampk2 = dampk * dampk;
dampk3 = dampk * dampk2;
alphai2 = alphai * alphai;
alphak2 = alphak * alphak;
termi = alphak2 / (alphak2-alphai2);
termk = alphai2 / (alphai2-alphak2);
termi2 = termi * termi;
termk2 = termk * termk;
dmpik[2] = (numtyp)1.0 - termi2*((numtyp)1.0+dampi+(numtyp)0.5*dampi2)*expi -
termk2*((numtyp)1.0+dampk+(numtyp)0.5*dampk2)*expk -
(numtyp)2.0*termi2*termk*((numtyp)1.0+dampi)*expi - (numtyp)2.0*termk2*termi*((numtyp)1.0+dampk)*expk;
dmpik[4] = (numtyp)1.0 - termi2*((numtyp)1.0+dampi+(numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi -
termk2*((numtyp)1.0+dampk+(numtyp)0.5*dampk2 + dampk3/(numtyp)6.00)*expk -
(numtyp)2.0*termi2*termk *((numtyp)1.0+dampi+dampi2/(numtyp)3.0)*expi -
(numtyp)2.0*termk2*termi *((numtyp)1.0+dampk+dampk2/(numtyp)3.0)*expk;
}
}
#endif

View File

@ -576,6 +576,11 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
time_nbor.stop();
if (_time_device)
time_nbor.add_to_total();
// on the host, special[i][j] = the special j neighbor of atom i (nall by maxspecial)
// on the device, transpose the matrix (1-d array) for coalesced reads
// dev_special[i][j] = the special i neighbor of atom j
time_transpose.start();
const int b2x=_block_cell_2d;
const int b2y=_block_cell_2d;
@ -679,6 +684,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
if (_cutoff < _cell_size) vadjust*=1.46;
mn=std::max(mn,static_cast<int>(ceil(_max_neighbor_factor*vadjust*mn)));
if (mn<33) mn+=3;
resize_max_neighbors<numtyp,acctyp>(mn,success);
set_nbor_block_size(mn/2);
if (!success)
@ -831,6 +837,17 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
time_nbor.stop();
}
void Neighbor::transpose(UCL_D_Vec<tagint> &out, const UCL_D_Vec<tagint> &in,
const int columns_in, const int rows_in)
{
const int b2x=_block_cell_2d;
const int b2y=_block_cell_2d;
const int g2x=static_cast<int>(ceil(static_cast<double>(columns_in)/b2x));
const int g2y=static_cast<int>(ceil(static_cast<double>(rows_in)/b2y));
_shared->k_transpose.set_size(g2x,g2y,b2x,b2y);
_shared->k_transpose.run(&out, &in, &columns_in, &rows_in);
}
template void Neighbor::build_nbor_list<PRECISION,ACC_PRECISION>
(double **x, const int inum, const int host_inum, const int nall,
Atom<PRECISION,ACC_PRECISION> &atom, double *sublo, double *subhi,

View File

@ -33,7 +33,7 @@
#endif
#endif
#if defined(USE_HIP)
#if defined(USE_HIP) || defined(__APPLE__)
#define LAL_USE_OLD_NEIGHBOR
#endif
@ -259,6 +259,10 @@ class Neighbor {
return o.str();
}
/// Helper function
void transpose(UCL_D_Vec<tagint> &out, const UCL_D_Vec<tagint> &in,
const int columns_in, const int rows_in);
private:
NeighborShared *_shared;
UCL_Device *dev;
@ -289,15 +293,17 @@ class Neighbor {
#endif
int _simd_size;
inline void set_nbor_block_size(const int mn) {
#ifdef LAL_USE_OLD_NEIGHBOR
inline void set_nbor_block_size(const int mn) {
int desired=mn/(2*_simd_size);
desired*=_simd_size;
if (desired<_simd_size) desired=_simd_size;
else if (desired>_max_block_nbor_build) desired=_max_block_nbor_build;
_block_nbor_build=desired;
#endif
}
#else
inline void set_nbor_block_size(const int) {}
#endif
};
}

View File

@ -48,6 +48,19 @@ _texture_2d( pos_tex,int4);
#define LAL_USE_OLD_NEIGHBOR
#endif
/*
compute the id of the cell where the atoms belong to
x: atom coordinates
cell_id: cell ids
particle_id:
boxlo[0-2]: the lower left corner of the local box
ncell[xyz]: the number of cells in xyz dims
i_cell_size is the inverse cell size
inum = the number of the local atoms that are ported to the device
nall = the number of the local+ghost atoms that are ported to the device
cells_in_cutoff = the number of cells that are within the cutoff
*/
__kernel void calc_cell_id(const numtyp4 *restrict x_,
unsigned *restrict cell_id,
int *restrict particle_id,
@ -90,6 +103,8 @@ __kernel void calc_cell_id(const numtyp4 *restrict x_,
}
}
// compute the number of atoms in each cell
__kernel void kernel_calc_cell_counts(const unsigned *restrict cell_id,
int *restrict cell_counts,
int nall, int ncell) {

View File

@ -182,12 +182,15 @@
#define ucl_cbrt cbrt
#define ucl_ceil ceil
#define ucl_abs fabs
#define ucl_recip(x) ((numtyp)1.0/(x))
#define ucl_rsqrt rsqrt
#define ucl_sqrt sqrt
#define ucl_recip(x) ((numtyp)1.0/(x))
#define ucl_erfc erfc
#else
#define ucl_exp expf
#define ucl_powr powf
#define ucl_atan atanf
#define ucl_cbrt cbrtf
#define ucl_ceil ceilf
@ -195,8 +198,7 @@
#define ucl_recip(x) ((numtyp)1.0/(x))
#define ucl_rsqrt rsqrtf
#define ucl_sqrt sqrtf
#define ucl_exp expf
#define ucl_powr powf
#define ucl_erfc erfcf
#endif

View File

@ -166,6 +166,7 @@
#define ucl_cbrt cbrt
#define ucl_ceil ceil
#define ucl_abs fabs
#define ucl_erfc erfc
#if defined(FAST_MATH) && !defined(_DOUBLE_DOUBLE)
@ -330,6 +331,10 @@
#define NEIGHMASK 0x3FFFFFFF
ucl_inline int sbmask(int j) { return j >> SBBITS & 3; };
#define SBBITS15 29
#define NEIGHMASK15 0x1FFFFFFF
ucl_inline int sbmask15(int j) { return j >> SBBITS15 & 7; };
// default to 32-bit smallint and other ints, 64-bit bigint:
// same as defined in src/lmptype.h
#if !defined(LAMMPS_SMALLSMALL) && !defined(LAMMPS_BIGBIG) && \

View File

@ -150,7 +150,7 @@ double SWT::host_memory_usage() const {
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int SWT::loop(const int eflag, const int vflag, const int evatom,
bool &success) {
bool & /*success*/) {
const int nbor_pitch=this->nbor->nbor_pitch();
// build the short neighbor list

View File

@ -106,6 +106,7 @@ _texture_2d( pos_tex,int4);
} \
}
// (SHUFFLE_AVAIL == 1)
#else
#define local_allocate_acc_zeta()
@ -202,6 +203,7 @@ _texture_2d( pos_tex,int4);
} \
}
// EVFLAG == 0
#else
#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom, \
@ -216,8 +218,8 @@ _texture_2d( pos_tex,int4);
ans[ii]=old; \
}
#endif
#endif
#endif // EVFLAG
#endif // SHUFFLE_AVAIL
#ifdef LAL_SIMD_IP_SYNC
#define t_per_atom t_per_atom_in

View File

@ -56,7 +56,7 @@ int VashishtaT::init(const int ntypes, const int nlocal, const int nall, const i
const double* costheta, const double* bigb,
const double* big2b, const double* bigc)
{
int success;
int success=0;
success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
_screen,vashishta,"k_vashishta","k_vashishta_three_center",
"k_vashishta_three_end","k_vashishta_short_nbor");
@ -211,7 +211,7 @@ double VashishtaT::host_memory_usage() const {
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int VashishtaT::loop(const int eflag, const int vflag, const int evatom,
bool &success) {
bool & /*success*/) {
const int nbor_pitch=this->nbor->nbor_pitch();
// build the short neighbor list

View File

@ -1,199 +0,0 @@
diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos
index 22af411f32..530510a0d1 100644
--- a/lib/kokkos/Makefile.kokkos
+++ b/lib/kokkos/Makefile.kokkos
@@ -20,7 +20,7 @@ KOKKOS_DEVICES ?= "OpenMP"
#KOKKOS_DEVICES ?= "Threads"
# Options:
# Intel: KNC,KNL,SNB,HSW,BDW,SKL,SKX,ICL,ICX,SPR
-# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86
+# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86,Hopper90
# ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX
# IBM: BGQ,Power7,Power8,Power9
# AMD-GPUS: Vega900,Vega906,Vega908,Vega90A
@@ -401,6 +401,7 @@ KOKKOS_INTERNAL_USE_ARCH_VOLTA72 := $(call kokkos_has_string,$(KOKKOS_ARCH),Volt
KOKKOS_INTERNAL_USE_ARCH_TURING75 := $(call kokkos_has_string,$(KOKKOS_ARCH),Turing75)
KOKKOS_INTERNAL_USE_ARCH_AMPERE80 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere80)
KOKKOS_INTERNAL_USE_ARCH_AMPERE86 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere86)
+KOKKOS_INTERNAL_USE_ARCH_HOPPER90 := $(call kokkos_has_string,$(KOKKOS_ARCH),Hopper90)
KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30) \
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32) \
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \
@@ -414,7 +415,8 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLE
+ $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72) \
+ $(KOKKOS_INTERNAL_USE_ARCH_TURING75) \
+ $(KOKKOS_INTERNAL_USE_ARCH_AMPERE80) \
- + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86))
+ + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86) \
+ + $(KOKKOS_INTERNAL_USE_ARCH_HOPPER90))
#SEK: This seems like a bug to me
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
@@ -1194,6 +1196,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA_ARCH), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE86")
KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_86
endif
+ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_HOPPER90), 1)
+ tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER")
+ tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER90")
+ KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_90
+ endif
ifneq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)
diff --git a/lib/kokkos/cmake/KokkosCore_config.h.in b/lib/kokkos/cmake/KokkosCore_config.h.in
index 88ddc48378..b83ced9243 100644
--- a/lib/kokkos/cmake/KokkosCore_config.h.in
+++ b/lib/kokkos/cmake/KokkosCore_config.h.in
@@ -102,6 +102,7 @@
#cmakedefine KOKKOS_ARCH_AMPERE
#cmakedefine KOKKOS_ARCH_AMPERE80
#cmakedefine KOKKOS_ARCH_AMPERE86
+#cmakedefine KOKKOS_ARCH_HOPPER90
#cmakedefine KOKKOS_ARCH_AMD_ZEN
#cmakedefine KOKKOS_ARCH_AMD_ZEN2
#cmakedefine KOKKOS_ARCH_AMD_ZEN3
diff --git a/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc b/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc
index f56cef1651..2585a6a64c 100644
--- a/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc
+++ b/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc
@@ -74,6 +74,7 @@ int main() {
case 75: std::cout << "Set -DKokkos_ARCH_TURING75=ON ." << std::endl; break;
case 80: std::cout << "Set -DKokkos_ARCH_AMPERE80=ON ." << std::endl; break;
case 86: std::cout << "Set -DKokkos_ARCH_AMPERE86=ON ." << std::endl; break;
+ case 90: std::cout << "Set -DKokkos_ARCH_HOPPER90=ON ." << std::endl; break;
default:
std::cout << "Compute capability " << compute_capability
<< " is not supported" << std::endl;
diff --git a/lib/kokkos/cmake/kokkos_arch.cmake b/lib/kokkos/cmake/kokkos_arch.cmake
index ef16aad047..c1d76cceeb 100644
--- a/lib/kokkos/cmake/kokkos_arch.cmake
+++ b/lib/kokkos/cmake/kokkos_arch.cmake
@@ -86,6 +86,7 @@ KOKKOS_ARCH_OPTION(VOLTA72 GPU "NVIDIA Volta generation CC 7.2" "KOKK
KOKKOS_ARCH_OPTION(TURING75 GPU "NVIDIA Turing generation CC 7.5" "KOKKOS_SHOW_CUDA_ARCHS")
KOKKOS_ARCH_OPTION(AMPERE80 GPU "NVIDIA Ampere generation CC 8.0" "KOKKOS_SHOW_CUDA_ARCHS")
KOKKOS_ARCH_OPTION(AMPERE86 GPU "NVIDIA Ampere generation CC 8.6" "KOKKOS_SHOW_CUDA_ARCHS")
+KOKKOS_ARCH_OPTION(HOPPER90 GPU "NVIDIA Hopper generation CC 9.0" "KOKKOS_SHOW_CUDA_ARCHS")
IF(Kokkos_ENABLE_HIP OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_UNSUPPORTED_ARCHS)
SET(KOKKOS_SHOW_HIP_ARCHS ON)
@@ -544,6 +545,7 @@ CHECK_CUDA_ARCH(VOLTA72 sm_72)
CHECK_CUDA_ARCH(TURING75 sm_75)
CHECK_CUDA_ARCH(AMPERE80 sm_80)
CHECK_CUDA_ARCH(AMPERE86 sm_86)
+CHECK_CUDA_ARCH(HOPPER90 sm_90)
SET(AMDGPU_ARCH_ALREADY_SPECIFIED "")
FUNCTION(CHECK_AMDGPU_ARCH ARCH FLAG)
@@ -806,6 +808,10 @@ IF (KOKKOS_ARCH_AMPERE80 OR KOKKOS_ARCH_AMPERE86)
SET(KOKKOS_ARCH_AMPERE ON)
ENDIF()
+IF (KOKKOS_ARCH_HOPPER90)
+ SET(KOKKOS_ARCH_HOPPER ON)
+ENDIF()
+
#Regardless of version, make sure we define the general architecture name
IF (KOKKOS_ARCH_VEGA900 OR KOKKOS_ARCH_VEGA906 OR KOKKOS_ARCH_VEGA908 OR KOKKOS_ARCH_VEGA90A)
SET(KOKKOS_ARCH_VEGA ON)
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
index 56f9117844..fcd4773dbc 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
@@ -232,7 +232,8 @@ inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) {
case 61: return 96;
case 70:
case 80:
- case 86: return 8;
+ case 86:
+ case 90: return 8;
case 75: return 32;
default:
Kokkos::Impl::throw_runtime_exception(
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp
index 40a263561f..8c40ebd60d 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp
@@ -418,7 +418,7 @@ KOKKOS_INLINE_FUNCTION
#endif // CUDA_VERSION >= 11000 && CUDA_VERSION < 11010
#if CUDA_VERSION >= 11010 && \
- ((defined(KOKKOS_ARCH_AMPERE80) || defined(KOKKOS_ARCH_AMPERE86)))
+ ((defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER)))
KOKKOS_INLINE_FUNCTION
bhalf_t cast_to_bhalf(bhalf_t val) { return val; }
KOKKOS_INLINE_FUNCTION
diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp
index f9451ecfe6..2ce1efb98c 100644
--- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp
+++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp
@@ -51,7 +51,7 @@ namespace Kokkos::Experimental::Impl {
struct OpenACC_Traits {
#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
- defined(KOKKOS_ARCH_AMPERE)
+ defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER)
static constexpr acc_device_t dev_type = acc_device_nvidia;
static constexpr bool may_fallback_to_host = false;
#else
diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
index a9bc085912..27ee1d4232 100644
--- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
+++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
@@ -115,8 +115,9 @@ void OpenMPTargetInternal::impl_initialize() {
// FIXME_OPENMPTARGET: Only fix the number of teams for NVIDIA architectures
// from Pascal and upwards.
-#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
- defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
+#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
+ defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
+ defined(KOKKOS_ARCH_HOPPER)
#if defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG >= 1300)
omp_set_num_teams(512);
#endif
diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
index 840db4327c..7e5addbc5b 100644
--- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
@@ -155,7 +155,7 @@ void SYCL::impl_initialize(InitializationSettings const& settings) {
#if !defined(KOKKOS_ARCH_INTEL_GPU) && !defined(KOKKOS_ARCH_KEPLER) && \
!defined(KOKKOS_ARCH_MAXWELL) && !defined(KOKKOS_ARCH_PASCAL) && \
!defined(KOKKOS_ARCH_VOLTA) && !defined(KOKKOS_ARCH_TURING75) && \
- !defined(KOKKOS_ARCH_AMPERE)
+ !defined(KOKKOS_ARCH_AMPERE) && !defined(KOKKOS_ARCH_HOPPER)
if (!settings.has_device_id() && gpu_devices.empty()) {
Impl::SYCLInternal::singleton().initialize(sycl::device());
return;
diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
index 5ac7d8af30..ba101f699e 100644
--- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
@@ -335,9 +335,10 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
return std::min({
int(m_space.impl_internal_space_instance()->m_maxWorkgroupSize),
// FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs.
-#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
- defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
- defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
+#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
+ defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
+ defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
+ defined(KOKKOS_ARCH_HOPPER)
256,
#endif
max_threads_for_memory
@@ -367,9 +368,10 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
return std::min<int>({
int(m_space.impl_internal_space_instance()->m_maxWorkgroupSize),
// FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs.
-#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
- defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
- defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
+#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
+ defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
+ defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
+ defined(KOKKOS_ARCH_HOPPER)
256,
#endif
max_threads_for_memory

View File

@ -1,523 +0,0 @@
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
index fcd4773dbc..30b6958a67 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
@@ -207,7 +207,6 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
LaunchBounds{});
}
-// Assuming cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1)
// NOTE these number can be obtained several ways:
// * One option is to download the CUDA Occupancy Calculator spreadsheet, select
// "Compute Capability" first and check what is the smallest "Shared Memory
@@ -242,6 +241,7 @@ inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) {
return 0;
}() * 1024;
}
+
} // namespace Impl
} // namespace Kokkos
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
index 5811498e01..e22eb3b842 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
@@ -569,12 +569,6 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default
}
#endif
-#ifdef KOKKOS_ENABLE_PRE_CUDA_10_DEPRECATION_API
- cudaThreadSetCacheConfig(cudaFuncCachePreferShared);
-#else
- cudaDeviceSetCacheConfig(cudaFuncCachePreferShared);
-#endif
-
// Init the array for used for arbitrarily sized atomics
if (stream == nullptr) Impl::initialize_host_cuda_lock_arrays();
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
index b7a80ad84f..5c4c3a7d39 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
@@ -93,10 +93,6 @@ namespace Impl {
// __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
// function qualifier which could be used to improve performance.
//----------------------------------------------------------------------------
-// Maximize L1 cache and minimize shared memory:
-// cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1 );
-// For 2.0 capability: 48 KB L1 and 16 KB shared
-//----------------------------------------------------------------------------
template <class DriverType>
__global__ static void cuda_parallel_launch_constant_memory() {
@@ -158,63 +154,105 @@ inline void check_shmem_request(CudaInternal const* cuda_instance, int shmem) {
}
}
-// This function needs to be template on DriverType and LaunchBounds
+// These functions needs to be template on DriverType and LaunchBounds
// so that the static bool is unique for each type combo
// KernelFuncPtr does not necessarily contain that type information.
+
template <class DriverType, class LaunchBounds, class KernelFuncPtr>
-inline void configure_shmem_preference(KernelFuncPtr const& func,
- bool prefer_shmem) {
+const cudaFuncAttributes& get_cuda_kernel_func_attributes(
+ const KernelFuncPtr& func) {
+ // Only call cudaFuncGetAttributes once for each unique kernel
+ // by leveraging static variable initialization rules
+ auto wrap_get_attributes = [&]() -> cudaFuncAttributes {
+ cudaFuncAttributes attr;
+ KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncGetAttributes(&attr, func));
+ return attr;
+ };
+ static cudaFuncAttributes func_attr = wrap_get_attributes();
+ return func_attr;
+}
+
+template <class DriverType, class LaunchBounds, class KernelFuncPtr>
+inline void configure_shmem_preference(const KernelFuncPtr& func,
+ const cudaDeviceProp& device_props,
+ const size_t block_size, int& shmem,
+ const size_t occupancy) {
#ifndef KOKKOS_ARCH_KEPLER
- // On Kepler the L1 has no benefit since it doesn't cache reads
+
+ const auto& func_attr =
+ get_cuda_kernel_func_attributes<DriverType, LaunchBounds>(func);
+
+ // Compute limits for number of blocks due to registers/SM
+ const size_t regs_per_sm = device_props.regsPerMultiprocessor;
+ const size_t regs_per_thread = func_attr.numRegs;
+ // The granularity of register allocation is chunks of 256 registers per warp
+ // -> 8 registers per thread
+ const size_t allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
+ const size_t max_blocks_regs =
+ regs_per_sm / (allocated_regs_per_thread * block_size);
+
+ // Compute how many threads per sm we actually want
+ const size_t max_threads_per_sm = device_props.maxThreadsPerMultiProcessor;
+ // only allocate multiples of warp size
+ const size_t num_threads_desired =
+ ((max_threads_per_sm * occupancy / 100 + 31) / 32) * 32;
+ // Get close to the desired occupancy,
+ // don't undershoot by much but also don't allocate a whole new block just
+ // because one is a few threads over otherwise.
+ size_t num_blocks_desired =
+ (num_threads_desired + block_size * 0.8) / block_size;
+ num_blocks_desired = ::std::min(max_blocks_regs, num_blocks_desired);
+ if (num_blocks_desired == 0) num_blocks_desired = 1;
+
+ // Calculate how much shared memory we need per block
+ size_t shmem_per_block = shmem + func_attr.sharedSizeBytes;
+
+ // The minimum shared memory allocation we can have in total per SM is 8kB.
+ // If we want to lower occupancy we have to make sure we request at least that
+ // much in aggregate over all blocks, so that shared memory actually becomes a
+ // limiting factor for occupancy
+ constexpr size_t min_shmem_size_per_sm = 8192;
+ if ((occupancy < 100) &&
+ (shmem_per_block * num_blocks_desired < min_shmem_size_per_sm)) {
+ shmem_per_block = min_shmem_size_per_sm / num_blocks_desired;
+ // Need to set the caller's shmem variable so that the
+ // kernel launch uses the correct dynamic shared memory request
+ shmem = shmem_per_block - func_attr.sharedSizeBytes;
+ }
+
+ // Compute the carveout fraction we need based on occupancy
+ // Use multiples of 8kB
+ const size_t max_shmem_per_sm = device_props.sharedMemPerMultiprocessor;
+ size_t carveout = shmem_per_block == 0
+ ? 0
+ : 100 *
+ (((num_blocks_desired * shmem_per_block +
+ min_shmem_size_per_sm - 1) /
+ min_shmem_size_per_sm) *
+ min_shmem_size_per_sm) /
+ max_shmem_per_sm;
+ if (carveout > 100) carveout = 100;
+
+ // Set the carveout, but only call it once per kernel or when it changes
auto set_cache_config = [&] {
- KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
- func,
- (prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1)));
- return prefer_shmem;
+ KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetAttribute(
+ func, cudaFuncAttributePreferredSharedMemoryCarveout, carveout));
+ return carveout;
};
- static bool cache_config_preference_cached = set_cache_config();
- if (cache_config_preference_cached != prefer_shmem) {
+ // Store the value in a static variable so we only reset if needed
+ static size_t cache_config_preference_cached = set_cache_config();
+ if (cache_config_preference_cached != carveout) {
cache_config_preference_cached = set_cache_config();
}
#else
// Use the parameters so we don't get a warning
(void)func;
- (void)prefer_shmem;
+ (void)device_props;
+ (void)block_size;
+ (void)occupancy;
#endif
}
-template <class Policy>
-std::enable_if_t<Policy::experimental_contains_desired_occupancy>
-modify_launch_configuration_if_desired_occupancy_is_specified(
- Policy const& policy, cudaDeviceProp const& properties,
- cudaFuncAttributes const& attributes, dim3 const& block, int& shmem,
- bool& prefer_shmem) {
- int const block_size = block.x * block.y * block.z;
- int const desired_occupancy = policy.impl_get_desired_occupancy().value();
-
- size_t const shmem_per_sm_prefer_l1 = get_shmem_per_sm_prefer_l1(properties);
- size_t const static_shmem = attributes.sharedSizeBytes;
-
- // round to nearest integer and avoid division by zero
- int active_blocks = std::max(
- 1, static_cast<int>(std::round(
- static_cast<double>(properties.maxThreadsPerMultiProcessor) /
- block_size * desired_occupancy / 100)));
- int const dynamic_shmem =
- shmem_per_sm_prefer_l1 / active_blocks - static_shmem;
-
- if (dynamic_shmem > shmem) {
- shmem = dynamic_shmem;
- prefer_shmem = false;
- }
-}
-
-template <class Policy>
-std::enable_if_t<!Policy::experimental_contains_desired_occupancy>
-modify_launch_configuration_if_desired_occupancy_is_specified(
- Policy const&, cudaDeviceProp const&, cudaFuncAttributes const&,
- dim3 const& /*block*/, int& /*shmem*/, bool& /*prefer_shmem*/) {}
-
// </editor-fold> end Some helper functions for launch code readability }}}1
//==============================================================================
@@ -348,7 +386,7 @@ struct CudaParallelLaunchKernelInvoker<
#ifdef KOKKOS_CUDA_ENABLE_GRAPHS
inline static void create_parallel_launch_graph_node(
DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
- CudaInternal const* cuda_instance, bool prefer_shmem) {
+ CudaInternal const* cuda_instance) {
//----------------------------------------
auto const& graph = Impl::get_cuda_graph_from_kernel(driver);
KOKKOS_EXPECTS(bool(graph));
@@ -358,8 +396,15 @@ struct CudaParallelLaunchKernelInvoker<
if (!Impl::is_empty_launch(grid, block)) {
Impl::check_shmem_request(cuda_instance, shmem);
- Impl::configure_shmem_preference<DriverType, LaunchBounds>(
- base_t::get_kernel_func(), prefer_shmem);
+ if (DriverType::Policy::
+ experimental_contains_desired_occupancy) {
+ int desired_occupancy =
+ driver.get_policy().impl_get_desired_occupancy().value();
+ size_t block_size = block.x * block.y * block.z;
+ Impl::configure_shmem_preference<DriverType, LaunchBounds>(
+ base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
+ shmem, desired_occupancy);
+ }
void const* args[] = {&driver};
@@ -442,7 +487,7 @@ struct CudaParallelLaunchKernelInvoker<
#ifdef KOKKOS_CUDA_ENABLE_GRAPHS
inline static void create_parallel_launch_graph_node(
DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
- CudaInternal const* cuda_instance, bool prefer_shmem) {
+ CudaInternal const* cuda_instance) {
//----------------------------------------
auto const& graph = Impl::get_cuda_graph_from_kernel(driver);
KOKKOS_EXPECTS(bool(graph));
@@ -452,8 +497,15 @@ struct CudaParallelLaunchKernelInvoker<
if (!Impl::is_empty_launch(grid, block)) {
Impl::check_shmem_request(cuda_instance, shmem);
- Impl::configure_shmem_preference<DriverType, LaunchBounds>(
- base_t::get_kernel_func(), prefer_shmem);
+ if constexpr (DriverType::Policy::
+ experimental_contains_desired_occupancy) {
+ int desired_occupancy =
+ driver.get_policy().impl_get_desired_occupancy().value();
+ size_t block_size = block.x * block.y * block.z;
+ Impl::configure_shmem_preference<DriverType, LaunchBounds>(
+ base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
+ shmem, desired_occupancy);
+ }
auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver);
@@ -566,7 +618,7 @@ struct CudaParallelLaunchKernelInvoker<
#ifdef KOKKOS_CUDA_ENABLE_GRAPHS
inline static void create_parallel_launch_graph_node(
DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
- CudaInternal const* cuda_instance, bool prefer_shmem) {
+ CudaInternal const* cuda_instance) {
// Just use global memory; coordinating through events to share constant
// memory with the non-graph interface is not really reasonable since
// events don't work with Graphs directly, and this would anyway require
@@ -580,7 +632,7 @@ struct CudaParallelLaunchKernelInvoker<
DriverType, LaunchBounds,
Experimental::CudaLaunchMechanism::GlobalMemory>;
global_launch_impl_t::create_parallel_launch_graph_node(
- driver, grid, block, shmem, cuda_instance, prefer_shmem);
+ driver, grid, block, shmem, cuda_instance);
}
#endif
};
@@ -613,8 +665,7 @@ struct CudaParallelLaunchImpl<
inline static void launch_kernel(const DriverType& driver, const dim3& grid,
const dim3& block, int shmem,
- const CudaInternal* cuda_instance,
- bool prefer_shmem) {
+ const CudaInternal* cuda_instance) {
if (!Impl::is_empty_launch(grid, block)) {
// Prevent multiple threads to simultaneously set the cache configuration
// preference and launch the same kernel
@@ -623,18 +674,17 @@ struct CudaParallelLaunchImpl<
Impl::check_shmem_request(cuda_instance, shmem);
- // If a desired occupancy is specified, we compute how much shared memory
- // to ask for to achieve that occupancy, assuming that the cache
- // configuration is `cudaFuncCachePreferL1`. If the amount of dynamic
- // shared memory computed is actually smaller than `shmem` we overwrite
- // `shmem` and set `prefer_shmem` to `false`.
- modify_launch_configuration_if_desired_occupancy_is_specified(
- driver.get_policy(), cuda_instance->m_deviceProp,
- get_cuda_func_attributes(), block, shmem, prefer_shmem);
-
- Impl::configure_shmem_preference<
- DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
- base_t::get_kernel_func(), prefer_shmem);
+ if (DriverType::Policy::
+ experimental_contains_desired_occupancy) {
+ int desired_occupancy =
+ driver.get_policy().impl_get_desired_occupancy().value();
+ size_t block_size = block.x * block.y * block.z;
+ Impl::configure_shmem_preference<
+ DriverType,
+ Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
+ base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
+ shmem, desired_occupancy);
+ }
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
@@ -650,18 +700,9 @@ struct CudaParallelLaunchImpl<
}
static cudaFuncAttributes get_cuda_func_attributes() {
- // Race condition inside of cudaFuncGetAttributes if the same address is
- // given requires using a local variable as input instead of a static Rely
- // on static variable initialization to make sure only one thread executes
- // the code and the result is visible.
- auto wrap_get_attributes = []() -> cudaFuncAttributes {
- cudaFuncAttributes attr_tmp;
- KOKKOS_IMPL_CUDA_SAFE_CALL(
- cudaFuncGetAttributes(&attr_tmp, base_t::get_kernel_func()));
- return attr_tmp;
- };
- static cudaFuncAttributes attr = wrap_get_attributes();
- return attr;
+ return get_cuda_kernel_func_attributes<
+ DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
+ base_t::get_kernel_func());
}
};
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
index e586bb4cc6..0e348c092a 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
@@ -121,8 +121,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
maxblocks[1]),
1);
CudaParallelLaunch<ParallelFor, LaunchBounds>(
- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
- false);
+ *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
} else if (RP::rank == 3) {
const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], m_rp.m_tile[2]);
KOKKOS_ASSERT(block.x > 0);
@@ -139,8 +138,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
(m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1) / block.z,
maxblocks[2]));
CudaParallelLaunch<ParallelFor, LaunchBounds>(
- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
- false);
+ *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
} else if (RP::rank == 4) {
// id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to
// threadIdx.z
@@ -158,8 +156,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
(m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1) / block.z,
maxblocks[2]));
CudaParallelLaunch<ParallelFor, LaunchBounds>(
- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
- false);
+ *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
} else if (RP::rank == 5) {
// id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to
// threadIdx.z
@@ -175,8 +172,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
(m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1) / block.z,
maxblocks[2]));
CudaParallelLaunch<ParallelFor, LaunchBounds>(
- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
- false);
+ *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
} else if (RP::rank == 6) {
// id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4,id5 to
// threadIdx.z
@@ -191,8 +187,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
std::min<array_index_type>(m_rp.m_tile_end[4] * m_rp.m_tile_end[5],
maxblocks[2]));
CudaParallelLaunch<ParallelFor, LaunchBounds>(
- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
- false);
+ *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
} else {
Kokkos::abort("Kokkos::MDRange Error: Exceeded rank bounds with Cuda\n");
}
@@ -405,8 +400,8 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
CudaParallelLaunch<ParallelReduce, LaunchBounds>(
*this, grid, block, shmem,
- m_policy.space().impl_internal_space_instance(),
- false); // copy to device and execute
+ m_policy.space()
+ .impl_internal_space_instance()); // copy to device and execute
if (!m_result_ptr_device_accessible) {
if (m_result_ptr) {
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
index ac160f8fe2..d1031751c2 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
@@ -135,8 +135,7 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
#endif
CudaParallelLaunch<ParallelFor, LaunchBounds>(
- *this, grid, block, 0, m_policy.space().impl_internal_space_instance(),
- false);
+ *this, grid, block, 0, m_policy.space().impl_internal_space_instance());
}
ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
@@ -375,8 +374,8 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
CudaParallelLaunch<ParallelReduce, LaunchBounds>(
*this, grid, block, shmem,
- m_policy.space().impl_internal_space_instance(),
- false); // copy to device and execute
+ m_policy.space()
+ .impl_internal_space_instance()); // copy to device and execute
if (!m_result_ptr_device_accessible) {
if (m_result_ptr) {
@@ -726,16 +725,16 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
m_final = false;
CudaParallelLaunch<ParallelScan, LaunchBounds>(
*this, grid, block, shmem,
- m_policy.space().impl_internal_space_instance(),
- false); // copy to device and execute
+ m_policy.space()
+ .impl_internal_space_instance()); // copy to device and execute
#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
}
#endif
m_final = true;
CudaParallelLaunch<ParallelScan, LaunchBounds>(
*this, grid, block, shmem,
- m_policy.space().impl_internal_space_instance(),
- false); // copy to device and execute
+ m_policy.space()
+ .impl_internal_space_instance()); // copy to device and execute
}
}
@@ -1038,16 +1037,16 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
m_final = false;
CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>(
*this, grid, block, shmem,
- m_policy.space().impl_internal_space_instance(),
- false); // copy to device and execute
+ m_policy.space()
+ .impl_internal_space_instance()); // copy to device and execute
#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
}
#endif
m_final = true;
CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>(
*this, grid, block, shmem,
- m_policy.space().impl_internal_space_instance(),
- false); // copy to device and execute
+ m_policy.space()
+ .impl_internal_space_instance()); // copy to device and execute
const int size = Analysis::value_size(m_functor);
#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
index cdd16085b3..ea9430b812 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
@@ -552,8 +552,8 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
CudaParallelLaunch<ParallelFor, LaunchBounds>(
*this, grid, block, shmem_size_total,
- m_policy.space().impl_internal_space_instance(),
- true); // copy to device and execute
+ m_policy.space()
+ .impl_internal_space_instance()); // copy to device and execute
}
ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
@@ -878,8 +878,8 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
CudaParallelLaunch<ParallelReduce, LaunchBounds>(
*this, grid, block, shmem_size_total,
- m_policy.space().impl_internal_space_instance(),
- true); // copy to device and execute
+ m_policy.space()
+ .impl_internal_space_instance()); // copy to device and execute
if (!m_result_ptr_device_accessible) {
m_policy.space().fence(
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
index 34d4bef9fd..178012431c 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
@@ -428,11 +428,6 @@ struct CudaReductionsFunctor<FunctorType, false, false> {
// __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
// function qualifier which could be used to improve performance.
//----------------------------------------------------------------------------
-// Maximize shared memory and minimize L1 cache:
-// cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferShared );
-// For 2.0 capability: 48 KB shared and 16 KB L1
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
/*
* Algorithmic constraints:
* (a) blockDim.y <= 1024
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
index fb3a6b138f..a12378a891 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
@@ -100,8 +100,7 @@ class ParallelFor<FunctorType, Kokkos::WorkGraphPolicy<Traits...>,
const int shared = 0;
Kokkos::Impl::CudaParallelLaunch<Self>(
- *this, grid, block, shared, Cuda().impl_internal_space_instance(),
- false);
+ *this, grid, block, shared, Cuda().impl_internal_space_instance());
}
inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)

View File

@ -1,46 +0,0 @@
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
index 30b6958a67..b94f053272 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
@@ -207,41 +207,6 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
LaunchBounds{});
}
-// NOTE these number can be obtained several ways:
-// * One option is to download the CUDA Occupancy Calculator spreadsheet, select
-// "Compute Capability" first and check what is the smallest "Shared Memory
-// Size Config" that is available. The "Shared Memory Per Multiprocessor" in
-// bytes is then to be found below in the summary.
-// * Another option would be to look for the information in the "Tuning
-// Guide(s)" of the CUDA Toolkit Documentation for each GPU architecture, in
-// the "Shared Memory" section (more tedious)
-inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) {
- int const compute_capability = properties.major * 10 + properties.minor;
- return [compute_capability]() {
- switch (compute_capability) {
- case 30:
- case 32:
- case 35: return 16;
- case 37: return 80;
- case 50:
- case 53:
- case 60:
- case 62: return 64;
- case 52:
- case 61: return 96;
- case 70:
- case 80:
- case 86:
- case 90: return 8;
- case 75: return 32;
- default:
- Kokkos::Impl::throw_runtime_exception(
- "Unknown device in cuda block size deduction");
- }
- return 0;
- }() * 1024;
-}
-
} // namespace Impl
} // namespace Kokkos

View File

@ -1,204 +0,0 @@
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
index b94f053272..252c13c524 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
@@ -53,17 +53,69 @@
namespace Kokkos {
namespace Impl {
+inline int cuda_warp_per_sm_allocation_granularity(
+ cudaDeviceProp const& properties) {
+ // Allocation granularity of warps in each sm
+ switch (properties.major) {
+ case 3:
+ case 5:
+ case 7:
+ case 8:
+ case 9: return 4;
+ case 6: return (properties.minor == 0 ? 2 : 4);
+ default:
+ throw_runtime_exception(
+ "Unknown device in cuda warp per sm allocation granularity");
+ return 0;
+ }
+}
+
+inline int cuda_max_warps_per_sm_registers(
+ cudaDeviceProp const& properties, cudaFuncAttributes const& attributes) {
+ // Maximum number of warps per sm as a function of register counts,
+ // subject to the constraint that warps are allocated with a fixed granularity
+ int const max_regs_per_block = properties.regsPerBlock;
+ int const regs_per_warp = attributes.numRegs * properties.warpSize;
+ int const warp_granularity =
+ cuda_warp_per_sm_allocation_granularity(properties);
+ // The granularity of register allocation is chunks of 256 registers per warp,
+ // which implies a need to over-allocate, so we round up
+ int const allocated_regs_per_warp = (regs_per_warp + 256 - 1) / 256;
+
+ // The maximum number of warps per SM is constrained from above by register
+ // allocation. To satisfy the constraint that warps per SM is allocated at a
+ // finite granularity, we need to round down.
+ int const max_warps_per_sm =
+ warp_granularity *
+ (max_regs_per_block / (allocated_regs_per_warp * warp_granularity));
+
+ return max_warps_per_sm;
+}
+
inline int cuda_max_active_blocks_per_sm(cudaDeviceProp const& properties,
cudaFuncAttributes const& attributes,
int block_size, size_t dynamic_shmem) {
- // Limits due do registers/SM
+ // Limits due to registers/SM
int const regs_per_sm = properties.regsPerMultiprocessor;
int const regs_per_thread = attributes.numRegs;
// The granularity of register allocation is chunks of 256 registers per warp
// -> 8 registers per thread
int const allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
- int const max_blocks_regs =
- regs_per_sm / (allocated_regs_per_thread * block_size);
+ int max_blocks_regs = regs_per_sm / (allocated_regs_per_thread * block_size);
+
+ // Compute the maximum number of warps as a function of the number of
+ // registers
+ int const max_warps_per_sm_registers =
+ cuda_max_warps_per_sm_registers(properties, attributes);
+
+ // Constrain the number of blocks to respect the maximum number of warps per
+ // SM On face value this should be an equality, but due to the warp
+ // granularity constraints noted in `cuda_max_warps_per_sm_registers` the
+ // left-hand-side of this comparison can overshoot what the hardware allows
+ // based on register counts alone
+ while ((max_blocks_regs * block_size / properties.warpSize) >
+ max_warps_per_sm_registers)
+ max_blocks_regs--;
// Limits due to shared memory/SM
size_t const shmem_per_sm = properties.sharedMemPerMultiprocessor;
@@ -207,6 +259,19 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
LaunchBounds{});
}
+template <class LaunchBounds>
+int cuda_get_opt_block_size_no_shmem(const cudaFuncAttributes& attr,
+ LaunchBounds) {
+ auto const& prop = Kokkos::Cuda().cuda_device_prop();
+
+ // Thin version of cuda_get_opt_block_size for cases where there is no shared
+ // memory
+ auto const block_size_to_no_shmem = [&](int /*block_size*/) { return 0; };
+
+ return cuda_deduce_block_size(false, prop, attr, block_size_to_no_shmem,
+ LaunchBounds{});
+}
+
} // namespace Impl
} // namespace Kokkos
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
index 5c4c3a7d39..170183ca0a 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
@@ -188,9 +188,23 @@ inline void configure_shmem_preference(const KernelFuncPtr& func,
// The granularity of register allocation is chunks of 256 registers per warp
// -> 8 registers per thread
const size_t allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
- const size_t max_blocks_regs =
+ size_t max_blocks_regs =
regs_per_sm / (allocated_regs_per_thread * block_size);
+ // Compute the maximum number of warps as a function of the number of
+ // registers
+ const size_t max_warps_per_sm_registers =
+ cuda_max_warps_per_sm_registers(device_props, func_attr);
+
+ // Constrain the number of blocks to respect the maximum number of warps per
+ // SM On face value this should be an equality, but due to the warp
+ // granularity constraints noted in `cuda_max_warps_per_sm_registers` the
+ // left-hand-side of this comparison can overshoot what the hardware allows
+ // based on register counts alone
+ while ((max_blocks_regs * block_size / device_props.warpSize) >
+ max_warps_per_sm_registers)
+ max_blocks_regs--;
+
// Compute how many threads per sm we actually want
const size_t max_threads_per_sm = device_props.maxThreadsPerMultiProcessor;
// only allocate multiples of warp size
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
index 0e348c092a..7e4f62f12e 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
@@ -67,6 +67,34 @@
namespace Kokkos {
namespace Impl {
+template <typename ParallelType, typename Policy, typename LaunchBounds>
+int max_tile_size_product_helper(const Policy& pol, const LaunchBounds&) {
+ cudaFuncAttributes attr =
+ CudaParallelLaunch<ParallelType,
+ LaunchBounds>::get_cuda_func_attributes();
+ auto const& prop = pol.space().cuda_device_prop();
+
+ // Limits due to registers/SM, MDRange doesn't have
+ // shared memory constraints
+ int const optimal_block_size =
+ Kokkos::Impl::cuda_get_opt_block_size_no_shmem(attr, LaunchBounds{});
+
+ // Compute how many blocks of this size we can launch, based on warp
+ // constraints
+ int const max_warps_per_sm_registers =
+ Kokkos::Impl::cuda_max_warps_per_sm_registers(prop, attr);
+ int const max_num_threads_from_warps =
+ max_warps_per_sm_registers * prop.warpSize;
+ int const max_num_blocks = max_num_threads_from_warps / optimal_block_size;
+
+ // Compute the total number of threads
+ int const max_threads_per_sm = optimal_block_size * max_num_blocks;
+
+ return std::min(
+ max_threads_per_sm,
+ static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
+}
+
template <class FunctorType, class... Traits>
class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
public:
@@ -85,18 +113,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
public:
template <typename Policy, typename Functor>
static int max_tile_size_product(const Policy& pol, const Functor&) {
- cudaFuncAttributes attr =
- CudaParallelLaunch<ParallelFor,
- LaunchBounds>::get_cuda_func_attributes();
- auto const& prop = pol.space().cuda_device_prop();
- // Limits due to registers/SM, MDRange doesn't have
- // shared memory constraints
- int const regs_per_sm = prop.regsPerMultiprocessor;
- int const regs_per_thread = attr.numRegs;
- int const max_threads_per_sm = regs_per_sm / regs_per_thread;
- return std::min(
- max_threads_per_sm,
- static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
+ return max_tile_size_product_helper<ParallelFor>(pol, LaunchBounds{});
}
Policy const& get_policy() const { return m_rp; }
inline __device__ void operator()() const {
@@ -258,17 +275,7 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
public:
template <typename Policy, typename Functor>
static int max_tile_size_product(const Policy& pol, const Functor&) {
- cudaFuncAttributes attr =
- CudaParallelLaunch<ParallelReduce,
- LaunchBounds>::get_cuda_func_attributes();
- auto const& prop = pol.space().cuda_device_prop();
- // Limits due do registers/SM
- int const regs_per_sm = prop.regsPerMultiprocessor;
- int const regs_per_thread = attr.numRegs;
- int const max_threads_per_sm = regs_per_sm / regs_per_thread;
- return std::min(
- max_threads_per_sm,
- static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
+ return max_tile_size_product_helper<ParallelReduce>(pol, LaunchBounds{});
}
Policy const& get_policy() const { return m_policy; }
inline __device__ void exec_range(reference_type update) const {

View File

@ -1,63 +0,0 @@
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
index 170183ca0a..ba43e362bb 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
@@ -412,12 +412,16 @@ struct CudaParallelLaunchKernelInvoker<
Impl::check_shmem_request(cuda_instance, shmem);
if (DriverType::Policy::
experimental_contains_desired_occupancy) {
+ /*
int desired_occupancy =
driver.get_policy().impl_get_desired_occupancy().value();
size_t block_size = block.x * block.y * block.z;
Impl::configure_shmem_preference<DriverType, LaunchBounds>(
base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
- shmem, desired_occupancy);
+ shmem, desired_occupancy);*/
+ Kokkos::Impl::throw_runtime_exception(
+ std::string("Cuda graph node creation FAILED:"
+ " occupancy requests are currently broken."));
}
void const* args[] = {&driver};
@@ -511,14 +515,17 @@ struct CudaParallelLaunchKernelInvoker<
if (!Impl::is_empty_launch(grid, block)) {
Impl::check_shmem_request(cuda_instance, shmem);
- if constexpr (DriverType::Policy::
+ if (DriverType::Policy::
experimental_contains_desired_occupancy) {
- int desired_occupancy =
+ /*int desired_occupancy =
driver.get_policy().impl_get_desired_occupancy().value();
size_t block_size = block.x * block.y * block.z;
Impl::configure_shmem_preference<DriverType, LaunchBounds>(
base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
- shmem, desired_occupancy);
+ shmem, desired_occupancy);*/
+ Kokkos::Impl::throw_runtime_exception(
+ std::string("Cuda graph node creation FAILED:"
+ " occupancy requests are currently broken."));
}
auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver);
@@ -690,14 +697,17 @@ struct CudaParallelLaunchImpl<
if (DriverType::Policy::
experimental_contains_desired_occupancy) {
- int desired_occupancy =
+ /*int desired_occupancy =
driver.get_policy().impl_get_desired_occupancy().value();
size_t block_size = block.x * block.y * block.z;
Impl::configure_shmem_preference<
DriverType,
Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
- shmem, desired_occupancy);
+ shmem, desired_occupancy);*/
+ Kokkos::Impl::throw_runtime_exception(
+ std::string("Cuda graph node creation FAILED:"
+ " occupancy requests are currently broken."));
}
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();

30
potentials/HGa.msmeam Normal file
View File

@ -0,0 +1,30 @@
bkgd_dyn = 1
emb_lin_neg = 1
augt1=0
ialloy=1
rc = 5.9
#H
attrac(1,1)=0.460
repuls(1,1)=0.460
Cmin(1,1,1)=1.3 # PuMS
Cmax(1,1,1)= 2.80
nn2(1,1)=1
#Ga
rho0(2) = 0.6
attrac(2,2)=0.097
repuls(2,2)=0.097
nn2(2,2)=1
#HGa
attrac(1,2)=0.300
repuls(1,2)=0.300
lattce(1,2)=l12
re(1,2)=3.19
delta(1,2)=-0.48
alpha(1,2)=6.6
Cmin(1,1,2)=2.0
Cmin(2,1,2)= 2.0
Cmin(1,2,1)=2.0
Cmin(2,2,1) = 1.4
Cmin(1,2,2) = 1.4
Cmin(1,1,2) = 1.4
nn2(1,2)=1

14
potentials/library.msmeam Normal file
View File

@ -0,0 +1,14 @@
# DATE: 2018-09-22 UNITS: metal CONTRIBUTOR: Steve Valone, smv@lanl.gov CITATION: Baskes, PRB 1992; smv, sr, mib, JNM 2010
# ms-meam data format May 2010
# elt lat z ielement atwt
# alpha b0 b1 b2 b3 b1m b2m b3m alat esub asub
# - t0 t1 t2 t3 t1m t2m t3m rozero ibar
# NOTE: leading character cannot be a space
'H' 'dim' 1.0 1 1.0079
2.960 2.960 3.0 1.0 1.0 1.0 3.0 1.0 0.741 2.235 2.50
1.0 0.44721 0.0 0.00 0.0 0.31623 0 6.70 0
'Ga4' 'fcc' 12.0 31 69.723
4.42 4.80 3.10 6.00 0.00 0.0 0.0 0.5 4.247 2.897 0.97
1.0 1.649 1.435 0.00 0.0 0.0 2.0 0.70 0

View File

@ -32,7 +32,7 @@ if not pylib.Py_IsInitialized():
else:
from .loader import load_model, load_unified, activate_mliappy
try:
from .loader import load_model_kokkos, activate_mliappy_kokkos
from .loader import load_model_kokkos, load_unified_kokkos, activate_mliappy_kokkos
except Exception as ee:
# ignore import error, it means that the KOKKOS package was not included in LAMMPS
pass

View File

@ -75,7 +75,7 @@ def activate_mliappy(lmp):
def activate_mliappy_kokkos(lmp):
try:
library = lmp.lib
module_names = ["mliap_model_python_couple_kokkos"]
module_names = ["mliap_model_python_couple_kokkos", "mliap_unified_couple_kokkos"]
api_version = library.lammps_python_api_version()
for module_name in module_names:
@ -118,3 +118,12 @@ def load_unified(model):
) from ie
mliap_unified_couple.load_from_python(model)
def load_unified_kokkos(model):
try:
import mliap_unified_couple_kokkos
except ImportError as ie:
raise ImportError("ML-IAP python module must be activated before loading\n"
"the pair style. Call lammps.mliap.activate_mliappy(lmp)."
) from ie
mliap_unified_couple_kokkos.load_from_python(model)

View File

@ -22,6 +22,7 @@
#include "memory.h"
#include "neighbor.h"
#include "remap_wrap.h"
#include "timer.h"
#include <cmath>
#include <cstring>
@ -326,15 +327,23 @@ FFT_SCALAR *AmoebaConvolution::pre_convolution_3d()
cfft[n++] = ZEROF;
}
double time0,time1;
if (timer->has_sync()) MPI_Barrier(world);
time0 = platform::walltime();
// perform forward FFT
fft1->compute(cfft,cfft,FFT3d::FORWARD);
time1 = platform::walltime();
if (SCALE) {
double scale = 1.0/nfft_global;
FFT_SCALAR scale = 1.0/nfft_global;
for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale;
}
time_fft += time1 - time0;
#if DEBUG_AMOEBA
debug_scalar(CFFT1,"PRE Convo / POST FFT");
debug_file(CFFT1,"pre.convo.post.fft");
@ -382,15 +391,24 @@ FFT_SCALAR *AmoebaConvolution::pre_convolution_4d()
debug_scalar(FFT,"PRE Convo / POST Remap");
debug_file(FFT,"pre.convo.post.remap");
#endif
double time0,time1;
if (timer->has_sync()) MPI_Barrier(world);
time0 = platform::walltime();
// perform forward FFT
fft1->compute(cfft,cfft,FFT3d::FORWARD);
time1 = platform::walltime();
if (SCALE) {
double scale = 1.0/nfft_global;
FFT_SCALAR scale = 1.0/nfft_global;
for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale;
}
time_fft += time1 - time0;
#if DEBUG_AMOEBA
debug_scalar(CFFT1,"PRE Convo / POST FFT");
debug_file(CFFT1,"pre.convo.post.fft");
@ -423,7 +441,16 @@ void *AmoebaConvolution::post_convolution_3d()
debug_scalar(CFFT1,"POST Convo / PRE FFT");
debug_file(CFFT1,"post.convo.pre.fft");
#endif
double time0,time1;
if (timer->has_sync()) MPI_Barrier(world);
time0 = platform::walltime();
fft2->compute(cfft,cfft,FFT3d::BACKWARD);
time1 = platform::walltime();
time_fft += time1 - time0;
#if DEBUG_AMOEBA
debug_scalar(CFFT2,"POST Convo / POST FFT");
@ -465,8 +492,18 @@ void *AmoebaConvolution::post_convolution_4d()
debug_scalar(CFFT1,"POST Convo / PRE FFT");
debug_file(CFFT1,"post.convo.pre.fft");
#endif
double time0,time1;
if (timer->has_sync()) MPI_Barrier(world);
time0 = platform::walltime();
fft2->compute(cfft,cfft,FFT3d::BACKWARD);
time1 = platform::walltime();
time_fft += time1 - time0;
#if DEBUG_AMOEBA
debug_scalar(CFFT2,"POST Convo / POST FFT");
debug_file(CFFT2,"post.convo.post.fft");

View File

@ -38,7 +38,7 @@ class AmoebaConvolution : protected Pointers {
int nxlo_out, nxhi_out, nylo_out, nyhi_out, nzlo_out, nzhi_out;
int nxlo_fft, nxhi_fft, nylo_fft, nyhi_fft, nzlo_fft, nzhi_fft;
bigint nfft_global; // nx * ny * nz
double *grid_brick_start; // lower left corner of (c)grid_brick data
FFT_SCALAR *grid_brick_start; // lower left corner of (c)grid_brick data
AmoebaConvolution(class LAMMPS *, class Pair *, int, int, int, int, int);
~AmoebaConvolution();
@ -47,7 +47,9 @@ class AmoebaConvolution : protected Pointers {
FFT_SCALAR *pre_convolution();
void *post_convolution();
private:
double time_fft;
protected:
int which; // caller name for convolution being performed
int flag3d; // 1 if using 3d grid_brick, 0 for 4d cgrid_brick
int nbrick_owned; // owned grid points in brick decomp
@ -59,23 +61,23 @@ class AmoebaConvolution : protected Pointers {
class Grid3d *gc;
class Remap *remap;
double ***grid_brick; // 3d real brick grid with ghosts
double ****cgrid_brick; // 4d complex brick grid with ghosts
FFT_SCALAR ***grid_brick; // 3d real brick grid with ghosts
FFT_SCALAR ****cgrid_brick; // 4d complex brick grid with ghosts
FFT_SCALAR *grid_fft; // 3d FFT grid as 1d vector
FFT_SCALAR *cfft; // 3d complex FFT grid as 1d vector
double *gc_buf1, *gc_buf2; // buffers for GridComm
double *remap_buf; // buffer for Remap
FFT_SCALAR *gc_buf1, *gc_buf2; // buffers for GridComm
FFT_SCALAR *remap_buf; // buffer for Remap
void allocate_grid();
void deallocate_grid();
void *zero_3d();
void *zero_4d();
FFT_SCALAR *pre_convolution_3d();
FFT_SCALAR *pre_convolution_4d();
virtual FFT_SCALAR *pre_convolution_4d();
void *post_convolution_3d();
void *post_convolution_4d();
virtual void *post_convolution_4d();
void procs2grid2d(int, int, int, int &, int &);
// DEBUG

View File

@ -285,7 +285,7 @@ void PairAmoeba::dispersion_kspace()
// gridpre = my portion of 3d grid in brick decomp w/ ghost values
// zeroed by zero()
double ***gridpre = (double ***) d_kspace->zero();
FFT_SCALAR ***gridpre = (FFT_SCALAR ***) d_kspace->zero();
// map atoms to grid
@ -294,7 +294,7 @@ void PairAmoeba::dispersion_kspace()
// pre-convolution operations including forward FFT
// gridfft = my portion of complex 3d grid in FFT decomposition
double *gridfft = d_kspace->pre_convolution();
FFT_SCALAR *gridfft = d_kspace->pre_convolution();
// ---------------------
// convolution operation

View File

@ -24,6 +24,7 @@
#include "math_special.h"
#include "my_page.h"
#include "neigh_list.h"
#include "timer.h"
#include <cmath>
@ -381,8 +382,6 @@ void PairAmoeba::induce()
}
}
// if (comm->me == 0) printf("CG iteration count = %d\n",iter);
// terminate the calculation if dipoles failed to converge
// NOTE: could make this an error
@ -546,13 +545,19 @@ void PairAmoeba::ufield0c(double **field, double **fieldp)
}
}
// get the reciprocal space part of the mutual field
if (polar_kspace_flag) umutual1(field,fieldp);
double time0, time1, time2;
if (timer->has_sync()) MPI_Barrier(world);
time0 = platform::walltime();
// get the real space portion of the mutual field
if (polar_rspace_flag) umutual2b(field,fieldp);
time1 = platform::walltime();
// get the reciprocal space part of the mutual field
if (polar_kspace_flag) umutual1(field,fieldp);
time2 = platform::walltime();
// add the self-energy portion of the mutual field
@ -563,6 +568,11 @@ void PairAmoeba::ufield0c(double **field, double **fieldp)
fieldp[i][j] += term*uinp[i][j];
}
}
// accumulate timing information
time_mutual_rspace += time1 - time0;
time_mutual_kspace += time2 - time1;
}
/* ----------------------------------------------------------------------
@ -785,7 +795,12 @@ void PairAmoeba::dfield0c(double **field, double **fieldp)
// get the reciprocal space part of the permanent field
double time0, time1, time2;
if (timer->has_sync()) MPI_Barrier(world);
time0 = platform::walltime();
if (polar_kspace_flag) udirect1(field);
time1 = platform::walltime();
for (i = 0; i < nlocal; i++) {
for (j = 0; j < 3; j++) {
@ -796,6 +811,7 @@ void PairAmoeba::dfield0c(double **field, double **fieldp)
// get the real space portion of the permanent field
if (polar_rspace_flag) udirect2b(field,fieldp);
time2 = platform::walltime();
// get the self-energy portion of the permanent field
@ -806,6 +822,11 @@ void PairAmoeba::dfield0c(double **field, double **fieldp)
fieldp[i][j] += term*rpole[i][j+1];
}
}
// accumulate timing information
time_direct_kspace += time1 - time0;
time_direct_rspace += time2 - time1;
}
/* ----------------------------------------------------------------------
@ -842,18 +863,26 @@ void PairAmoeba::umutual1(double **field, double **fieldp)
}
}
double time0, time1;
// gridpre = my portion of 4d grid in brick decomp w/ ghost values
double ****gridpre = (double ****) ic_kspace->zero();
FFT_SCALAR ****gridpre = (FFT_SCALAR ****) ic_kspace->zero();
// map 2 values to grid
if (timer->has_sync()) MPI_Barrier(world);
time0 = platform::walltime();
grid_uind(fuind,fuinp,gridpre);
time1 = platform::walltime();
time_grid_uind += (time1 - time0);
// pre-convolution operations including forward FFT
// gridfft = my portion of complex 3d grid in FFT decomposition
double *gridfft = ic_kspace->pre_convolution();
FFT_SCALAR *gridfft = ic_kspace->pre_convolution();
// ---------------------
// convolution operation
@ -883,12 +912,18 @@ void PairAmoeba::umutual1(double **field, double **fieldp)
// post-convolution operations including backward FFT
// gridppost = my portion of 4d grid in brick decomp w/ ghost values
double ****gridpost = (double ****) ic_kspace->post_convolution();
FFT_SCALAR ****gridpost = (FFT_SCALAR ****) ic_kspace->post_convolution();
// get potential
if (timer->has_sync()) MPI_Barrier(world);
time0 = platform::walltime();
fphi_uind(gridpost,fdip_phi1,fdip_phi2,fdip_sum_phi);
time1 = platform::walltime();
time_fphi_uind += (time1 - time0);
// store fractional reciprocal potentials for OPT method
if (poltyp == OPT) {
@ -1055,7 +1090,7 @@ void PairAmoeba::udirect1(double **field)
// gridpre = my portion of 3d grid in brick decomp w/ ghost values
// zeroed by setup()
double ***gridpre = (double ***) i_kspace->zero();
FFT_SCALAR ***gridpre = (FFT_SCALAR ***) i_kspace->zero();
// map multipole moments to grid
@ -1064,7 +1099,7 @@ void PairAmoeba::udirect1(double **field)
// pre-convolution operations including forward FFT
// gridfft = my 1d portion of complex 3d grid in FFT decomp
double *gridfft = i_kspace->pre_convolution();
FFT_SCALAR *gridfft = i_kspace->pre_convolution();
// ---------------------
// convolution operation
@ -1109,7 +1144,7 @@ void PairAmoeba::udirect1(double **field)
// post-convolution operations including backward FFT
// gridppost = my portion of 3d grid in brick decomp w/ ghost values
double ***gridpost = (double ***) i_kspace->post_convolution();
FFT_SCALAR ***gridpost = (FFT_SCALAR ***) i_kspace->post_convolution();
// get potential

View File

@ -68,25 +68,23 @@ void PairAmoeba::moduli()
int maxfft = MAX(nfft1,nfft2);
maxfft = MAX(maxfft,nfft3);
double *array = new double[bsorder];
double *bsarray = new double[maxfft];
if (maxfft > _nfft_max) {
memory->destroy(_moduli_bsarray);
_nfft_max = maxfft;
memory->create(_moduli_bsarray,_nfft_max,"amoeba:_moduli_bsarray");
}
// compute and load the moduli values
double x = 0.0;
bspline(x,bsorder,array);
bspline(x,bsorder,_moduli_array);
for (i = 0; i < maxfft; i++) bsarray[i] = 0.0;
for (i = 0; i < bsorder; i++) bsarray[i+1] = array[i];
for (i = 0; i < maxfft; i++) _moduli_bsarray[i] = 0.0;
for (i = 0; i < bsorder; i++) _moduli_bsarray[i+1] = _moduli_array[i];
dftmod(bsmod1,bsarray,nfft1,bsorder);
dftmod(bsmod2,bsarray,nfft2,bsorder);
dftmod(bsmod3,bsarray,nfft3,bsorder);
// perform deallocation of local arrays
delete[] array;
delete[] bsarray;
dftmod(bsmod1,_moduli_bsarray,nfft1,bsorder);
dftmod(bsmod2,_moduli_bsarray,nfft2,bsorder);
dftmod(bsmod3,_moduli_bsarray,nfft3,bsorder);
}
/* ----------------------------------------------------------------------
@ -525,7 +523,7 @@ void PairAmoeba::frac_to_cart()
grid_mpole maps fractional atomic multipoles to PME grid
------------------------------------------------------------------------- */
void PairAmoeba::grid_mpole(double **fmp, double ***grid)
void PairAmoeba::grid_mpole(double **fmp, FFT_SCALAR ***grid)
{
int i,j,k,m,ib,jb,kb;
double v0,u0,t0;
@ -598,7 +596,7 @@ void PairAmoeba::grid_mpole(double **fmp, double ***grid)
the particle mesh Ewald grid
------------------------------------------------------------------------- */
void PairAmoeba::fphi_mpole(double ***grid, double **fphi)
void PairAmoeba::fphi_mpole(FFT_SCALAR ***grid, double **fphi)
{
int i,j,k,m,ib,jb,kb;
double v0,v1,v2,v3;
@ -742,7 +740,7 @@ void PairAmoeba::fphi_mpole(double ***grid, double **fphi)
grid_uind maps fractional induced dipoles to the PME grid
------------------------------------------------------------------------- */
void PairAmoeba::grid_uind(double **fuind, double **fuinp, double ****grid)
void PairAmoeba::grid_uind(double **fuind, double **fuinp, FFT_SCALAR ****grid)
{
int i,j,k,m,ib,jb,kb;
double v0,u0,t0;
@ -793,7 +791,7 @@ void PairAmoeba::grid_uind(double **fuind, double **fuinp, double ****grid)
fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
------------------------------------------------------------------------- */
void PairAmoeba::fphi_uind(double ****grid, double **fdip_phi1,
void PairAmoeba::fphi_uind(FFT_SCALAR ****grid, double **fdip_phi1,
double **fdip_phi2, double **fdip_sum_phi)
{
int i,j,k,m,ib,jb,kb;
@ -1042,7 +1040,7 @@ void PairAmoeba::fphi_uind(double ****grid, double **fdip_phi1,
grid_disp maps dispersion coefficients to PME grid
------------------------------------------------------------------------- */
void PairAmoeba::grid_disp(double ***grid)
void PairAmoeba::grid_disp(FFT_SCALAR ***grid)
{
int i,j,k,m,ib,jb,kb,itype,iclass;
double v0,u0,t0;

View File

@ -21,6 +21,7 @@
#include "math_const.h"
#include "math_special.h"
#include "neigh_list.h"
#include "timer.h"
#include <cmath>
@ -55,6 +56,8 @@ void PairAmoeba::multipole()
double qixx,qixy,qixz,qiyy,qiyz,qizz;
double cii,dii,qii;
double time0,time1,time2;
// set cutoffs, taper coeffs, and PME params
if (use_ewald) choose(MPOLE_LONG);
@ -78,13 +81,18 @@ void PairAmoeba::multipole()
felec = electric / am_dielectric;
if (timer->has_sync()) MPI_Barrier(world);
time0 = platform::walltime();
// compute the real space part of the Ewald summation
if (mpole_rspace_flag) multipole_real();
time1 = platform::walltime();
// compute the reciprocal space part of the Ewald summation
if (mpole_kspace_flag) multipole_kspace();
time2 = platform::walltime();
// compute the Ewald self-energy term over all the atoms
@ -109,6 +117,11 @@ void PairAmoeba::multipole()
e = fterm * (cii + term*(dii/3.0+2.0*term*qii/5.0));
empole += e;
}
// accumulate timing information
time_mpole_rspace += time1 - time0;
time_mpole_kspace += time2 - time1;
}
/* ----------------------------------------------------------------------
@ -361,6 +374,9 @@ void PairAmoeba::multipole_real()
bn[k] = (bfac*bn[k-1]+alsq2n*exp2a) / r2;
}
for (k = 0; k < 6; k++) bn[k] *= felec;
//if (i == 0 && j < 10) {
// printf("j = %d: aewald = %f; rr1 = %f; bn: %f %f %f %f %f %f\n", j, aewald, rr1, bn[0], bn[1], bn[2], bn[3], bn[4], bn[5]);
//}
// find damped multipole intermediates and energy value
@ -404,6 +420,8 @@ void PairAmoeba::multipole_real()
term2i*rr3i + term2k*rr3k + term2ik*rr3ik +
term3i*rr5i + term3k*rr5k + term3ik*rr5ik;
// find damped multipole intermediates for force and torque
de = term1*rr3 + term4ik*rr9ik + term5ik*rr11ik +
@ -444,6 +462,7 @@ void PairAmoeba::multipole_real()
term4 = 2.0 * (-ck*rr5+dkr*rr7-qkr*rr9);
term5 = 2.0 * (-ci*rr5-dir*rr7-qir*rr9);
term6 = 4.0 * rr7;
}
empole += e;
@ -482,6 +501,7 @@ void PairAmoeba::multipole_real()
tq[i][2] += ttmi[2];
// increment force-based gradient and torque on second site
// commenting out j parts for DEBUGGING
f[j][0] += frcx;
f[j][1] += frcy;
@ -638,7 +658,7 @@ void PairAmoeba::multipole_kspace()
// gridpre = my portion of 3d grid in brick decomp w/ ghost values
double ***gridpre = (double ***) m_kspace->zero();
FFT_SCALAR ***gridpre = (FFT_SCALAR ***) m_kspace->zero();
// map atoms to grid
@ -647,7 +667,7 @@ void PairAmoeba::multipole_kspace()
// pre-convolution operations including forward FFT
// gridfft = my portion of complex 3d grid in FFT decomp as 1d vector
double *gridfft = m_kspace->pre_convolution();
FFT_SCALAR *gridfft = m_kspace->pre_convolution();
// ---------------------
// convolution operation
@ -718,7 +738,7 @@ void PairAmoeba::multipole_kspace()
// post-convolution operations including backward FFT
// gridppost = my portion of 3d grid in brick decomp w/ ghost values
double ***gridpost = (double ***) m_kspace->post_convolution();
FFT_SCALAR ***gridpost = (FFT_SCALAR ***) m_kspace->post_convolution();
// get potential

View File

@ -21,6 +21,7 @@
#include "math_const.h"
#include "math_special.h"
#include "neigh_list.h"
#include "timer.h"
#include <cmath>
#include <cstring>
@ -55,6 +56,8 @@ void PairAmoeba::polar()
double fix[3],fiy[3],fiz[3];
double tep[3];
double time0,time1,time2;
// set cutoffs, taper coeffs, and PME params
if (use_ewald) choose(POLAR_LONG);
@ -76,11 +79,16 @@ void PairAmoeba::polar()
// compute the real space part of the dipole interactions
if (timer->has_sync()) MPI_Barrier(world);
time0 = platform::walltime();
if (polar_rspace_flag) polar_real();
time1 = platform::walltime();
// compute the reciprocal space part of dipole interactions
if (polar_kspace_flag) polar_kspace();
time2 = platform::walltime();
// compute the Ewald self-energy torque and virial terms
@ -133,6 +141,11 @@ void PairAmoeba::polar()
virpolar[4] -= vxz;
virpolar[5] -= vyz;
}
// accumulate timing information
time_polar_rspace += time1 - time0;
time_polar_kspace += time2 - time1;
}
/* ----------------------------------------------------------------------
@ -382,7 +395,7 @@ void PairAmoeba::polar_real()
factor_uscale = 1.0;
}
}
//if (i == 12 && j < 20) printf("j = %d: r = %f; factor_wscale = %f\n", j, sqrt(r2), factor_wscale);
r = sqrt(r2);
ck = rpole[j][0];
dkx = rpole[j][1];
@ -597,7 +610,6 @@ void PairAmoeba::polar_real()
dufld[i][3] += xr*tiz5 + zr*tix5 + 2.0*xr*zr*tuir;
dufld[i][4] += yr*tiz5 + zr*tiy5 + 2.0*yr*zr*tuir;
dufld[i][5] += zr*tiz5 + zr*zr*tuir;
dufld[j][0] -= xr*tkx5 + xr*xr*tukr;
dufld[j][1] -= xr*tky5 + yr*tkx5 + 2.0*xr*yr*tukr;
dufld[j][2] -= yr*tky5 + yr*yr*tukr;
@ -855,6 +867,7 @@ void PairAmoeba::polar_real()
frcx = -2.0 * depx;
frcy = -2.0 * depy;
frcz = -2.0 * depz;
}
// get the dtau/dr terms used for mutual polarization force
@ -1327,7 +1340,7 @@ void PairAmoeba::polar_kspace()
// gridpre = my portion of 3d grid in brick decomp w/ ghost values
double ***gridpre = (double ***) p_kspace->zero();
FFT_SCALAR ***gridpre = (FFT_SCALAR ***) p_kspace->zero();
// map atoms to grid
@ -1336,7 +1349,7 @@ void PairAmoeba::polar_kspace()
// pre-convolution operations including forward FFT
// gridfft = my portion of complex 3d grid in FFT decomp as 1d vector
double *gridfft = p_kspace->pre_convolution();
FFT_SCALAR *gridfft = p_kspace->pre_convolution();
// ---------------------
// convolution operation
@ -1386,7 +1399,7 @@ void PairAmoeba::polar_kspace()
// post-convolution operations including backward FFT
// gridppost = my portion of 3d grid in brick decomp w/ ghost values
double ***gridpost = (double ***) p_kspace->post_convolution();
FFT_SCALAR ***gridpost = (FFT_SCALAR ***) p_kspace->post_convolution();
// get potential
@ -1419,7 +1432,7 @@ void PairAmoeba::polar_kspace()
// gridpre2 = my portion of 4d grid in brick decomp w/ ghost values
double ****gridpre2 = (double ****) pc_kspace->zero();
FFT_SCALAR ****gridpre2 = (FFT_SCALAR ****) pc_kspace->zero();
// map 2 values to grid
@ -1428,7 +1441,7 @@ void PairAmoeba::polar_kspace()
// pre-convolution operations including forward FFT
// gridfft = my portion of complex 3d grid in FFT decomposition
double *gridfft = pc_kspace->pre_convolution();
FFT_SCALAR *gridfft = pc_kspace->pre_convolution();
// ---------------------
// convolution operation
@ -1451,7 +1464,7 @@ void PairAmoeba::polar_kspace()
// post-convolution operations including backward FFT
// gridppost = my portion of 4d grid in brick decomp w/ ghost values
double ****gridpost = (double ****) pc_kspace->post_convolution();
FFT_SCALAR ****gridpost = (FFT_SCALAR ****) pc_kspace->post_convolution();
// get potential
@ -1857,7 +1870,7 @@ void PairAmoeba::polar_kspace()
// gridpre = my portion of 3d grid in brick decomp w/ ghost values
// zeroed by zero()
double ***gridpre = (double ***) p_kspace->zero();
FFT_SCALAR ***gridpre = (FFT_SCALAR ***) p_kspace->zero();
// map atoms to grid
@ -1887,7 +1900,7 @@ void PairAmoeba::polar_kspace()
// gridpre = my portion of 3d grid in brick decomp w/ ghost values
// zeroed by zero()
gridpre = (double ***) p_kspace->zero();
gridpre = (FFT_SCALAR ***) p_kspace->zero();
// map atoms to grid
@ -1896,7 +1909,7 @@ void PairAmoeba::polar_kspace()
// pre-convolution operations including forward FFT
// gridfft1/2 = my portions of complex 3d grid in FFT decomp as 1d vectors
double *gridfft2 = p_kspace->pre_convolution();
FFT_SCALAR *gridfft2 = p_kspace->pre_convolution();
// ---------------------
// convolution operation
@ -1953,7 +1966,7 @@ void PairAmoeba::polar_kspace()
// gridpre = my portion of 3d grid in brick decomp w/ ghost values
// zeroed by zero()
double ***gridpre = (double ***) p_kspace->zero();
FFT_SCALAR ***gridpre = (FFT_SCALAR ***) p_kspace->zero();
// map atoms to grid
@ -1962,12 +1975,12 @@ void PairAmoeba::polar_kspace()
// pre-convolution operations including forward FFT
// gridfft = my portion of complex 3d grid in FFT decomp as 1d vector
double *gridfft = p_kspace->pre_convolution();
FFT_SCALAR *gridfft = p_kspace->pre_convolution();
// gridfft1 = copy of first FFT
int nfft_owned = p_kspace->nfft_owned;
memcpy(gridfft1,gridfft,2*nfft_owned*sizeof(double));
memcpy(gridfft1,gridfft,2*nfft_owned*sizeof(FFT_SCALAR));
// assign ??? to the PME grid
@ -1982,7 +1995,7 @@ void PairAmoeba::polar_kspace()
// gridpre = my portion of 3d grid in brick decomp w/ ghost values
gridpre = (double ***) p_kspace->zero();
gridpre = (FFT_SCALAR ***) p_kspace->zero();
// map atoms to grid
@ -1991,7 +2004,7 @@ void PairAmoeba::polar_kspace()
// pre-convolution operations including forward FFT
// gridfft = my portion of complex 3d grid in FFT decomp as 1d vector
double *gridfft2 = p_kspace->pre_convolution();
FFT_SCALAR *gridfft2 = p_kspace->pre_convolution();
// ---------------------
// convolution operation

View File

@ -194,8 +194,8 @@ void FixAmoebaBiTorsion::init()
// error check that PairAmoeba or PairHiippo exist
pair = nullptr;
pair = force->pair_match("amoeba",1,0);
if (!pair) pair = force->pair_match("hippo",1,0);
pair = force->pair_match("^amoeba",0,0);
if (!pair) pair = force->pair_match("^hippo",0,0);
if (!pair)
error->all(FLERR,"Cannot use fix amoeba/bitorsion w/out pair amoeba/hippo");

View File

@ -285,8 +285,9 @@ void ImproperAmoeba::init_style()
// check if PairAmoeba disabled improper terms
Pair *pair = nullptr;
pair = force->pair_match("amoeba",1,0);
if (!pair) pair = force->pair_match("hippo",1,0);
pair = force->pair_match("^amoeba",0,0);
if (!pair) pair = force->pair_match("^hippo",0,0);
if (!pair) error->all(FLERR,"Improper amoeba could not find pair amoeba/hippo");
int tmp;

View File

@ -29,6 +29,7 @@
#include "my_page.h"
#include "neigh_list.h"
#include "neighbor.h"
#include "timer.h"
#include "update.h"
#include <cmath>
@ -47,6 +48,7 @@ enum{MUTUAL,OPT,TCG,DIRECT};
enum{GEAR,ASPC,LSQR};
#define DELTASTACK 16
#define DEBUG_AMOEBA 0
/* ---------------------------------------------------------------------- */
@ -85,6 +87,10 @@ PairAmoeba::PairAmoeba(LAMMPS *lmp) : Pair(lmp)
cmp = fmp = nullptr;
cphi = fphi = nullptr;
_moduli_array = nullptr;
_moduli_bsarray = nullptr;
_nfft_max = 0;
poli = nullptr;
conj = conjp = nullptr;
vec = vecp = nullptr;
@ -227,6 +233,9 @@ PairAmoeba::~PairAmoeba()
memory->destroy(fphidp);
memory->destroy(cphidp);
memory->destroy(_moduli_array);
memory->destroy(_moduli_bsarray);
memory->destroy(thetai1);
memory->destroy(thetai2);
memory->destroy(thetai3);
@ -349,12 +358,22 @@ void PairAmoeba::compute(int eflag, int vflag)
if (update->ntimestep <= update->beginstep+1) {
time_init = time_hal = time_repulse = time_disp = time_mpole = 0.0;
time_induce = time_polar = time_qxfer = 0.0;
time_mpole_rspace = time_mpole_kspace = 0.0;
time_direct_rspace = time_direct_kspace = 0.0;
time_mutual_rspace = time_mutual_kspace = 0.0;
time_polar_rspace = time_polar_kspace = 0.0;
time_grid_uind = time_fphi_uind = 0.0;
if (ic_kspace) {
ic_kspace->time_fft = 0.0;
}
}
double time0,time1,time2,time3,time4,time5,time6,time7,time8;
MPI_Barrier(world);
time0 = MPI_Wtime();
if (timer->has_sync()) MPI_Barrier(world);
time0 = platform::walltime();
// if reneighboring step:
// augment neighbor list to include 1-5 neighbor flags
@ -410,8 +429,7 @@ void PairAmoeba::compute(int eflag, int vflag)
comm->forward_comm(this);
if (amoeba) pbc_xred();
time1 = MPI_Wtime();
time1 = platform::walltime();
// ----------------------------------------
// compute components of force field
@ -420,22 +438,22 @@ void PairAmoeba::compute(int eflag, int vflag)
// buffered 14-7 Vdwl, pairwise
if (amoeba && hal_flag) hal();
time2 = MPI_Wtime();
time2 = platform::walltime();
// Pauli repulsion, pairwise
if (!amoeba && repulse_flag) repulsion();
time3 = MPI_Wtime();
time3 = platform::walltime();
// Ewald dispersion, pairwise and long range
if (!amoeba && (disp_rspace_flag || disp_kspace_flag)) dispersion();
time4 = MPI_Wtime();
time4 = platform::walltime();
// multipole, pairwise and long range
if (mpole_rspace_flag || mpole_kspace_flag) multipole();
time5 = MPI_Wtime();
time5 = platform::walltime();
// induced dipoles, interative CG relaxation
// communicate induce() output values needed by ghost atoms
@ -445,17 +463,17 @@ void PairAmoeba::compute(int eflag, int vflag)
cfstyle = INDUCE;
comm->forward_comm(this);
}
time6 = MPI_Wtime();
time6 = platform::walltime();
// dipoles, pairwise and long range
if (polar_rspace_flag || polar_kspace_flag) polar();
time7 = MPI_Wtime();
time7 = platform::walltime();
// charge transfer, pairwise
if (!amoeba && qxfer_flag) charge_transfer();
time8 = MPI_Wtime();
time8 = platform::walltime();
// store energy components for output by compute pair command
@ -518,6 +536,44 @@ void PairAmoeba::finish()
MPI_Allreduce(&time_qxfer,&ave,1,MPI_DOUBLE,MPI_SUM,world);
time_qxfer = ave/comm->nprocs;
#if DEBUG_AMOEBA
// real-space/kspace breakdown
MPI_Allreduce(&time_mpole_rspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
time_mpole_rspace = ave/comm->nprocs;
MPI_Allreduce(&time_mpole_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
time_mpole_kspace = ave/comm->nprocs;
MPI_Allreduce(&time_direct_rspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
time_direct_rspace = ave/comm->nprocs;
MPI_Allreduce(&time_direct_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
time_direct_kspace = ave/comm->nprocs;
MPI_Allreduce(&time_mutual_rspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
time_mutual_rspace = ave/comm->nprocs;
MPI_Allreduce(&time_mutual_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
time_mutual_kspace = ave/comm->nprocs;
MPI_Allreduce(&time_polar_rspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
time_polar_rspace = ave/comm->nprocs;
MPI_Allreduce(&time_polar_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
time_polar_kspace = ave/comm->nprocs;
MPI_Allreduce(&time_grid_uind,&ave,1,MPI_DOUBLE,MPI_SUM,world);
time_grid_uind = ave/comm->nprocs;
MPI_Allreduce(&time_fphi_uind,&ave,1,MPI_DOUBLE,MPI_SUM,world);
time_fphi_uind = ave/comm->nprocs;
double time_mutual_fft = 0;
if (ic_kspace) time_mutual_fft = ic_kspace->time_fft;
MPI_Allreduce(&time_mutual_fft,&ave,1,MPI_DOUBLE,MPI_SUM,world);
time_mutual_fft = ave/comm->nprocs;
#endif // DEBUG_AMOEBA
double time_total = (time_init + time_hal + time_repulse + time_disp +
time_mpole + time_induce + time_polar + time_qxfer) / 100.0;
@ -534,8 +590,27 @@ void PairAmoeba::finish()
utils::logmesg(lmp," Induce time: {:<12.6g} {:6.2f}%\n", time_induce, time_induce/time_total);
utils::logmesg(lmp," Polar time: {:<12.6g} {:6.2f}%\n", time_polar, time_polar/time_total);
if (!amoeba)
utils::logmesg(lmp," Qxfer time: {:<12.6g} {:6.2f}%\n", time_qxfer, time_qxfer/time_total);
utils::logmesg(lmp," Total time: {:<12.6g}\n",time_total * 100.0);
utils::logmesg(lmp," Qxfer time: {:.6g} {:.6g}\n", time_qxfer, time_qxfer/time_total);
utils::logmesg(lmp," Total time: {:.6g}\n",time_total * 100.0);
#if DEBUG_AMOEBA
double rspace_time = time_mpole_rspace + time_direct_rspace + time_mutual_rspace + time_polar_rspace;
double kspace_time = time_mpole_kspace + time_direct_kspace + time_mutual_kspace + time_polar_kspace;
utils::logmesg(lmp," Real-space timing breakdown: {:.3g}%\n", rspace_time/time_total);
utils::logmesg(lmp," Mpole time: {:.6g} {:.3g}%\n", time_mpole_rspace, time_mpole_rspace/time_total);
utils::logmesg(lmp," Direct time: {:.6g} {:.3g}%\n", time_direct_rspace, time_direct_rspace/time_total);
utils::logmesg(lmp," Mutual time: {:.6g} {:.3g}%\n", time_mutual_rspace, time_mutual_rspace/time_total);
utils::logmesg(lmp," Polar time: {:.6g} {:.3g}%\n", time_polar_rspace, time_polar_rspace/time_total);
utils::logmesg(lmp," K-space timing breakdown : {:.3g}%\n", kspace_time/time_total);
utils::logmesg(lmp," Mpole time: {:.6g} {:.3g}%\n", time_mpole_kspace, time_mpole_kspace/time_total);
utils::logmesg(lmp," Direct time: {:.6g} {:.3g}%\n", time_direct_kspace, time_direct_kspace/time_total);
utils::logmesg(lmp," Mutual time: {:.6g} {:.3g}%\n", time_mutual_kspace, time_mutual_kspace/time_total);
utils::logmesg(lmp," - Grid : {:.6g} {:.3g}%\n", time_grid_uind, time_grid_uind/time_total);
utils::logmesg(lmp," - FFT : {:.6g} {:.3g}%\n", time_mutual_fft, time_mutual_fft/time_total);
utils::logmesg(lmp," - Interp : {:.6g} {:.3g}%\n", time_fphi_uind, time_fphi_uind/time_total);
utils::logmesg(lmp," Polar time: {:.6g} {:.3g}%\n", time_polar_kspace, time_polar_kspace/time_total);
#endif
}
}
@ -2320,6 +2395,8 @@ void PairAmoeba::grow_local()
firstneigh_pcpc = (double **)
memory->smalloc(nmax*sizeof(double *),"induce:firstneigh_pcpc");
}
memory->create(_moduli_array,bsordermax,"amoeba:_moduli_array");
}
/* ----------------------------------------------------------------------

View File

@ -82,6 +82,12 @@ class PairAmoeba : public Pair {
double time_init, time_hal, time_repulse, time_disp;
double time_mpole, time_induce, time_polar, time_qxfer;
double time_mpole_rspace, time_mpole_kspace;
double time_direct_rspace, time_direct_kspace;
double time_mutual_rspace, time_mutual_kspace;
double time_polar_rspace, time_polar_kspace;
double time_grid_uind, time_fphi_uind;
// energy/virial components
double ehal, erepulse, edisp, epolar, empole, eqxfer;
@ -324,8 +330,12 @@ class PairAmoeba : public Pair {
double *qfac; // convoulution pre-factors
double *gridfft1; // copy of p_kspace FFT grid
double **cmp, **fmp; // Cartesian and fractional multipoles
double **cphi, **fphi;
double **cmp,**fmp; // Cartesian and fractional multipoles
double **cphi,**fphi;
double *_moduli_array; // buffers for moduli
double *_moduli_bsarray;
int _nfft_max;
// params for current KSpace solve and FFT being worked on
@ -335,8 +345,12 @@ class PairAmoeba : public Pair {
double ctf[10][10]; // indices NOT flipped vs Fortran
double ftc[10][10]; // indices NOT flipped vs Fortran
class AmoebaConvolution *m_kspace, *p_kspace, *pc_kspace, *d_kspace;
class AmoebaConvolution *i_kspace, *ic_kspace;
class AmoebaConvolution *m_kspace; // multipole KSpace
class AmoebaConvolution *p_kspace; // polar KSpace
class AmoebaConvolution *pc_kspace;
class AmoebaConvolution *d_kspace; // dispersion KSpace
class AmoebaConvolution *i_kspace; // induce KSpace
class AmoebaConvolution *ic_kspace;
// FFT grid size factors
@ -347,33 +361,33 @@ class PairAmoeba : public Pair {
void hal();
void repulsion();
void damprep(double, double, double, double, double, double, double, double, int, double, double,
double *);
virtual void repulsion();
void damprep(double, double, double, double, double, double, double, double,
int, double, double, double *);
void dispersion();
void dispersion_real();
virtual void dispersion_real();
void dispersion_kspace();
void multipole();
void multipole_real();
virtual void multipole_real();
void multipole_kspace();
void polar();
void polar_energy();
void polar_real();
void polar_kspace();
virtual void polar_real();
virtual void polar_kspace();
void damppole(double, int, double, double, double *, double *, double *);
void induce();
virtual void induce();
void ulspred();
void ufield0c(double **, double **);
virtual void ufield0c(double **, double **);
void uscale0b(int, double **, double **, double **, double **);
void dfield0c(double **, double **);
void umutual1(double **, double **);
void umutual2b(double **, double **);
virtual void umutual1(double **, double **);
virtual void umutual2b(double **, double **);
void udirect1(double **);
void udirect2b(double **, double **);
virtual void udirect2b(double **, double **);
void dampmut(double, double, double, double *);
void dampdir(double, double, double, double *, double *);
void cholesky(int, double *, double *);
@ -393,11 +407,11 @@ class PairAmoeba : public Pair {
void fphi_to_cphi(double **, double **);
void frac_to_cart();
void grid_mpole(double **, double ***);
void fphi_mpole(double ***, double **);
void grid_uind(double **, double **, double ****);
void fphi_uind(double ****, double **, double **, double **);
void grid_disp(double ***);
void grid_mpole(double **, FFT_SCALAR ***);
void fphi_mpole(FFT_SCALAR ***, double **);
void grid_uind(double **, double **, FFT_SCALAR ****);
virtual void fphi_uind(FFT_SCALAR ****, double **, double **, double **);
void grid_disp(FFT_SCALAR ***);
void kewald();
void kewald_parallel(int, int, int, int, int &, int &, int &, int &, int &, int &, int &, int &,

View File

@ -45,6 +45,10 @@ depend () {
# add one if statement per parent package
# add one depend() call per child package that depends on that parent
if (test $1 = "AMOEBA") then
depend GPU
fi
if (test $1 = "ASPHERE") then
depend GPU
depend OPENMP

View File

@ -28,6 +28,8 @@ action () {
# list of files with optional dependcies
action amoeba_convolution_gpu.cpp amoeba_convolution.cpp
action amoeba_convolution_gpu.h amoeba_convolution.cpp
action fix_gpu.cpp
action fix_gpu.h
action fix_nve_gpu.h
@ -41,6 +43,8 @@ action fix_npt_gpu.cpp
action fix_nve_asphere_gpu.h fix_nve_asphere.h
action fix_nve_asphere_gpu.cpp fix_nve_asphere.cpp
action gpu_extra.h
action pair_amoeba_gpu.cpp pair_amoeba.cpp
action pair_amoeba_gpu.h pair_amoeba.h
action pair_beck_gpu.cpp pair_beck.cpp
action pair_beck_gpu.h pair_beck.h
action pair_born_coul_long_gpu.cpp pair_born_coul_long.cpp
@ -89,6 +93,8 @@ action pair_gauss_gpu.cpp pair_gauss.cpp
action pair_gauss_gpu.h pair_gauss.h
action pair_gayberne_gpu.cpp pair_gayberne.cpp
action pair_gayberne_gpu.h pair_gayberne.cpp
action pair_hippo_gpu.cpp pair_hippo.cpp
action pair_hippo_gpu.h pair_hippo.cpp
action pair_lj96_cut_gpu.cpp pair_lj96_cut.cpp
action pair_lj96_cut_gpu.h pair_lj96_cut.h
action pair_lj_charmm_coul_long_gpu.cpp pair_lj_charmm_coul_long.cpp
@ -113,6 +119,10 @@ action pair_lj_cut_coul_msm_gpu.cpp pair_lj_cut_coul_msm.cpp
action pair_lj_cut_coul_msm_gpu.h pair_lj_cut_coul_msm.h
action pair_lj_cut_gpu.cpp
action pair_lj_cut_gpu.h
action pair_lj_cut_dipole_long_gpu.cpp pair_lj_cut_dipole_long.cpp
action pair_lj_cut_dipole_long_gpu.h pair_lj_cut_dipole_long.cpp
action pair_lj_cut_tip4p_long_gpu.h pair_lj_cut_tip4p_long.cpp
action pair_lj_cut_tip4p_long_gpu.cpp pair_lj_cut_tip4p_long.cpp
action pair_lj_smooth_gpu.cpp pair_lj_smooth.cpp
action pair_lj_smooth_gpu.h pair_lj_smooth.cpp
action pair_lj_expand_gpu.cpp
@ -155,10 +165,6 @@ action pppm_gpu.cpp pppm.cpp
action pppm_gpu.h pppm.cpp
action pair_ufm_gpu.cpp pair_ufm.cpp
action pair_ufm_gpu.h pair_ufm.h
action pair_lj_cut_dipole_long_gpu.cpp pair_lj_cut_dipole_long.cpp
action pair_lj_cut_dipole_long_gpu.h pair_lj_cut_dipole_long.cpp
action pair_lj_cut_tip4p_long_gpu.h pair_lj_cut_tip4p_long.cpp
action pair_lj_cut_tip4p_long_gpu.cpp pair_lj_cut_tip4p_long.cpp
# edit 2 Makefile.package files to include/exclude package info

View File

@ -0,0 +1,181 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
https://www.lammps.org/ Sandia National Laboratories
LAMMPS Development team: developers@lammps.org
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
#include "amoeba_convolution_gpu.h"
#include "comm.h"
#include "fft3d_wrap.h"
#include "remap_wrap.h"
#include "grid3d.h"
using namespace LAMMPS_NS;
// DEBUG
#define DEBUG_AMOEBA 0
#if DEBUG_AMOEBA
char *labels[7] =
{(char *) "MPOLE_GRID", (char *) "POLAR_GRID",
(char *) "POLAR_GRIDC", (char *) "DISP_GRID",
(char *) "INDUCE_GRID", (char *) "INDUCE_GRIDC"};
enum{GRIDBRICK_OUT,GRIDBRICK_IN,FFT,CFFT1,CFFT2};
#endif
// END DEBUG
#define SCALE 0
//#define USE_AMOEBA_FFT
#ifdef USE_AMOEBA_FFT
// External functions from GPU library
int amoeba_setup_fft(const int size, const int numel, const int element_type);
int amoeba_compute_fft1d(void* in, void* out, const int numel, const int mode);
#endif
/* ----------------------------------------------------------------------
partition an FFT grid across processors
both for a brick and FFT x pencil decomposition
nx,nz,nz = global FFT grid size
order = size of stencil in each dimension that maps atoms to grid
adapted from PPPM::set_grid_local()
------------------------------------------------------------------------- */
AmoebaConvolutionGPU::AmoebaConvolutionGPU(LAMMPS *lmp, Pair *pair,
int nx_caller, int ny_caller, int nz_caller,
int order_caller, int which_caller) :
AmoebaConvolution(lmp, pair, nx_caller, ny_caller, nz_caller, order_caller,
which_caller)
{
}
/* ----------------------------------------------------------------------
perform pre-convolution grid operations for 4d cgrid_brick array
------------------------------------------------------------------------- */
FFT_SCALAR *AmoebaConvolutionGPU::pre_convolution_4d()
{
int ix,iy,iz,n;
// reverse comm for 4d brick grid + ghosts
#if DEBUG_AMOEBA
debug_scalar(GRIDBRICK_OUT,"PRE Convo / PRE Grid3d");
#endif
gc->reverse_comm(Grid3d::PAIR,amoeba,which,2,sizeof(FFT_SCALAR),
gc_buf1,gc_buf2,MPI_FFT_SCALAR);
#if DEBUG_AMOEBA
debug_scalar(GRIDBRICK_IN,"PRE Convo / POST Grid3d");
debug_file(GRIDBRICK_IN,"pre.convo.post.grid3d");
#endif
// copy owned 4d brick grid values to FFT grid
n = 0;
for (iz = nzlo_in; iz <= nzhi_in; iz++)
for (iy = nylo_in; iy <= nyhi_in; iy++)
for (ix = nxlo_in; ix <= nxhi_in; ix++) {
cfft[n++] = cgrid_brick[iz][iy][ix][0];
cfft[n++] = cgrid_brick[iz][iy][ix][1];
}
// remap FFT grid from brick to x pencil partitioning
// NOTE: could just setup FFT to start from brick decomp and skip remap
remap->perform(cfft,cfft,remap_buf);
#if DEBUG_AMOEBA
debug_scalar(FFT,"PRE Convo / POST Remap");
debug_file(FFT,"pre.convo.post.remap");
#endif
double time0,time1;
MPI_Barrier(world);
time0 = platform::walltime();
// perform forward FFT
#ifdef USE_AMOEBA_FFT
amoeba_compute_fft1d(cfft,cfft,2*nfft_owned,FFT3d::FORWARD);
#else
fft1->compute(cfft,cfft,FFT3d::FORWARD);
#endif
time1 = platform::walltime();
time_fft += time1 - time0;
if (SCALE) {
double scale = 1.0/nfft_global;
for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale;
}
#if DEBUG_AMOEBA
debug_scalar(CFFT1,"PRE Convo / POST FFT");
debug_file(CFFT1,"pre.convo.post.fft");
#endif
return cfft;
}
/* ----------------------------------------------------------------------
perform post-convolution grid operations for 4d cgrid_brick array
------------------------------------------------------------------------- */
void *AmoebaConvolutionGPU::post_convolution_4d()
{
int ix,iy,iz,n;
// perform backward FFT
#if DEBUG_AMOEBA
debug_scalar(CFFT1,"POST Convo / PRE FFT");
debug_file(CFFT1,"post.convo.pre.fft");
#endif
double time0,time1;
MPI_Barrier(world);
time0 = platform::walltime();
fft2->compute(cfft,cfft,FFT3d::BACKWARD);
time1 = platform::walltime();
time_fft += time1 - time0;
#if DEBUG_AMOEBA
debug_scalar(CFFT2,"POST Convo / POST FFT");
debug_file(CFFT2,"post.convo.post.fft");
#endif
// copy 1d complex values into 4d complex grid
n = 0;
for (iz = nzlo_in; iz <= nzhi_in; iz++)
for (iy = nylo_in; iy <= nyhi_in; iy++)
for (ix = nxlo_in; ix <= nxhi_in; ix++) {
cgrid_brick[iz][iy][ix][0] = cfft[n++];
cgrid_brick[iz][iy][ix][1] = cfft[n++];
}
// forward comm to populate ghost grid values
#if DEBUG_AMOEBA
debug_scalar(GRIDBRICK_IN,"POST Convo / PRE grid3d");
debug_file(GRIDBRICK_IN,"post.convo.pre.grid3d");
#endif
gc->forward_comm(Grid3d::PAIR,amoeba,which,2,sizeof(FFT_SCALAR),
gc_buf1,gc_buf2,MPI_FFT_SCALAR);
return (void *) cgrid_brick;
}

View File

@ -0,0 +1,32 @@
/* -*- c++ -*- ----------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
https://www.lammps.org/ Sandia National Laboratories
LAMMPS Development team: developers@lammps.org
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
#ifndef LMP_AMOEBA_CONVOLUTION_GPU_H
#define LMP_AMOEBA_CONVOLUTION_GPU_H
#include "amoeba_convolution.h"
namespace LAMMPS_NS {
class AmoebaConvolutionGPU : public AmoebaConvolution {
public:
AmoebaConvolutionGPU(class LAMMPS *, class Pair *, int, int, int, int, int);
FFT_SCALAR *pre_convolution_4d() override;
void *post_convolution_4d() override;
};
} // namespace LAMMPS_NS
#endif

View File

@ -131,7 +131,7 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) :
_gpu_mode = GPU_NEIGH;
_particle_split = 1.0;
int nthreads = 0;
int newtonflag = 0;
int newtonflag = force->newton_pair;
int threads_per_atom = -1;
double binsize = 0.0;
char *opencl_args = nullptr;
@ -360,6 +360,8 @@ double FixGPU::memory_usage()
return bytes;
}
/* ---------------------------------------------------------------------- */
double FixGPU::binsize(const double subx, const double suby,
const double subz, const int nlocal,
const double cut) {

2067
src/GPU/pair_amoeba_gpu.cpp Normal file

File diff suppressed because it is too large Load Diff

72
src/GPU/pair_amoeba_gpu.h Normal file
View File

@ -0,0 +1,72 @@
/* -*- c++ -*- ----------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
https://www.lammps.org/, Sandia National Laboratories
LAMMPS Development team: developers@lammps.org
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
#ifdef PAIR_CLASS
// clang-format off
PairStyle(amoeba/gpu,PairAmoebaGPU);
// clang-format on
#else
#ifndef LMP_PAIR_AMOEBA_GPU_H
#define LMP_PAIR_AMOEBA_GPU_H
#include "pair_amoeba.h"
namespace LAMMPS_NS {
class PairAmoebaGPU : public PairAmoeba {
public:
PairAmoebaGPU(LAMMPS *lmp);
~PairAmoebaGPU() override;
void init_style() override;
double memory_usage() override;
enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH };
void induce() override;
void multipole_real() override;
void udirect2b(double **, double **) override;
void umutual1(double **, double **) override;
void fphi_uind(FFT_SCALAR ****, double **, double **, double **) override;
void umutual2b(double **, double **) override;
void ufield0c(double **, double **) override;
void polar_real() override;
void polar_kspace() override;
private:
int gpu_mode;
double cpu_time;
void *tq_pinned;
void *fieldp_pinned;
bool acc_float;
bool gpu_hal_ready;
bool gpu_repulsion_ready;
bool gpu_dispersion_real_ready;
bool gpu_multipole_real_ready;
bool gpu_udirect2b_ready;
bool gpu_umutual1_ready;
bool gpu_fphi_uind_ready;
bool gpu_umutual2b_ready;
bool gpu_polar_real_ready;
void udirect2b_cpu();
template<class numtyp>
void compute_force_from_torque(const numtyp*, double**, double*);
};
} // namespace LAMMPS_NS
#endif
#endif

1494
src/GPU/pair_hippo_gpu.cpp Normal file

File diff suppressed because it is too large Load Diff

73
src/GPU/pair_hippo_gpu.h Normal file
View File

@ -0,0 +1,73 @@
/* -*- c++ -*- ----------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
https://www.lammps.org/, Sandia National Laboratories
LAMMPS Development team: developers@lammps.org
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
#ifdef PAIR_CLASS
// clang-format off
PairStyle(hippo/gpu,PairHippoGPU);
// clang-format on
#else
#ifndef LMP_PAIR_HIPPO_GPU_H
#define LMP_PAIR_HIPPO_GPU_H
#include "pair_amoeba.h"
namespace LAMMPS_NS {
class PairHippoGPU : public PairAmoeba {
public:
PairHippoGPU(LAMMPS *lmp);
~PairHippoGPU() override;
void init_style() override;
double memory_usage() override;
enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH };
void induce() override;
void repulsion() override;
void dispersion_real() override;
void multipole_real() override;
void udirect2b(double **, double **) override;
void umutual1(double **, double **) override;
void fphi_uind(FFT_SCALAR ****, double **, double **, double **) override;
void umutual2b(double **, double **) override;
void ufield0c(double **, double **) override;
void polar_real() override;
private:
int gpu_mode;
double cpu_time;
void *tq_pinned;
void *fieldp_pinned;
bool acc_float;
bool gpu_hal_ready;
bool gpu_repulsion_ready;
bool gpu_dispersion_real_ready;
bool gpu_multipole_real_ready;
bool gpu_udirect2b_ready;
bool gpu_umutual1_ready;
bool gpu_fphi_uind_ready;
bool gpu_umutual2b_ready;
bool gpu_polar_real_ready;
void udirect2b_cpu();
template<class numtyp>
void compute_force_from_torque(const numtyp*, double**, double*);
};
} // namespace LAMMPS_NS
#endif
#endif

View File

@ -204,6 +204,8 @@ action mliap_model_linear_kokkos.h mliap_model_linear.h
action mliap_model_python_kokkos.cpp mliap_model_linear.cpp
action mliap_model_python_kokkos.h mliap_model_linear.h
action mliap_model_kokkos.h mliap_model.h
action mliap_unified_kokkos.cpp mliap_unified.cpp
action mliap_unified_kokkos.h mliap_unified.h
action mliap_so3_kokkos.cpp mliap_so3.cpp
action mliap_so3_kokkos.h mliap_so3.h
action modify_kokkos.cpp
@ -314,6 +316,8 @@ action pair_lj_spica_kokkos.cpp pair_lj_spica.cpp
action pair_lj_spica_kokkos.h pair_lj_spica.h
action pair_meam_kokkos.cpp pair_meam.cpp
action pair_meam_kokkos.h pair_meam.h
action pair_meam_ms_kokkos.cpp pair_meam_ms.cpp
action pair_meam_ms_kokkos.h pair_meam_ms.h
action pair_mliap_kokkos.cpp pair_mliap.cpp
action pair_mliap_kokkos.h pair_mliap.h
action pair_morse_kokkos.cpp
@ -365,6 +369,7 @@ action verlet_kokkos.h
# Install cython pyx file only if non-KOKKOS version is present
action mliap_model_python_couple_kokkos.pyx mliap_model_python_couple.pyx
action mliap_unified_couple_kokkos.pyx mliap_unified_couple.pyx
# edit 2 Makefile.package files to include/exclude package info
@ -423,15 +428,19 @@ fi
if (test $1 = 1) then
if (type cythonize > /dev/null 2>&1 && test -e ../python_impl.cpp) then
cythonize -3 ../mliap_model_python_couple_kokkos.pyx
cythonize -3 ../mliap_unified_couple_kokkos.pyx
fi
elif (test $1 = 0) then
rm -f ../mliap_model_python_couple_kokkos.cpp ../mliap_model_python_couple_kokkos.h
rm -f ../mliap_unified_couple_kokkos.cpp ../mliap_unified_couple_kokkos.h
elif (test $1 = 2) then
if (type cythonize > /dev/null 2>&1 && test -e ../python_impl.cpp) then
cythonize -3 ../mliap_model_python_couple_kokkos.pyx
cythonize -3 ../mliap_unified_couple_kokkos.pyx
else
rm -f ../mliap_model_python_couple_kokkos.cpp ../mliap_model_python_couple_kokkos.h
rm -f ../mliap_unified_couple_kokkos.cpp ../mliap_unified_couple_kokkos.h
fi
fi

View File

@ -39,7 +39,7 @@ FixNVTKokkos<DeviceType>::FixNVTKokkos(LAMMPS *lmp, int narg, char **arg) :
// id = fix-ID + temp
this->id_temp = utils::strdup(std::string(this->id)+"_temp");
this->modify->add_compute(fmt::format("{} all temp/kk",this->id_temp));
this->modify->add_compute(fmt::format("{} {} temp/kk",this->id_temp,this->group->names[this->igroup]));
this->tcomputeflag = 1;
}

View File

@ -67,7 +67,7 @@ FixNVTSllodKokkos<DeviceType>::FixNVTSllodKokkos(LAMMPS *lmp, int narg, char **a
}
this->id_temp = utils::strdup(std::string(this->id)+"_temp");
this->modify->add_compute(fmt::format("{} all temp/deform/kk",this->id_temp));
this->modify->add_compute(fmt::format("{} {} temp/deform/kk",this->id_temp,this->group->names[this->igroup]));
this->tcomputeflag = 1;
this->nondeformbias = 0;
}

View File

@ -77,9 +77,8 @@ void FixSetForceKokkos<DeviceType>::init()
template<class DeviceType>
void FixSetForceKokkos<DeviceType>::post_force(int /*vflag*/)
{
atomKK->sync(execution_space, X_MASK | F_MASK | MASK_MASK);
atomKK->sync(execution_space, F_MASK | MASK_MASK);
x = atomKK->k_x.view<DeviceType>();
f = atomKK->k_f.view<DeviceType>();
mask = atomKK->k_mask.view<DeviceType>();
@ -88,6 +87,8 @@ void FixSetForceKokkos<DeviceType>::post_force(int /*vflag*/)
// update region if necessary
if (region) {
if (!utils::strmatch(region->style, "^block"))
error->all(FLERR,"Cannot (yet) use {}-style region with fix setforce/kk",region->style);
region->prematch();
DAT::tdual_int_1d k_match = DAT::tdual_int_1d("setforce:k_match",nlocal);
KokkosBase* regionKKBase = dynamic_cast<KokkosBase*>(region);

View File

@ -61,17 +61,44 @@ void MEAMKokkos<DeviceType>::operator()(TagMEAMDensFinal, const int &i, EV_FLOAT
if (elti >= 0) {
scaleii = d_scale(type[i],type[i]);
d_rho1[i] = 0.0;
if (msmeamflag) {
d_rho2[i] = -1.0 / 3.0 * (d_arho2b[i] * d_arho2b[i]
- d_arho2mb[i] * d_arho2mb[i]);
} else{
d_rho2[i] = -1.0 / 3.0 * d_arho2b[i] * d_arho2b[i];
}
d_rho3[i] = 0.0;
for (int m = 0; m < 3; m++) {
if (msmeamflag) {
d_rho1[i] = d_rho1[i] + d_arho1(i, m) * d_arho1(i, m)
- d_arho1m(i, m) * d_arho1m(i, m);
d_rho3[i] = d_rho3[i] - 3.0 / 5.0 * (d_arho3b(i, m) * d_arho3b(i, m)
- d_arho3mb(i, m) * d_arho3mb(i, m));
} else{
d_rho1[i] += d_arho1(i,m) * d_arho1(i,m);
d_rho3[i] -= 3.0 / 5.0 * d_arho3b(i,m) * d_arho3b(i,m);
}
for (int m = 0; m < 6; m++)
}
for (int m = 0; m < 6; m++){
if (msmeamflag) {
d_rho2[i] = d_rho2[i] + v2D[m] * (d_arho2(i, m) * d_arho2(i, m)
- d_arho2m(i, m) * d_arho2m(i, m));
} else{
d_rho2[i] += v2D[m] * d_arho2(i,m) * d_arho2(i,m);
}
}
for (int m = 0; m < 10; m++)
if (msmeamflag) {
d_rho3[i] = d_rho3[i] + v3D[m] * (d_arho3(i, m) * d_arho3(i, m)
- d_arho3m(i, m) * d_arho3m(i, m));
} else{
d_rho3[i] += v3D[m] * d_arho3(i,m) * d_arho3(i,m);
}
if (msmeamflag) {
// with msmeam all t weights are already accounted for in rho
d_gamma[i] = d_rho1[i] + d_rho2[i] + d_rho3[i];
} else{
if (d_rho0[i] > 0.0) {
if (ialloy == 1) {
d_t_ave(i,0) = fdiv_zero_kk(d_t_ave(i,0), d_tsq_ave(i,0));
@ -87,8 +114,8 @@ void MEAMKokkos<DeviceType>::operator()(TagMEAMDensFinal, const int &i, EV_FLOAT
d_t_ave(i,2) /= d_rho0[i];
}
}
d_gamma[i] = d_t_ave(i,0) * d_rho1[i] + d_t_ave(i,1) * d_rho2[i] + d_t_ave(i,2) * d_rho3[i];
}
if (d_rho0[i] > 0.0)
d_gamma[i] /= (d_rho0[i] * d_rho0[i]);

Some files were not shown because too many files have changed in this diff Show More