Merge pull request #2603 from wmbrownIntel/gpu-updateFeb2021
GPU Package Update February 2021
This commit is contained in:
@ -1,7 +1,9 @@
|
||||
set(GPU_SOURCES_DIR ${LAMMPS_SOURCE_DIR}/GPU)
|
||||
set(GPU_SOURCES ${GPU_SOURCES_DIR}/gpu_extra.h
|
||||
${GPU_SOURCES_DIR}/fix_gpu.h
|
||||
${GPU_SOURCES_DIR}/fix_gpu.cpp)
|
||||
${GPU_SOURCES_DIR}/fix_gpu.cpp
|
||||
${GPU_SOURCES_DIR}/fix_nh_gpu.h
|
||||
${GPU_SOURCES_DIR}/fix_nh_gpu.cpp)
|
||||
target_compile_definitions(lammps PRIVATE -DLMP_GPU)
|
||||
|
||||
set(GPU_API "opencl" CACHE STRING "API used by GPU package")
|
||||
@ -155,11 +157,6 @@ elseif(GPU_API STREQUAL "OPENCL")
|
||||
else()
|
||||
find_package(OpenCL REQUIRED)
|
||||
endif()
|
||||
set(OCL_TUNE "generic" CACHE STRING "OpenCL Device Tuning")
|
||||
set(OCL_TUNE_VALUES intel fermi kepler cypress generic)
|
||||
set_property(CACHE OCL_TUNE PROPERTY STRINGS ${OCL_TUNE_VALUES})
|
||||
validate_option(OCL_TUNE OCL_TUNE_VALUES)
|
||||
string(TOUPPER ${OCL_TUNE} OCL_TUNE)
|
||||
|
||||
include(OpenCLUtils)
|
||||
set(OCL_COMMON_HEADERS ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_preprocessor.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_aux_fun1.h)
|
||||
@ -203,7 +200,7 @@ elseif(GPU_API STREQUAL "OPENCL")
|
||||
add_library(gpu STATIC ${GPU_LIB_SOURCES})
|
||||
target_link_libraries(gpu PRIVATE OpenCL::OpenCL)
|
||||
target_include_directories(gpu PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/gpu)
|
||||
target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -D${OCL_TUNE}_OCL -DMPI_GERYON -DUCL_NO_EXIT)
|
||||
target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT)
|
||||
target_compile_definitions(gpu PRIVATE -DUSE_OPENCL)
|
||||
|
||||
target_link_libraries(lammps PRIVATE gpu)
|
||||
|
||||
@ -120,8 +120,6 @@ CMake build
|
||||
-D GPU_API=value # value = opencl (default) or cuda or hip
|
||||
-D GPU_PREC=value # precision setting
|
||||
# value = double or mixed (default) or single
|
||||
-D OCL_TUNE=value # hardware choice for GPU_API=opencl
|
||||
# generic (default) or intel (Intel CPU) or fermi, kepler, cypress (NVIDIA)
|
||||
-D GPU_ARCH=value # primary GPU hardware choice for GPU_API=cuda
|
||||
# value = sm_XX, see below
|
||||
# default is sm_50
|
||||
|
||||
@ -114,7 +114,7 @@ OPT.
|
||||
* :doc:`nph/eff <fix_nh_eff>`
|
||||
* :doc:`nph/sphere (o) <fix_nph_sphere>`
|
||||
* :doc:`nphug <fix_nphug>`
|
||||
* :doc:`npt (iko) <fix_nh>`
|
||||
* :doc:`npt (giko) <fix_nh>`
|
||||
* :doc:`npt/asphere (o) <fix_npt_asphere>`
|
||||
* :doc:`npt/body <fix_npt_body>`
|
||||
* :doc:`npt/cauchy <fix_npt_cauchy>`
|
||||
@ -122,8 +122,8 @@ OPT.
|
||||
* :doc:`npt/sphere (o) <fix_npt_sphere>`
|
||||
* :doc:`npt/uef <fix_nh_uef>`
|
||||
* :doc:`numdiff <fix_numdiff>`
|
||||
* :doc:`nve (iko) <fix_nve>`
|
||||
* :doc:`nve/asphere (i) <fix_nve_asphere>`
|
||||
* :doc:`nve (giko) <fix_nve>`
|
||||
* :doc:`nve/asphere (gi) <fix_nve_asphere>`
|
||||
* :doc:`nve/asphere/noforce <fix_nve_asphere_noforce>`
|
||||
* :doc:`nve/awpmd <fix_nve_awpmd>`
|
||||
* :doc:`nve/body <fix_nve_body>`
|
||||
@ -138,7 +138,7 @@ OPT.
|
||||
* :doc:`nve/spin <fix_nve_spin>`
|
||||
* :doc:`nve/tri <fix_nve_tri>`
|
||||
* :doc:`nvk <fix_nvk>`
|
||||
* :doc:`nvt (iko) <fix_nh>`
|
||||
* :doc:`nvt (giko) <fix_nh>`
|
||||
* :doc:`nvt/asphere (o) <fix_nvt_asphere>`
|
||||
* :doc:`nvt/body <fix_nvt_body>`
|
||||
* :doc:`nvt/eff <fix_nh_eff>`
|
||||
|
||||
@ -122,7 +122,7 @@ OPT.
|
||||
* :doc:`lebedeva/z <pair_lebedeva_z>`
|
||||
* :doc:`lennard/mdf <pair_mdf>`
|
||||
* :doc:`line/lj <pair_line_lj>`
|
||||
* :doc:`lj/charmm/coul/charmm (iko) <pair_charmm>`
|
||||
* :doc:`lj/charmm/coul/charmm (giko) <pair_charmm>`
|
||||
* :doc:`lj/charmm/coul/charmm/implicit (ko) <pair_charmm>`
|
||||
* :doc:`lj/charmm/coul/long (gikot) <pair_charmm>`
|
||||
* :doc:`lj/charmm/coul/long/soft (o) <pair_fep_soft>`
|
||||
|
||||
@ -1,11 +1,14 @@
|
||||
GPU package
|
||||
===========
|
||||
|
||||
The GPU package was developed by Mike Brown while at SNL and ORNL
|
||||
and his collaborators, particularly Trung Nguyen (now at Northwestern).
|
||||
It provides GPU versions of many pair styles and for parts of the
|
||||
:doc:`kspace_style pppm <kspace_style>` for long-range Coulombics.
|
||||
It has the following general features:
|
||||
The GPU package was developed by Mike Brown while at SNL and ORNL (now
|
||||
at Intel Corp.) and his collaborators, particularly Trung Nguyen (now at
|
||||
Northwestern). Support for AMD GPUs via HIP was added by Vsevolod Nikolskiy
|
||||
and coworkers at HSE University.
|
||||
|
||||
The GPU package provides GPU versions of many pair styles and for
|
||||
parts of the :doc:`kspace_style pppm <kspace_style>` for long-range
|
||||
Coulombics. It has the following general features:
|
||||
|
||||
* It is designed to exploit common GPU hardware configurations where one
|
||||
or more GPUs are coupled to many cores of one or more multi-core CPUs,
|
||||
@ -24,8 +27,9 @@ It has the following general features:
|
||||
force vectors.
|
||||
* LAMMPS-specific code is in the GPU package. It makes calls to a
|
||||
generic GPU library in the lib/gpu directory. This library provides
|
||||
NVIDIA support as well as more general OpenCL support, so that the
|
||||
same functionality is supported on a variety of hardware.
|
||||
either Nvidia support, AMD support, or more general OpenCL support
|
||||
(for Nvidia GPUs, AMD GPUs, Intel GPUs, and multi-core CPUs).
|
||||
so that the same functionality is supported on a variety of hardware.
|
||||
|
||||
**Required hardware/software:**
|
||||
|
||||
@ -45,12 +49,23 @@ to have the OpenCL headers and the (vendor neutral) OpenCL library installed.
|
||||
In OpenCL mode, the acceleration depends on having an `OpenCL Installable Client Driver (ICD) <https://www.khronos.org/news/permalink/opencl-installable-client-driver-icd-loader>`_
|
||||
installed. There can be multiple of them for the same or different hardware
|
||||
(GPUs, CPUs, Accelerators) installed at the same time. OpenCL refers to those
|
||||
as 'platforms'. The GPU library will select the **first** suitable platform,
|
||||
but this can be overridden using the device option of the :doc:`package <package>`
|
||||
as 'platforms'. The GPU library will try to auto-select the best suitable platform,
|
||||
but this can be overridden using the platform option of the :doc:`package <package>`
|
||||
command. run lammps/lib/gpu/ocl_get_devices to get a list of available
|
||||
platforms and devices with a suitable ICD available.
|
||||
|
||||
To compute and use this package in HIP mode, you have to have the AMD ROCm
|
||||
To compile and use this package for Intel GPUs, OpenCL or the Intel oneAPI
|
||||
HPC Toolkit can be installed using linux package managers. The latter also
|
||||
provides optimized C++, MPI, and many other libraries and tools. See:
|
||||
|
||||
* https://software.intel.com/content/www/us/en/develop/tools/oneapi/hpc-toolkit/download.html
|
||||
|
||||
If you do not have a discrete GPU card installed, this package can still provide
|
||||
significant speedups on some CPUs that include integrated GPUs. Additionally, for
|
||||
many macs, OpenCL is already included with the OS and Makefiles are available
|
||||
in the lib/gpu directory.
|
||||
|
||||
To compile and use this package in HIP mode, you have to have the AMD ROCm
|
||||
software installed. Versions of ROCm older than 3.5 are currently deprecated
|
||||
by AMD.
|
||||
|
||||
@ -75,10 +90,20 @@ automatically if you create more MPI tasks/node than there are
|
||||
GPUs/mode. E.g. with 8 MPI tasks/node and 2 GPUs, each GPU will be
|
||||
shared by 4 MPI tasks.
|
||||
|
||||
The GPU package also has limited support for OpenMP for both
|
||||
multi-threading and vectorization of routines that are run on the CPUs.
|
||||
This requires that the GPU library and LAMMPS are built with flags to
|
||||
enable OpenMP support (e.g. -fopenmp). Some styles for time integration
|
||||
are also available in the GPU package. These run completely on the CPUs
|
||||
in full double precision, but exploit multi-threading and vectorization
|
||||
for faster performance.
|
||||
|
||||
Use the "-sf gpu" :doc:`command-line switch <Run_options>`, which will
|
||||
automatically append "gpu" to styles that support it. Use the "-pk
|
||||
gpu Ng" :doc:`command-line switch <Run_options>` to set Ng = # of
|
||||
GPUs/node to use.
|
||||
GPUs/node to use. If Ng is 0, the number is selected automatically as
|
||||
the number of matching GPUs that have the highest number of compute
|
||||
cores.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
@ -87,8 +112,8 @@ GPUs/node to use.
|
||||
mpirun -np 48 -ppn 12 lmp_machine -sf gpu -pk gpu 2 -in in.script # ditto on 4 16-core nodes
|
||||
|
||||
Note that if the "-sf gpu" switch is used, it also issues a default
|
||||
:doc:`package gpu 1 <package>` command, which sets the number of
|
||||
GPUs/node to 1.
|
||||
:doc:`package gpu 0 <package>` command, which will result in
|
||||
automatic selection of the number of GPUs to use.
|
||||
|
||||
Using the "-pk" switch explicitly allows for setting of the number of
|
||||
GPUs/node to use and additional options. Its syntax is the same as
|
||||
@ -138,6 +163,13 @@ Likewise, you should experiment with the precision setting for the GPU
|
||||
library to see if single or mixed precision will give accurate
|
||||
results, since they will typically be faster.
|
||||
|
||||
MPI parallelism typically outperforms OpenMP parallelism, but in some
|
||||
cases using fewer MPI tasks and multiple OpenMP threads with the GPU
|
||||
package can give better performance. 3-body potentials can often perform
|
||||
better with multiple OMP threads because the inter-process communication
|
||||
is higher for these styles with the GPU package in order to allow
|
||||
deterministic results.
|
||||
|
||||
**Guidelines for best performance:**
|
||||
|
||||
* Using multiple MPI tasks per GPU will often give the best performance,
|
||||
@ -161,6 +193,12 @@ results, since they will typically be faster.
|
||||
:doc:`angle <angle_style>`, :doc:`dihedral <dihedral_style>`,
|
||||
:doc:`improper <improper_style>`, and :doc:`long-range <kspace_style>`
|
||||
calculations will not be included in the "Pair" time.
|
||||
* Since only part of the pppm kspace style is GPU accelerated, it
|
||||
may be faster to only use GPU acceleration for Pair styles with
|
||||
long-range electrostatics. See the "pair/only" keyword of the
|
||||
package command for a shortcut to do that. The work between kspace
|
||||
on the CPU and non-bonded interactions on the GPU can be balanced
|
||||
through adjusting the coulomb cutoff without loss of accuracy.
|
||||
* When the *mode* setting for the package gpu command is force/neigh,
|
||||
the time for neighbor list calculations on the GPU will be added into
|
||||
the "Pair" time, not the "Neigh" time. An additional breakdown of the
|
||||
|
||||
@ -16,7 +16,7 @@ These are the accelerator packages currently in LAMMPS, either as
|
||||
standard or user packages:
|
||||
|
||||
+-----------------------------------------+-------------------------------------------------------+
|
||||
| :doc:`GPU Package <Speed_gpu>` | for NVIDIA GPUs as well as OpenCL support |
|
||||
| :doc:`GPU Package <Speed_gpu>` | for GPUs via CUDA, OpenCL, or ROCm HIP |
|
||||
+-----------------------------------------+-------------------------------------------------------+
|
||||
| :doc:`USER-INTEL Package <Speed_intel>` | for Intel CPUs and Intel Xeon Phi |
|
||||
+-----------------------------------------+-------------------------------------------------------+
|
||||
@ -43,7 +43,7 @@ three kinds of hardware, via the listed packages:
|
||||
+-----------------+-----------------------------------------------------------------------------------------------------------------------------+
|
||||
| Many-core CPUs | :doc:`USER-INTEL <Speed_intel>`, :doc:`KOKKOS <Speed_kokkos>`, :doc:`USER-OMP <Speed_omp>`, :doc:`OPT <Speed_opt>` packages |
|
||||
+-----------------+-----------------------------------------------------------------------------------------------------------------------------+
|
||||
| NVIDIA/AMD GPUs | :doc:`GPU <Speed_gpu>`, :doc:`KOKKOS <Speed_kokkos>` packages |
|
||||
| GPUs | :doc:`GPU <Speed_gpu>`, :doc:`KOKKOS <Speed_kokkos>` packages |
|
||||
+-----------------+-----------------------------------------------------------------------------------------------------------------------------+
|
||||
| Intel Phi/AVX | :doc:`USER-INTEL <Speed_intel>`, :doc:`KOKKOS <Speed_kokkos>` packages |
|
||||
+-----------------+-----------------------------------------------------------------------------------------------------------------------------+
|
||||
@ -154,8 +154,8 @@ Here is a brief summary of what the various packages provide. Details
|
||||
are in the individual accelerator sections.
|
||||
|
||||
* Styles with a "gpu" suffix are part of the GPU package and can be run
|
||||
on NVIDIA or AMD GPUs. The speed-up on a GPU depends on a variety of
|
||||
factors, discussed in the accelerator sections.
|
||||
on Intel, NVIDIA, or AMD GPUs. The speed-up on a GPU depends on a
|
||||
variety of factors, discussed in the accelerator sections.
|
||||
* Styles with an "intel" suffix are part of the USER-INTEL
|
||||
package. These styles support vectorized single and mixed precision
|
||||
calculations, in addition to full double precision. In extreme cases,
|
||||
|
||||
@ -1,8 +1,10 @@
|
||||
.. index:: fix nvt
|
||||
.. index:: fix nvt/gpu
|
||||
.. index:: fix nvt/intel
|
||||
.. index:: fix nvt/kk
|
||||
.. index:: fix nvt/omp
|
||||
.. index:: fix npt
|
||||
.. index:: fix npt/gpu
|
||||
.. index:: fix npt/intel
|
||||
.. index:: fix npt/kk
|
||||
.. index:: fix npt/omp
|
||||
@ -13,12 +15,12 @@
|
||||
fix nvt command
|
||||
===============
|
||||
|
||||
Accelerator Variants: *nvt/intel*, *nvt/kk*, *nvt/omp*
|
||||
Accelerator Variants: *nvt/gpu*, *nvt/intel*, *nvt/kk*, *nvt/omp*
|
||||
|
||||
fix npt command
|
||||
===============
|
||||
|
||||
Accelerator Variants: *npt/intel*, *npt/kk*, *npt/omp*
|
||||
Accelerator Variants: *npt/gpu*, *npt/intel*, *npt/kk*, *npt/omp*
|
||||
|
||||
fix nph command
|
||||
===============
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
.. index:: fix nve
|
||||
.. index:: fix nve/gpu
|
||||
.. index:: fix nve/intel
|
||||
.. index:: fix nve/kk
|
||||
.. index:: fix nve/omp
|
||||
@ -6,7 +7,7 @@
|
||||
fix nve command
|
||||
===============
|
||||
|
||||
Accelerator Variants: *nve/intel*, *nve/kk*, *nve/omp*
|
||||
Accelerator Variants: *nve/gpu*, *nve/intel*, *nve/kk*, *nve/omp*
|
||||
|
||||
Syntax
|
||||
""""""
|
||||
|
||||
@ -1,10 +1,11 @@
|
||||
.. index:: fix nve/asphere
|
||||
.. index:: fix nve/asphere/gpu
|
||||
.. index:: fix nve/asphere/intel
|
||||
|
||||
fix nve/asphere command
|
||||
=======================
|
||||
|
||||
Accelerator Variants: *nve/asphere/intel*
|
||||
Accelerator Variants: *nve/asphere/gpu*, *nve/asphere/intel*
|
||||
|
||||
Syntax
|
||||
""""""
|
||||
|
||||
@ -18,7 +18,7 @@ Syntax
|
||||
*gpu* args = Ngpu keyword value ...
|
||||
Ngpu = # of GPUs per node
|
||||
zero or more keyword/value pairs may be appended
|
||||
keywords = *neigh* or *newton* or *pair/only* or *binsize* or *split* or *gpuID* or *tpa* or *device* or *blocksize*
|
||||
keywords = *neigh* or *newton* or *pair/only* or *binsize* or *split* or *gpuID* or *tpa* or *blocksize* or *platform* or *device_type* or *ocl_args*
|
||||
*neigh* value = *yes* or *no*
|
||||
yes = neighbor list build on GPU (default)
|
||||
no = neighbor list build on CPU
|
||||
@ -32,17 +32,18 @@ Syntax
|
||||
size = bin size for neighbor list construction (distance units)
|
||||
*split* = fraction
|
||||
fraction = fraction of atoms assigned to GPU (default = 1.0)
|
||||
*gpuID* values = first last
|
||||
first = ID of first GPU to be used on each node
|
||||
last = ID of last GPU to be used on each node
|
||||
*tpa* value = Nthreads
|
||||
Nthreads = # of GPU threads used per atom
|
||||
*device* value = device_type or platform_id:device_type or platform_id:custom,val1,val2,val3,..,val13
|
||||
platform_id = numerical OpenCL platform id (default: -1)
|
||||
device_type = *kepler* or *fermi* or *cypress* or *intel* or *phi* or *generic* or *custom*
|
||||
val1,val2,... = custom OpenCL tune parameters (see below for details)
|
||||
Nthreads = # of GPU vector lanes used per atom
|
||||
*blocksize* value = size
|
||||
size = thread block size for pair force computation
|
||||
*platform* value = id
|
||||
id = For OpenCL, platform ID for the GPU or accelerator
|
||||
*gpuID* values = id
|
||||
id = ID of first GPU to be used on each node
|
||||
*device_type* value = *intelgpu* or *nvidiagpu* or *amdgpu* or *applegpu* or *generic* or *custom,val1,val2,...*
|
||||
val1,val2,... = custom OpenCL accelerator configuration parameters (see below for details)
|
||||
*ocl_args* value = args
|
||||
args = List of additional OpenCL compiler arguments delimited by colons
|
||||
*intel* args = NPhi keyword value ...
|
||||
Nphi = # of co-processors per node
|
||||
zero or more keyword/value pairs may be appended
|
||||
@ -112,12 +113,10 @@ Examples
|
||||
|
||||
.. code-block:: LAMMPS
|
||||
|
||||
package gpu 1
|
||||
package gpu 0
|
||||
package gpu 1 split 0.75
|
||||
package gpu 2 split -1.0
|
||||
package gpu 1 device kepler
|
||||
package gpu 1 device 2:generic
|
||||
package gpu 1 device custom,32,4,8,256,11,128,256,128,32,64,8,128,128
|
||||
package gpu 0 device_type intelgpu
|
||||
package kokkos neigh half comm device
|
||||
package omp 0 neigh no
|
||||
package omp 4
|
||||
@ -174,10 +173,18 @@ simulations.
|
||||
The *gpu* style invokes settings associated with the use of the GPU
|
||||
package.
|
||||
|
||||
The *Ngpu* argument sets the number of GPUs per node. There must be
|
||||
at least as many MPI tasks per node as GPUs, as set by the mpirun or
|
||||
mpiexec command. If there are more MPI tasks (per node)
|
||||
than GPUs, multiple MPI tasks will share each GPU.
|
||||
The *Ngpu* argument sets the number of GPUs per node. If *Ngpu* is 0
|
||||
and no other keywords are specified, GPU or accelerator devices are
|
||||
auto-selected. In this process, all platforms are searched for
|
||||
accelerator devices and GPUs are chosen if available. The device with
|
||||
the highest number of compute cores is selected. The number of devices
|
||||
is increased to be the number of matching accelerators with the same
|
||||
number of compute cores. If there are more devices than MPI tasks,
|
||||
the additional devices will be unused. The auto-selection of GPUs/
|
||||
accelerator devices and platforms can be restricted by specifying
|
||||
a non-zero value for *Ngpu* and / or using the *gpuID*, *platform*,
|
||||
and *device_type* keywords as described below. If there are more MPI
|
||||
tasks (per node) than GPUs, multiple MPI tasks will share each GPU.
|
||||
|
||||
Optional keyword/value pairs can also be specified. Each has a
|
||||
default value as listed below.
|
||||
@ -212,18 +219,8 @@ overlapped with all other computations on the CPU.
|
||||
|
||||
The *binsize* keyword sets the size of bins used to bin atoms in
|
||||
neighbor list builds performed on the GPU, if *neigh* = *yes* is set.
|
||||
If *binsize* is set to 0.0 (the default), then bins = the size of the
|
||||
pairwise cutoff + neighbor skin distance. This is 2x larger than the
|
||||
LAMMPS default used for neighbor list building on the CPU. This will
|
||||
be close to optimal for the GPU, so you do not normally need to use
|
||||
this keyword. Note that if you use a longer-than-usual pairwise
|
||||
cutoff, e.g. to allow for a smaller fraction of KSpace work with a
|
||||
:doc:`long-range Coulombic solver <kspace_style>` because the GPU is
|
||||
faster at performing pairwise interactions, then it may be optimal to
|
||||
make the *binsize* smaller than the default. For example, with a
|
||||
cutoff of 20\*sigma in LJ :doc:`units <units>` and a neighbor skin
|
||||
distance of sigma, a *binsize* = 5.25\*sigma can be more efficient than
|
||||
the default.
|
||||
If *binsize* is set to 0.0 (the default), then the binsize is set
|
||||
automatically using heuristics in the GPU package.
|
||||
|
||||
The *split* keyword can be used for load balancing force calculations
|
||||
between CPU and GPU cores in GPU-enabled pair styles. If 0 < *split* <
|
||||
@ -257,63 +254,71 @@ cores would perform force calculations for some fraction of the
|
||||
particles at the same time the GPUs performed force calculation for
|
||||
the other particles.
|
||||
|
||||
The *gpuID* keyword allows selection of which GPUs on each node will
|
||||
be used for a simulation. The *first* and *last* values specify the
|
||||
GPU IDs to use (from 0 to Ngpu-1). By default, first = 0 and last =
|
||||
Ngpu-1, so that all GPUs are used, assuming Ngpu is set to the number
|
||||
of physical GPUs. If you only wish to use a subset, set Ngpu to a
|
||||
smaller number and first/last to a sub-range of the available GPUs.
|
||||
The *gpuID* keyword is used to specify the first ID for the GPU or
|
||||
other accelerator that LAMMPS will use. For example, if the ID is
|
||||
1 and *Ngpu* is 3, GPUs 1-3 will be used. Device IDs should be
|
||||
determined from the output of nvc_get_devices, ocl_get_devices,
|
||||
or hip_get_devices
|
||||
as provided in the lib/gpu directory. When using OpenCL with
|
||||
accelerators that have main memory NUMA, the accelerators can be
|
||||
split into smaller virtual accelerators for more efficient use
|
||||
with MPI.
|
||||
|
||||
The *tpa* keyword sets the number of GPU thread per atom used to
|
||||
The *tpa* keyword sets the number of GPU vector lanes per atom used to
|
||||
perform force calculations. With a default value of 1, the number of
|
||||
threads will be chosen based on the pair style, however, the value can
|
||||
be set explicitly with this keyword to fine-tune performance. For
|
||||
large cutoffs or with a small number of particles per GPU, increasing
|
||||
the value can improve performance. The number of threads per atom must
|
||||
be a power of 2 and currently cannot be greater than 32.
|
||||
|
||||
The *device* keyword can be used to tune parameters optimized for a
|
||||
specific accelerator and platform when using OpenCL. OpenCL supports
|
||||
the concept of a **platform**\ , which represents one or more devices that
|
||||
share the same driver (e.g. there would be a different platform for
|
||||
GPUs from different vendors or for CPU based accelerator support).
|
||||
In LAMMPS only one platform can be active at a time and by default
|
||||
the first platform with an accelerator is selected. This is equivalent
|
||||
to using a platform ID of -1. The platform ID is a number corresponding
|
||||
to the output of the ocl_get_devices tool. The platform ID is passed
|
||||
to the GPU library, by prefixing the *device* keyword with that number
|
||||
separated by a colon. For CUDA, the *device* keyword is ignored.
|
||||
Currently, the device tuning support is limited to NVIDIA Kepler, NVIDIA
|
||||
Fermi, AMD Cypress, Intel x86_64 CPU, Intel Xeon Phi, or a generic device.
|
||||
More devices may be added later. The default device type can be
|
||||
specified when building LAMMPS with the GPU library, via setting a
|
||||
variable in the lib/gpu/Makefile that is used.
|
||||
|
||||
In addition, a device type *custom* is available, which is followed by
|
||||
13 comma separated numbers, which allows to set those tweakable parameters
|
||||
from the package command. It can be combined with the (colon separated)
|
||||
platform id. The individual settings are:
|
||||
|
||||
* MEM_THREADS
|
||||
* THREADS_PER_ATOM
|
||||
* THREADS_PER_CHARGE
|
||||
* BLOCK_PAIR
|
||||
* MAX_SHARED_TYPES
|
||||
* BLOCK_NBOR_BUILD
|
||||
* BLOCK_BIO_PAIR
|
||||
* BLOCK_ELLIPSE
|
||||
* WARP_SIZE
|
||||
* PPPM_BLOCK_1D
|
||||
* BLOCK_CELL_2D
|
||||
* BLOCK_CELL_ID
|
||||
* MAX_BIO_SHARED_TYPES
|
||||
be a power of 2 and currently cannot be greater than the SIMD width
|
||||
for the GPU / accelerator. In the case it exceeds the SIMD width, it
|
||||
will automatically be decreased to meet the restriction.
|
||||
|
||||
The *blocksize* keyword allows you to tweak the number of threads used
|
||||
per thread block. This number should be a multiple of 32 (for GPUs)
|
||||
and its maximum depends on the specific GPU hardware. Typical choices
|
||||
are 64, 128, or 256. A larger block size increases occupancy of
|
||||
individual GPU cores, but reduces the total number of thread blocks,
|
||||
thus may lead to load imbalance.
|
||||
thus may lead to load imbalance. On modern hardware, the sensitivity
|
||||
to the blocksize is typically low.
|
||||
|
||||
The *platform* keyword is only used with OpenCL to specify the ID for
|
||||
an OpenCL platform. See the output from ocl_get_devices in the lib/gpu
|
||||
directory. In LAMMPS only one platform can be active at a time and by
|
||||
default (id=-1) the platform is auto-selected to find the GPU with the
|
||||
most compute cores. When *Ngpu* or other keywords are specified, the
|
||||
auto-selection is appropriately restricted. For example, if *Ngpu* is
|
||||
3, only platforms with at least 3 accelerators are considered. Similar
|
||||
restrictions can be enforced by the *gpuID* and *device_type* keywords.
|
||||
|
||||
The *device_type* keyword can be used for OpenCL to specify the type of
|
||||
GPU to use or specify a custom configuration for an accelerator. In most
|
||||
cases this selection will be automatic and there is no need to use the
|
||||
keyword. The *applegpu* type is not specific to a particular GPU vendor,
|
||||
but is separate due to the more restrictive Apple OpenCL implementation.
|
||||
For expert users, to specify a custom configuration, the *custom* keyword
|
||||
followed by the next parameters can be specified:
|
||||
|
||||
CONFIG_ID, SIMD_SIZE, MEM_THREADS, SHUFFLE_AVAIL, FAST_MATH,
|
||||
THREADS_PER_ATOM, THREADS_PER_CHARGE, THREADS_PER_THREE, BLOCK_PAIR,
|
||||
BLOCK_BIO_PAIR, BLOCK_ELLIPSE, PPPM_BLOCK_1D, BLOCK_NBOR_BUILD,
|
||||
BLOCK_CELL_2D, BLOCK_CELL_ID, MAX_SHARED_TYPES, MAX_BIO_SHARED_TYPES,
|
||||
PPPM_MAX_SPLINE.
|
||||
|
||||
CONFIG_ID can be 0. SHUFFLE_AVAIL in {0,1} indicates that inline-PTX
|
||||
(NVIDIA) or OpenCL extensions (Intel) should be used for horizontal
|
||||
vector operations. FAST_MATH in {0,1} indicates that OpenCL fast math
|
||||
optimizations are used during the build and hardware-accelerated
|
||||
transcendental functions are used when available. THREADS_PER_* give the
|
||||
default *tpa* values for ellipsoidal models, styles using charge, and
|
||||
any other styles. The BLOCK_* parameters specify the block sizes for
|
||||
various kernel calls and the MAX_*SHARED*_ parameters are used to
|
||||
determine the amount of local shared memory to use for storing model
|
||||
parameters.
|
||||
|
||||
For OpenCL, the routines are compiled at runtime for the specified GPU
|
||||
or accelerator architecture. The *ocl_args* keyword can be used to
|
||||
specify additional flags for the runtime build.
|
||||
|
||||
----------
|
||||
|
||||
@ -658,9 +663,9 @@ Related commands
|
||||
Default
|
||||
"""""""
|
||||
|
||||
For the GPU package, the default is Ngpu = 1 and the option defaults
|
||||
For the GPU package, the default is Ngpu = 0 and the option defaults
|
||||
are neigh = yes, newton = off, binsize = 0.0, split = 1.0, gpuID = 0
|
||||
to Ngpu-1, tpa = 1, and device = not used. These settings are made
|
||||
to Ngpu-1, tpa = 1, and platform=-1. These settings are made
|
||||
automatically if the "-sf gpu" :doc:`command-line switch <Run_options>`
|
||||
is used. If it is not used, you must invoke the package gpu command
|
||||
in your input script or via the "-pk gpu" :doc:`command-line switch <Run_options>`.
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
.. index:: pair_style lj/charmm/coul/charmm
|
||||
.. index:: pair_style lj/charmm/coul/charmm/gpu
|
||||
.. index:: pair_style lj/charmm/coul/charmm/intel
|
||||
.. index:: pair_style lj/charmm/coul/charmm/kk
|
||||
.. index:: pair_style lj/charmm/coul/charmm/omp
|
||||
@ -19,7 +20,7 @@
|
||||
pair_style lj/charmm/coul/charmm command
|
||||
========================================
|
||||
|
||||
Accelerator Variants: *lj/charmm/coul/charmm/intel*, *lj/charmm/coul/charmm/kk*, *lj/charmm/coul/charmm/omp*
|
||||
Accelerator Variants: *lj/charmm/coul/charmm/gpu*, *lj/charmm/coul/charmm/intel*, *lj/charmm/coul/charmm/kk*, *lj/charmm/coul/charmm/omp*
|
||||
|
||||
pair_style lj/charmm/coul/charmm/implicit command
|
||||
=================================================
|
||||
|
||||
@ -2297,6 +2297,7 @@ omegaz
|
||||
Omelyan
|
||||
omp
|
||||
OMP
|
||||
oneAPI
|
||||
onelevel
|
||||
oneway
|
||||
onn
|
||||
@ -2528,6 +2529,7 @@ ptm
|
||||
PTM
|
||||
ptol
|
||||
ptr
|
||||
PTX
|
||||
pu
|
||||
purdue
|
||||
Purohit
|
||||
|
||||
@ -51,7 +51,7 @@ BIN2C = $(CUDA_HOME)/bin/bin2c
|
||||
|
||||
# host code compiler and settings
|
||||
|
||||
CUDR_CPP = mpicxx -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK -DOMPI_SKIP_MPICXX=1 -fPIC
|
||||
CUDR_CPP = mpicxx -fopenmp -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK -DOMPI_SKIP_MPICXX=1 -fPIC
|
||||
CUDR_OPTS = -O2 $(LMP_INC)
|
||||
CUDR = $(CUDR_CPP) $(CUDR_OPTS) $(CUDA_PROXY) $(CUDA_PRECISION) $(CUDA_INCLUDE) \
|
||||
$(CUDPP_OPT)
|
||||
|
||||
@ -17,7 +17,7 @@ LMP_INC = -DLAMMPS_SMALLBIG
|
||||
HIP_PRECISION = -D_SINGLE_DOUBLE
|
||||
|
||||
HIP_OPTS = -O3
|
||||
HIP_HOST_OPTS = -Wno-deprecated-declarations
|
||||
HIP_HOST_OPTS = -Wno-deprecated-declarations -fopenmp
|
||||
HIP_HOST_INCLUDE =
|
||||
|
||||
# use device sort
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
# Settings that the LAMMPS build will import when this package library is used
|
||||
|
||||
gpu_SYSINC =
|
||||
gpu_SYSINC = -DFFT_SINGLE
|
||||
gpu_SYSLIB = -framework OpenCL
|
||||
gpu_SYSPATH =
|
||||
|
||||
@ -1,25 +1,21 @@
|
||||
# /* ----------------------------------------------------------------------
|
||||
# Generic Linux Makefile for OpenCL
|
||||
# Generic Linux Makefile for OpenCL - Mixed precision
|
||||
# ------------------------------------------------------------------------- */
|
||||
|
||||
# which file will be copied to Makefile.lammps
|
||||
|
||||
EXTRAMAKE = Makefile.lammps.opencl
|
||||
|
||||
# OCL_TUNE = -DFERMI_OCL # -- Uncomment for NVIDIA Fermi
|
||||
# OCL_TUNE = -DKEPLER_OCL # -- Uncomment for NVIDIA Kepler
|
||||
# OCL_TUNE = -DCYPRESS_OCL # -- Uncomment for AMD Cypress
|
||||
OCL_TUNE = -DGENERIC_OCL # -- Uncomment for generic device
|
||||
|
||||
# this setting should match LAMMPS Makefile
|
||||
# one of LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG and LAMMPS_SMALLSMALL
|
||||
|
||||
LMP_INC = -DLAMMPS_SMALLBIG
|
||||
|
||||
OCL_INC = -I/usr/local/cuda/include # Path to CL directory
|
||||
OCL_CPP = mpic++ $(DEFAULT_DEVICE) -O3 -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK $(LMP_INC) $(OCL_INC) -std=c++11
|
||||
OCL_LINK = -L/usr/local/cuda/lib64 -lOpenCL
|
||||
OCL_INC =
|
||||
OCL_CPP = mpic++ -std=c++11 -O3 -DMPICH_IGNORE_CXX_SEEK $(LMP_INC) $(OCL_INC)
|
||||
OCL_LINK = -lOpenCL
|
||||
OCL_PREC = -D_SINGLE_DOUBLE
|
||||
OCL_TUNE = -fopenmp -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT
|
||||
|
||||
BIN_DIR = ./
|
||||
OBJ_DIR = ./
|
||||
@ -28,4 +24,3 @@ AR = ar
|
||||
BSH = /bin/sh
|
||||
|
||||
include Opencl.makefile
|
||||
|
||||
|
||||
@ -1,19 +1,17 @@
|
||||
# /* ----------------------------------------------------------------------
|
||||
# Generic Mac Makefile for OpenCL
|
||||
# Generic Mac Makefile for OpenCL - Single precision with FFT_SINGLE
|
||||
# ------------------------------------------------------------------------- */
|
||||
|
||||
# which file will be copied to Makefile.lammps
|
||||
|
||||
EXTRAMAKE = Makefile.lammps.mac_ocl
|
||||
|
||||
OCL_TUNE = -DFERMI_OCL # -- Uncomment for NVIDIA Fermi
|
||||
# OCL_TUNE = -DKEPLER_OCL # -- Uncomment for NVIDIA Kepler
|
||||
# OCL_TUNE = -DCYPRESS_OCL # -- Uncomment for AMD Cypress
|
||||
# OCL_TUNE = -DGENERIC_OCL # -- Uncomment for generic device
|
||||
LMP_INC = -DLAMMPS_SMALLBIG
|
||||
|
||||
OCL_CPP = mpic++ -O3 -DMPI_GERYON -DUCL_NO_EXIT
|
||||
OCL_CPP = clang++ -std=c++11 -O3 -I../../src/STUBS
|
||||
OCL_LINK = -framework OpenCL
|
||||
OCL_PREC = -D_SINGLE_SINGLE
|
||||
OCL_TUNE = -DUCL_NO_EXIT
|
||||
|
||||
BIN_DIR = ./
|
||||
OBJ_DIR = ./
|
||||
|
||||
23
lib/gpu/Makefile.mac_opencl_mpi
Normal file
23
lib/gpu/Makefile.mac_opencl_mpi
Normal file
@ -0,0 +1,23 @@
|
||||
# /* ----------------------------------------------------------------------
|
||||
# Generic Mac Makefile for OpenCL - Single precision with FFT_SINGLE
|
||||
# ------------------------------------------------------------------------- */
|
||||
|
||||
# which file will be copied to Makefile.lammps
|
||||
|
||||
EXTRAMAKE = Makefile.lammps.mac_ocl
|
||||
|
||||
LMP_INC = -DLAMMPS_SMALLBIG
|
||||
|
||||
OCL_CPP = mpicxx -std=c++11 -O3 -DMPICH_SKIP_MPICXX -DOMPI_SKIP_MPICXX=1
|
||||
OCL_LINK = -framework OpenCL
|
||||
OCL_PREC = -D_SINGLE_SINGLE
|
||||
OCL_TUNE = -DUCL_NO_EXIT -DMPI_GERYON
|
||||
|
||||
BIN_DIR = ./
|
||||
OBJ_DIR = ./
|
||||
LIB_DIR = ./
|
||||
AR = ar
|
||||
BSH = /bin/sh
|
||||
|
||||
include Opencl.makefile
|
||||
|
||||
26
lib/gpu/Makefile.oneapi
Normal file
26
lib/gpu/Makefile.oneapi
Normal file
@ -0,0 +1,26 @@
|
||||
# /* ----------------------------------------------------------------------
|
||||
# Generic Linux Makefile for OpenCL
|
||||
# ------------------------------------------------------------------------- */
|
||||
|
||||
# which file will be copied to Makefile.lammps
|
||||
|
||||
EXTRAMAKE = Makefile.lammps.opencl
|
||||
|
||||
# this setting should match LAMMPS Makefile
|
||||
# one of LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG and LAMMPS_SMALLSMALL
|
||||
|
||||
LMP_INC = -DLAMMPS_SMALLBIG
|
||||
|
||||
OCL_INC =
|
||||
OCL_CPP = mpiicpc -std=c++11 -xHost -O2 -qopenmp -qopenmp-simd -DMPICH_IGNORE_CXX_SEEK $(LMP_INC) $(OCL_INC)
|
||||
OCL_LINK = -lOpenCL
|
||||
OCL_PREC = -D_SINGLE_DOUBLE
|
||||
OCL_TUNE = -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT -fp-model fast=2 -no-prec-div
|
||||
|
||||
BIN_DIR = ./
|
||||
OBJ_DIR = ./
|
||||
LIB_DIR = ./
|
||||
AR = ar
|
||||
BSH = /bin/sh
|
||||
|
||||
include Opencl.makefile
|
||||
@ -1,92 +0,0 @@
|
||||
# /* ----------------------------------------------------------------------
|
||||
# Generic Linux Makefile for OpenCL
|
||||
# ------------------------------------------------------------------------- */
|
||||
|
||||
# which file will be copied to Makefile.lammps
|
||||
|
||||
EXTRAMAKE = Makefile.lammps.opencl
|
||||
|
||||
# this setting should match LAMMPS Makefile
|
||||
# one of LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG and LAMMPS_SMALLSMALL
|
||||
|
||||
LMP_INC = -DLAMMPS_SMALLBIG
|
||||
|
||||
# precision for GPU calculations
|
||||
# -D_SINGLE_SINGLE # Single precision for all calculations
|
||||
# -D_DOUBLE_DOUBLE # Double precision for all calculations
|
||||
# -D_SINGLE_DOUBLE # Accumulation of forces, etc. in double
|
||||
|
||||
OCL_PREC = -D_SINGLE_DOUBLE
|
||||
|
||||
BIN_DIR = ./
|
||||
OBJ_DIR = ./
|
||||
LIB_DIR = ./
|
||||
AR = ar
|
||||
BSH = /bin/sh
|
||||
|
||||
# Compiler and linker settings
|
||||
|
||||
# OCL_TUNE = -DFERMI_OCL # -- Uncomment for NVIDIA Fermi
|
||||
# OCL_TUNE = -DKEPLER_OCL # -- Uncomment for NVIDIA Kepler
|
||||
# OCL_TUNE = -DCYPRESS_OCL # -- Uncomment for AMD Cypress
|
||||
OCL_TUNE = -DGENERIC_OCL # -- Uncomment for generic device
|
||||
|
||||
OCL_INC = -I/usr/local/cuda/include # Path to CL directory
|
||||
OCL_CPP = mpic++ $(DEFAULT_DEVICE) -g -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK $(LMP_INC) $(OCL_INC)
|
||||
OCL_LINK = -lOpenCL
|
||||
OCL = $(OCL_CPP) $(OCL_PREC) $(OCL_TUNE) -DUSE_OPENCL
|
||||
|
||||
# Headers for Geryon
|
||||
UCL_H = $(wildcard ./geryon/ucl*.h)
|
||||
OCL_H = $(wildcard ./geryon/ocl*.h) $(UCL_H) lal_preprocessor.h
|
||||
PRE1_H = lal_preprocessor.h lal_aux_fun1.h
|
||||
ALL_H = $(OCL_H) $(wildcard ./lal_*.h)
|
||||
|
||||
# Source files
|
||||
SRCS := $(wildcard ./lal_*.cpp)
|
||||
OBJS := $(subst ./,$(OBJ_DIR)/,$(SRCS:%.cpp=%.o))
|
||||
CUS := $(wildcard lal_*.cu)
|
||||
KERS := $(subst ./,$(OBJ_DIR)/,$(CUS:lal_%.cu=%_cl.h))
|
||||
KERS := $(addprefix $(OBJ_DIR)/, $(KERS))
|
||||
|
||||
# targets
|
||||
|
||||
GPU_LIB = $(LIB_DIR)/libgpu.a
|
||||
|
||||
EXECS = $(BIN_DIR)/ocl_get_devices
|
||||
|
||||
all: $(OBJ_DIR) $(KERS) $(GPU_LIB) $(EXECS)
|
||||
|
||||
$(OBJ_DIR):
|
||||
mkdir -p $@
|
||||
|
||||
# device code compilation
|
||||
|
||||
$(OBJ_DIR)/%_cl.h: lal_%.cu $(PRE1_H)
|
||||
$(BSH) ./geryon/file_to_cstr.sh $* $(PRE1_H) $< $@;
|
||||
|
||||
# host code compilation
|
||||
|
||||
$(OBJ_DIR)/lal_%.o: lal_%.cpp $(KERS)
|
||||
$(OCL) -o $@ -c $< -I$(OBJ_DIR)
|
||||
|
||||
# build libgpu.a
|
||||
|
||||
$(GPU_LIB): $(OBJS)
|
||||
$(AR) -crusv $(GPU_LIB) $(OBJS)
|
||||
@cp $(EXTRAMAKE) Makefile.lammps
|
||||
|
||||
# test app for querying device info
|
||||
|
||||
$(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp $(OCL_H)
|
||||
$(OCL) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_OPENCL $(OCL_LINK)
|
||||
|
||||
clean:
|
||||
-rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(KERS) *.linkinfo
|
||||
|
||||
veryclean: clean
|
||||
-rm -rf *~ *.linkinfo
|
||||
|
||||
cleanlib:
|
||||
-rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(KERS) *.linkinfo
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
# Headers for Geryon
|
||||
UCL_H = $(wildcard ./geryon/ucl*.h)
|
||||
NVD_H = $(wildcard ./geryon/nvd*.h) $(UCL_H) lal_preprocessor.h
|
||||
NVD_H = $(wildcard ./geryon/nvd*.h) $(UCL_H) lal_preprocessor.h \
|
||||
lal_pre_cuda_hip.h
|
||||
ALL_H = $(NVD_H) $(wildcard ./lal_*.h)
|
||||
|
||||
# Source files
|
||||
@ -39,17 +40,21 @@ BIN2C = $(CUDA_HOME)/bin/bin2c
|
||||
|
||||
# device code compilation
|
||||
|
||||
$(OBJ_DIR)/pppm_f.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h
|
||||
$(OBJ_DIR)/pppm_f.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h \
|
||||
lal_pre_cuda_hip.h
|
||||
$(CUDA) --fatbin -DNV_KERNEL -Dgrdtyp=float -Dgrdtyp4=float4 -o $@ lal_pppm.cu
|
||||
|
||||
$(OBJ_DIR)/pppm_f_cubin.h: $(OBJ_DIR)/pppm_f.cubin
|
||||
$(BIN2C) -c -n pppm_f $(OBJ_DIR)/pppm_f.cubin > $(OBJ_DIR)/pppm_f_cubin.h
|
||||
rm $(OBJ_DIR)/pppm_f.cubin
|
||||
|
||||
$(OBJ_DIR)/pppm_d.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h
|
||||
$(OBJ_DIR)/pppm_d.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h \
|
||||
lal_pre_cuda_hip.h
|
||||
$(CUDA) --fatbin -DNV_KERNEL -Dgrdtyp=double -Dgrdtyp4=double4 -o $@ lal_pppm.cu
|
||||
|
||||
$(OBJ_DIR)/pppm_d_cubin.h: $(OBJ_DIR)/pppm_d.cubin
|
||||
$(BIN2C) -c -n pppm_d $(OBJ_DIR)/pppm_d.cubin > $(OBJ_DIR)/pppm_d_cubin.h
|
||||
rm $(OBJ_DIR)/pppm_d.cubin
|
||||
|
||||
$(OBJ_DIR)/%_cubin.h: lal_%.cu $(ALL_H)
|
||||
$(CUDA) --fatbin -DNV_KERNEL -o $(OBJ_DIR)/$*.cubin $(OBJ_DIR)/lal_$*.cu
|
||||
@ -93,7 +98,7 @@ $(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVD_H)
|
||||
$(CUDR) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_CUDADR $(CUDA_LIB) -lcuda
|
||||
|
||||
clean:
|
||||
-rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(CUDPP) $(CUHS) *.linkinfo
|
||||
-rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(CUDPP) $(CUHS) *.cubin *.linkinfo
|
||||
|
||||
veryclean: clean
|
||||
-rm -rf *~ *.linkinfo
|
||||
|
||||
@ -1,8 +1,15 @@
|
||||
# Common headers for kernels
|
||||
PRE1_H = lal_preprocessor.h lal_aux_fun1.h
|
||||
|
||||
# Headers for Geryon
|
||||
UCL_H = $(wildcard ./geryon/ucl*.h)
|
||||
OCL_H = $(wildcard ./geryon/ocl*.h) $(UCL_H) lal_preprocessor.h
|
||||
PRE1_H = lal_preprocessor.h lal_aux_fun1.h
|
||||
ALL_H = $(OCL_H) $(wildcard ./lal_*.h)
|
||||
OCL_H = $(wildcard ./geryon/ocl*.h) $(UCL_H) lal_precision.h
|
||||
|
||||
# Headers for Host files
|
||||
HOST_H = lal_answer.h lal_atom.h lal_balance.h lal_base_atomic.h \
|
||||
lal_base_charge.h lal_base_dipole.h lal_base_dpd.h \
|
||||
lal_base_ellipsoid.h lal_base_three.h lal_device.h lal_neighbor.h \
|
||||
lal_neighbor_shared.h lal_pre_ocl_config.h $(OCL_H)
|
||||
|
||||
# Source files
|
||||
SRCS := $(wildcard ./lal_*.cpp)
|
||||
@ -28,12 +35,75 @@ OCL = $(OCL_CPP) $(OCL_PREC) $(OCL_TUNE) -DUSE_OPENCL
|
||||
|
||||
# device code compilation
|
||||
|
||||
$(OBJ_DIR)/atom_cl.h: lal_atom.cu lal_preprocessor.h
|
||||
$(BSH) ./geryon/file_to_cstr.sh atom lal_preprocessor.h lal_atom.cu $(OBJ_DIR)/atom_cl.h
|
||||
|
||||
$(OBJ_DIR)/neighbor_cpu_cl.h: lal_neighbor_cpu.cu lal_preprocessor.h
|
||||
$(BSH) ./geryon/file_to_cstr.sh neighbor_cpu lal_preprocessor.h lal_neighbor_cpu.cu $(OBJ_DIR)/neighbor_cpu_cl.h
|
||||
|
||||
$(OBJ_DIR)/neighbor_gpu_cl.h: lal_neighbor_gpu.cu lal_preprocessor.h
|
||||
$(BSH) ./geryon/file_to_cstr.sh neighbor_gpu lal_preprocessor.h lal_neighbor_gpu.cu $(OBJ_DIR)/neighbor_gpu_cl.h
|
||||
|
||||
$(OBJ_DIR)/device_cl.h: lal_device.cu lal_preprocessor.h
|
||||
$(BSH) ./geryon/file_to_cstr.sh device lal_preprocessor.h lal_device.cu $(OBJ_DIR)/device_cl.h
|
||||
|
||||
$(OBJ_DIR)/pppm_cl.h: lal_pppm.cu lal_preprocessor.h
|
||||
$(BSH) ./geryon/file_to_cstr.sh pppm lal_preprocessor.h lal_pppm.cu $(OBJ_DIR)/pppm_cl.h;
|
||||
|
||||
$(OBJ_DIR)/ellipsoid_nbor_cl.h: lal_ellipsoid_nbor.cu lal_preprocessor.h
|
||||
$(BSH) ./geryon/file_to_cstr.sh ellipsoid_nbor lal_preprocessor.h lal_ellipsoid_nbor.cu $(OBJ_DIR)/ellipsoid_nbor_cl.h
|
||||
|
||||
$(OBJ_DIR)/gayberne_cl.h: lal_gayberne.cu $(PRE1_H) lal_ellipsoid_extra.h
|
||||
$(BSH) ./geryon/file_to_cstr.sh gayberne $(PRE1_H) lal_ellipsoid_extra.h lal_gayberne.cu $(OBJ_DIR)/gayberne_cl.h;
|
||||
|
||||
$(OBJ_DIR)/gayberne_lj_cl.h: lal_gayberne_lj.cu $(PRE1_H) lal_ellipsoid_extra.h
|
||||
$(BSH) ./geryon/file_to_cstr.sh gayberne_lj $(PRE1_H) lal_ellipsoid_extra.h lal_gayberne_lj.cu $(OBJ_DIR)/gayberne_lj_cl.h;
|
||||
|
||||
$(OBJ_DIR)/re_squared_cl.h: lal_re_squared.cu $(PRE1_H) lal_ellipsoid_extra.h
|
||||
$(BSH) ./geryon/file_to_cstr.sh re_squared $(PRE1_H) lal_ellipsoid_extra.h lal_re_squared.cu $(OBJ_DIR)/re_squared_cl.h;
|
||||
|
||||
$(OBJ_DIR)/re_squared_lj_cl.h: lal_re_squared_lj.cu $(PRE1_H) lal_ellipsoid_extra.h
|
||||
$(BSH) ./geryon/file_to_cstr.sh re_squared_lj $(PRE1_H) lal_ellipsoid_extra.h lal_re_squared_lj.cu $(OBJ_DIR)/re_squared_lj_cl.h;
|
||||
|
||||
$(OBJ_DIR)/tersoff_cl.h: lal_tersoff.cu $(PRE1_H) lal_tersoff_extra.h
|
||||
$(BSH) ./geryon/file_to_cstr.sh tersoff $(PRE1_H) lal_tersoff_extra.h lal_tersoff.cu $(OBJ_DIR)/tersoff_cl.h;
|
||||
|
||||
$(OBJ_DIR)/tersoff_mod_cl.h: lal_tersoff_mod.cu $(PRE1_H) lal_tersoff_mod_extra.h
|
||||
$(BSH) ./geryon/file_to_cstr.sh tersoff_mod $(PRE1_H) lal_tersoff_mod_extra.h lal_tersoff_mod.cu $(OBJ_DIR)/tersoff_mod_cl.h;
|
||||
|
||||
$(OBJ_DIR)/tersoff_zbl_cl.h: lal_tersoff_zbl.cu $(PRE1_H) lal_tersoff_zbl_extra.h
|
||||
$(BSH) ./geryon/file_to_cstr.sh tersoff_zbl $(PRE1_H) lal_tersoff_zbl_extra.h lal_tersoff_zbl.cu $(OBJ_DIR)/tersoff_zbl_cl.h;
|
||||
|
||||
$(OBJ_DIR)/%_cl.h: lal_%.cu $(PRE1_H)
|
||||
$(BSH) ./geryon/file_to_cstr.sh $* $(PRE1_H) $< $@;
|
||||
|
||||
# host code compilation
|
||||
|
||||
$(OBJ_DIR)/lal_%.o: lal_%.cpp $(KERS)
|
||||
$(OBJ_DIR)/lal_answer.o: lal_answer.cpp $(HOST_H)
|
||||
$(OCL) -o $@ -c lal_answer.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_dpd_tstat_ext.o: lal_dpd_tstat_ext.cpp lal_dpd.h $(HOST_H)
|
||||
$(OCL) -o $@ -c lal_dpd_tstat_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_eam_alloy_ext.o: lal_eam_alloy_ext.cpp lal_eam.h $(HOST_H)
|
||||
$(OCL) -o $@ -c lal_eam_alloy_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_eam_fs_ext.o: lal_eam_fs_ext.cpp lal_eam.h $(HOST_H)
|
||||
$(OCL) -o $@ -c lal_eam_fs_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_neighbor.o: lal_neighbor.cpp $(HOST_H)
|
||||
$(OCL) -o $@ -c lal_neighbor.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_neighbor_shared.o: lal_neighbor_shared.cpp $(HOST_H)
|
||||
$(OCL) -o $@ -c lal_neighbor_shared.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_%_ext.o: lal_%_ext.cpp lal_%.h $(HOST_H)
|
||||
$(OCL) -o $@ -c $< -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_base_%.o: lal_base_%.cpp $(HOST_H)
|
||||
$(OCL) -o $@ -c $< -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_%.o: lal_%.cpp %_cl.h $(HOST_H)
|
||||
$(OCL) -o $@ -c $< -I$(OBJ_DIR)
|
||||
|
||||
$(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp $(OCL_H)
|
||||
|
||||
359
lib/gpu/README
359
lib/gpu/README
@ -4,18 +4,109 @@
|
||||
|
||||
W. Michael Brown (ORNL)
|
||||
Trung Dac Nguyen (ORNL/Northwestern)
|
||||
Peng Wang (NVIDIA)
|
||||
Nitin Dhamankar (Intel)
|
||||
Axel Kohlmeyer (Temple)
|
||||
Peng Wang (NVIDIA)
|
||||
Anders Hafreager (UiO)
|
||||
V. Nikolskiy (HSE)
|
||||
Maurice de Koning (Unicamp/Brazil)
|
||||
Rodolfo Paula Leite (Unicamp/Brazil)
|
||||
Steve Plimpton (SNL)
|
||||
Inderaj Bains (NVIDIA)
|
||||
|
||||
-------------------------------------------------------------------
|
||||
|
||||
This directory has source files to build a library that LAMMPS
|
||||
links against when using the GPU package.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
This library must be built with a C++ compiler, before LAMMPS is
|
||||
built, so LAMMPS can link against it.
|
||||
This directory has source files to build a library that LAMMPS links against
|
||||
when using the GPU package.
|
||||
|
||||
This library must be built with a C++ compiler along with CUDA, HIP, or OpenCL
|
||||
before LAMMPS is built, so LAMMPS can link against it.
|
||||
|
||||
This library, libgpu.a, provides routines for acceleration of certain
|
||||
LAMMPS styles and neighbor list builds using CUDA, OpenCL, or ROCm HIP.
|
||||
|
||||
Pair styles supported by this library are marked in the list of Pair style
|
||||
potentials with a "g". See the online version at:
|
||||
|
||||
https://lammps.sandia.gov/doc/Commands_pair.html
|
||||
|
||||
In addition the (plain) pppm kspace style is supported as well.
|
||||
|
||||
------------------------------------------------------------------------------
|
||||
DEVICE QUERY
|
||||
------------------------------------------------------------------------------
|
||||
The gpu library includes binaries to check for available GPUs and their
|
||||
properties. It is a good idea to run this on first use to make sure the
|
||||
system and build is setup properly. Additionally, the GPU numbering for
|
||||
specific selection of devices should be taking from this output. The GPU
|
||||
library may split some accelerators into separate virtual accelerators for
|
||||
efficient use with MPI.
|
||||
|
||||
After building the GPU library, for OpenCL:
|
||||
./ocl_get_devices
|
||||
for CUDA:
|
||||
./nvc_get_devices
|
||||
and for ROCm HIP:
|
||||
./hip_get_devices
|
||||
|
||||
------------------------------------------------------------------------------
|
||||
QUICK START
|
||||
------------------------------------------------------------------------------
|
||||
OpenCL: Mac without MPI:
|
||||
make -f Makefile.mac_opencl -j; cd ../../src/; make mpi-stubs
|
||||
make g++_serial -j
|
||||
./lmp_g++_serial -in ../bench/in.lj -log none -sf gpu
|
||||
|
||||
OpenCL: Mac with MPI:
|
||||
make -f Makefile.mac_opencl_mpi -j; cd ../../src/; make g++_openmpi -j
|
||||
mpirun -np $NUM_MPI ./lmp_g++_openmpi -in ../bench/in.lj -log none -sf gpu
|
||||
|
||||
OpenCL: Linux with Intel oneAPI:
|
||||
make -f Makefile.oneapi -j; cd ../../src; make oneapi -j
|
||||
export OMP_NUM_THREADS=$NUM_THREADS
|
||||
mpirun -np $NUM_MPI ./lmp_oneapi -in ../bench/in.lj -log none -sf gpu
|
||||
|
||||
OpenCL: Linux with MPI:
|
||||
make -f Makefile.linux_opencl -j; cd ../../src; make omp -j
|
||||
export OMP_NUM_THREADS=$NUM_THREADS
|
||||
mpirun -np $NUM_MPI ./lmp_omp -in ../bench/in.lj -log none -sf gpu
|
||||
|
||||
NVIDIA CUDA:
|
||||
make -f Makefile.cuda_mps -j; cd ../../src; make omp -j
|
||||
export CUDA_MPS_LOG_DIRECTORY=/tmp; export CUDA_MPS_PIPE_DIRECTORY=/tmp
|
||||
nvidia-smi -i 0 -c EXCLUSIVE_PROCESS
|
||||
export OMP_NUM_THREADS=$NUM_THREADS
|
||||
mpirun -np $NUM_MPI ./lmp_omp -in ../bench/in.lj -log none -sf gpu
|
||||
echo quit | /usr/bin/nvidia-cuda-mps-control
|
||||
|
||||
AMD HIP:
|
||||
make -f Makefile.hip -j; cd ../../src; make omp -j
|
||||
export OMP_NUM_THREADS=$NUM_THREADS
|
||||
mpirun -np $NUM_MPI ./lmp_omp -in ../bench/in.lj -log none -sf gpu
|
||||
|
||||
------------------------------------------------------------------------------
|
||||
Installing oneAPI, OpenCl, CUDA, or ROCm
|
||||
------------------------------------------------------------------------------
|
||||
The easiest approach is to use the linux package manger to perform the
|
||||
installation from Intel, NVIDIA, etc. repositories. All are available for
|
||||
free. The oneAPI installation includes Intel optimized MPI and C++ compilers,
|
||||
along with many libraries. Alternatively, Intel OpenCL can also be installed
|
||||
separately from the Intel repository.
|
||||
|
||||
NOTE: Installation of the CUDA SDK is not required, only the CUDA toolkit.
|
||||
|
||||
See:
|
||||
|
||||
https://software.intel.com/content/www/us/en/develop/tools/oneapi/hpc-toolkit.html
|
||||
|
||||
https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html
|
||||
|
||||
https://github.com/RadeonOpenCompute/ROCm
|
||||
|
||||
------------------------------------------------------------------------------
|
||||
Build Intro
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
You can type "make lib-gpu" from the src directory to see help on how
|
||||
to build this library via make commands, or you can do the same thing
|
||||
@ -25,7 +116,7 @@ do it manually by following the instructions below.
|
||||
Build the library using one of the provided Makefile.* files or create
|
||||
your own, specific to your compiler and system. For example:
|
||||
|
||||
make -f Makefile.linux
|
||||
make -f Makefile.linux_opencl
|
||||
|
||||
When you are done building this library, two files should
|
||||
exist in this directory:
|
||||
@ -45,33 +136,132 @@ IMPORTANT: If you re-build the library, e.g. for a different precision
|
||||
Makefile.linux clean, to insure all previous derived files are removed
|
||||
before the new build is done.
|
||||
|
||||
Makefile.lammps has settings for 3 variables:
|
||||
NOTE: The system-specific setting LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG,
|
||||
or LAMMPS_SMALLSMALL if specified when building LAMMPS (i.e. in
|
||||
src/MAKE/Makefile.foo) should be consistent with that specified
|
||||
when building libgpu.a (i.e. by LMP_INC in the lib/gpu/Makefile.bar).
|
||||
|
||||
user-gpu_SYSINC = leave blank for this package
|
||||
user-gpu_SYSLIB = CUDA libraries needed by this package
|
||||
user-gpu_SYSPATH = path(s) to where those libraries are
|
||||
|
||||
Because you have the CUDA compilers on your system, you should have
|
||||
the needed libraries. If the CUDA development tools were installed
|
||||
in the standard manner, the settings in the Makefile.lammps.standard
|
||||
file should work.
|
||||
------------------------------------------------------------------------------
|
||||
PRECISION MODES
|
||||
------------------------------------------------------------------------------
|
||||
The GPU library supports 3 precision modes: single, double, and mixed, with
|
||||
the latter being the default for most Makefiles aside from Mac specific
|
||||
Makefiles due to the more restrictive nature of the Apple OpenCL for some
|
||||
devices.
|
||||
|
||||
-------------------------------------------------------------------
|
||||
To specify the precision mode (output to the screen before LAMMPS runs for
|
||||
verification), set either CUDA_PRECISION, OCL_PREC, or HIP_PRECISION to one
|
||||
of -D_SINGLE_SINGLE, -D_DOUBLE_DOUBLE, or -D_SINGLE_DOUBLE.
|
||||
|
||||
GENERAL NOTES
|
||||
--------------------------------
|
||||
Some accelerators or OpenCL implementations only support single precision.
|
||||
This mode should be used with care and appropriate validation as the errors
|
||||
can scale with system size in this implementation. This can be useful for
|
||||
accelerating test runs when setting up a simulation for production runs on
|
||||
another machine. In the case where only single precision is supported, either
|
||||
LAMMPS must be compiled with -DFFT_SINGLE to use PPPM with GPU acceleration
|
||||
or GPU acceleration should be disabled for PPPM (e.g. suffix off or pair/only
|
||||
as described in the LAMMPS documentation).
|
||||
|
||||
This library, libgpu.a, provides routines for GPU acceleration
|
||||
of certain LAMMPS styles and neighbor list builds. Compilation of this
|
||||
library requires installing the CUDA GPU driver and CUDA toolkit for
|
||||
your operating system. Installation of the CUDA SDK is not necessary.
|
||||
In addition to the LAMMPS library, the binary nvc_get_devices will also
|
||||
be built. This can be used to query the names and properties of GPU
|
||||
devices on your system. A Makefile for OpenCL and ROCm HIP compilation
|
||||
is provided, but support for it is not currently provided by the developers.
|
||||
Details of the implementation are provided in:
|
||||
|
||||
----
|
||||
------------------------------------------------------------------------------
|
||||
CUDA BUILD NOTES
|
||||
------------------------------------------------------------------------------
|
||||
NOTE: when compiling with CMake, all of the considerations listed below
|
||||
are considered within the CMake configuration process, so no separate
|
||||
compilation of the gpu library is required. Also this will build in support
|
||||
for all compute architecture that are supported by the CUDA toolkit version
|
||||
used to build the gpu library.
|
||||
|
||||
If you do not want to use a fat binary, that supports multiple CUDA
|
||||
architectures, the CUDA_ARCH must be set to match the GPU architecture. This
|
||||
is reported by nvc_get_devices executable created by the build process and
|
||||
a detailed list of GPU architectures and CUDA compatible GPUs can be found
|
||||
e.g. here: https://en.wikipedia.org/wiki/CUDA#GPUs_supported
|
||||
|
||||
The CUDA_HOME variable should be set to the location of the CUDA toolkit.
|
||||
|
||||
To build, edit the CUDA_ARCH, CUDA_PRECISION, CUDA_HOME variables in one of
|
||||
the Makefiles. CUDA_ARCH should be set based on the compute capability of
|
||||
your GPU. This can be verified by running the nvc_get_devices executable after
|
||||
the build is complete. Additionally, the GPU package must be installed and
|
||||
compiled for LAMMPS. This may require editing the gpu_SYSPATH variable in the
|
||||
LAMMPS makefile.
|
||||
|
||||
Please note that the GPU library accesses the CUDA driver library directly,
|
||||
so it needs to be linked with the CUDA driver library (libcuda.so) that ships
|
||||
with the Nvidia driver. If you are compiling LAMMPS on the head node of a GPU
|
||||
cluster, this library may not be installed, so you may need to copy it over
|
||||
from one of the compute nodes (best into this directory). Recent CUDA toolkits
|
||||
starting from CUDA 9 provide a dummy libcuda.so library (typically under
|
||||
$(CUDA_HOME)/lib64/stubs), that can be used for linking.
|
||||
|
||||
Best performance with the GPU library is typically with multiple MPI processes
|
||||
sharing the same GPU cards. For NVIDIA, this is most efficient with CUDA
|
||||
MPS enabled. To prevent runtime errors for GPUs configured in exclusive process
|
||||
mode with MPS, the GPU library should be build with either of the equivalent
|
||||
-DCUDA_MPS_SUPPORT or -DCUDA_PROXY flags.
|
||||
|
||||
------------------------------------------------------------------------------
|
||||
HIP BUILD NOTES
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
1. GPU sorting requires installing hipcub
|
||||
(https://github.com/ROCmSoftwarePlatform/hipCUB). The HIP CUDA-backend
|
||||
additionally requires cub (https://nvlabs.github.io/cub). Download and
|
||||
extract the cub directory to lammps/lib/gpu/ or specify an appropriate
|
||||
path in lammps/lib/gpu/Makefile.hip.
|
||||
2. In Makefile.hip it is possible to specify the target platform via
|
||||
export HIP_PLATFORM=hcc or HIP_PLATFORM=nvcc as well as the target
|
||||
architecture (gfx803, gfx900, gfx906 etc.)
|
||||
3. If your MPI implementation does not support `mpicxx --showme` command,
|
||||
it is required to specify the corresponding MPI compiler and linker flags
|
||||
in lammps/lib/gpu/Makefile.hip and in lammps/src/MAKE/OPTIONS/Makefile.hip.
|
||||
|
||||
------------------------------------------------------------------------------
|
||||
OPENCL BUILD NOTES
|
||||
------------------------------------------------------------------------------
|
||||
If GERYON_NUMA_FISSION is defined at build time, LAMMPS will consider separate
|
||||
NUMA nodes on GPUs or accelerators as separate devices. For example, a 2-socket
|
||||
CPU would appear as two separate devices for OpenCL (and LAMMPS would require
|
||||
two MPI processes to use both sockets with the GPU library - each with its
|
||||
own device ID as output by ocl_get_devices).
|
||||
|
||||
For a debug build, use "-DUCL_DEBUG -DGERYON_KERNEL_DUMP" and remove
|
||||
"-DUCL_NO_EXIT" and "-DMPI_GERYON" from the build options.
|
||||
|
||||
------------------------------------------------------------------------------
|
||||
ALL PREPROCESSOR OPTIONS (For Advanced Users)
|
||||
------------------------------------------------------------------------------
|
||||
_SINGLE_SINGLE Build library for single precision mode
|
||||
_SINGLE_DOUBLE Build library for mixed precision mode
|
||||
_DOUBLE_DOUBLE Build library for double precision mode
|
||||
CUDA_MPS_SUPPORT Do not generate errors for exclusive mode for CUDA
|
||||
CUDA_PROXY Same as above
|
||||
MPI_GERYON Library should use MPI_Abort for unhandled errors
|
||||
GERYON_NUMA_FISSION Accelerators with main memory NUMA are split into
|
||||
multiple virtual accelerators for each NUMA node
|
||||
LAL_USE_OMP=0 Disable OpenMP in lib, regardless of compiler setting
|
||||
LAL_USE_OMP_SIMD=0 Disable OpenMP SIMD in lib, regardless of compiler set
|
||||
GERYON_OCL_FLUSH For OpenCL, flush queue after every enqueue
|
||||
LAL_NO_OCL_EV_JIT Turn off JIT specialization for kernels in OpenCL
|
||||
LAL_USE_OLD_NEIGHBOR Use old neighbor list algorithm
|
||||
USE_CUDPP Enable GPU binning in neighbor builds (not recommended)
|
||||
USE_HIP_DEVICE_SORT Enable GPU binning for HIP builds
|
||||
(only w/ LAL_USE_OLD_NEIGHBOR)
|
||||
LAL_NO_BLOCK_REDUCE Use host for energy/virial accumulation
|
||||
LAL_OCL_EXTRA_ARGS Supply extra args for OpenCL compiler delimited with :
|
||||
UCL_NO_EXIT LAMMPS should handle errors instead of Geryon lib
|
||||
UCL_DEBUG Debug build for Geryon
|
||||
GERYON_KERNEL_DUMP Dump all compiled OpenCL programs with compiler
|
||||
flags and build logs
|
||||
GPU_CAST Casting performed on GPU, untested recently
|
||||
THREE_CONCURRENT Concurrent 3-body calcs in separate queues, untested
|
||||
|
||||
|
||||
------------------------------------------------------------------------------
|
||||
References for Details
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
Brown, W.M., Wang, P. Plimpton, S.J., Tharrington, A.N. Implementing
|
||||
Molecular Dynamics on Hybrid High Performance Computers - Short Range
|
||||
@ -89,116 +279,3 @@ Brown, W.M., Masako, Y. Implementing Molecular Dynamics on Hybrid High
|
||||
Performance Computers - Three-Body Potentials. Computer Physics Communications.
|
||||
2013. 184: p. 2785–2793.
|
||||
|
||||
----
|
||||
|
||||
NOTE: Installation of the CUDA SDK is not required, only the CUDA
|
||||
toolkit itself or an OpenCL 1.2 compatible header and library.
|
||||
|
||||
Pair styles supporting GPU acceleration this this library
|
||||
are marked in the list of Pair style potentials with a "g".
|
||||
See the online version at: https://lammps.sandia.gov/doc/Commands_pair.html
|
||||
|
||||
In addition the (plain) pppm kspace style is supported as well.
|
||||
|
||||
|
||||
MULTIPLE LAMMPS PROCESSES
|
||||
--------------------------------
|
||||
|
||||
Multiple LAMMPS MPI processes can share GPUs on the system, but multiple
|
||||
GPUs cannot be utilized by a single MPI process. In many cases, the
|
||||
best performance will be obtained by running as many MPI processes as
|
||||
CPU cores available with the condition that the number of MPI processes
|
||||
is an integer multiple of the number of GPUs being used. See the
|
||||
LAMMPS user manual for details on running with GPU acceleration.
|
||||
|
||||
|
||||
BUILDING AND PRECISION MODES
|
||||
--------------------------------
|
||||
|
||||
To build, edit the CUDA_ARCH, CUDA_PRECISION, CUDA_HOME variables in one of
|
||||
the Makefiles. CUDA_ARCH should be set based on the compute capability of
|
||||
your GPU. This can be verified by running the nvc_get_devices executable after
|
||||
the build is complete. Additionally, the GPU package must be installed and
|
||||
compiled for LAMMPS. This may require editing the gpu_SYSPATH variable in the
|
||||
LAMMPS makefile.
|
||||
|
||||
Please note that the GPU library accesses the CUDA driver library directly,
|
||||
so it needs to be linked not only to the CUDA runtime library (libcudart.so)
|
||||
that ships with the CUDA toolkit, but also with the CUDA driver library
|
||||
(libcuda.so) that ships with the Nvidia driver. If you are compiling LAMMPS
|
||||
on the head node of a GPU cluster, this library may not be installed,
|
||||
so you may need to copy it over from one of the compute nodes (best into
|
||||
this directory). Recent CUDA toolkits starting from CUDA 9 provide a dummy
|
||||
libcuda.so library (typically under $(CUDA_HOME)/lib64/stubs), that can be used for
|
||||
linking.
|
||||
|
||||
The gpu library supports 3 precision modes as determined by
|
||||
the CUDA_PRECISION variable:
|
||||
|
||||
CUDA_PRECISION = -D_SINGLE_SINGLE # Single precision for all calculations
|
||||
CUDA_PRECISION = -D_DOUBLE_DOUBLE # Double precision for all calculations
|
||||
CUDA_PRECISION = -D_SINGLE_DOUBLE # Accumulation of forces, etc. in double
|
||||
|
||||
As of CUDA 7.5 only GPUs with compute capability 2.0 (Fermi) or newer are
|
||||
supported and as of CUDA 9.0 only compute capability 3.0 (Kepler) or newer
|
||||
are supported. There are some limitations of this library for GPUs older
|
||||
than that, which require additional preprocessor flag, and limit features,
|
||||
but they are kept for historical reasons. There is no value in trying to
|
||||
use those GPUs for production calculations.
|
||||
|
||||
You have to make sure that you set a CUDA_ARCH line suitable for your
|
||||
hardware and CUDA toolkit version: e.g. -arch=sm_35 for Tesla K20 or K40
|
||||
or -arch=sm_52 GeForce GTX Titan X. A detailed list of GPU architectures
|
||||
and CUDA compatible GPUs can be found e.g. here:
|
||||
https://en.wikipedia.org/wiki/CUDA#GPUs_supported
|
||||
|
||||
NOTE: when compiling with CMake, all of the considerations listed below
|
||||
are considered within the CMake configuration process, so no separate
|
||||
compilation of the gpu library is required. Also this will build in support
|
||||
for all compute architecture that are supported by the CUDA toolkit version
|
||||
used to build the gpu library.
|
||||
|
||||
Please note the CUDA_CODE settings in Makefile.linux_multi, which allows
|
||||
to compile this library with support for multiple GPUs. This list can be
|
||||
extended for newer GPUs with newer CUDA toolkits and should allow to build
|
||||
a single GPU library compatible with all GPUs that are worth using for
|
||||
GPU acceleration and supported by the current CUDA toolkits and drivers.
|
||||
|
||||
NOTE: The system-specific setting LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG,
|
||||
or LAMMPS_SMALLSMALL if specified when building LAMMPS (i.e. in
|
||||
src/MAKE/Makefile.foo) should be consistent with that specified
|
||||
when building libgpu.a (i.e. by LMP_INC in the lib/gpu/Makefile.bar).
|
||||
|
||||
BUILDING FOR HIP FRAMEWORK
|
||||
--------------------------------
|
||||
1. Install the latest ROCm framework (https://github.com/RadeonOpenCompute/ROCm).
|
||||
2. GPU sorting requires installing hipcub
|
||||
(https://github.com/ROCmSoftwarePlatform/hipCUB). The HIP CUDA-backend
|
||||
additionally requires cub (https://nvlabs.github.io/cub). Download and
|
||||
extract the cub directory to lammps/lib/gpu/ or specify an appropriate
|
||||
path in lammps/lib/gpu/Makefile.hip.
|
||||
3. In Makefile.hip it is possible to specify the target platform via
|
||||
export HIP_PLATFORM=hcc or HIP_PLATFORM=nvcc as well as the target
|
||||
architecture (gfx803, gfx900, gfx906 etc.)
|
||||
4. If your MPI implementation does not support `mpicxx --showme` command,
|
||||
it is required to specify the corresponding MPI compiler and linker flags
|
||||
in lammps/lib/gpu/Makefile.hip and in lammps/src/MAKE/OPTIONS/Makefile.hip.
|
||||
5. Building the GPU library (libgpu.a):
|
||||
cd lammps/lib/gpu; make -f Makefile.hip -j
|
||||
6. Building the LAMMPS executable (lmp_hip):
|
||||
cd ../../src; make hip -j
|
||||
|
||||
EXAMPLE CONVENTIONAL BUILD PROCESS
|
||||
--------------------------------
|
||||
|
||||
cd ~/lammps/lib/gpu
|
||||
emacs Makefile.linux
|
||||
make -f Makefile.linux
|
||||
./nvc_get_devices
|
||||
cd ../../src
|
||||
emacs ./MAKE/Makefile.linux
|
||||
make yes-asphere
|
||||
make yes-kspace
|
||||
make yes-gpu
|
||||
make linux
|
||||
|
||||
|
||||
@ -24,6 +24,8 @@ namespace ucl_hip {
|
||||
// --------------------------------------------------------------------------
|
||||
typedef hipStream_t command_queue;
|
||||
|
||||
inline void ucl_flush(command_queue &cq) {}
|
||||
|
||||
inline void ucl_sync(hipStream_t &stream) {
|
||||
CU_SAFE_CALL(hipStreamSynchronize(stream));
|
||||
}
|
||||
@ -143,15 +145,26 @@ class UCL_Device {
|
||||
inline std::string device_type_name(const int i) { return "GPU"; }
|
||||
|
||||
/// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
|
||||
inline int device_type() { return device_type(_device); }
|
||||
inline enum UCL_DEVICE_TYPE device_type() { return device_type(_device); }
|
||||
/// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
|
||||
inline int device_type(const int i) { return UCL_GPU; }
|
||||
inline enum UCL_DEVICE_TYPE device_type(const int i) { return UCL_GPU; }
|
||||
|
||||
/// Returns true if host memory is efficiently addressable from device
|
||||
inline bool shared_memory() { return shared_memory(_device); }
|
||||
/// Returns true if host memory is efficiently addressable from device
|
||||
inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; }
|
||||
|
||||
/// Returns preferred vector width
|
||||
inline int preferred_fp32_width() { return preferred_fp32_width(_device); }
|
||||
/// Returns preferred vector width
|
||||
inline int preferred_fp32_width(const int i)
|
||||
{return _properties[i].SIMDWidth;}
|
||||
/// Returns preferred vector width
|
||||
inline int preferred_fp64_width() { return preferred_fp64_width(_device); }
|
||||
/// Returns preferred vector width
|
||||
inline int preferred_fp64_width(const int i)
|
||||
{return _properties[i].SIMDWidth;}
|
||||
|
||||
/// Returns true if double precision is support for the current device
|
||||
inline bool double_precision() { return double_precision(_device); }
|
||||
/// Returns true if double precision is support for the device
|
||||
@ -215,6 +228,18 @@ class UCL_Device {
|
||||
/// Get the maximum number of threads per block
|
||||
inline size_t group_size(const int i)
|
||||
{ return _properties[i].maxThreadsPerBlock; }
|
||||
/// Get the maximum number of threads per block in dimension 'dim'
|
||||
inline size_t group_size_dim(const int dim)
|
||||
{ return group_size_dim(_device, dim); }
|
||||
/// Get the maximum number of threads per block in dimension 'dim'
|
||||
inline size_t group_size_dim(const int i, const int dim)
|
||||
{ return _properties[i].maxThreadsDim[dim];}
|
||||
|
||||
/// Get the shared local memory size in bytes
|
||||
inline size_t slm_size() { return slm_size(_device); }
|
||||
/// Get the shared local memory size in bytes
|
||||
inline size_t slm_size(const int i)
|
||||
{ return _properties[i].sharedMemPerBlock; }
|
||||
|
||||
/// Return the maximum memory pitch in bytes for current device
|
||||
inline size_t max_pitch() { return max_pitch(_device); }
|
||||
@ -255,11 +280,20 @@ class UCL_Device {
|
||||
inline int max_sub_devices(const int i)
|
||||
{ return 0; }
|
||||
|
||||
/// True if the device supports shuffle intrinsics
|
||||
inline bool has_shuffle_support()
|
||||
{ return has_shuffle_support(_device); }
|
||||
/// True if the device supports shuffle intrinsics
|
||||
inline bool has_shuffle_support(const int i)
|
||||
{ return arch(i)>=3.0; }
|
||||
|
||||
/// List all devices along with all properties
|
||||
inline void print_all(std::ostream &out);
|
||||
|
||||
/// Select the platform that has accelerators (for compatibility with OpenCL)
|
||||
inline int set_platform_accelerator(int pid=-1) { return UCL_SUCCESS; }
|
||||
/// For compatability with OCL API
|
||||
inline int auto_set_platform(const enum UCL_DEVICE_TYPE type=UCL_GPU,
|
||||
const std::string vendor="")
|
||||
{ return set_platform(0); }
|
||||
|
||||
inline int load_module(const void* program, hipModule_t& module, std::string *log=nullptr){
|
||||
auto it = _loaded_modules.emplace(program, hipModule_t());
|
||||
|
||||
@ -14,6 +14,7 @@
|
||||
#include <fstream>
|
||||
#include <string>
|
||||
#include <iostream>
|
||||
#include <cstdio>
|
||||
|
||||
namespace ucl_hip {
|
||||
|
||||
@ -64,7 +65,7 @@ class UCL_Program {
|
||||
}
|
||||
|
||||
/// Load a program from a string and compile with flags
|
||||
inline int load_string(const void *program, const char *flags="", std::string *log=nullptr) {
|
||||
inline int load_string(const void *program, const char *flags="", std::string *log=nullptr, FILE* foutput=nullptr) {
|
||||
return _device_ptr->load_module(program, _module, log);
|
||||
}
|
||||
|
||||
@ -73,6 +74,7 @@ class UCL_Program {
|
||||
hipModule_t _module;
|
||||
hipStream_t _cq;
|
||||
friend class UCL_Texture;
|
||||
friend class UCL_Const;
|
||||
};
|
||||
|
||||
/// Class for dealing with CUDA Driver kernels
|
||||
|
||||
@ -107,6 +107,37 @@ class UCL_Texture {
|
||||
}
|
||||
};
|
||||
|
||||
/// Class storing a const global memory reference
|
||||
class UCL_Const {
|
||||
public:
|
||||
UCL_Const() {}
|
||||
~UCL_Const() {}
|
||||
/// Construct with a specified global reference
|
||||
inline UCL_Const(UCL_Program &prog, const char *global_name)
|
||||
{ get_global(prog,global_name); }
|
||||
/// Set the global reference for this object
|
||||
inline void get_global(UCL_Program &prog, const char *global_name) {
|
||||
_cq=prog.cq();
|
||||
CU_SAFE_CALL(hipModuleGetGlobal(&_global, &_global_bytes, prog._module,
|
||||
global_name));
|
||||
}
|
||||
/// Copy from array on host to const memory
|
||||
template <class numtyp>
|
||||
inline void update_device(UCL_H_Vec<numtyp> &src, const int numel) {
|
||||
CU_SAFE_CALL(hipMemcpyHtoDAsync(_global, src.begin(), numel*sizeof(numtyp),
|
||||
_cq));
|
||||
}
|
||||
/// Get device ptr associated with object
|
||||
inline const void* begin() const { return &_global; }
|
||||
inline void clear() {}
|
||||
|
||||
private:
|
||||
hipStream_t _cq;
|
||||
void* _global;
|
||||
size_t _global_bytes;
|
||||
friend class UCL_Kernel;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif
|
||||
|
||||
@ -37,6 +37,8 @@ namespace ucl_cudadr {
|
||||
// --------------------------------------------------------------------------
|
||||
typedef CUstream command_queue;
|
||||
|
||||
inline void ucl_flush(command_queue &cq) {}
|
||||
|
||||
inline void ucl_sync(CUstream &stream) {
|
||||
CU_SAFE_CALL(cuStreamSynchronize(stream));
|
||||
}
|
||||
@ -156,15 +158,26 @@ class UCL_Device {
|
||||
inline std::string device_type_name(const int i) { return "GPU"; }
|
||||
|
||||
/// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
|
||||
inline int device_type() { return device_type(_device); }
|
||||
inline enum UCL_DEVICE_TYPE device_type() { return device_type(_device); }
|
||||
/// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
|
||||
inline int device_type(const int i) { return UCL_GPU; }
|
||||
inline enum UCL_DEVICE_TYPE device_type(const int i) { return UCL_GPU; }
|
||||
|
||||
/// Returns true if host memory is efficiently addressable from device
|
||||
inline bool shared_memory() { return shared_memory(_device); }
|
||||
/// Returns true if host memory is efficiently addressable from device
|
||||
inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; }
|
||||
|
||||
/// Returns preferred vector width
|
||||
inline int preferred_fp32_width() { return preferred_fp32_width(_device); }
|
||||
/// Returns preferred vector width
|
||||
inline int preferred_fp32_width(const int i)
|
||||
{return _properties[i].SIMDWidth;}
|
||||
/// Returns preferred vector width
|
||||
inline int preferred_fp64_width() { return preferred_fp64_width(_device); }
|
||||
/// Returns preferred vector width
|
||||
inline int preferred_fp64_width(const int i)
|
||||
{return _properties[i].SIMDWidth;}
|
||||
|
||||
/// Returns true if double precision is support for the current device
|
||||
inline bool double_precision() { return double_precision(_device); }
|
||||
/// Returns true if double precision is support for the device
|
||||
@ -228,6 +241,18 @@ class UCL_Device {
|
||||
/// Get the maximum number of threads per block
|
||||
inline size_t group_size(const int i)
|
||||
{ return _properties[i].maxThreadsPerBlock; }
|
||||
/// Get the maximum number of threads per block in dimension 'dim'
|
||||
inline size_t group_size_dim(const int dim)
|
||||
{ return group_size_dim(_device, dim); }
|
||||
/// Get the maximum number of threads per block in dimension 'dim'
|
||||
inline size_t group_size_dim(const int i, const int dim)
|
||||
{ return _properties[i].maxThreadsDim[dim]; }
|
||||
|
||||
/// Get the shared local memory size in bytes
|
||||
inline size_t slm_size() { return slm_size(_device); }
|
||||
/// Get the shared local memory size in bytes
|
||||
inline size_t slm_size(const int i)
|
||||
{ return _properties[i].sharedMemPerBlock; }
|
||||
|
||||
/// Return the maximum memory pitch in bytes for current device
|
||||
inline size_t max_pitch() { return max_pitch(_device); }
|
||||
@ -268,11 +293,22 @@ class UCL_Device {
|
||||
inline int max_sub_devices(const int i)
|
||||
{ return 0; }
|
||||
|
||||
/// True if the device supports shuffle intrinsics
|
||||
inline bool has_shuffle_support()
|
||||
{ return has_shuffle_support(_device); }
|
||||
/// True if the device supports shuffle intrinsics
|
||||
inline bool has_shuffle_support(const int i)
|
||||
{ return arch(i)>=3.0; }
|
||||
|
||||
/// List all devices along with all properties
|
||||
inline void print_all(std::ostream &out);
|
||||
|
||||
/// Select the platform that has accelerators (for compatibility with OpenCL)
|
||||
inline int set_platform_accelerator(int pid=-1) { return UCL_SUCCESS; }
|
||||
/// For compatability with OCL API
|
||||
inline int auto_set_platform(const enum UCL_DEVICE_TYPE type=UCL_GPU,
|
||||
const std::string vendor="",
|
||||
const int ndevices=-1,
|
||||
const int first_device=-1)
|
||||
{ return set_platform(0); }
|
||||
|
||||
private:
|
||||
int _device, _num_devices;
|
||||
|
||||
@ -26,6 +26,7 @@
|
||||
|
||||
#include "nvd_device.h"
|
||||
#include <fstream>
|
||||
#include <cstdio>
|
||||
|
||||
namespace ucl_cudadr {
|
||||
|
||||
@ -77,7 +78,7 @@ class UCL_Program {
|
||||
|
||||
/// Load a program from a string and compile with flags
|
||||
inline int load_string(const void *program, const char *flags="",
|
||||
std::string *log=nullptr) {
|
||||
std::string *log=nullptr, FILE* foutput=nullptr) {
|
||||
if (std::string(flags)=="BINARY")
|
||||
return load_binary((const char *)program);
|
||||
const unsigned int num_opts=2;
|
||||
@ -100,12 +101,25 @@ class UCL_Program {
|
||||
|
||||
if (err != CUDA_SUCCESS) {
|
||||
#ifndef UCL_NO_EXIT
|
||||
std::cerr << std::endl
|
||||
std::cerr << std::endl << std::endl
|
||||
<< "----------------------------------------------------------\n"
|
||||
<< " UCL Error: Error compiling PTX Program...\n"
|
||||
<< "----------------------------------------------------------\n";
|
||||
std::cerr << log << std::endl;
|
||||
std::cerr << log << std::endl
|
||||
<< "----------------------------------------------------------\n\n";
|
||||
#endif
|
||||
if (foutput != NULL) {
|
||||
fprintf(foutput,"\n\n");
|
||||
fprintf(foutput,
|
||||
"----------------------------------------------------------\n");
|
||||
fprintf(foutput," UCL Error: Error compiling PTX Program...\n");
|
||||
fprintf(foutput,
|
||||
"----------------------------------------------------------\n");
|
||||
fprintf(foutput,"%s\n",log);
|
||||
fprintf(foutput,
|
||||
"----------------------------------------------------------\n");
|
||||
fprintf(foutput,"\n\n");
|
||||
}
|
||||
return UCL_COMPILE_ERROR;
|
||||
}
|
||||
|
||||
@ -139,11 +153,15 @@ class UCL_Program {
|
||||
return UCL_SUCCESS;
|
||||
}
|
||||
|
||||
/// Return the default command queue/stream associated with this data
|
||||
inline command_queue & cq() { return _cq; }
|
||||
|
||||
friend class UCL_Kernel;
|
||||
private:
|
||||
CUmodule _module;
|
||||
CUstream _cq;
|
||||
friend class UCL_Texture;
|
||||
friend class UCL_Const;
|
||||
};
|
||||
|
||||
/// Class for dealing with CUDA Driver kernels
|
||||
|
||||
@ -38,8 +38,11 @@ class UCL_Texture {
|
||||
inline UCL_Texture(UCL_Program &prog, const char *texture_name)
|
||||
{ get_texture(prog,texture_name); }
|
||||
/// Set the texture reference for this object
|
||||
inline void get_texture(UCL_Program &prog, const char *texture_name)
|
||||
{ CU_SAFE_CALL(cuModuleGetTexRef(&_tex, prog._module, texture_name)); }
|
||||
inline void get_texture(UCL_Program &prog, const char *texture_name) {
|
||||
#if (CUDA_VERSION < 11000)
|
||||
CU_SAFE_CALL(cuModuleGetTexRef(&_tex, prog._module, texture_name));
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Bind a float array where each fetch grabs a vector of length numel
|
||||
template<class numtyp>
|
||||
@ -72,11 +75,14 @@ class UCL_Texture {
|
||||
}
|
||||
|
||||
private:
|
||||
#if (CUDA_VERSION < 11000)
|
||||
CUtexref _tex;
|
||||
#endif
|
||||
friend class UCL_Kernel;
|
||||
|
||||
template<class mat_typ>
|
||||
inline void _bind_float(mat_typ &vec, const unsigned numel) {
|
||||
#if (CUDA_VERSION < 11000)
|
||||
#ifdef UCL_DEBUG
|
||||
assert(numel!=0 && numel<5);
|
||||
#endif
|
||||
@ -90,10 +96,42 @@ class UCL_Texture {
|
||||
else
|
||||
CU_SAFE_CALL(cuTexRefSetFormat(_tex,CU_AD_FORMAT_SIGNED_INT32,numel*2));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
/// Class storing a const global memory reference
|
||||
class UCL_Const {
|
||||
public:
|
||||
UCL_Const() {}
|
||||
~UCL_Const() {}
|
||||
/// Construct with a specified global reference
|
||||
inline UCL_Const(UCL_Program &prog, const char *global_name)
|
||||
{ get_global(prog,global_name); }
|
||||
/// Set the global reference for this object
|
||||
inline void get_global(UCL_Program &prog, const char *global_name) {
|
||||
_cq=prog.cq();
|
||||
CU_SAFE_CALL(cuModuleGetGlobal(&_global, &_global_bytes, prog._module,
|
||||
global_name));
|
||||
}
|
||||
/// Copy from array on host to const memory
|
||||
template <class numtyp>
|
||||
inline void update_device(UCL_H_Vec<numtyp> &src, const int numel) {
|
||||
CU_SAFE_CALL(cuMemcpyHtoDAsync(_global, src.begin(), numel*sizeof(numtyp),
|
||||
_cq));
|
||||
}
|
||||
/// Get device ptr associated with object
|
||||
inline const CUdeviceptr * begin() const { return &_global; }
|
||||
inline void clear() {}
|
||||
|
||||
private:
|
||||
CUstream _cq;
|
||||
CUdeviceptr _global;
|
||||
size_t _global_bytes;
|
||||
friend class UCL_Kernel;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif
|
||||
|
||||
@ -28,12 +28,8 @@
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
|
||||
/* We default to OpenCL 1.2 as target version for now as
|
||||
* there are known issues with OpenCL 2.0 and later.
|
||||
* This is also to silence warnings from generic OpenCL headers */
|
||||
|
||||
#if !defined(CL_TARGET_OPENCL_VERSION)
|
||||
#define CL_TARGET_OPENCL_VERSION 120
|
||||
#ifndef CL_TARGET_OPENCL_VERSION
|
||||
#define CL_TARGET_OPENCL_VERSION 210
|
||||
#endif
|
||||
|
||||
#ifdef __APPLE__
|
||||
@ -55,17 +51,36 @@ namespace ucl_opencl {
|
||||
typedef cl_command_queue command_queue;
|
||||
typedef cl_context context_type;
|
||||
|
||||
inline void ucl_flush(command_queue &cq) { CL_SAFE_CALL(clFlush(cq)); }
|
||||
|
||||
inline void ucl_sync(cl_command_queue &cq) {
|
||||
CL_SAFE_CALL(clFinish(cq));
|
||||
}
|
||||
|
||||
inline bool _shared_mem_device(cl_device_type &device_type) {
|
||||
#if defined(GERYON_FORCE_SHARED_MAIN_MEM_ON)
|
||||
inline bool _shared_mem_device(cl_device_id &device) { return true; }
|
||||
#elif defined(GERYON_FORCE_SHARED_MAIN_MEM_OFF)
|
||||
inline bool _shared_mem_device(cl_device_id &device) { return false; }
|
||||
#else
|
||||
inline bool _shared_mem_device(cl_device_id &device) {
|
||||
#ifdef CL_VERSION_1_2
|
||||
cl_bool br;
|
||||
CL_SAFE_CALL(clGetDeviceInfo(device, CL_DEVICE_HOST_UNIFIED_MEMORY,
|
||||
sizeof(cl_bool), &br,NULL));
|
||||
return (br == CL_TRUE);
|
||||
#else
|
||||
cl_device_type device_type;
|
||||
CL_SAFE_CALL(clGetDeviceInfo(device,CL_DEVICE_TYPE,
|
||||
sizeof(device_type),&device_type,NULL));
|
||||
return (device_type==CL_DEVICE_TYPE_CPU);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
struct OCLProperties {
|
||||
std::string name;
|
||||
cl_device_type device_type;
|
||||
bool is_subdevice;
|
||||
cl_ulong global_mem;
|
||||
cl_ulong shared_mem;
|
||||
cl_ulong const_mem;
|
||||
@ -74,12 +89,16 @@ struct OCLProperties {
|
||||
size_t work_group_size;
|
||||
size_t work_item_size[3];
|
||||
bool double_precision;
|
||||
int preferred_vector_width32, preferred_vector_width64;
|
||||
int alignment;
|
||||
size_t timer_resolution;
|
||||
bool ecc_support;
|
||||
std::string c_version;
|
||||
bool partition_equal, partition_counts, partition_affinity;
|
||||
cl_uint max_sub_devices;
|
||||
int cl_device_version;
|
||||
bool has_subgroup_support;
|
||||
bool has_shuffle_support;
|
||||
};
|
||||
|
||||
/// Class for looking at data parallel device properties
|
||||
@ -182,15 +201,26 @@ class UCL_Device {
|
||||
inline std::string device_type_name(const int i);
|
||||
|
||||
/// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
|
||||
inline int device_type() { return device_type(_device); }
|
||||
inline enum UCL_DEVICE_TYPE device_type() { return device_type(_device); }
|
||||
/// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
|
||||
inline int device_type(const int i);
|
||||
inline enum UCL_DEVICE_TYPE device_type(const int i);
|
||||
|
||||
/// Returns true if host memory is efficiently addressable from device
|
||||
inline bool shared_memory() { return shared_memory(_device); }
|
||||
/// Returns true if host memory is efficiently addressable from device
|
||||
inline bool shared_memory(const int i)
|
||||
{ return _shared_mem_device(_properties[i].device_type); }
|
||||
{ return _shared_mem_device(_cl_devices[i]); }
|
||||
|
||||
/// Returns preferred vector width
|
||||
inline int preferred_fp32_width() { return preferred_fp32_width(_device); }
|
||||
/// Returns preferred vector width
|
||||
inline int preferred_fp32_width(const int i)
|
||||
{return _properties[i].preferred_vector_width32;}
|
||||
/// Returns preferred vector width
|
||||
inline int preferred_fp64_width() { return preferred_fp64_width(_device); }
|
||||
/// Returns preferred vector width
|
||||
inline int preferred_fp64_width(const int i)
|
||||
{return _properties[i].preferred_vector_width64;}
|
||||
|
||||
/// Returns true if double precision is support for the current device
|
||||
inline bool double_precision() { return double_precision(_device); }
|
||||
@ -242,6 +272,18 @@ class UCL_Device {
|
||||
/// Get the maximum number of threads per block
|
||||
inline size_t group_size(const int i)
|
||||
{ return _properties[i].work_group_size; }
|
||||
/// Get the maximum number of threads per block in dimension 'dim'
|
||||
inline size_t group_size_dim(const int dim)
|
||||
{ return group_size_dim(_device, dim); }
|
||||
/// Get the maximum number of threads per block in dimension 'dim'
|
||||
inline size_t group_size_dim(const int i, const int dim)
|
||||
{ return _properties[i].work_item_size[dim]; }
|
||||
|
||||
/// Get the shared local memory size in bytes
|
||||
inline size_t slm_size() { return slm_size(_device); }
|
||||
/// Get the shared local memory size in bytes
|
||||
inline size_t slm_size(const int i)
|
||||
{ return _properties[i].shared_mem; }
|
||||
|
||||
/// Return the maximum memory pitch in bytes for current device
|
||||
inline size_t max_pitch() { return max_pitch(_device); }
|
||||
@ -256,6 +298,12 @@ class UCL_Device {
|
||||
inline bool sharing_supported(const int i)
|
||||
{ return true; }
|
||||
|
||||
/// True if the device is a sub-device
|
||||
inline bool is_subdevice()
|
||||
{ return is_subdevice(_device); }
|
||||
/// True if the device is a sub-device
|
||||
inline bool is_subdevice(const int i)
|
||||
{ return _properties[i].is_subdevice; }
|
||||
/// True if splitting device into equal subdevices supported
|
||||
inline bool fission_equal()
|
||||
{ return fission_equal(_device); }
|
||||
@ -274,6 +322,18 @@ class UCL_Device {
|
||||
/// True if splitting device into subdevices by affinity domains supported
|
||||
inline bool fission_by_affinity(const int i)
|
||||
{ return _properties[i].partition_affinity; }
|
||||
/// True if the device has subgroup support
|
||||
inline bool has_subgroup_support()
|
||||
{ return has_subgroup_support(_device); }
|
||||
/// True if the device has subgroup support
|
||||
inline bool has_subgroup_support(const int i)
|
||||
{ return _properties[i].has_subgroup_support; }
|
||||
/// True if the device supports shuffle intrinsics
|
||||
inline bool has_shuffle_support()
|
||||
{ return has_shuffle_support(_device); }
|
||||
/// True if the device supports shuffle intrinsics
|
||||
inline bool has_shuffle_support(const int i)
|
||||
{ return _properties[i].has_shuffle_support; }
|
||||
|
||||
/// Maximum number of subdevices allowed from device fission
|
||||
inline int max_sub_devices()
|
||||
@ -281,6 +341,12 @@ class UCL_Device {
|
||||
/// Maximum number of subdevices allowed from device fission
|
||||
inline int max_sub_devices(const int i)
|
||||
{ return _properties[i].max_sub_devices; }
|
||||
/// OpenCL version supported by the device
|
||||
inline int cl_device_version()
|
||||
{ return cl_device_version(_device); }
|
||||
/// OpenCL version supported by the device
|
||||
inline int cl_device_version(const int i)
|
||||
{ return _properties[i].cl_device_version; }
|
||||
|
||||
/// List all devices along with all properties
|
||||
inline void print_all(std::ostream &out);
|
||||
@ -288,8 +354,14 @@ class UCL_Device {
|
||||
/// Return the OpenCL type for the device
|
||||
inline cl_device_id & cl_device() { return _cl_device; }
|
||||
|
||||
/// Select the platform that has accelerators
|
||||
inline int set_platform_accelerator(int pid=-1);
|
||||
/// Automatically set the platform by type, vendor, and/or CU count
|
||||
/** If first_device is positive, search restricted to platforms containing
|
||||
* this device IDs. If ndevices is positive, search is restricted
|
||||
* to platforms with at least that many devices **/
|
||||
inline int auto_set_platform(const enum UCL_DEVICE_TYPE type=UCL_GPU,
|
||||
const std::string vendor="",
|
||||
const int ndevices=-1,
|
||||
const int first_device=-1);
|
||||
|
||||
private:
|
||||
int _num_platforms; // Number of platforms
|
||||
@ -322,8 +394,7 @@ UCL_Device::UCL_Device() {
|
||||
return;
|
||||
} else
|
||||
_num_platforms=static_cast<int>(nplatforms);
|
||||
// note that platform 0 may not necessarily be associated with accelerators
|
||||
set_platform_accelerator();
|
||||
set_platform(0);
|
||||
}
|
||||
|
||||
UCL_Device::~UCL_Device() {
|
||||
@ -332,6 +403,14 @@ UCL_Device::~UCL_Device() {
|
||||
|
||||
void UCL_Device::clear() {
|
||||
_properties.clear();
|
||||
|
||||
#ifdef GERYON_NUMA_FISSION
|
||||
#ifdef CL_VERSION_1_2
|
||||
for (int i=0; i<_cl_devices.size(); i++)
|
||||
CL_DESTRUCT_CALL(clReleaseDevice(_cl_devices[i]));
|
||||
#endif
|
||||
#endif
|
||||
|
||||
_cl_devices.clear();
|
||||
if (_device>-1) {
|
||||
for (size_t i=0; i<_cq.size(); i++) {
|
||||
@ -341,6 +420,7 @@ void UCL_Device::clear() {
|
||||
CL_DESTRUCT_CALL(clReleaseContext(_context));
|
||||
}
|
||||
_device=-1;
|
||||
_num_devices=0;
|
||||
}
|
||||
|
||||
int UCL_Device::set_platform(int pid) {
|
||||
@ -370,11 +450,51 @@ int UCL_Device::set_platform(int pid) {
|
||||
CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,n,device_list,
|
||||
&n));
|
||||
|
||||
#ifndef GERYON_NUMA_FISSION
|
||||
// --- Store properties for each device
|
||||
for (int i=0; i<_num_devices; i++) {
|
||||
_cl_devices.push_back(device_list[i]);
|
||||
add_properties(device_list[i]);
|
||||
}
|
||||
#else
|
||||
// --- Create sub-devices for anything partitionable by NUMA and store props
|
||||
int num_unpart = _num_devices;
|
||||
_num_devices = 0;
|
||||
for (int i=0; i<num_unpart; i++) {
|
||||
cl_uint num_subdevices = 1;
|
||||
cl_device_id *subdevice_list = device_list + i;
|
||||
|
||||
#ifdef CL_VERSION_1_2
|
||||
cl_device_affinity_domain adomain;
|
||||
CL_SAFE_CALL(clGetDeviceInfo(device_list[i],
|
||||
CL_DEVICE_PARTITION_AFFINITY_DOMAIN,
|
||||
sizeof(cl_device_affinity_domain),
|
||||
&adomain,NULL));
|
||||
|
||||
cl_device_partition_property props[3];
|
||||
props[0]=CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN;
|
||||
props[1]=CL_DEVICE_AFFINITY_DOMAIN_NUMA;
|
||||
props[2]=0;
|
||||
if (adomain & CL_DEVICE_AFFINITY_DOMAIN_NUMA)
|
||||
CL_SAFE_CALL(clCreateSubDevices(device_list[i], props, 0, NULL,
|
||||
&num_subdevices));
|
||||
if (num_subdevices > 1) {
|
||||
subdevice_list = new cl_device_id[num_subdevices];
|
||||
CL_SAFE_CALL(clCreateSubDevices(device_list[i], props, num_subdevices,
|
||||
subdevice_list, &num_subdevices));
|
||||
}
|
||||
#endif
|
||||
|
||||
for (int j=0; j<num_subdevices; j++) {
|
||||
_num_devices++;
|
||||
_cl_devices.push_back(subdevice_list[j]);
|
||||
add_properties(subdevice_list[j]);
|
||||
}
|
||||
|
||||
if (num_subdevices > 1) delete[] subdevice_list;
|
||||
} // for i
|
||||
#endif
|
||||
|
||||
delete[] device_list;
|
||||
return UCL_SUCCESS;
|
||||
}
|
||||
@ -429,11 +549,18 @@ void UCL_Device::add_properties(cl_device_id device_list) {
|
||||
sizeof(cl_uint),&op.alignment,nullptr));
|
||||
op.alignment/=8;
|
||||
|
||||
cl_uint float_width;
|
||||
CL_SAFE_CALL(clGetDeviceInfo(device_list,
|
||||
CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT,
|
||||
sizeof(float_width),&float_width,nullptr));
|
||||
op.preferred_vector_width32=float_width;
|
||||
|
||||
// Determine if double precision is supported
|
||||
cl_uint double_width;
|
||||
CL_SAFE_CALL(clGetDeviceInfo(device_list,
|
||||
CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE,
|
||||
sizeof(double_width),&double_width,nullptr));
|
||||
op.preferred_vector_width64=double_width;
|
||||
if (double_width==0)
|
||||
op.double_precision=false;
|
||||
else
|
||||
@ -452,9 +579,14 @@ void UCL_Device::add_properties(cl_device_id device_list) {
|
||||
op.ecc_support=true;
|
||||
|
||||
op.c_version="";
|
||||
op.is_subdevice=false;
|
||||
op.partition_equal=false;
|
||||
op.partition_counts=false;
|
||||
op.partition_affinity=false;
|
||||
op.max_sub_devices=1;
|
||||
op.cl_device_version=0;
|
||||
op.has_subgroup_support=false;
|
||||
op.has_shuffle_support=false;
|
||||
|
||||
#ifdef CL_VERSION_1_2
|
||||
size_t return_bytes;
|
||||
@ -463,6 +595,13 @@ void UCL_Device::add_properties(cl_device_id device_list) {
|
||||
op.c_version=buffer;
|
||||
|
||||
cl_device_partition_property pinfo[4];
|
||||
CL_SAFE_CALL(clGetDeviceInfo(device_list, CL_DEVICE_PARTITION_TYPE,
|
||||
4*sizeof(cl_device_partition_property),
|
||||
&pinfo, &return_bytes));
|
||||
if (return_bytes == 0) op.is_subdevice=false;
|
||||
else if (pinfo[0]) op.is_subdevice=true;
|
||||
else op.is_subdevice=false;
|
||||
|
||||
CL_SAFE_CALL(clGetDeviceInfo(device_list,
|
||||
CL_DEVICE_PARTITION_PROPERTIES,
|
||||
4*sizeof(cl_device_partition_property),
|
||||
@ -480,6 +619,46 @@ void UCL_Device::add_properties(cl_device_id device_list) {
|
||||
CL_SAFE_CALL(clGetDeviceInfo(device_list,
|
||||
CL_DEVICE_PARTITION_MAX_SUB_DEVICES,
|
||||
sizeof(cl_uint),&op.max_sub_devices,nullptr));
|
||||
|
||||
CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_VERSION,1024,buffer,nullptr));
|
||||
int cl_version_maj = buffer[7] - '0';
|
||||
int cl_version_min = buffer[9] - '0';
|
||||
op.cl_device_version = cl_version_maj * 100 + cl_version_min * 10;
|
||||
|
||||
size_t ext_str_size_ret;
|
||||
CL_SAFE_CALL(clGetDeviceInfo(device_list, CL_DEVICE_EXTENSIONS, 0, nullptr,
|
||||
&ext_str_size_ret));
|
||||
char buffer2[ext_str_size_ret];
|
||||
CL_SAFE_CALL(clGetDeviceInfo(device_list, CL_DEVICE_EXTENSIONS,
|
||||
ext_str_size_ret, buffer2, nullptr));
|
||||
#if defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0)
|
||||
if (op.cl_device_version >= 210) {
|
||||
if ((std::string(buffer2).find("cl_khr_subgroups") != std::string::npos) ||
|
||||
(std::string(buffer2).find("cl_intel_subgroups") != std::string::npos))
|
||||
op.has_subgroup_support=true;
|
||||
if (std::string(buffer2).find("cl_intel_subgroups") != std::string::npos)
|
||||
op.has_shuffle_support=true;
|
||||
}
|
||||
#endif
|
||||
if (std::string(buffer2).find("cl_nv_device_attribute_query") !=
|
||||
std::string::npos) {
|
||||
#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
|
||||
#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
|
||||
#endif
|
||||
#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV
|
||||
#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001
|
||||
#endif
|
||||
cl_uint major, minor;
|
||||
CL_SAFE_CALL(clGetDeviceInfo(device_list,
|
||||
CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
|
||||
sizeof(cl_uint), &major, nullptr));
|
||||
CL_SAFE_CALL(clGetDeviceInfo(device_list,
|
||||
CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
|
||||
sizeof(cl_uint), &minor, nullptr));
|
||||
double arch = static_cast<double>(minor)/10+major;
|
||||
if (arch >= 3.0)
|
||||
op.has_shuffle_support=true;
|
||||
}
|
||||
#endif
|
||||
|
||||
_properties.push_back(op);
|
||||
@ -516,7 +695,7 @@ std::string UCL_Device::device_type_name(const int i) {
|
||||
}
|
||||
|
||||
// Get a string telling the type of the device
|
||||
int UCL_Device::device_type(const int i) {
|
||||
enum UCL_DEVICE_TYPE UCL_Device::device_type(const int i) {
|
||||
if (_properties[i].device_type==CL_DEVICE_TYPE_CPU)
|
||||
return UCL_CPU;
|
||||
else if (_properties[i].device_type==CL_DEVICE_TYPE_GPU)
|
||||
@ -529,14 +708,8 @@ int UCL_Device::device_type(const int i) {
|
||||
|
||||
// Set the CUDA device to the specified device number
|
||||
int UCL_Device::set(int num) {
|
||||
cl_device_id *device_list = new cl_device_id[_num_devices];
|
||||
cl_uint n;
|
||||
CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,_num_devices,
|
||||
device_list,&n));
|
||||
|
||||
_device=num;
|
||||
_cl_device=device_list[_device];
|
||||
delete[] device_list;
|
||||
_cl_device=_cl_devices[_device];
|
||||
return create_context();
|
||||
}
|
||||
|
||||
@ -555,6 +728,11 @@ void UCL_Device::print_all(std::ostream &out) {
|
||||
out << "\nDevice " << i << ": \"" << name(i).c_str() << "\"\n";
|
||||
out << " Type of device: "
|
||||
<< device_type_name(i).c_str() << std::endl;
|
||||
out << " Is a subdevice: ";
|
||||
if (is_subdevice(i))
|
||||
out << "Yes\n";
|
||||
else
|
||||
out << "No\n";
|
||||
out << " Double precision support: ";
|
||||
if (double_precision(i))
|
||||
out << "Yes\n";
|
||||
@ -613,31 +791,91 @@ void UCL_Device::print_all(std::ostream &out) {
|
||||
out << "No\n";
|
||||
out << " Maximum subdevices from fission: "
|
||||
<< max_sub_devices(i) << std::endl;
|
||||
out << " Shared memory system: ";
|
||||
if (shared_memory(i))
|
||||
out << "Yes\n";
|
||||
else
|
||||
out << "No\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Select the platform that is associated with accelerators
|
||||
// if pid < 0, select the first platform
|
||||
int UCL_Device::set_platform_accelerator(int pid) {
|
||||
if (pid < 0) {
|
||||
int found = 0;
|
||||
int UCL_Device::auto_set_platform(const enum UCL_DEVICE_TYPE type,
|
||||
const std::string vendor,
|
||||
const int ndevices,
|
||||
const int first_device) {
|
||||
if (_num_platforms < 2) return set_platform(0);
|
||||
|
||||
int last_device = -1;
|
||||
if (first_device > -1) {
|
||||
if (ndevices)
|
||||
last_device = first_device + ndevices - 1;
|
||||
else
|
||||
last_device = first_device;
|
||||
}
|
||||
|
||||
bool vendor_match=false;
|
||||
bool type_match=false;
|
||||
int max_cus=0;
|
||||
int best_platform=0;
|
||||
|
||||
std::string vendor_upper=vendor;
|
||||
for (int i=0; i<vendor.length(); i++)
|
||||
if (vendor_upper[i]<='z' && vendor_upper[i]>='a')
|
||||
vendor_upper[i]=toupper(vendor_upper[i]);
|
||||
|
||||
for (int n=0; n<_num_platforms; n++) {
|
||||
set_platform(n);
|
||||
for (int i=0; i<num_devices(); i++) {
|
||||
if ((_properties[i].device_type & CL_DEVICE_TYPE_CPU) ||
|
||||
(_properties[i].device_type & CL_DEVICE_TYPE_GPU) ||
|
||||
(_properties[i].device_type & CL_DEVICE_TYPE_ACCELERATOR)) {
|
||||
found = 1;
|
||||
break;
|
||||
if (last_device > -1 && last_device >= num_devices()) continue;
|
||||
if (ndevices > num_devices()) continue;
|
||||
|
||||
int first_id=0;
|
||||
int last_id=num_devices()-1;
|
||||
if (last_device > -1) {
|
||||
first_id=first_device;
|
||||
last_id=last_device;
|
||||
}
|
||||
|
||||
if (vendor_upper!="") {
|
||||
std::string pname = platform_name();
|
||||
for (int i=0; i<pname.length(); i++)
|
||||
if (pname[i]<='z' && pname[i]>='a')
|
||||
pname[i]=toupper(pname[i]);
|
||||
|
||||
if (pname.find(vendor_upper)!=std::string::npos) {
|
||||
if (vendor_match == false) {
|
||||
best_platform=n;
|
||||
max_cus=0;
|
||||
vendor_match=true;
|
||||
}
|
||||
} else if (vendor_match)
|
||||
continue;
|
||||
}
|
||||
|
||||
if (type != UCL_DEFAULT) {
|
||||
bool ptype_matched=false;
|
||||
for (int d=first_id; d<=last_id; d++) {
|
||||
if (type==device_type(d)) {
|
||||
if (type_match == false) {
|
||||
best_platform=n;
|
||||
max_cus=0;
|
||||
type_match=true;
|
||||
ptype_matched=true;
|
||||
}
|
||||
}
|
||||
if (found) return UCL_SUCCESS;
|
||||
}
|
||||
return UCL_ERROR;
|
||||
} else {
|
||||
return set_platform(pid);
|
||||
if (type_match==true && ptype_matched==false)
|
||||
continue;
|
||||
}
|
||||
|
||||
for (int d=first_id; d<=last_id; d++) {
|
||||
if (cus(d) > max_cus) {
|
||||
best_platform=n;
|
||||
max_cus=cus(d);
|
||||
}
|
||||
}
|
||||
}
|
||||
return set_platform(best_platform);
|
||||
}
|
||||
|
||||
} // namespace ucl_opencl
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
ocl_kernel.h
|
||||
-------------------
|
||||
W. Michael Brown
|
||||
Nitin Dhamankar (Intel)
|
||||
|
||||
Utilities for dealing with OpenCL kernels
|
||||
|
||||
@ -26,6 +27,7 @@
|
||||
|
||||
#include "ocl_device.h"
|
||||
#include <fstream>
|
||||
#include <cstdio>
|
||||
|
||||
namespace ucl_opencl {
|
||||
|
||||
@ -93,7 +95,7 @@ class UCL_Program {
|
||||
|
||||
/// Load a program from a string and compile with flags
|
||||
inline int load_string(const void *program, const char *flags="",
|
||||
std::string *log=nullptr) {
|
||||
std::string *log=nullptr, FILE* foutput=nullptr) {
|
||||
cl_int error_flag;
|
||||
const char *prog=(const char *)program;
|
||||
_program=clCreateProgramWithSource(_context,1,&prog,nullptr,&error_flag);
|
||||
@ -107,26 +109,65 @@ class UCL_Program {
|
||||
sizeof(cl_build_status),&build_status,
|
||||
nullptr));
|
||||
|
||||
if (build_status != CL_SUCCESS || log!=nullptr) {
|
||||
#ifdef GERYON_KERNEL_DUMP
|
||||
{
|
||||
size_t ms;
|
||||
CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,0,
|
||||
nullptr, &ms));
|
||||
CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,
|
||||
0,NULL,&ms));
|
||||
char *build_log = new char[ms];
|
||||
CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,ms,
|
||||
build_log, nullptr));
|
||||
CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,
|
||||
ms,build_log, NULL));
|
||||
std::cout << std::endl << std::endl
|
||||
<< "--------------------------------------------------------\n"
|
||||
<< " UCL PROGRAM DUMP\n"
|
||||
<< "--------------------------------------------------------\n"
|
||||
<< flags << std::endl
|
||||
<< "--------------------------------------------------------\n"
|
||||
<< prog << std::endl
|
||||
<< "--------------------------------------------------------\n"
|
||||
<< build_log
|
||||
<< "--------------------------------------------------------\n"
|
||||
<< std::endl << std::endl;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (build_status != CL_SUCCESS || log!=NULL) {
|
||||
size_t ms;
|
||||
CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,
|
||||
0,NULL,&ms));
|
||||
char *build_log = new char[ms];
|
||||
CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,
|
||||
ms,build_log, NULL));
|
||||
|
||||
if (log!=nullptr)
|
||||
*log=std::string(build_log);
|
||||
|
||||
if (build_status != CL_SUCCESS) {
|
||||
#ifndef UCL_NO_EXIT
|
||||
std::cerr << std::endl
|
||||
std::cerr << std::endl << std::endl
|
||||
<< "----------------------------------------------------------\n"
|
||||
<< " UCL Error: Error compiling OpenCL Program ("
|
||||
<< build_status << ") ...\n"
|
||||
<< "----------------------------------------------------------\n";
|
||||
std::cerr << build_log << std::endl;
|
||||
std::cerr <<
|
||||
"----------------------------------------------------------\n"
|
||||
<< std::endl << std::endl;
|
||||
#endif
|
||||
if (foutput != NULL) {
|
||||
fprintf(foutput,"\n\n");
|
||||
fprintf(foutput,
|
||||
"----------------------------------------------------------\n");
|
||||
fprintf(foutput,
|
||||
" UCL Error: Error compiling OpenCL Program (%d) ...\n",
|
||||
build_status);
|
||||
fprintf(foutput,
|
||||
"----------------------------------------------------------\n");
|
||||
fprintf(foutput,"%s\n",build_log);
|
||||
fprintf(foutput,
|
||||
"----------------------------------------------------------\n");
|
||||
fprintf(foutput,"\n\n");
|
||||
}
|
||||
delete[] build_log;
|
||||
return UCL_COMPILE_ERROR;
|
||||
} else delete[] build_log;
|
||||
@ -141,6 +182,7 @@ class UCL_Program {
|
||||
inline void cq(command_queue &cq_in) { _cq=cq_in; }
|
||||
|
||||
friend class UCL_Kernel;
|
||||
friend class UCL_Const;
|
||||
private:
|
||||
bool _init_done;
|
||||
cl_program _program;
|
||||
@ -322,9 +364,45 @@ class UCL_Kernel {
|
||||
inline void cq(command_queue &cq_in) { _cq=cq_in; }
|
||||
#include "ucl_arg_kludge.h"
|
||||
|
||||
#if defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0)
|
||||
inline size_t max_subgroup_size(const size_t block_size_x) {
|
||||
size_t block_size = block_size_x;
|
||||
CL_SAFE_CALL(clGetKernelSubGroupInfo(_kernel, _device,
|
||||
CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE,
|
||||
sizeof(block_size), (void *) &block_size,
|
||||
sizeof(size_t), (void *) &_mx_subgroup_sz,
|
||||
NULL));
|
||||
return _mx_subgroup_sz;
|
||||
}
|
||||
|
||||
inline size_t max_subgroup_size(const size_t block_size_x,
|
||||
const size_t block_size_y) {
|
||||
size_t block_size[2] { block_size_x, block_size_y };
|
||||
CL_SAFE_CALL(clGetKernelSubGroupInfo(_kernel, _device,
|
||||
CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE,
|
||||
sizeof(block_size), (void *) &block_size,
|
||||
sizeof(size_t), (void *) &_mx_subgroup_sz,
|
||||
NULL));
|
||||
return _mx_subgroup_sz;
|
||||
}
|
||||
|
||||
inline size_t max_subgroup_size(const size_t block_size_x,
|
||||
const size_t block_size_y,
|
||||
const size_t block_size_z) {
|
||||
size_t block_size[3] { block_size_x, block_size_y, block_size_z };
|
||||
CL_SAFE_CALL(clGetKernelSubGroupInfo(_kernel, _device,
|
||||
CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE,
|
||||
sizeof(block_size), (void *) &block_size,
|
||||
sizeof(size_t), (void *) &_mx_subgroup_sz,
|
||||
NULL));
|
||||
return _mx_subgroup_sz;
|
||||
}
|
||||
#endif
|
||||
|
||||
private:
|
||||
cl_kernel _kernel;
|
||||
cl_program _program;
|
||||
cl_device_id _device;
|
||||
cl_uint _dimensions;
|
||||
size_t _block_size[3];
|
||||
size_t _num_blocks[3];
|
||||
@ -338,6 +416,11 @@ class UCL_Kernel {
|
||||
unsigned _kernel_info_nargs;
|
||||
//std::string _kernel_info_args[256];
|
||||
#endif
|
||||
|
||||
#ifdef CL_VERSION_2_1
|
||||
size_t _mx_subgroup_sz; // Maximum sub-group size for this kernel
|
||||
#endif
|
||||
|
||||
};
|
||||
|
||||
inline int UCL_Kernel::set_function(UCL_Program &program, const char *function) {
|
||||
@ -347,6 +430,7 @@ inline int UCL_Kernel::set_function(UCL_Program &program, const char *function)
|
||||
CL_SAFE_CALL(clRetainCommandQueue(_cq));
|
||||
_program=program._program;
|
||||
CL_SAFE_CALL(clRetainProgram(_program));
|
||||
_device=program._device;
|
||||
cl_int error_flag;
|
||||
_kernel=clCreateKernel(program._program,function,&error_flag);
|
||||
|
||||
@ -380,8 +464,11 @@ inline int UCL_Kernel::set_function(UCL_Program &program, const char *function)
|
||||
}
|
||||
|
||||
void UCL_Kernel::run() {
|
||||
CL_SAFE_CALL(clEnqueueNDRangeKernel(_cq,_kernel,_dimensions,nullptr,
|
||||
_num_blocks,_block_size,0,nullptr,nullptr));
|
||||
CL_SAFE_CALL(clEnqueueNDRangeKernel(_cq,_kernel,_dimensions,NULL,
|
||||
_num_blocks,_block_size,0,NULL,NULL));
|
||||
#ifdef GERYON_OCL_FLUSH
|
||||
ucl_flush(_cq);
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
@ -4,12 +4,8 @@
|
||||
#include <cstdio>
|
||||
#include <cassert>
|
||||
|
||||
/* We default to OpenCL 1.2 as target version for now as
|
||||
* there are known issues with OpenCL 2.0 and later.
|
||||
* This is also to silence warnings from generic OpenCL headers */
|
||||
|
||||
#if !defined(CL_TARGET_OPENCL_VERSION)
|
||||
#define CL_TARGET_OPENCL_VERSION 120
|
||||
#ifndef CL_TARGET_OPENCL_VERSION
|
||||
#define CL_TARGET_OPENCL_VERSION 210
|
||||
#endif
|
||||
|
||||
#ifdef __APPLE__
|
||||
|
||||
@ -108,7 +108,7 @@ inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
|
||||
return UCL_MEMORY_ERROR;
|
||||
*mat.host_ptr() = (typename mat_type::data_type*)
|
||||
clEnqueueMapBuffer(cm.cq(),mat.cbegin(),CL_TRUE,
|
||||
map_perm,0,n,0,nullptr,nullptr,nullptr);
|
||||
map_perm,0,n,0,NULL,NULL,NULL);
|
||||
|
||||
mat.cq()=cm.cq();
|
||||
CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
|
||||
@ -116,18 +116,15 @@ inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
|
||||
}
|
||||
|
||||
template <class mat_type, class copy_type>
|
||||
inline int _host_view(mat_type &mat, copy_type &cm, const size_t n) {
|
||||
inline int _host_view(mat_type &mat, copy_type &cm, const size_t o,
|
||||
const size_t n) {
|
||||
cl_int error_flag;
|
||||
cl_context context;
|
||||
CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_CONTEXT,sizeof(context),
|
||||
&context,nullptr));
|
||||
cl_mem_flags orig_flags;
|
||||
CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_FLAGS,sizeof(orig_flags),
|
||||
&orig_flags,nullptr));
|
||||
orig_flags=orig_flags & ~CL_MEM_ALLOC_HOST_PTR;
|
||||
|
||||
mat.cbegin()=clCreateBuffer(context, CL_MEM_USE_HOST_PTR | orig_flags, n,
|
||||
*mat.host_ptr(), &error_flag);
|
||||
cl_buffer_region subbuffer;
|
||||
subbuffer.origin = o;
|
||||
subbuffer.size = n;
|
||||
mat.cbegin()=clCreateSubBuffer(cm.cbegin(), 0,
|
||||
CL_BUFFER_CREATE_TYPE_REGION, &subbuffer,
|
||||
&error_flag);
|
||||
|
||||
CL_CHECK_ERR(error_flag);
|
||||
CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
|
||||
@ -470,6 +467,9 @@ inline void _device_zero(mat_type &mat, const size_t n, command_queue &cq) {
|
||||
size_t kn=n/sizeof(typename mat_type::data_type);
|
||||
CL_SAFE_CALL(clEnqueueNDRangeKernel(cq,kzero,1,0,&kn,0,0,0,0));
|
||||
#endif
|
||||
#ifdef GERYON_OCL_FLUSH
|
||||
ucl_flush(cq);
|
||||
#endif
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
@ -585,7 +585,10 @@ template <> struct _ucl_memcpy<1,0> {
|
||||
std::cerr << "UCL_COPY 1NS\n";
|
||||
#endif
|
||||
CL_SAFE_CALL(clEnqueueReadBuffer(cq,src.cbegin(),block,src_offset,n,
|
||||
dst.begin(),0,nullptr,nullptr));
|
||||
dst.begin(),0,NULL,NULL));
|
||||
#ifdef GERYON_OCL_FLUSH
|
||||
if (block==CL_FALSE) ucl_flush(cq);
|
||||
#endif
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
@ -617,6 +620,9 @@ template <> struct _ucl_memcpy<1,0> {
|
||||
src_offset+=spitch;
|
||||
dst_offset+=dpitch;
|
||||
}
|
||||
#ifdef GERYON_OCL_FLUSH
|
||||
if (block==CL_FALSE) ucl_flush(cq);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
@ -637,7 +643,10 @@ template <> struct _ucl_memcpy<0,1> {
|
||||
std::cerr << "UCL_COPY 3NS\n";
|
||||
#endif
|
||||
CL_SAFE_CALL(clEnqueueWriteBuffer(cq,dst.cbegin(),block,dst_offset,n,
|
||||
src.begin(),0,nullptr,nullptr));
|
||||
src.begin(),0,NULL,NULL));
|
||||
#ifdef GERYON_OCL_FLUSH
|
||||
if (block==CL_FALSE) ucl_flush(cq);
|
||||
#endif
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
@ -669,6 +678,9 @@ template <> struct _ucl_memcpy<0,1> {
|
||||
src_offset+=spitch;
|
||||
dst_offset+=dpitch;
|
||||
}
|
||||
#ifdef GERYON_OCL_FLUSH
|
||||
if (block==CL_FALSE) ucl_flush(cq);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
@ -690,6 +702,9 @@ template <int mem1, int mem2> struct _ucl_memcpy {
|
||||
#endif
|
||||
|
||||
if (block==CL_TRUE) ucl_sync(cq);
|
||||
#ifdef GERYON_OCL_FLUSH
|
||||
else ucl_flush(cq);
|
||||
#endif
|
||||
}
|
||||
template <class p1, class p2>
|
||||
static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
|
||||
@ -720,6 +735,9 @@ template <int mem1, int mem2> struct _ucl_memcpy {
|
||||
#endif
|
||||
|
||||
if (block==CL_TRUE) ucl_sync(cq);
|
||||
#ifdef GERYON_OCL_FLUSH
|
||||
else ucl_flush(cq);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@ -53,6 +53,59 @@ class UCL_Texture {
|
||||
friend class UCL_Kernel;
|
||||
};
|
||||
|
||||
/// Class storing a const global memory reference
|
||||
class UCL_Const {
|
||||
public:
|
||||
UCL_Const() : _global_bytes(0), _active(false) {}
|
||||
~UCL_Const() { clear(); }
|
||||
/// Construct with a specified global reference
|
||||
inline UCL_Const(UCL_Program &prog, const char *global_name)
|
||||
{ get_global(prog,global_name); }
|
||||
/// Set the global reference for this object
|
||||
inline void get_global(UCL_Program &prog, const char *global_name) {
|
||||
if (_active) {
|
||||
CL_DESTRUCT_CALL(clReleaseContext(_context));
|
||||
CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq));
|
||||
}
|
||||
_active = true;
|
||||
_context = prog._context;
|
||||
_cq = prog._cq;
|
||||
CL_SAFE_CALL(clRetainContext(_context));
|
||||
CL_SAFE_CALL(clRetainCommandQueue(_cq));
|
||||
}
|
||||
/// Copy from array on host to const memory
|
||||
template <class numtyp>
|
||||
inline void update_device(UCL_H_Vec<numtyp> &src, const int numel) {
|
||||
const int bytes=numel*sizeof(numtyp);
|
||||
if (_global_bytes < bytes) {
|
||||
if (_global_bytes) CL_SAFE_CALL(clReleaseMemObject(_global));
|
||||
cl_int e;
|
||||
_global = clCreateBuffer(_context, CL_MEM_READ_ONLY, bytes, NULL, &e);
|
||||
CL_SAFE_CALL(e);
|
||||
}
|
||||
CL_SAFE_CALL(clEnqueueWriteBuffer(_cq, _global, CL_FALSE, 0, bytes,
|
||||
(void *)src.begin(), 0, NULL, NULL));
|
||||
}
|
||||
/// Get device ptr associated with object
|
||||
inline const cl_mem * begin() const { return &_global; }
|
||||
inline void clear() {
|
||||
if (_global_bytes) CL_SAFE_CALL(clReleaseMemObject(_global));
|
||||
if (_active) {
|
||||
CL_DESTRUCT_CALL(clReleaseContext(_context));
|
||||
CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq));
|
||||
}
|
||||
_global_bytes=0;
|
||||
_active=false;
|
||||
}
|
||||
|
||||
private:
|
||||
cl_mem _global;
|
||||
size_t _global_bytes;
|
||||
cl_context _context;
|
||||
cl_command_queue _cq;
|
||||
bool _active;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif
|
||||
|
||||
@ -61,7 +61,6 @@ class UCL_Timer {
|
||||
/// Initialize command queue for timing
|
||||
inline void init(UCL_Device &dev, command_queue &cq) {
|
||||
clear();
|
||||
t_factor=dev.timer_resolution()/1000000000.0;
|
||||
_cq=cq;
|
||||
clRetainCommandQueue(_cq);
|
||||
_initialized=true;
|
||||
@ -124,17 +123,17 @@ class UCL_Timer {
|
||||
clReleaseEvent(start_event);
|
||||
clReleaseEvent(stop_event);
|
||||
has_measured_time = false;
|
||||
return (tend-tstart)*t_factor;
|
||||
return (tend-tstart)*1e-6;
|
||||
}
|
||||
|
||||
/// Return the time (s) of last start to stop - Forces synchronization
|
||||
inline double seconds() { return time()/1000.0; }
|
||||
inline double seconds() { return time()*1e-3; }
|
||||
|
||||
/// Return the total time in ms
|
||||
inline double total_time() { return _total_time; }
|
||||
|
||||
/// Return the total time in seconds
|
||||
inline double total_seconds() { return _total_time/1000.0; }
|
||||
inline double total_seconds() { return _total_time*1e-3; }
|
||||
|
||||
private:
|
||||
cl_event start_event, stop_event;
|
||||
|
||||
@ -69,17 +69,17 @@ class UCL_BaseMat {
|
||||
/// Return the type/permissions of memory allocation
|
||||
/** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, UCL_NOT_PINNED
|
||||
* or UCL_VIEW **/
|
||||
/// Assert that any ops in associate command queue have been issued to device
|
||||
inline void flush() { ucl_flush(_cq); }
|
||||
|
||||
inline enum UCL_MEMOPT kind() const { return _kind; }
|
||||
|
||||
inline bool shared_mem_device() {
|
||||
#ifdef _OCL_MAT
|
||||
cl_device_id device;
|
||||
CL_SAFE_CALL(clGetCommandQueueInfo(_cq,CL_QUEUE_DEVICE,
|
||||
sizeof(cl_device_id),&device,nullptr));
|
||||
cl_device_type device_type;
|
||||
CL_SAFE_CALL(clGetDeviceInfo(device,CL_DEVICE_TYPE,
|
||||
sizeof(device_type),&device_type,nullptr));
|
||||
return _shared_mem_device(device_type);
|
||||
sizeof(cl_device_id),&device,NULL));
|
||||
return _shared_mem_device(device);
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
|
||||
@ -39,7 +39,7 @@ class UCL_D_Vec : public UCL_BaseMat {
|
||||
};
|
||||
typedef numtyp data_type;
|
||||
|
||||
UCL_D_Vec() : _cols(0) {}
|
||||
UCL_D_Vec() : _cols(0), _row_bytes(0) {}
|
||||
~UCL_D_Vec() { _device_free(*this); }
|
||||
|
||||
/// Construct with n columns
|
||||
|
||||
@ -44,10 +44,8 @@ using namespace ucl_hip;
|
||||
int main(int argc, char** argv) {
|
||||
UCL_Device cop;
|
||||
std::cout << "Found " << cop.num_platforms() << " platform(s).\n";
|
||||
if (cop.num_platforms()>0) {
|
||||
std::cout << "Using platform: " << cop.platform_name() << std::endl;
|
||||
if (cop.num_platforms()>0)
|
||||
cop.print_all(std::cout);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@ -241,7 +241,7 @@ class UCL_H_Mat : public UCL_BaseMat {
|
||||
_array=input.begin()+offset;
|
||||
_end=_array+_cols;
|
||||
#ifdef _OCL_MAT
|
||||
_host_view(*this,input,_row_bytes*_rows);
|
||||
_host_view(*this,input,offset*sizeof(numtyp),_row_bytes*_rows);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@ -39,7 +39,7 @@ class UCL_H_Vec : public UCL_BaseMat {
|
||||
};
|
||||
typedef numtyp data_type;
|
||||
|
||||
UCL_H_Vec() : _cols(0) {
|
||||
UCL_H_Vec() : _cols(0), _row_bytes(0) {
|
||||
#ifdef _OCL_MAT
|
||||
_carray=(cl_mem)(0);
|
||||
#endif
|
||||
@ -135,7 +135,7 @@ class UCL_H_Vec : public UCL_BaseMat {
|
||||
_cols=cols;
|
||||
_row_bytes=_cols*sizeof(numtyp);
|
||||
this->_cq=input.cq();
|
||||
_array=input.begin();
|
||||
_array=(numtyp *)input.begin();
|
||||
_end=_array+_cols;
|
||||
#ifdef _OCL_MAT
|
||||
_carray=input.cbegin();
|
||||
@ -240,10 +240,10 @@ class UCL_H_Vec : public UCL_BaseMat {
|
||||
_cols=cols;
|
||||
_row_bytes=_cols*sizeof(numtyp);
|
||||
this->_cq=input.cq();
|
||||
_array=input.begin()+offset;
|
||||
_array=(numtyp *)input.begin()+offset;
|
||||
_end=_array+_cols;
|
||||
#ifdef _OCL_MAT
|
||||
_host_view(*this,input,_row_bytes);
|
||||
_host_view(*this,input,offset*sizeof(numtyp),_row_bytes);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@ -162,6 +162,8 @@ class UCL_Vector {
|
||||
inline void cq(command_queue &cq_in) { host.cq(cq_in); device.cq(cq_in); }
|
||||
/// Block until command_queue associated with matrix is complete
|
||||
inline void sync() { host.sync(); }
|
||||
/// Assert that any ops in associate command queue have been issued to device
|
||||
inline void flush() { ucl_flush(host.cq()); }
|
||||
|
||||
///Get the size of a row on the host (including any padding) in elements
|
||||
inline size_t row_size() const { return host.row_size(); }
|
||||
|
||||
@ -14,6 +14,9 @@
|
||||
***************************************************************************/
|
||||
|
||||
#include "lal_answer.h"
|
||||
#if (LAL_USE_OMP == 1)
|
||||
#include <omp.h>
|
||||
#endif
|
||||
|
||||
namespace LAMMPS_AL {
|
||||
#define AnswerT Answer<numtyp,acctyp>
|
||||
@ -81,6 +84,10 @@ bool AnswerT::init(const int inum, const bool charge, const bool rot,
|
||||
_time_cast=0.0;
|
||||
_time_cpu_idle=0.0;
|
||||
|
||||
success=success && (error_flag.alloc(1,*dev,UCL_READ_WRITE,
|
||||
UCL_WRITE_ONLY)==UCL_SUCCESS);
|
||||
if (success) error_flag.zero();
|
||||
|
||||
return success && alloc(ef_inum);
|
||||
}
|
||||
|
||||
@ -111,6 +118,7 @@ bool AnswerT::add_fields(const bool charge, const bool rot) {
|
||||
template <class numtyp, class acctyp>
|
||||
void AnswerT::clear() {
|
||||
_gpu_bytes=0;
|
||||
error_flag.clear();
|
||||
if (!_allocated)
|
||||
return;
|
||||
_allocated=false;
|
||||
@ -138,12 +146,21 @@ double AnswerT::host_memory_usage() const {
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void AnswerT::copy_answers(const bool eflag, const bool vflag,
|
||||
const bool ef_atom, const bool vf_atom) {
|
||||
const bool ef_atom, const bool vf_atom,
|
||||
const int red_blocks) {
|
||||
time_answer.start();
|
||||
_eflag=eflag;
|
||||
_vflag=vflag;
|
||||
_ef_atom=ef_atom;
|
||||
_vf_atom=vf_atom;
|
||||
#ifdef LAL_NO_BLOCK_REDUCE
|
||||
_ev_stride=_inum;
|
||||
#else
|
||||
if (ef_atom || vf_atom)
|
||||
_ev_stride=_inum;
|
||||
else
|
||||
_ev_stride=red_blocks;
|
||||
#endif
|
||||
|
||||
int csize=_ev_fields;
|
||||
if (!eflag)
|
||||
@ -152,20 +169,24 @@ void AnswerT::copy_answers(const bool eflag, const bool vflag,
|
||||
csize-=6;
|
||||
|
||||
if (csize>0)
|
||||
engv.update_host(_inum*csize,true);
|
||||
engv.update_host(_ev_stride*csize,true);
|
||||
if (_rot)
|
||||
force.update_host(_inum*4*2,true);
|
||||
else
|
||||
force.update_host(_inum*4,true);
|
||||
time_answer.stop();
|
||||
|
||||
#ifndef GERYON_OCL_FLUSH
|
||||
force.flush();
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void AnswerT::copy_answers(const bool eflag, const bool vflag,
|
||||
const bool ef_atom, const bool vf_atom,
|
||||
int *ilist) {
|
||||
int *ilist, const int red_blocks) {
|
||||
_ilist=ilist;
|
||||
copy_answers(eflag,vflag,ef_atom,vf_atom);
|
||||
copy_answers(eflag,vflag,ef_atom,vf_atom,red_blocks);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
@ -177,21 +198,24 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
|
||||
double evdwl=0.0;
|
||||
int vstart=0;
|
||||
if (_eflag) {
|
||||
for (int i=0; i<_inum; i++)
|
||||
#if (LAL_USE_OMP_SIMD == 1)
|
||||
#pragma omp simd reduction(+:evdwl)
|
||||
#endif
|
||||
for (int i=0; i<_ev_stride; i++)
|
||||
evdwl+=engv[i];
|
||||
if (_ef_atom) {
|
||||
if (_ilist==nullptr) {
|
||||
for (int i=0; i<_inum; i++)
|
||||
for (int i=0; i<_ev_stride; i++)
|
||||
eatom[i]+=engv[i];
|
||||
} else {
|
||||
for (int i=0; i<_inum; i++)
|
||||
for (int i=0; i<_ev_stride; i++)
|
||||
eatom[_ilist[i]]+=engv[i];
|
||||
}
|
||||
}
|
||||
vstart=_inum;
|
||||
vstart=_ev_stride;
|
||||
}
|
||||
if (_vflag) {
|
||||
int iend=vstart+_inum;
|
||||
int iend=vstart+_ev_stride;
|
||||
for (int j=0; j<6; j++) {
|
||||
for (int i=vstart; i<iend; i++)
|
||||
virial[j]+=engv[i];
|
||||
@ -206,8 +230,8 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
|
||||
vatom[_ilist[ii++]][j]+=engv[i];
|
||||
}
|
||||
}
|
||||
vstart+=_inum;
|
||||
iend+=_inum;
|
||||
vstart+=_ev_stride;
|
||||
iend+=_ev_stride;
|
||||
}
|
||||
}
|
||||
|
||||
@ -224,28 +248,36 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
|
||||
return energy_virial(eatom,vatom,virial);
|
||||
|
||||
double evdwl=0.0;
|
||||
int ii, vstart=0, iend=_inum;
|
||||
int ii, vstart=0, iend=_ev_stride;
|
||||
if (_eflag) {
|
||||
iend=_inum*2;
|
||||
for (int i=0; i<_inum; i++)
|
||||
iend=_ev_stride*2;
|
||||
#if (LAL_USE_OMP_SIMD == 1)
|
||||
#pragma omp simd reduction(+:evdwl)
|
||||
#endif
|
||||
for (int i=0; i<_ev_stride; i++)
|
||||
evdwl+=engv[i];
|
||||
for (int i=_inum; i<iend; i++)
|
||||
ecoul+=engv[i];
|
||||
double ecv=0.0;
|
||||
#if (LAL_USE_OMP_SIMD == 1)
|
||||
#pragma omp simd reduction(+:ecv)
|
||||
#endif
|
||||
for (int i=_ev_stride; i<iend; i++)
|
||||
ecv+=engv[i];
|
||||
ecoul+=ecv;
|
||||
if (_ef_atom) {
|
||||
if (_ilist==nullptr) {
|
||||
for (int i=0; i<_inum; i++)
|
||||
for (int i=0; i<_ev_stride; i++)
|
||||
eatom[i]+=engv[i];
|
||||
for (int i=_inum; i<iend; i++)
|
||||
for (int i=_ev_stride; i<iend; i++)
|
||||
eatom[i]+=engv[i];
|
||||
} else {
|
||||
for (int i=0, ii=0; i<_inum; i++)
|
||||
for (int i=0, ii=0; i<_ev_stride; i++)
|
||||
eatom[_ilist[ii++]]+=engv[i];
|
||||
for (int i=_inum, ii=0; i<iend; i++)
|
||||
for (int i=_ev_stride, ii=0; i<iend; i++)
|
||||
eatom[_ilist[ii++]]+=engv[i];
|
||||
}
|
||||
}
|
||||
vstart=iend;
|
||||
iend+=_inum;
|
||||
iend+=_ev_stride;
|
||||
}
|
||||
if (_vflag) {
|
||||
for (int j=0; j<6; j++) {
|
||||
@ -260,8 +292,8 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
|
||||
vatom[_ilist[ii++]][j]+=engv[i];
|
||||
}
|
||||
}
|
||||
vstart+=_inum;
|
||||
iend+=_inum;
|
||||
vstart+=_ev_stride;
|
||||
iend+=_ev_stride;
|
||||
}
|
||||
}
|
||||
|
||||
@ -270,24 +302,63 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void AnswerT::get_answers(double **f, double **tor) {
|
||||
int fl=0;
|
||||
if (_ilist==nullptr) {
|
||||
for (int i=0; i<_inum; i++) {
|
||||
f[i][0]+=force[fl];
|
||||
f[i][1]+=force[fl+1];
|
||||
f[i][2]+=force[fl+2];
|
||||
fl+=4;
|
||||
typedef struct { double x,y,z; } vec3d;
|
||||
typedef struct { acctyp x,y,z,w; } vec4d_t;
|
||||
vec3d *fp=reinterpret_cast<vec3d*>(&(f[0][0]));
|
||||
vec4d_t *forcep=reinterpret_cast<vec4d_t*>(&(force[0]));
|
||||
|
||||
#if (LAL_USE_OMP == 1)
|
||||
#pragma omp parallel
|
||||
#endif
|
||||
{
|
||||
#if (LAL_USE_OMP == 1)
|
||||
const int nthreads = omp_get_num_threads();
|
||||
const int tid = omp_get_thread_num();
|
||||
const int idelta = _inum / nthreads + 1;
|
||||
const int ifrom = tid * idelta;
|
||||
const int ito = std::min(ifrom + idelta, _inum);
|
||||
#else
|
||||
const int tid = 0;
|
||||
const int ifrom = 0;
|
||||
const int ito = _inum;
|
||||
#endif
|
||||
|
||||
for (int i=ifrom; i<ito; i++) {
|
||||
fp[i].x+=forcep[i].x;
|
||||
fp[i].y+=forcep[i].y;
|
||||
fp[i].z+=forcep[i].z;
|
||||
}
|
||||
if (_rot) {
|
||||
for (int i=0; i<_inum; i++) {
|
||||
tor[i][0]+=force[fl];
|
||||
tor[i][1]+=force[fl+1];
|
||||
tor[i][2]+=force[fl+2];
|
||||
fl+=4;
|
||||
vec3d *torp=reinterpret_cast<vec3d*>(&(tor[0][0]));
|
||||
forcep=reinterpret_cast<vec4d_t*>(&(force[_inum*4]));
|
||||
for (int i=ifrom; i<ito; i++) {
|
||||
torp[i].x+=forcep[i].x;
|
||||
torp[i].y+=forcep[i].y;
|
||||
torp[i].z+=forcep[i].z;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int i=0; i<_inum; i++) {
|
||||
#if (LAL_USE_OMP == 1)
|
||||
#pragma omp parallel
|
||||
#endif
|
||||
{
|
||||
#if (LAL_USE_OMP == 1)
|
||||
const int nthreads = omp_get_num_threads();
|
||||
const int tid = omp_get_thread_num();
|
||||
const int idelta = _inum / nthreads + 1;
|
||||
const int ifrom = tid * idelta;
|
||||
const int ito = std::min(ifrom + idelta, _inum);
|
||||
int fl=ifrom*4;
|
||||
#else
|
||||
const int tid = 0;
|
||||
const int ifrom = 0;
|
||||
const int ito = _inum;
|
||||
int fl=0;
|
||||
#endif
|
||||
|
||||
for (int i=ifrom; i<ito; i++) {
|
||||
int ii=_ilist[i];
|
||||
f[ii][0]+=force[fl];
|
||||
f[ii][1]+=force[fl+1];
|
||||
@ -295,7 +366,8 @@ void AnswerT::get_answers(double **f, double **tor) {
|
||||
fl+=4;
|
||||
}
|
||||
if (_rot) {
|
||||
for (int i=0; i<_inum; i++) {
|
||||
fl=_inum*4 + ifrom*4;
|
||||
for (int i=ifrom; i<ito; i++) {
|
||||
int ii=_ilist[i];
|
||||
tor[ii][0]+=force[fl];
|
||||
tor[ii][1]+=force[fl+1];
|
||||
@ -304,6 +376,7 @@ void AnswerT::get_answers(double **f, double **tor) {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
|
||||
@ -110,12 +110,12 @@ class Answer {
|
||||
// -------------------------COPY FROM GPU -------------------------------
|
||||
|
||||
/// Copy answers from device into read buffer asynchronously
|
||||
void copy_answers(const bool eflag, const bool vflag,
|
||||
const bool ef_atom, const bool vf_atom);
|
||||
void copy_answers(const bool eflag, const bool vflag, const bool ef_atom,
|
||||
const bool vf_atom, const int red_blocks);
|
||||
|
||||
/// Copy answers from device into read buffer asynchronously
|
||||
void copy_answers(const bool eflag, const bool vflag,
|
||||
const bool ef_atom, const bool vf_atom, int *ilist);
|
||||
void copy_answers(const bool eflag, const bool vflag, const bool ef_atom,
|
||||
const bool vf_atom, int *ilist, const int red_blocks);
|
||||
|
||||
/// Copy energy and virial data into LAMMPS memory
|
||||
double energy_virial(double *eatom, double **vatom, double *virial);
|
||||
@ -128,11 +128,13 @@ class Answer {
|
||||
void get_answers(double **f, double **tor);
|
||||
|
||||
inline double get_answers(double **f, double **tor, double *eatom,
|
||||
double **vatom, double *virial, double &ecoul) {
|
||||
double **vatom, double *virial, double &ecoul,
|
||||
int &error_flag_in) {
|
||||
double ta=MPI_Wtime();
|
||||
time_answer.sync_stop();
|
||||
_time_cpu_idle+=MPI_Wtime()-ta;
|
||||
double ts=MPI_Wtime();
|
||||
if (error_flag[0]) error_flag_in=error_flag[0];
|
||||
double evdw=energy_virial(eatom,vatom,virial,ecoul);
|
||||
get_answers(f,tor);
|
||||
_time_cast+=MPI_Wtime()-ts;
|
||||
@ -151,6 +153,8 @@ class Answer {
|
||||
UCL_Vector<acctyp,acctyp> force;
|
||||
/// Energy and virial per-atom storage
|
||||
UCL_Vector<acctyp,acctyp> engv;
|
||||
/// Error flag
|
||||
UCL_Vector<int,int> error_flag;
|
||||
|
||||
/// Device timers
|
||||
UCL_Timer time_answer;
|
||||
@ -162,7 +166,7 @@ class Answer {
|
||||
bool alloc(const int inum);
|
||||
|
||||
bool _allocated, _eflag, _vflag, _ef_atom, _vf_atom, _rot, _charge, _other;
|
||||
int _max_local, _inum, _e_fields, _ev_fields, _ans_fields;
|
||||
int _max_local, _inum, _e_fields, _ev_fields, _ans_fields, _ev_stride;
|
||||
int *_ilist;
|
||||
double _time_cast, _time_cpu_idle;
|
||||
|
||||
|
||||
@ -414,9 +414,9 @@ const char *atom=0;
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void AtomT::compile_kernels(UCL_Device &dev) {
|
||||
std::string flags = "-D"+std::string(OCL_VENDOR);
|
||||
std::string flags = "";
|
||||
atom_program=new UCL_Program(dev);
|
||||
atom_program->load_string(atom,flags);
|
||||
atom_program->load_string(atom,flags,nullptr,screen);
|
||||
k_cast_x.set_function(*atom_program,"kernel_cast_x");
|
||||
_compiled=true;
|
||||
}
|
||||
|
||||
@ -24,6 +24,9 @@
|
||||
#include "geryon/ocl_mat.h"
|
||||
#include "geryon/ocl_kernel.h"
|
||||
using namespace ucl_opencl;
|
||||
#ifndef LAL_NO_OCL_EV_JIT
|
||||
#define LAL_OCL_EV_JIT
|
||||
#endif
|
||||
#elif defined(USE_CUDART)
|
||||
#include "geryon/nvc_timer.h"
|
||||
#include "geryon/nvc_mat.h"
|
||||
@ -178,7 +181,7 @@ class Atom {
|
||||
ii+=m_size-n;
|
||||
}
|
||||
UCL_H_Vec<dev_typ> view;
|
||||
view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
|
||||
view.view_offset(0,buffer,m_size*m_size);
|
||||
ucl_copy(dev_v,view,false);
|
||||
}
|
||||
|
||||
@ -197,7 +200,26 @@ class Atom {
|
||||
ii+=m_size-n;
|
||||
}
|
||||
UCL_H_Vec<dev_typ> view;
|
||||
view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
|
||||
view.view_offset(0,buffer,m_size*m_size);
|
||||
ucl_copy(dev_v,view,false);
|
||||
}
|
||||
|
||||
/// Pack LAMMPS atom type constants into 2 vectors and copy to device
|
||||
template <class dev_typ, class t1, class t2>
|
||||
inline void type_pack2(const int n, UCL_D_Vec<dev_typ> &dev_v,
|
||||
UCL_H_Vec<numtyp> &buffer, t1 ***one, t2 ***two) {
|
||||
int ii=0;
|
||||
for (int i=0; i<n; i++) {
|
||||
for (int j=0; j<n; j++) {
|
||||
for (int k=0; k<n; k++) {
|
||||
buffer[ii*2]=static_cast<numtyp>(one[i][j][k]);
|
||||
buffer[ii*2+1]=static_cast<numtyp>(two[i][j][k]);
|
||||
ii++;
|
||||
}
|
||||
}
|
||||
}
|
||||
UCL_H_Vec<dev_typ> view;
|
||||
view.view_offset(0,buffer,n*n*n);
|
||||
ucl_copy(dev_v,view,false);
|
||||
}
|
||||
|
||||
@ -217,7 +239,7 @@ class Atom {
|
||||
ii+=m_size-n;
|
||||
}
|
||||
UCL_H_Vec<dev_typ> view;
|
||||
view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
|
||||
view.view_offset(0,buffer,m_size*m_size);
|
||||
ucl_copy(dev_v,view,false);
|
||||
}
|
||||
|
||||
@ -238,7 +260,7 @@ class Atom {
|
||||
ii+=m_size-n;
|
||||
}
|
||||
UCL_H_Vec<dev_typ> view;
|
||||
view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
|
||||
view.view_offset(0,buffer,m_size*m_size);
|
||||
ucl_copy(dev_v,view,false);
|
||||
}
|
||||
|
||||
@ -251,7 +273,7 @@ class Atom {
|
||||
buffer[i*2+1]=static_cast<numtyp>(two[i][i]);
|
||||
}
|
||||
UCL_H_Vec<dev_typ> view;
|
||||
view.view((dev_typ*)buffer.begin(),n,*dev);
|
||||
view.view_offset(0,buffer,n);
|
||||
ucl_copy(dev_v,view,false);
|
||||
}
|
||||
|
||||
@ -261,6 +283,9 @@ class Atom {
|
||||
inline void data_unavail()
|
||||
{ _x_avail=false; _q_avail=false; _quat_avail=false; _v_avail=false; _resized=false; }
|
||||
|
||||
typedef struct { double x,y,z; } vec3d;
|
||||
typedef struct { numtyp x,y,z,w; } vec4d_t;
|
||||
|
||||
/// Cast positions and types to write buffer
|
||||
inline void cast_x_data(double **host_ptr, const int *host_type) {
|
||||
if (_x_avail==false) {
|
||||
@ -269,13 +294,16 @@ class Atom {
|
||||
memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
|
||||
memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int));
|
||||
#else
|
||||
int wl=0;
|
||||
vec3d *host_p=reinterpret_cast<vec3d*>(&(host_ptr[0][0]));
|
||||
vec4d_t *xp=reinterpret_cast<vec4d_t*>(&(x[0]));
|
||||
#if (LAL_USE_OMP == 1)
|
||||
#pragma omp parallel for schedule(static)
|
||||
#endif
|
||||
for (int i=0; i<_nall; i++) {
|
||||
x[wl]=host_ptr[i][0];
|
||||
x[wl+1]=host_ptr[i][1];
|
||||
x[wl+2]=host_ptr[i][2];
|
||||
x[wl+3]=host_type[i];
|
||||
wl+=4;
|
||||
xp[i].x=host_p[i].x;
|
||||
xp[i].y=host_p[i].y;
|
||||
xp[i].z=host_p[i].z;
|
||||
xp[i].w=host_type[i];
|
||||
}
|
||||
#endif
|
||||
_time_cast+=MPI_Wtime()-t;
|
||||
@ -320,6 +348,11 @@ class Atom {
|
||||
} else if (sizeof(numtyp)==sizeof(double))
|
||||
memcpy(q.host.begin(),host_ptr,_nall*sizeof(numtyp));
|
||||
else
|
||||
#if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
|
||||
#pragma omp parallel for simd schedule(static)
|
||||
#elif (LAL_USE_OMP_SIMD == 1)
|
||||
#pragma omp simd
|
||||
#endif
|
||||
for (int i=0; i<_nall; i++) q[i]=host_ptr[i];
|
||||
_time_cast+=MPI_Wtime()-t;
|
||||
}
|
||||
@ -346,6 +379,11 @@ class Atom {
|
||||
} else if (sizeof(numtyp)==sizeof(double))
|
||||
memcpy(quat.host.begin(),host_ptr,_nall*4*sizeof(numtyp));
|
||||
else
|
||||
#if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
|
||||
#pragma omp parallel for simd schedule(static)
|
||||
#elif (LAL_USE_OMP_SIMD == 1)
|
||||
#pragma omp simd
|
||||
#endif
|
||||
for (int i=0; i<_nall*4; i++) quat[i]=host_ptr[i];
|
||||
_time_cast+=MPI_Wtime()-t;
|
||||
}
|
||||
@ -370,13 +408,16 @@ class Atom {
|
||||
memcpy(host_v_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
|
||||
memcpy(host_tag_cast.begin(),host_tag,_nall*sizeof(int));
|
||||
#else
|
||||
int wl=0;
|
||||
vec3d *host_p=reinterpret_cast<vec3d*>(&(host_ptr[0][0]));
|
||||
vec4d_t *vp=reinterpret_cast<vec4d_t*>(&(v[0]));
|
||||
#if (LAL_USE_OMP == 1)
|
||||
#pragma omp parallel for schedule(static)
|
||||
#endif
|
||||
for (int i=0; i<_nall; i++) {
|
||||
v[wl]=host_ptr[i][0];
|
||||
v[wl+1]=host_ptr[i][1];
|
||||
v[wl+2]=host_ptr[i][2];
|
||||
v[wl+3]=host_tag[i];
|
||||
wl+=4;
|
||||
vp[i].x=host_p[i].x;
|
||||
vp[i].y=host_p[i].y;
|
||||
vp[i].z=host_p[i].z;
|
||||
vp[i].w=host_tag[i];
|
||||
}
|
||||
#endif
|
||||
_time_cast+=MPI_Wtime()-t;
|
||||
|
||||
@ -40,170 +40,521 @@
|
||||
nbor_begin+=offset; \
|
||||
}
|
||||
|
||||
#if (ARCH < 300)
|
||||
#define nbor_info_p(nbor_mem, nbor_stride, t_per_atom, ii, offset, \
|
||||
i, numj, stride, nbor_end, nbor_begin) \
|
||||
i=nbor_mem[ii]; \
|
||||
nbor_begin=ii+nbor_stride; \
|
||||
numj=nbor_mem[nbor_begin]; \
|
||||
nbor_begin+=nbor_stride+ii*(t_per_atom-1); \
|
||||
stride=fast_mul(t_per_atom,nbor_stride); \
|
||||
nbor_end=nbor_begin+fast_mul(numj/t_per_atom,stride)+(numj & \
|
||||
(t_per_atom-1)); \
|
||||
nbor_begin+=offset;
|
||||
|
||||
#define store_answers(f, energy, virial, ii, inum, tid, t_per_atom, offset, \
|
||||
eflag, vflag, ans, engv) \
|
||||
if (t_per_atom>1) { \
|
||||
__local acctyp red_acc[6][BLOCK_PAIR]; \
|
||||
red_acc[0][tid]=f.x; \
|
||||
red_acc[1][tid]=f.y; \
|
||||
red_acc[2][tid]=f.z; \
|
||||
red_acc[3][tid]=energy; \
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
|
||||
#if (SHUFFLE_AVAIL == 0)
|
||||
|
||||
#define simd_reduce_add1(width, local, offset, tid, one) \
|
||||
local[0][tid]=one; \
|
||||
for (unsigned int s=width/2; s>0; s>>=1) { \
|
||||
simdsync(); \
|
||||
if (offset < s) local[0][tid] += local[0][tid+s]; \
|
||||
} \
|
||||
if (offset==0) one=local[0][tid];
|
||||
|
||||
#define simd_reduce_add2(width, local, offset, tid, one, two) \
|
||||
local[0][tid]=one; \
|
||||
local[1][tid]=two; \
|
||||
for (unsigned int s=width/2; s>0; s>>=1) { \
|
||||
simdsync(); \
|
||||
if (offset < s) { \
|
||||
for (int r=0; r<4; r++) \
|
||||
red_acc[r][tid] += red_acc[r][tid+s]; \
|
||||
} \
|
||||
} \
|
||||
f.x=red_acc[0][tid]; \
|
||||
f.y=red_acc[1][tid]; \
|
||||
f.z=red_acc[2][tid]; \
|
||||
energy=red_acc[3][tid]; \
|
||||
if (vflag>0) { \
|
||||
for (int r=0; r<6; r++) \
|
||||
red_acc[r][tid]=virial[r]; \
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
|
||||
if (offset < s) { \
|
||||
for (int r=0; r<6; r++) \
|
||||
red_acc[r][tid] += red_acc[r][tid+s]; \
|
||||
} \
|
||||
} \
|
||||
for (int r=0; r<6; r++) \
|
||||
virial[r]=red_acc[r][tid]; \
|
||||
local[0][tid] += local[0][tid+s]; \
|
||||
local[1][tid] += local[1][tid+s]; \
|
||||
} \
|
||||
} \
|
||||
if (offset==0) { \
|
||||
one=local[0][tid]; \
|
||||
two=local[1][tid]; \
|
||||
}
|
||||
|
||||
#define simd_reduce_add3(width, local, offset, tid, one, two, three) \
|
||||
local[0][tid]=one; \
|
||||
local[1][tid]=two; \
|
||||
local[2][tid]=three; \
|
||||
for (unsigned int s=width/2; s>0; s>>=1) { \
|
||||
simdsync(); \
|
||||
if (offset < s) { \
|
||||
local[0][tid] += local[0][tid+s]; \
|
||||
local[1][tid] += local[1][tid+s]; \
|
||||
local[2][tid] += local[2][tid+s]; \
|
||||
} \
|
||||
} \
|
||||
if (offset==0) { \
|
||||
one=local[0][tid]; \
|
||||
two=local[1][tid]; \
|
||||
three=local[2][tid]; \
|
||||
}
|
||||
|
||||
#define simd_reduce_add6(width, local, offset, tid, one, two, three, \
|
||||
four, five, six) \
|
||||
local[0][tid]=one; \
|
||||
local[1][tid]=two; \
|
||||
local[2][tid]=three; \
|
||||
local[3][tid]=four; \
|
||||
local[4][tid]=five; \
|
||||
local[5][tid]=six; \
|
||||
for (unsigned int s=width/2; s>0; s>>=1) { \
|
||||
simdsync(); \
|
||||
if (offset < s) { \
|
||||
local[0][tid] += local[0][tid+s]; \
|
||||
local[1][tid] += local[1][tid+s]; \
|
||||
local[2][tid] += local[2][tid+s]; \
|
||||
local[3][tid] += local[3][tid+s]; \
|
||||
local[4][tid] += local[4][tid+s]; \
|
||||
local[5][tid] += local[5][tid+s]; \
|
||||
} \
|
||||
} \
|
||||
if (offset==0) { \
|
||||
one=local[0][tid]; \
|
||||
two=local[1][tid]; \
|
||||
three=local[2][tid]; \
|
||||
four=local[3][tid]; \
|
||||
five=local[4][tid]; \
|
||||
six=local[5][tid]; \
|
||||
}
|
||||
|
||||
#define simd_reduce_arr(trip, width, local, offset, tid, arr) \
|
||||
for (int r=0; r<trip; r++) \
|
||||
local[r][tid]=arr[r]; \
|
||||
for (unsigned int s=width/2; s>0; s>>=1) { \
|
||||
simdsync(); \
|
||||
if (offset < s) { \
|
||||
for (int r=0; r<trip; r++) \
|
||||
local[r][tid] += local[r][tid+s]; \
|
||||
} \
|
||||
} \
|
||||
if (offset==0) { \
|
||||
for (int r=0; r<trip; r++) \
|
||||
arr[r]=local[r][tid]; \
|
||||
}
|
||||
|
||||
#define block_reduce_add1(width, local, tid, one) \
|
||||
local[0][tid]=one; \
|
||||
for (unsigned int s=BLOCK_SIZE_X/2; s>width/2; s>>=1) { \
|
||||
__syncthreads(); \
|
||||
if (tid < s) local[0][tid] += local[0][tid+s]; \
|
||||
} \
|
||||
if (tid<width) { \
|
||||
for (unsigned int s=width/2; s>0; s>>=1) { \
|
||||
simdsync(); \
|
||||
if (tid < s) local[0][tid] += local[0][tid+s]; \
|
||||
} \
|
||||
if (tid==0) one=local[0][tid]; \
|
||||
}
|
||||
|
||||
#define block_reduce_add2(width, local, tid, one, two) \
|
||||
local[0][tid]=one; \
|
||||
local[1][tid]=two; \
|
||||
for (unsigned int s=BLOCK_SIZE_X/2; s>width/2; s>>=1) { \
|
||||
__syncthreads(); \
|
||||
if (tid < s) { \
|
||||
local[0][tid] += local[0][tid+s]; \
|
||||
local[1][tid] += local[1][tid+s]; \
|
||||
} \
|
||||
} \
|
||||
if (tid<width) { \
|
||||
for (unsigned int s=width/2; s>0; s>>=1) { \
|
||||
simdsync(); \
|
||||
if (tid < s) { \
|
||||
local[0][tid] += local[0][tid+s]; \
|
||||
local[1][tid] += local[1][tid+s]; \
|
||||
} \
|
||||
} \
|
||||
if (tid==0) { \
|
||||
one=local[0][tid]; \
|
||||
two=local[1][tid]; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define block_reduce_arr(trip, width, local, tid, arr) \
|
||||
for (int r=0; r<trip; r++) \
|
||||
local[r][tid]=arr[r]; \
|
||||
for (unsigned int s=BLOCK_SIZE_X/2; s>width/2; s>>=1) { \
|
||||
__syncthreads(); \
|
||||
if (tid < s) { \
|
||||
for (int r=0; r<trip; r++) \
|
||||
local[r][tid] += local[r][tid+s]; \
|
||||
} \
|
||||
} \
|
||||
if (tid<width) { \
|
||||
for (unsigned int s=width/2; s>0; s>>=1) { \
|
||||
simdsync(); \
|
||||
if (tid < s) { \
|
||||
for (int r=0; r<trip; r++) \
|
||||
local[r][tid] += local[r][tid+s]; \
|
||||
} \
|
||||
} \
|
||||
if (tid==0) { \
|
||||
for (int r=0; r<trip; r++) \
|
||||
arr[r]=local[r][tid]; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define local_allocate_store_pair() \
|
||||
__local acctyp red_acc[6][BLOCK_PAIR];
|
||||
#define local_allocate_store_charge() \
|
||||
__local acctyp red_acc[6][BLOCK_PAIR];
|
||||
#define local_allocate_store_bio() \
|
||||
__local acctyp red_acc[6][BLOCK_BIO_PAIR];
|
||||
#define local_allocate_store_ellipse() \
|
||||
__local acctyp red_acc[6][BLOCK_ELLIPSE];
|
||||
#define local_allocate_store_three() \
|
||||
__local acctyp red_acc[6][BLOCK_ELLIPSE];
|
||||
|
||||
#define store_answers(f, energy, virial, ii, inum, tid, \
|
||||
t_per_atom, offset, eflag, vflag, ans, engv) \
|
||||
if (t_per_atom>1) { \
|
||||
simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z); \
|
||||
if (EVFLAG && (vflag==2 || eflag==2)) { \
|
||||
if (eflag) { \
|
||||
simdsync(); \
|
||||
simd_reduce_add1(t_per_atom, red_acc, offset, tid, energy); \
|
||||
} \
|
||||
if (vflag) { \
|
||||
simdsync(); \
|
||||
simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
if (offset==0 && ii<inum) ans[ii]=f; \
|
||||
if (EVFLAG && (eflag || vflag)) { \
|
||||
int ei=BLOCK_ID_X; \
|
||||
if (eflag!=2 && vflag!=2) { \
|
||||
const int ev_stride=NUM_BLOCKS_X; \
|
||||
if (eflag) { \
|
||||
simdsync(); \
|
||||
block_reduce_add1(simd_size(), red_acc, tid, energy); \
|
||||
if (vflag) __syncthreads(); \
|
||||
if (tid==0) { \
|
||||
engv[ei]=energy*(acctyp)0.5; \
|
||||
ei+=ev_stride; \
|
||||
} \
|
||||
} \
|
||||
if (vflag) { \
|
||||
simdsync(); \
|
||||
block_reduce_arr(6, simd_size(), red_acc, tid, virial); \
|
||||
if (tid==0) { \
|
||||
for (int r=0; r<6; r++) { \
|
||||
engv[ei]=virial[r]*(acctyp)0.5; \
|
||||
ei+=ev_stride; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} else if (offset==0 && ii<inum) { \
|
||||
int ei=ii; \
|
||||
if (eflag>0) { \
|
||||
if (EVFLAG && eflag) { \
|
||||
engv[ei]=energy*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
} \
|
||||
if (vflag>0) { \
|
||||
if (EVFLAG && vflag) { \
|
||||
for (int i=0; i<6; i++) { \
|
||||
engv[ei]=virial[i]*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
} \
|
||||
} \
|
||||
ans[ii]=f; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define store_answers_q(f, energy, e_coul, virial, ii, inum, tid, \
|
||||
t_per_atom, offset, eflag, vflag, ans, engv) \
|
||||
if (t_per_atom>1) { \
|
||||
__local acctyp red_acc[6][BLOCK_PAIR]; \
|
||||
red_acc[0][tid]=f.x; \
|
||||
red_acc[1][tid]=f.y; \
|
||||
red_acc[2][tid]=f.z; \
|
||||
red_acc[3][tid]=energy; \
|
||||
red_acc[4][tid]=e_coul; \
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
|
||||
if (offset < s) { \
|
||||
for (int r=0; r<5; r++) \
|
||||
red_acc[r][tid] += red_acc[r][tid+s]; \
|
||||
simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z); \
|
||||
if (EVFLAG && (vflag==2 || eflag==2)) { \
|
||||
if (eflag) { \
|
||||
simdsync(); \
|
||||
simd_reduce_add2(t_per_atom, red_acc, offset, tid, energy, e_coul); \
|
||||
} \
|
||||
if (vflag) { \
|
||||
simdsync(); \
|
||||
simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial); \
|
||||
} \
|
||||
} \
|
||||
f.x=red_acc[0][tid]; \
|
||||
f.y=red_acc[1][tid]; \
|
||||
f.z=red_acc[2][tid]; \
|
||||
energy=red_acc[3][tid]; \
|
||||
e_coul=red_acc[4][tid]; \
|
||||
if (vflag>0) { \
|
||||
for (int r=0; r<6; r++) \
|
||||
red_acc[r][tid]=virial[r]; \
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
|
||||
if (offset < s) { \
|
||||
for (int r=0; r<6; r++) \
|
||||
red_acc[r][tid] += red_acc[r][tid+s]; \
|
||||
} \
|
||||
if (offset==0 && ii<inum) ans[ii]=f; \
|
||||
if (EVFLAG && (eflag || vflag)) { \
|
||||
int ei=BLOCK_ID_X; \
|
||||
const int ev_stride=NUM_BLOCKS_X; \
|
||||
if (eflag!=2 && vflag!=2) { \
|
||||
if (eflag) { \
|
||||
simdsync(); \
|
||||
block_reduce_add2(simd_size(), red_acc, tid, energy, e_coul); \
|
||||
if (vflag) __syncthreads(); \
|
||||
if (tid==0) { \
|
||||
engv[ei]=energy*(acctyp)0.5; \
|
||||
ei+=ev_stride; \
|
||||
engv[ei]=e_coul*(acctyp)0.5; \
|
||||
ei+=ev_stride; \
|
||||
} \
|
||||
} \
|
||||
for (int r=0; r<6; r++) \
|
||||
virial[r]=red_acc[r][tid]; \
|
||||
if (vflag) { \
|
||||
simdsync(); \
|
||||
block_reduce_arr(6, simd_size(), red_acc, tid, virial); \
|
||||
if (tid==0) { \
|
||||
for (int r=0; r<6; r++) { \
|
||||
engv[ei]=virial[r]*(acctyp)0.5; \
|
||||
ei+=ev_stride; \
|
||||
} \
|
||||
} \
|
||||
if (offset==0) { \
|
||||
} \
|
||||
} else if (offset==0 && ii<inum) { \
|
||||
int ei=ii; \
|
||||
if (eflag>0) { \
|
||||
if (EVFLAG && eflag) { \
|
||||
engv[ei]=energy*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
engv[ei]=e_coul*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
} \
|
||||
if (vflag>0) { \
|
||||
if (EVFLAG && vflag) { \
|
||||
for (int i=0; i<6; i++) { \
|
||||
engv[ei]=virial[i]*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
} \
|
||||
} \
|
||||
ans[ii]=f; \
|
||||
} \
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define store_answers(f, energy, virial, ii, inum, tid, t_per_atom, offset, \
|
||||
eflag, vflag, ans, engv) \
|
||||
#define simd_reduce_add1(width, one) \
|
||||
for (unsigned int s=width/2; s>0; s>>=1) one += shfl_down(one, s, width);
|
||||
|
||||
#define simd_reduce_add2(width, one, two) \
|
||||
for (unsigned int s=width/2; s>0; s>>=1) { \
|
||||
one += shfl_down(one, s, width); \
|
||||
two += shfl_down(two, s, width); \
|
||||
}
|
||||
|
||||
#define simd_reduce_add3(width, one, two, three) \
|
||||
for (unsigned int s=width/2; s>0; s>>=1) { \
|
||||
one += shfl_down(one, s, width); \
|
||||
two += shfl_down(two, s, width); \
|
||||
three += shfl_down(three, s, width); \
|
||||
}
|
||||
|
||||
#define simd_reduce_add6(width, one, two, three, four, five, six) \
|
||||
for (unsigned int s=width/2; s>0; s>>=1) { \
|
||||
one += shfl_down(one, s, width); \
|
||||
two += shfl_down(two, s, width); \
|
||||
three += shfl_down(three, s, width); \
|
||||
four += shfl_down(four, s, width); \
|
||||
five += shfl_down(five, s, width); \
|
||||
six += shfl_down(six, s, width); \
|
||||
}
|
||||
|
||||
#define simd_reduce_arr(trip, width, arr) \
|
||||
for (unsigned int s=width/2; s>0; s>>=1) { \
|
||||
for (int r=0; r<trip; r++) \
|
||||
arr[r] += shfl_down(arr[r], s, width); \
|
||||
}
|
||||
|
||||
#if (EVFLAG == 1)
|
||||
|
||||
#define local_allocate_store_pair() \
|
||||
__local acctyp red_acc[7][BLOCK_PAIR / SIMD_SIZE];
|
||||
#define local_allocate_store_charge() \
|
||||
__local acctyp red_acc[8][BLOCK_PAIR / SIMD_SIZE];
|
||||
#define local_allocate_store_bio() \
|
||||
__local acctyp red_acc[8][BLOCK_BIO_PAIR / SIMD_SIZE];
|
||||
#define local_allocate_store_ellipse()
|
||||
#define local_allocate_store_three() \
|
||||
__local acctyp red_acc[7][BLOCK_ELLIPSE / SIMD_SIZE];
|
||||
|
||||
#define store_answers(f, energy, virial, ii, inum, tid, \
|
||||
t_per_atom, offset, eflag, vflag, ans, engv) \
|
||||
if (t_per_atom>1) { \
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
|
||||
f.x += shfl_xor(f.x, s, t_per_atom); \
|
||||
f.y += shfl_xor(f.y, s, t_per_atom); \
|
||||
f.z += shfl_xor(f.z, s, t_per_atom); \
|
||||
energy += shfl_xor(energy, s, t_per_atom); \
|
||||
simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \
|
||||
if (vflag==2 || eflag==2) { \
|
||||
if (eflag) \
|
||||
simd_reduce_add1(t_per_atom,energy); \
|
||||
if (vflag) \
|
||||
simd_reduce_arr(6, t_per_atom,virial); \
|
||||
} \
|
||||
if (vflag>0) { \
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
|
||||
for (int r=0; r<6; r++) \
|
||||
virial[r] += shfl_xor(virial[r], s, t_per_atom); \
|
||||
} \
|
||||
if (offset==0 && ii<inum) ans[ii]=f; \
|
||||
if (eflag || vflag) { \
|
||||
if (eflag!=2 && vflag!=2) { \
|
||||
const int vwidth = simd_size(); \
|
||||
const int voffset = tid & (simd_size() - 1); \
|
||||
const int bnum = tid/simd_size(); \
|
||||
int active_subgs = BLOCK_SIZE_X/simd_size(); \
|
||||
for ( ; active_subgs > 1; active_subgs /= vwidth) { \
|
||||
if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads(); \
|
||||
if (bnum < active_subgs) { \
|
||||
if (eflag) { \
|
||||
simd_reduce_add1(vwidth, energy); \
|
||||
if (voffset==0) red_acc[6][bnum] = energy; \
|
||||
} \
|
||||
if (vflag) { \
|
||||
simd_reduce_arr(6, vwidth, virial); \
|
||||
if (voffset==0) \
|
||||
for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r]; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
__syncthreads(); \
|
||||
if (tid < active_subgs) { \
|
||||
if (eflag) energy = red_acc[6][tid]; \
|
||||
if (vflag) \
|
||||
for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid]; \
|
||||
} else { \
|
||||
if (eflag) energy = (acctyp)0; \
|
||||
if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
if (bnum == 0) { \
|
||||
int ei=BLOCK_ID_X; \
|
||||
const int ev_stride=NUM_BLOCKS_X; \
|
||||
if (eflag) { \
|
||||
simd_reduce_add1(vwidth, energy); \
|
||||
if (tid==0) { \
|
||||
engv[ei]=energy*(acctyp)0.5; \
|
||||
ei+=ev_stride; \
|
||||
} \
|
||||
} \
|
||||
if (vflag) { \
|
||||
simd_reduce_arr(6, vwidth, virial); \
|
||||
if (tid==0) { \
|
||||
for (int r=0; r<6; r++) { \
|
||||
engv[ei]=virial[r]*(acctyp)0.5; \
|
||||
ei+=ev_stride; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
if (offset==0) { \
|
||||
} \
|
||||
} else if (offset==0 && ii<inum) { \
|
||||
int ei=ii; \
|
||||
if (eflag>0) { \
|
||||
if (eflag) { \
|
||||
engv[ei]=energy*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
} \
|
||||
if (vflag>0) { \
|
||||
if (vflag) { \
|
||||
for (int i=0; i<6; i++) { \
|
||||
engv[ei]=virial[i]*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
} \
|
||||
} \
|
||||
ans[ii]=f; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define store_answers_q(f, energy, e_coul, virial, ii, inum, tid, \
|
||||
t_per_atom, offset, eflag, vflag, ans, engv) \
|
||||
if (t_per_atom>1) { \
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
|
||||
f.x += shfl_xor(f.x, s, t_per_atom); \
|
||||
f.y += shfl_xor(f.y, s, t_per_atom); \
|
||||
f.z += shfl_xor(f.z, s, t_per_atom); \
|
||||
energy += shfl_xor(energy, s, t_per_atom); \
|
||||
e_coul += shfl_xor(e_coul, s, t_per_atom); \
|
||||
simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \
|
||||
if (vflag==2 || eflag==2) { \
|
||||
if (eflag) \
|
||||
simd_reduce_add2(t_per_atom,energy,e_coul); \
|
||||
if (vflag) \
|
||||
simd_reduce_arr(6, t_per_atom,virial); \
|
||||
} \
|
||||
if (vflag>0) { \
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
|
||||
for (int r=0; r<6; r++) \
|
||||
virial[r] += shfl_xor(virial[r], s, t_per_atom); \
|
||||
} \
|
||||
if (offset==0 && ii<inum) ans[ii]=f; \
|
||||
if (eflag || vflag) { \
|
||||
if (eflag!=2 && vflag!=2) { \
|
||||
const int vwidth = simd_size(); \
|
||||
const int voffset = tid & (simd_size() - 1); \
|
||||
const int bnum = tid/simd_size(); \
|
||||
int active_subgs = BLOCK_SIZE_X/simd_size(); \
|
||||
for ( ; active_subgs > 1; active_subgs /= vwidth) { \
|
||||
if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads(); \
|
||||
if (bnum < active_subgs) { \
|
||||
if (eflag) { \
|
||||
simd_reduce_add2(vwidth, energy, e_coul); \
|
||||
if (voffset==0) { \
|
||||
red_acc[6][bnum] = energy; \
|
||||
red_acc[7][bnum] = e_coul; \
|
||||
} \
|
||||
} \
|
||||
if (vflag) { \
|
||||
simd_reduce_arr(6, vwidth, virial); \
|
||||
if (voffset==0) \
|
||||
for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r]; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
__syncthreads(); \
|
||||
if (tid < active_subgs) { \
|
||||
if (eflag) { \
|
||||
energy = red_acc[6][tid]; \
|
||||
e_coul = red_acc[7][tid]; \
|
||||
} \
|
||||
if (vflag) \
|
||||
for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid]; \
|
||||
} else { \
|
||||
if (eflag) energy = e_coul = (acctyp)0; \
|
||||
if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
if (bnum == 0) { \
|
||||
int ei=BLOCK_ID_X; \
|
||||
const int ev_stride=NUM_BLOCKS_X; \
|
||||
if (eflag) { \
|
||||
simd_reduce_add2(vwidth, energy, e_coul); \
|
||||
if (tid==0) { \
|
||||
engv[ei]=energy*(acctyp)0.5; \
|
||||
ei+=ev_stride; \
|
||||
engv[ei]=e_coul*(acctyp)0.5; \
|
||||
ei+=ev_stride; \
|
||||
} \
|
||||
} \
|
||||
if (vflag) { \
|
||||
simd_reduce_arr(6, vwidth, virial); \
|
||||
if (tid==0) { \
|
||||
for (int r=0; r<6; r++) { \
|
||||
engv[ei]=virial[r]*(acctyp)0.5; \
|
||||
ei+=ev_stride; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
if (offset==0) { \
|
||||
} \
|
||||
} else if (offset==0 && ii<inum) { \
|
||||
int ei=ii; \
|
||||
if (eflag>0) { \
|
||||
if (eflag) { \
|
||||
engv[ei]=energy*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
engv[ei]=e_coul*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
} \
|
||||
if (vflag>0) { \
|
||||
if (vflag) { \
|
||||
for (int i=0; i<6; i++) { \
|
||||
engv[ei]=virial[i]*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
} \
|
||||
} \
|
||||
ans[ii]=f; \
|
||||
} \
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define local_allocate_store_pair()
|
||||
#define local_allocate_store_charge()
|
||||
#define local_allocate_store_bio()
|
||||
#define local_allocate_store_ellipse()
|
||||
#define local_allocate_store_three()
|
||||
|
||||
#define store_answers(f, energy, virial, ii, inum, tid, \
|
||||
t_per_atom, offset, eflag, vflag, ans, engv) \
|
||||
if (t_per_atom>1) \
|
||||
simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \
|
||||
if (offset==0 && ii<inum) ans[ii]=f;
|
||||
|
||||
#define store_answers_q(f, energy, e_coul, virial, ii, inum, tid, \
|
||||
t_per_atom, offset, eflag, vflag, ans, engv) \
|
||||
if (t_per_atom>1) \
|
||||
simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \
|
||||
if (offset==0 && ii<inum) ans[ii]=f;
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
@ -21,12 +21,15 @@ namespace LAMMPS_AL {
|
||||
extern Device<PRECISION,ACC_PRECISION> global_device;
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
BaseAtomicT::BaseAtomic() : _compiled(false), _max_bytes(0) {
|
||||
BaseAtomicT::BaseAtomic() : _compiled(false), _max_bytes(0), _onetype(0) {
|
||||
device=&global_device;
|
||||
ans=new Answer<numtyp,acctyp>();
|
||||
nbor=new Neighbor();
|
||||
pair_program=nullptr;
|
||||
ucl_device=nullptr;
|
||||
#if defined(LAL_OCL_EV_JIT)
|
||||
pair_program_noev=nullptr;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
@ -36,6 +39,10 @@ BaseAtomicT::~BaseAtomic() {
|
||||
k_pair_fast.clear();
|
||||
k_pair.clear();
|
||||
if (pair_program) delete pair_program;
|
||||
#if defined(LAL_OCL_EV_JIT)
|
||||
k_pair_noev.clear();
|
||||
if (pair_program_noev) delete pair_program_noev;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
@ -49,7 +56,7 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall,
|
||||
const int max_nbors, const int maxspecial,
|
||||
const double cell_size, const double gpu_split,
|
||||
FILE *_screen, const void *pair_program,
|
||||
const char *k_name) {
|
||||
const char *k_name, const int onetype) {
|
||||
screen=_screen;
|
||||
|
||||
int gpu_nbor=0;
|
||||
@ -64,28 +71,29 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall,
|
||||
_gpu_host=1;
|
||||
|
||||
_threads_per_atom=device->threads_per_atom();
|
||||
if (_threads_per_atom>1 && gpu_nbor==0) {
|
||||
nbor->packing(true);
|
||||
_nbor_data=&(nbor->dev_packed);
|
||||
} else
|
||||
_nbor_data=&(nbor->dev_nbor);
|
||||
|
||||
int success=device->init(*ans,false,false,nlocal,nall,maxspecial);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
|
||||
max_nbors,cell_size,false,_threads_per_atom);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
if (ucl_device!=device->gpu) _compiled=false;
|
||||
|
||||
ucl_device=device->gpu;
|
||||
atom=&device->atom;
|
||||
|
||||
_block_size=device->pair_block_size();
|
||||
compile_kernels(*ucl_device,pair_program,k_name);
|
||||
compile_kernels(*ucl_device,pair_program,k_name,onetype);
|
||||
|
||||
if (_threads_per_atom>1 && gpu_nbor==0) {
|
||||
nbor->packing(true);
|
||||
_nbor_data=&(nbor->dev_packed);
|
||||
} else
|
||||
_nbor_data=&(nbor->dev_nbor);
|
||||
|
||||
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
|
||||
max_nbors,cell_size,false,_threads_per_atom);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
// Initialize host-device load balancer
|
||||
hd_balancer.init(device,gpu_nbor,gpu_split);
|
||||
@ -102,8 +110,8 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall,
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void BaseAtomicT::estimate_gpu_overhead() {
|
||||
device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead);
|
||||
void BaseAtomicT::estimate_gpu_overhead(const int add_kernels) {
|
||||
device->estimate_gpu_overhead(1+add_kernels,_gpu_overhead,_driver_overhead);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
@ -164,8 +172,8 @@ inline void BaseAtomicT::build_nbor_list(const int inum, const int host_inum,
|
||||
atom->cast_copy_x(host_x,host_type);
|
||||
|
||||
int mn;
|
||||
nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, tag,
|
||||
nspecial, special, success, mn);
|
||||
nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi,
|
||||
tag, nspecial, special, success, mn, ans->error_flag);
|
||||
|
||||
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||
if (bytes>_max_an_bytes)
|
||||
@ -179,11 +187,25 @@ template <class numtyp, class acctyp>
|
||||
void BaseAtomicT::compute(const int f_ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh,
|
||||
const bool eflag, const bool vflag,
|
||||
const bool eflag_in, const bool vflag_in,
|
||||
const bool eatom, const bool vatom,
|
||||
int &host_start, const double cpu_time,
|
||||
bool &success) {
|
||||
acc_timers();
|
||||
int eflag, vflag;
|
||||
if (eatom) eflag=2;
|
||||
else if (eflag_in) eflag=1;
|
||||
else eflag=0;
|
||||
if (vatom) vflag=2;
|
||||
else if (vflag_in) vflag=1;
|
||||
else vflag=0;
|
||||
|
||||
#ifdef LAL_NO_BLOCK_REDUCE
|
||||
if (eflag) eflag=2;
|
||||
if (vflag) vflag=2;
|
||||
#endif
|
||||
|
||||
set_kernel(eflag,vflag);
|
||||
if (inum_full==0) {
|
||||
host_start=0;
|
||||
// Make sure textures are correct if realloc by a different hybrid style
|
||||
@ -207,8 +229,8 @@ void BaseAtomicT::compute(const int f_ago, const int inum_full,
|
||||
hd_balancer.start_timer();
|
||||
atom->add_x_data(host_x,host_type);
|
||||
|
||||
loop(eflag,vflag);
|
||||
ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
|
||||
const int red_blocks=loop(eflag,vflag);
|
||||
ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
|
||||
device->add_ans_object(ans);
|
||||
hd_balancer.stop_timer();
|
||||
}
|
||||
@ -220,12 +242,26 @@ template <class numtyp, class acctyp>
|
||||
int ** BaseAtomicT::compute(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *sublo, double *subhi, tagint *tag,
|
||||
int **nspecial, tagint **special, const bool eflag,
|
||||
const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start,
|
||||
int **ilist, int **jnum,
|
||||
int **nspecial, tagint **special,
|
||||
const bool eflag_in, const bool vflag_in,
|
||||
const bool eatom, const bool vatom,
|
||||
int &host_start, int **ilist, int **jnum,
|
||||
const double cpu_time, bool &success) {
|
||||
acc_timers();
|
||||
int eflag, vflag;
|
||||
if (eatom) eflag=2;
|
||||
else if (eflag_in) eflag=1;
|
||||
else eflag=0;
|
||||
if (vatom) vflag=2;
|
||||
else if (vflag_in) vflag=1;
|
||||
else vflag=0;
|
||||
|
||||
#ifdef LAL_NO_BLOCK_REDUCE
|
||||
if (eflag) eflag=2;
|
||||
if (vflag) vflag=2;
|
||||
#endif
|
||||
|
||||
set_kernel(eflag,vflag);
|
||||
if (inum_full==0) {
|
||||
host_start=0;
|
||||
// Make sure textures are correct if realloc by a different hybrid style
|
||||
@ -254,8 +290,8 @@ int ** BaseAtomicT::compute(const int ago, const int inum_full,
|
||||
*ilist=nbor->host_ilist.begin();
|
||||
*jnum=nbor->host_acc.begin();
|
||||
|
||||
loop(eflag,vflag);
|
||||
ans->copy_answers(eflag,vflag,eatom,vatom);
|
||||
const int red_blocks=loop(eflag,vflag);
|
||||
ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
|
||||
device->add_ans_object(ans);
|
||||
hd_balancer.stop_timer();
|
||||
|
||||
@ -270,19 +306,46 @@ double BaseAtomicT::host_memory_usage_atomic() const {
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void BaseAtomicT::compile_kernels(UCL_Device &dev, const void *pair_str,
|
||||
const char *kname) {
|
||||
if (_compiled)
|
||||
const char *kname, const int onetype) {
|
||||
if (_compiled && _onetype==onetype)
|
||||
return;
|
||||
_onetype=onetype;
|
||||
|
||||
std::string s_fast=std::string(kname)+"_fast";
|
||||
if (pair_program) delete pair_program;
|
||||
pair_program=new UCL_Program(dev);
|
||||
pair_program->load_string(pair_str,device->compile_string().c_str());
|
||||
std::string oclstring = device->compile_string()+" -DEVFLAG=1";
|
||||
if (_onetype) oclstring+=" -DONETYPE="+device->toa(_onetype);
|
||||
pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen);
|
||||
k_pair_fast.set_function(*pair_program,s_fast.c_str());
|
||||
k_pair.set_function(*pair_program,kname);
|
||||
pos_tex.get_texture(*pair_program,"pos_tex");
|
||||
|
||||
#if defined(LAL_OCL_EV_JIT)
|
||||
oclstring = device->compile_string()+" -DEVFLAG=0";
|
||||
if (_onetype) oclstring+=" -DONETYPE="+device->toa(_onetype);
|
||||
if (pair_program_noev) delete pair_program_noev;
|
||||
pair_program_noev=new UCL_Program(dev);
|
||||
pair_program_noev->load_string(pair_str,oclstring.c_str(),nullptr,screen);
|
||||
k_pair_noev.set_function(*pair_program_noev,s_fast.c_str());
|
||||
#else
|
||||
k_pair_sel = &k_pair_fast;
|
||||
#endif
|
||||
|
||||
_compiled=true;
|
||||
|
||||
#if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
|
||||
if (dev.cl_device_version() >= 210) {
|
||||
size_t mx_subgroup_sz = k_pair_fast.max_subgroup_size(_block_size);
|
||||
#if defined(LAL_OCL_EV_JIT)
|
||||
mx_subgroup_sz = std::min(mx_subgroup_sz, k_pair_noev.max_subgroup_size(_block_size));
|
||||
#endif
|
||||
if (_threads_per_atom > mx_subgroup_sz)
|
||||
_threads_per_atom = mx_subgroup_sz;
|
||||
device->set_simd_size(mx_subgroup_sz);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
template class BaseAtomic<PRECISION,ACC_PRECISION>;
|
||||
|
||||
@ -53,10 +53,11 @@ class BaseAtomic {
|
||||
int init_atomic(const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen,
|
||||
const void *pair_program, const char *k_name);
|
||||
const void *pair_program, const char *k_name,
|
||||
const int onetype=0);
|
||||
|
||||
/// Estimate the overhead for GPU context changes and CPU driver
|
||||
void estimate_gpu_overhead();
|
||||
void estimate_gpu_overhead(const int add_kernels=0);
|
||||
|
||||
/// Check if there is enough storage for atom arrays and realloc if not
|
||||
/** \param success set to false if insufficient memory **/
|
||||
@ -100,7 +101,7 @@ class BaseAtomic {
|
||||
/// Accumulate timers
|
||||
inline void acc_timers() {
|
||||
if (device->time_device()) {
|
||||
nbor->acc_timers();
|
||||
nbor->acc_timers(screen);
|
||||
time_pair.add_to_total();
|
||||
atom->acc_timers();
|
||||
ans->acc_timers();
|
||||
@ -179,23 +180,31 @@ class BaseAtomic {
|
||||
Neighbor *nbor;
|
||||
|
||||
// ------------------------- DEVICE KERNELS -------------------------
|
||||
UCL_Program *pair_program;
|
||||
UCL_Kernel k_pair_fast, k_pair;
|
||||
UCL_Program *pair_program, *pair_program_noev;
|
||||
UCL_Kernel k_pair_fast, k_pair, k_pair_noev, *k_pair_sel;
|
||||
inline int block_size() { return _block_size; }
|
||||
inline void set_kernel(const int eflag, const int vflag) {
|
||||
#if defined(LAL_OCL_EV_JIT)
|
||||
if (eflag || vflag) k_pair_sel = &k_pair_fast;
|
||||
else k_pair_sel = &k_pair_noev;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
// --------------------------- TEXTURES -----------------------------
|
||||
UCL_Texture pos_tex;
|
||||
|
||||
protected:
|
||||
bool _compiled;
|
||||
int _block_size, _threads_per_atom;
|
||||
int _block_size, _threads_per_atom, _onetype;
|
||||
double _max_bytes, _max_an_bytes;
|
||||
double _gpu_overhead, _driver_overhead;
|
||||
UCL_D_Vec<int> *_nbor_data;
|
||||
|
||||
void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k);
|
||||
void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k,
|
||||
const int onetype);
|
||||
|
||||
virtual void loop(const bool _eflag, const bool _vflag) = 0;
|
||||
virtual int loop(const int eflag, const int vflag) = 0;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@ -27,6 +27,9 @@ BaseChargeT::BaseCharge() : _compiled(false), _max_bytes(0) {
|
||||
nbor=new Neighbor();
|
||||
pair_program=nullptr;
|
||||
ucl_device=nullptr;
|
||||
#if defined(LAL_OCL_EV_JIT)
|
||||
pair_program_noev=nullptr;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
@ -36,6 +39,10 @@ BaseChargeT::~BaseCharge() {
|
||||
k_pair_fast.clear();
|
||||
k_pair.clear();
|
||||
if (pair_program) delete pair_program;
|
||||
#if defined(LAL_OCL_EV_JIT)
|
||||
k_pair_noev.clear();
|
||||
if (pair_program_noev) delete pair_program_noev;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
@ -64,21 +71,11 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall,
|
||||
_gpu_host=1;
|
||||
|
||||
_threads_per_atom=device->threads_per_charge();
|
||||
if (_threads_per_atom>1 && gpu_nbor==0) {
|
||||
nbor->packing(true);
|
||||
_nbor_data=&(nbor->dev_packed);
|
||||
} else
|
||||
_nbor_data=&(nbor->dev_nbor);
|
||||
|
||||
int success=device->init(*ans,true,false,nlocal,nall,maxspecial);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
|
||||
max_nbors,cell_size,false,_threads_per_atom);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
if (ucl_device!=device->gpu) _compiled=false;
|
||||
|
||||
ucl_device=device->gpu;
|
||||
@ -88,6 +85,17 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall,
|
||||
_block_bio_size=device->block_bio_pair();
|
||||
compile_kernels(*ucl_device,pair_program,k_name);
|
||||
|
||||
if (_threads_per_atom>1 && gpu_nbor==0) {
|
||||
nbor->packing(true);
|
||||
_nbor_data=&(nbor->dev_packed);
|
||||
} else
|
||||
_nbor_data=&(nbor->dev_nbor);
|
||||
|
||||
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
|
||||
max_nbors,cell_size,false,_threads_per_atom);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
// Initialize host-device load balancer
|
||||
hd_balancer.init(device,gpu_nbor,gpu_split);
|
||||
|
||||
@ -104,8 +112,8 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall,
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void BaseChargeT::estimate_gpu_overhead() {
|
||||
device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead);
|
||||
void BaseChargeT::estimate_gpu_overhead(const int add_kernels) {
|
||||
device->estimate_gpu_overhead(1+add_kernels,_gpu_overhead,_driver_overhead);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
@ -166,8 +174,8 @@ inline void BaseChargeT::build_nbor_list(const int inum, const int host_inum,
|
||||
atom->cast_copy_x(host_x,host_type);
|
||||
|
||||
int mn;
|
||||
nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, tag,
|
||||
nspecial, special, success, mn);
|
||||
nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi,
|
||||
tag, nspecial, special, success, mn, ans->error_flag);
|
||||
|
||||
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||
if (bytes>_max_an_bytes)
|
||||
@ -181,12 +189,26 @@ template <class numtyp, class acctyp>
|
||||
void BaseChargeT::compute(const int f_ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh,
|
||||
const bool eflag, const bool vflag,
|
||||
const bool eflag_in, const bool vflag_in,
|
||||
const bool eatom, const bool vatom,
|
||||
int &host_start, const double cpu_time,
|
||||
bool &success, double *host_q,
|
||||
const int nlocal, double *boxlo, double *prd) {
|
||||
acc_timers();
|
||||
int eflag, vflag;
|
||||
if (eatom) eflag=2;
|
||||
else if (eflag_in) eflag=1;
|
||||
else eflag=0;
|
||||
if (vatom) vflag=2;
|
||||
else if (vflag_in) vflag=1;
|
||||
else vflag=0;
|
||||
|
||||
#ifdef LAL_NO_BLOCK_REDUCE
|
||||
if (eflag) eflag=2;
|
||||
if (vflag) vflag=2;
|
||||
#endif
|
||||
|
||||
set_kernel(eflag,vflag);
|
||||
if (inum_full==0) {
|
||||
host_start=0;
|
||||
// Make sure textures are correct if realloc by a different hybrid style
|
||||
@ -215,8 +237,8 @@ void BaseChargeT::compute(const int f_ago, const int inum_full,
|
||||
device->precompute(f_ago,nlocal,nall,host_x,host_type,success,host_q,
|
||||
boxlo, prd);
|
||||
|
||||
loop(eflag,vflag);
|
||||
ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
|
||||
const int red_blocks=loop(eflag,vflag);
|
||||
ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
|
||||
device->add_ans_object(ans);
|
||||
hd_balancer.stop_timer();
|
||||
}
|
||||
@ -228,13 +250,27 @@ template <class numtyp, class acctyp>
|
||||
int** BaseChargeT::compute(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *sublo, double *subhi, tagint *tag,
|
||||
int **nspecial, tagint **special, const bool eflag,
|
||||
const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start,
|
||||
int **nspecial, tagint **special,
|
||||
const bool eflag_in, const bool vflag_in,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
int **ilist, int **jnum,
|
||||
const double cpu_time, bool &success,
|
||||
double *host_q, double *boxlo, double *prd) {
|
||||
acc_timers();
|
||||
int eflag, vflag;
|
||||
if (eatom) eflag=2;
|
||||
else if (eflag_in) eflag=1;
|
||||
else eflag=0;
|
||||
if (vatom) vflag=2;
|
||||
else if (vflag_in) vflag=1;
|
||||
else vflag=0;
|
||||
|
||||
#ifdef LAL_NO_BLOCK_REDUCE
|
||||
if (eflag) eflag=2;
|
||||
if (vflag) vflag=2;
|
||||
#endif
|
||||
|
||||
set_kernel(eflag,vflag);
|
||||
if (inum_full==0) {
|
||||
host_start=0;
|
||||
// Make sure textures are correct if realloc by a different hybrid style
|
||||
@ -269,8 +305,8 @@ int** BaseChargeT::compute(const int ago, const int inum_full,
|
||||
device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q,
|
||||
boxlo, prd);
|
||||
|
||||
loop(eflag,vflag);
|
||||
ans->copy_answers(eflag,vflag,eatom,vatom);
|
||||
const int red_blocks=loop(eflag,vflag);
|
||||
ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
|
||||
device->add_ans_object(ans);
|
||||
hd_balancer.stop_timer();
|
||||
|
||||
@ -292,13 +328,37 @@ void BaseChargeT::compile_kernels(UCL_Device &dev, const void *pair_str,
|
||||
std::string s_fast=std::string(kname)+"_fast";
|
||||
if (pair_program) delete pair_program;
|
||||
pair_program=new UCL_Program(dev);
|
||||
pair_program->load_string(pair_str,device->compile_string().c_str());
|
||||
std::string oclstring = device->compile_string()+" -DEVFLAG=1";
|
||||
pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen);
|
||||
k_pair_fast.set_function(*pair_program,s_fast.c_str());
|
||||
k_pair.set_function(*pair_program,kname);
|
||||
pos_tex.get_texture(*pair_program,"pos_tex");
|
||||
q_tex.get_texture(*pair_program,"q_tex");
|
||||
|
||||
#if defined(LAL_OCL_EV_JIT)
|
||||
oclstring = device->compile_string()+" -DEVFLAG=0";
|
||||
if (pair_program_noev) delete pair_program_noev;
|
||||
pair_program_noev=new UCL_Program(dev);
|
||||
pair_program_noev->load_string(pair_str,oclstring.c_str(),nullptr,screen);
|
||||
k_pair_noev.set_function(*pair_program_noev,s_fast.c_str());
|
||||
#else
|
||||
k_pair_sel = &k_pair_fast;
|
||||
#endif
|
||||
|
||||
_compiled=true;
|
||||
|
||||
#if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
|
||||
if (dev.cl_device_version() >= 210) {
|
||||
size_t mx_subgroup_sz = k_pair_fast.max_subgroup_size(_block_size);
|
||||
#if defined(LAL_OCL_EV_JIT)
|
||||
mx_subgroup_sz = std::min(mx_subgroup_sz, k_pair_noev.max_subgroup_size(_block_size));
|
||||
#endif
|
||||
if (_threads_per_atom > mx_subgroup_sz)
|
||||
_threads_per_atom = mx_subgroup_sz;
|
||||
device->set_simd_size(mx_subgroup_sz);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
template class BaseCharge<PRECISION,ACC_PRECISION>;
|
||||
|
||||
@ -57,7 +57,7 @@ class BaseCharge {
|
||||
const void *pair_program, const char *k_name);
|
||||
|
||||
/// Estimate the overhead for GPU context changes and CPU driver
|
||||
void estimate_gpu_overhead();
|
||||
void estimate_gpu_overhead(const int add_kernels=0);
|
||||
|
||||
/// Check if there is enough storage for atom arrays and realloc if not
|
||||
/** \param success set to false if insufficient memory **/
|
||||
@ -103,7 +103,7 @@ class BaseCharge {
|
||||
/// Accumulate timers
|
||||
inline void acc_timers() {
|
||||
if (device->time_device()) {
|
||||
nbor->acc_timers();
|
||||
nbor->acc_timers(screen);
|
||||
time_pair.add_to_total();
|
||||
atom->acc_timers();
|
||||
ans->acc_timers();
|
||||
@ -177,9 +177,15 @@ class BaseCharge {
|
||||
Neighbor *nbor;
|
||||
|
||||
// ------------------------- DEVICE KERNELS -------------------------
|
||||
UCL_Program *pair_program;
|
||||
UCL_Kernel k_pair_fast, k_pair;
|
||||
UCL_Program *pair_program, *pair_program_noev;
|
||||
UCL_Kernel k_pair_fast, k_pair, k_pair_noev, *k_pair_sel;
|
||||
inline int block_size() { return _block_size; }
|
||||
inline void set_kernel(const int eflag, const int vflag) {
|
||||
#if defined(LAL_OCL_EV_JIT)
|
||||
if (eflag || vflag) k_pair_sel = &k_pair_fast;
|
||||
else k_pair_sel = &k_pair_noev;
|
||||
#endif
|
||||
}
|
||||
|
||||
// --------------------------- TEXTURES -----------------------------
|
||||
UCL_Texture pos_tex;
|
||||
@ -194,7 +200,7 @@ class BaseCharge {
|
||||
|
||||
void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k);
|
||||
|
||||
virtual void loop(const bool _eflag, const bool _vflag) = 0;
|
||||
virtual int loop(const int eflag, const int vflag) = 0;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@ -27,6 +27,9 @@ BaseDipoleT::BaseDipole() : _compiled(false), _max_bytes(0) {
|
||||
nbor=new Neighbor();
|
||||
pair_program=nullptr;
|
||||
ucl_device=nullptr;
|
||||
#if defined(LAL_OCL_EV_JIT)
|
||||
pair_program_noev=nullptr;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
@ -36,6 +39,10 @@ BaseDipoleT::~BaseDipole() {
|
||||
k_pair_fast.clear();
|
||||
k_pair.clear();
|
||||
if (pair_program) delete pair_program;
|
||||
#if defined(LAL_OCL_EV_JIT)
|
||||
k_pair_noev.clear();
|
||||
if (pair_program_noev) delete pair_program_noev;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
@ -65,30 +72,30 @@ int BaseDipoleT::init_atomic(const int nlocal, const int nall,
|
||||
_gpu_host=1;
|
||||
|
||||
_threads_per_atom=device->threads_per_charge();
|
||||
if (_threads_per_atom>1 && gpu_nbor==0) {
|
||||
nbor->packing(true);
|
||||
_nbor_data=&(nbor->dev_packed);
|
||||
} else
|
||||
_nbor_data=&(nbor->dev_nbor);
|
||||
|
||||
int success=device->init(*ans,true,true,nlocal,nall,maxspecial);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
|
||||
max_nbors,cell_size,false,_threads_per_atom);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
if (ucl_device!=device->gpu) _compiled=false;
|
||||
|
||||
ucl_device=device->gpu;
|
||||
atom=&device->atom;
|
||||
|
||||
_block_size=device->pair_block_size();
|
||||
_block_bio_size=device->block_bio_pair();
|
||||
compile_kernels(*ucl_device,pair_program,k_name);
|
||||
|
||||
if (_threads_per_atom>1 && gpu_nbor==0) {
|
||||
nbor->packing(true);
|
||||
_nbor_data=&(nbor->dev_packed);
|
||||
} else
|
||||
_nbor_data=&(nbor->dev_nbor);
|
||||
|
||||
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
|
||||
max_nbors,cell_size,false,_threads_per_atom);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
// Initialize host-device load balancer
|
||||
hd_balancer.init(device,gpu_nbor,gpu_split);
|
||||
|
||||
@ -168,8 +175,8 @@ inline void BaseDipoleT::build_nbor_list(const int inum, const int host_inum,
|
||||
atom->cast_copy_x(host_x,host_type);
|
||||
|
||||
int mn;
|
||||
nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, tag,
|
||||
nspecial, special, success, mn);
|
||||
nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi,
|
||||
tag, nspecial, special, success, mn, ans->error_flag);
|
||||
|
||||
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||
if (bytes>_max_an_bytes)
|
||||
@ -183,12 +190,26 @@ template <class numtyp, class acctyp>
|
||||
void BaseDipoleT::compute(const int f_ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh,
|
||||
const bool eflag, const bool vflag,
|
||||
const bool eflag_in, const bool vflag_in,
|
||||
const bool eatom, const bool vatom,
|
||||
int &host_start, const double cpu_time,
|
||||
bool &success, double *host_q, double **host_mu,
|
||||
const int nlocal, double *boxlo, double *prd) {
|
||||
acc_timers();
|
||||
int eflag, vflag;
|
||||
if (eatom) eflag=2;
|
||||
else if (eflag_in) eflag=1;
|
||||
else eflag=0;
|
||||
if (vatom) vflag=2;
|
||||
else if (vflag_in) vflag=1;
|
||||
else vflag=0;
|
||||
|
||||
#ifdef LAL_NO_BLOCK_REDUCE
|
||||
if (eflag) eflag=2;
|
||||
if (vflag) vflag=2;
|
||||
#endif
|
||||
|
||||
set_kernel(eflag,vflag);
|
||||
if (inum_full==0) {
|
||||
host_start=0;
|
||||
// Make sure textures are correct if realloc by a different hybrid style
|
||||
@ -219,8 +240,8 @@ void BaseDipoleT::compute(const int f_ago, const int inum_full,
|
||||
device->precompute(f_ago,nlocal,nall,host_x,host_type,success,host_q,
|
||||
boxlo, prd);
|
||||
|
||||
loop(eflag,vflag);
|
||||
ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
|
||||
const int red_blocks=loop(eflag,vflag);
|
||||
ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
|
||||
device->add_ans_object(ans);
|
||||
hd_balancer.stop_timer();
|
||||
}
|
||||
@ -232,14 +253,28 @@ template <class numtyp, class acctyp>
|
||||
int** BaseDipoleT::compute(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *sublo, double *subhi, tagint *tag,
|
||||
int **nspecial, tagint **special, const bool eflag,
|
||||
const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start,
|
||||
int **ilist, int **jnum,
|
||||
int **nspecial, tagint **special,
|
||||
const bool eflag_in, const bool vflag_in,
|
||||
const bool eatom, const bool vatom,
|
||||
int &host_start, int **ilist, int **jnum,
|
||||
const double cpu_time, bool &success,
|
||||
double *host_q, double **host_mu,
|
||||
double *boxlo, double *prd) {
|
||||
acc_timers();
|
||||
int eflag, vflag;
|
||||
if (eatom) eflag=2;
|
||||
else if (eflag_in) eflag=1;
|
||||
else eflag=0;
|
||||
if (vatom) vflag=2;
|
||||
else if (vflag_in) vflag=1;
|
||||
else vflag=0;
|
||||
|
||||
#ifdef LAL_NO_BLOCK_REDUCE
|
||||
if (eflag) eflag=2;
|
||||
if (vflag) vflag=2;
|
||||
#endif
|
||||
|
||||
set_kernel(eflag,vflag);
|
||||
if (inum_full==0) {
|
||||
host_start=0;
|
||||
// Make sure textures are correct if realloc by a different hybrid style
|
||||
@ -277,8 +312,8 @@ int** BaseDipoleT::compute(const int ago, const int inum_full,
|
||||
device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q,
|
||||
boxlo, prd);
|
||||
|
||||
loop(eflag,vflag);
|
||||
ans->copy_answers(eflag,vflag,eatom,vatom);
|
||||
const int red_blocks=loop(eflag,vflag);
|
||||
ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
|
||||
device->add_ans_object(ans);
|
||||
hd_balancer.stop_timer();
|
||||
|
||||
@ -300,14 +335,38 @@ void BaseDipoleT::compile_kernels(UCL_Device &dev, const void *pair_str,
|
||||
std::string s_fast=std::string(kname)+"_fast";
|
||||
if (pair_program) delete pair_program;
|
||||
pair_program=new UCL_Program(dev);
|
||||
pair_program->load_string(pair_str,device->compile_string().c_str());
|
||||
std::string oclstring = device->compile_string()+" -DEVFLAG=1";
|
||||
pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen);
|
||||
k_pair_fast.set_function(*pair_program,s_fast.c_str());
|
||||
k_pair.set_function(*pair_program,kname);
|
||||
pos_tex.get_texture(*pair_program,"pos_tex");
|
||||
q_tex.get_texture(*pair_program,"q_tex");
|
||||
mu_tex.get_texture(*pair_program,"mu_tex");
|
||||
|
||||
#if defined(LAL_OCL_EV_JIT)
|
||||
oclstring = device->compile_string()+" -DEVFLAG=0";
|
||||
if (pair_program_noev) delete pair_program_noev;
|
||||
pair_program_noev=new UCL_Program(dev);
|
||||
pair_program_noev->load_string(pair_str,oclstring.c_str(),nullptr,screen);
|
||||
k_pair_noev.set_function(*pair_program_noev,s_fast.c_str());
|
||||
#else
|
||||
k_pair_sel = &k_pair_fast;
|
||||
#endif
|
||||
|
||||
_compiled=true;
|
||||
|
||||
#if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
|
||||
if (dev.cl_device_version() >= 210) {
|
||||
size_t mx_subgroup_sz = k_pair_fast.max_subgroup_size(_block_size);
|
||||
#if defined(LAL_OCL_EV_JIT)
|
||||
mx_subgroup_sz = std::min(mx_subgroup_sz, k_pair_noev.max_subgroup_size(_block_size));
|
||||
#endif
|
||||
if (_threads_per_atom > mx_subgroup_sz)
|
||||
_threads_per_atom = mx_subgroup_sz;
|
||||
device->set_simd_size(mx_subgroup_sz);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
template class BaseDipole<PRECISION,ACC_PRECISION>;
|
||||
|
||||
@ -102,7 +102,7 @@ class BaseDipole {
|
||||
/// Accumulate timers
|
||||
inline void acc_timers() {
|
||||
if (device->time_device()) {
|
||||
nbor->acc_timers();
|
||||
nbor->acc_timers(screen);
|
||||
time_pair.add_to_total();
|
||||
atom->acc_timers();
|
||||
ans->acc_timers();
|
||||
@ -176,9 +176,16 @@ class BaseDipole {
|
||||
Neighbor *nbor;
|
||||
|
||||
// ------------------------- DEVICE KERNELS -------------------------
|
||||
UCL_Program *pair_program;
|
||||
UCL_Kernel k_pair_fast, k_pair;
|
||||
UCL_Program *pair_program, *pair_program_noev;
|
||||
UCL_Kernel k_pair_fast, k_pair, k_pair_noev, *k_pair_sel;
|
||||
inline int block_size() { return _block_size; }
|
||||
inline void set_kernel(const int eflag, const int vflag) {
|
||||
#if defined(LAL_OCL_EV_JIT)
|
||||
if (eflag || vflag) k_pair_sel = &k_pair_fast;
|
||||
else k_pair_sel = &k_pair_noev;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
// --------------------------- TEXTURES -----------------------------
|
||||
UCL_Texture pos_tex;
|
||||
@ -187,14 +194,14 @@ class BaseDipole {
|
||||
|
||||
protected:
|
||||
bool _compiled;
|
||||
int _block_size, _block_bio_size, _threads_per_atom;
|
||||
int _block_size, _threads_per_atom;
|
||||
double _max_bytes, _max_an_bytes;
|
||||
double _gpu_overhead, _driver_overhead;
|
||||
UCL_D_Vec<int> *_nbor_data;
|
||||
|
||||
void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k);
|
||||
|
||||
virtual void loop(const bool _eflag, const bool _vflag) = 0;
|
||||
virtual int loop(const int eflag, const int vflag) = 0;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@ -27,6 +27,9 @@ BaseDPDT::BaseDPD() : _compiled(false), _max_bytes(0) {
|
||||
nbor=new Neighbor();
|
||||
pair_program=nullptr;
|
||||
ucl_device=nullptr;
|
||||
#if defined(LAL_OCL_EV_JIT)
|
||||
pair_program_noev=nullptr;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
@ -36,6 +39,10 @@ BaseDPDT::~BaseDPD() {
|
||||
k_pair_fast.clear();
|
||||
k_pair.clear();
|
||||
if (pair_program) delete pair_program;
|
||||
#if defined(LAL_OCL_EV_JIT)
|
||||
k_pair_noev.clear();
|
||||
if (pair_program_noev) delete pair_program_noev;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
@ -47,9 +54,9 @@ int BaseDPDT::bytes_per_atom_atomic(const int max_nbors) const {
|
||||
template <class numtyp, class acctyp>
|
||||
int BaseDPDT::init_atomic(const int nlocal, const int nall,
|
||||
const int max_nbors, const int maxspecial,
|
||||
const double cell_size,
|
||||
const double gpu_split, FILE *_screen,
|
||||
const void *pair_program, const char *k_name) {
|
||||
const double cell_size, const double gpu_split,
|
||||
FILE *_screen, const void *pair_program,
|
||||
const char *k_name, const int onetype) {
|
||||
screen=_screen;
|
||||
|
||||
int gpu_nbor=0;
|
||||
@ -63,31 +70,30 @@ int BaseDPDT::init_atomic(const int nlocal, const int nall,
|
||||
if (host_nlocal>0)
|
||||
_gpu_host=1;
|
||||
|
||||
_threads_per_atom=device->threads_per_charge();
|
||||
if (_threads_per_atom>1 && gpu_nbor==0) {
|
||||
nbor->packing(true);
|
||||
_nbor_data=&(nbor->dev_packed);
|
||||
} else
|
||||
_nbor_data=&(nbor->dev_nbor);
|
||||
_threads_per_atom=device->threads_per_atom();
|
||||
|
||||
int success=device->init(*ans,false,false,nlocal,nall,maxspecial,true);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
|
||||
max_nbors,cell_size,false,_threads_per_atom);
|
||||
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
if (ucl_device!=device->gpu) _compiled=false;
|
||||
|
||||
ucl_device=device->gpu;
|
||||
atom=&device->atom;
|
||||
|
||||
_block_size=device->pair_block_size();
|
||||
_block_bio_size=device->block_bio_pair();
|
||||
compile_kernels(*ucl_device,pair_program,k_name);
|
||||
compile_kernels(*ucl_device,pair_program,k_name,onetype);
|
||||
|
||||
if (_threads_per_atom>1 && gpu_nbor==0) {
|
||||
nbor->packing(true);
|
||||
_nbor_data=&(nbor->dev_packed);
|
||||
} else
|
||||
_nbor_data=&(nbor->dev_nbor);
|
||||
|
||||
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
|
||||
max_nbors,cell_size,false,_threads_per_atom);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
// Initialize host-device load balancer
|
||||
hd_balancer.init(device,gpu_nbor,gpu_split);
|
||||
@ -167,8 +173,8 @@ inline void BaseDPDT::build_nbor_list(const int inum, const int host_inum,
|
||||
atom->cast_copy_x(host_x,host_type);
|
||||
|
||||
int mn;
|
||||
nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, tag,
|
||||
nspecial, special, success, mn);
|
||||
nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi,
|
||||
tag, nspecial, special, success, mn, ans->error_flag);
|
||||
|
||||
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||
if (bytes>_max_an_bytes)
|
||||
@ -179,16 +185,30 @@ inline void BaseDPDT::build_nbor_list(const int inum, const int host_inum,
|
||||
// Copy nbor list from host if necessary and then calculate forces, virials,..
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
void BaseDPDT::compute(const int f_ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh,
|
||||
const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom,
|
||||
int &host_start, const double cpu_time,
|
||||
bool &success, tagint *tag, double **host_v,
|
||||
const double dtinvsqrt, const int seed, const int timestep,
|
||||
void BaseDPDT::compute(const int f_ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, int *ilist, int *numj,
|
||||
int **firstneigh, const bool eflag_in,
|
||||
const bool vflag_in, const bool eatom,
|
||||
const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success, tagint *tag,
|
||||
double **host_v, const double dtinvsqrt,
|
||||
const int seed, const int timestep,
|
||||
const int nlocal, double *boxlo, double *prd) {
|
||||
acc_timers();
|
||||
int eflag, vflag;
|
||||
if (eatom) eflag=2;
|
||||
else if (eflag_in) eflag=1;
|
||||
else eflag=0;
|
||||
if (vatom) vflag=2;
|
||||
else if (vflag_in) vflag=1;
|
||||
else vflag=0;
|
||||
|
||||
#ifdef LAL_NO_BLOCK_REDUCE
|
||||
if (eflag) eflag=2;
|
||||
if (vflag) vflag=2;
|
||||
#endif
|
||||
|
||||
set_kernel(eflag,vflag);
|
||||
if (inum_full==0) {
|
||||
host_start=0;
|
||||
// Make sure textures are correct if realloc by a different hybrid style
|
||||
@ -218,8 +238,8 @@ void BaseDPDT::compute(const int f_ago, const int inum_full,
|
||||
_seed = seed;
|
||||
_timestep = timestep;
|
||||
|
||||
loop(eflag,vflag);
|
||||
ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
|
||||
const int red_blocks=loop(eflag,vflag);
|
||||
ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
|
||||
device->add_ans_object(ans);
|
||||
hd_balancer.stop_timer();
|
||||
}
|
||||
@ -231,8 +251,8 @@ template <class numtyp, class acctyp>
|
||||
int** BaseDPDT::compute(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *sublo, double *subhi, tagint *tag,
|
||||
int **nspecial, tagint **special, const bool eflag,
|
||||
const bool vflag, const bool eatom,
|
||||
int **nspecial, tagint **special, const bool eflag_in,
|
||||
const bool vflag_in, const bool eatom,
|
||||
const bool vatom, int &host_start,
|
||||
int **ilist, int **jnum,
|
||||
const double cpu_time, bool &success,
|
||||
@ -240,6 +260,20 @@ int** BaseDPDT::compute(const int ago, const int inum_full,
|
||||
const int seed, const int timestep,
|
||||
double *boxlo, double *prd) {
|
||||
acc_timers();
|
||||
int eflag, vflag;
|
||||
if (eatom) eflag=2;
|
||||
else if (eflag_in) eflag=1;
|
||||
else eflag=0;
|
||||
if (vatom) vflag=2;
|
||||
else if (vflag_in) vflag=1;
|
||||
else vflag=0;
|
||||
|
||||
#ifdef LAL_NO_BLOCK_REDUCE
|
||||
if (eflag) eflag=2;
|
||||
if (vflag) vflag=2;
|
||||
#endif
|
||||
|
||||
set_kernel(eflag,vflag);
|
||||
if (inum_full==0) {
|
||||
host_start=0;
|
||||
// Make sure textures are correct if realloc by a different hybrid style
|
||||
@ -275,8 +309,8 @@ int** BaseDPDT::compute(const int ago, const int inum_full,
|
||||
_seed = seed;
|
||||
_timestep = timestep;
|
||||
|
||||
loop(eflag,vflag);
|
||||
ans->copy_answers(eflag,vflag,eatom,vatom);
|
||||
const int red_blocks=loop(eflag,vflag);
|
||||
ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
|
||||
device->add_ans_object(ans);
|
||||
hd_balancer.stop_timer();
|
||||
|
||||
@ -291,20 +325,48 @@ double BaseDPDT::host_memory_usage_atomic() const {
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void BaseDPDT::compile_kernels(UCL_Device &dev, const void *pair_str,
|
||||
const char *kname) {
|
||||
if (_compiled)
|
||||
const char *kname, const int onetype) {
|
||||
if (_compiled && _onetype==onetype)
|
||||
return;
|
||||
|
||||
_onetype=onetype;
|
||||
|
||||
std::string s_fast=std::string(kname)+"_fast";
|
||||
if (pair_program) delete pair_program;
|
||||
pair_program=new UCL_Program(dev);
|
||||
pair_program->load_string(pair_str,device->compile_string().c_str());
|
||||
std::string oclstring = device->compile_string()+" -DEVFLAG=1";
|
||||
if (_onetype) oclstring+=" -DONETYPE="+device->toa(_onetype);
|
||||
pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen);
|
||||
k_pair_fast.set_function(*pair_program,s_fast.c_str());
|
||||
k_pair.set_function(*pair_program,kname);
|
||||
pos_tex.get_texture(*pair_program,"pos_tex");
|
||||
vel_tex.get_texture(*pair_program,"vel_tex");
|
||||
|
||||
#if defined(LAL_OCL_EV_JIT)
|
||||
oclstring = device->compile_string()+" -DEVFLAG=0";
|
||||
if (_onetype) oclstring+=" -DONETYPE="+device->toa(_onetype);
|
||||
if (pair_program_noev) delete pair_program_noev;
|
||||
pair_program_noev=new UCL_Program(dev);
|
||||
pair_program_noev->load_string(pair_str,oclstring.c_str(),nullptr,screen);
|
||||
k_pair_noev.set_function(*pair_program_noev,s_fast.c_str());
|
||||
#else
|
||||
k_pair_sel = &k_pair_fast;
|
||||
#endif
|
||||
|
||||
_compiled=true;
|
||||
|
||||
#if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
|
||||
if (dev.cl_device_version() >= 210) {
|
||||
size_t mx_subgroup_sz = k_pair_fast.max_subgroup_size(_block_size);
|
||||
#if defined(LAL_OCL_EV_JIT)
|
||||
mx_subgroup_sz = std::min(mx_subgroup_sz, k_pair_noev.max_subgroup_size(_block_size));
|
||||
#endif
|
||||
if (_threads_per_atom > mx_subgroup_sz)
|
||||
_threads_per_atom = mx_subgroup_sz;
|
||||
device->set_simd_size(mx_subgroup_sz);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
template class BaseDPD<PRECISION,ACC_PRECISION>;
|
||||
|
||||
@ -52,7 +52,8 @@ class BaseDPD {
|
||||
int init_atomic(const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen,
|
||||
const void *pair_program, const char *k_name);
|
||||
const void *pair_program, const char *k_name,
|
||||
const int onetype=0);
|
||||
|
||||
/// Estimate the overhead for GPU context changes and CPU driver
|
||||
void estimate_gpu_overhead();
|
||||
@ -101,7 +102,7 @@ class BaseDPD {
|
||||
/// Accumulate timers
|
||||
inline void acc_timers() {
|
||||
if (device->time_device()) {
|
||||
nbor->acc_timers();
|
||||
nbor->acc_timers(screen);
|
||||
time_pair.add_to_total();
|
||||
atom->acc_timers();
|
||||
ans->acc_timers();
|
||||
@ -177,9 +178,16 @@ class BaseDPD {
|
||||
Neighbor *nbor;
|
||||
|
||||
// ------------------------- DEVICE KERNELS -------------------------
|
||||
UCL_Program *pair_program;
|
||||
UCL_Kernel k_pair_fast, k_pair;
|
||||
UCL_Program *pair_program, *pair_program_noev;
|
||||
UCL_Kernel k_pair_fast, k_pair, k_pair_noev, *k_pair_sel;
|
||||
inline int block_size() { return _block_size; }
|
||||
inline void set_kernel(const int eflag, const int vflag) {
|
||||
#if defined(LAL_OCL_EV_JIT)
|
||||
if (eflag || vflag) k_pair_sel = &k_pair_fast;
|
||||
else k_pair_sel = &k_pair_noev;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
// --------------------------- TEXTURES -----------------------------
|
||||
UCL_Texture pos_tex;
|
||||
@ -191,13 +199,14 @@ class BaseDPD {
|
||||
|
||||
protected:
|
||||
bool _compiled;
|
||||
int _block_size, _block_bio_size, _threads_per_atom;
|
||||
int _block_size, _threads_per_atom, _onetype;
|
||||
double _max_bytes, _max_an_bytes;
|
||||
double _gpu_overhead, _driver_overhead;
|
||||
UCL_D_Vec<int> *_nbor_data;
|
||||
|
||||
void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k);
|
||||
virtual void loop(const bool _eflag, const bool _vflag) = 0;
|
||||
void compile_kernels(UCL_Device &dev, const void *pair_string,
|
||||
const char *k, const int onetype);
|
||||
virtual int loop(const int eflag, const int vflag) = 0;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@ -29,7 +29,8 @@ const char *ellipsoid_nbor=0;
|
||||
extern Device<PRECISION,ACC_PRECISION> global_device;
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
BaseEllipsoidT::BaseEllipsoid() : _compiled(false), _max_bytes(0) {
|
||||
BaseEllipsoidT::BaseEllipsoid() : _compiled(false), _max_bytes(0),
|
||||
host_olist_size(0) {
|
||||
device=&global_device;
|
||||
ans=new Answer<numtyp,acctyp>();
|
||||
nbor=new Neighbor();
|
||||
@ -37,6 +38,10 @@ BaseEllipsoidT::BaseEllipsoid() : _compiled(false), _max_bytes(0) {
|
||||
ellipsoid_program=nullptr;
|
||||
lj_program=nullptr;
|
||||
ucl_device=nullptr;
|
||||
#if defined(LAL_OCL_EV_JIT)
|
||||
ellipsoid_program_noev=nullptr;
|
||||
lj_program_noev=nullptr;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
@ -53,6 +58,14 @@ BaseEllipsoidT::~BaseEllipsoid() {
|
||||
if (nbor_program) delete nbor_program;
|
||||
if (ellipsoid_program) delete ellipsoid_program;
|
||||
if (lj_program) delete lj_program;
|
||||
#if defined(LAL_OCL_EV_JIT)
|
||||
k_ellipsoid_noev.clear();
|
||||
k_ellipsoid_sphere_noev.clear();
|
||||
k_sphere_ellipsoid_noev.clear();
|
||||
k_lj_fast.clear();
|
||||
if (ellipsoid_program_noev) delete ellipsoid_program_noev;
|
||||
if (lj_program_noev) delete lj_program_noev;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
@ -89,11 +102,6 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
|
||||
max_nbors,cell_size,true,1);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
if (ucl_device!=device->gpu) _compiled=false;
|
||||
|
||||
ucl_device=device->gpu;
|
||||
@ -102,6 +110,11 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
|
||||
_block_size=device->block_ellipse();
|
||||
compile_kernels(*ucl_device,ellipsoid_program,lj_program,k_name,ellip_sphere);
|
||||
|
||||
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
|
||||
max_nbors,cell_size,true,1);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
// Initialize host-device load balancer
|
||||
hd_balancer.init(device,gpu_nbor,gpu_split);
|
||||
|
||||
@ -133,12 +146,11 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
|
||||
if (_multiple_forms && gpu_nbor!=0)
|
||||
return -9;
|
||||
|
||||
if (_multiple_forms)
|
||||
if (_multiple_forms) {
|
||||
ans->force.zero();
|
||||
|
||||
// Memory for ilist ordered by particle type
|
||||
if (host_olist.alloc(nbor->max_atoms(),*ucl_device)!=UCL_SUCCESS)
|
||||
return -3;
|
||||
host_olist_size = nbor->max_atoms();
|
||||
host_olist = new int[nbor->max_atoms()];
|
||||
}
|
||||
|
||||
_max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||
|
||||
@ -160,7 +172,10 @@ template <class numtyp, class acctyp>
|
||||
void BaseEllipsoidT::clear_base() {
|
||||
// Output any timing information
|
||||
output_times();
|
||||
host_olist.clear();
|
||||
if (host_olist_size) {
|
||||
host_olist_size = 0;
|
||||
delete []host_olist;
|
||||
}
|
||||
|
||||
time_nbor1.clear();
|
||||
time_ellipsoid.clear();
|
||||
@ -206,10 +221,14 @@ void BaseEllipsoidT::output_times() {
|
||||
MPI_Reduce(&_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,
|
||||
device->replica());
|
||||
double max_mb=mpi_max_bytes/(1024*1024);
|
||||
double t_time=times[0]+times[1]+times[2]+times[3]+times[4]+times[5];
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
// Workaround for timing issue on Intel OpenCL
|
||||
if (times[3] > 80e6) times[3]=0.0;
|
||||
#endif
|
||||
|
||||
if (device->replica_me()==0)
|
||||
if (screen && times[5]>0.0) {
|
||||
if (screen && times[7]>0.0) {
|
||||
int replica_size=device->replica_size();
|
||||
|
||||
fprintf(screen,"\n\n-------------------------------------");
|
||||
@ -218,9 +237,8 @@ void BaseEllipsoidT::output_times() {
|
||||
fprintf(screen,"\n-------------------------------------");
|
||||
fprintf(screen,"--------------------------------\n");
|
||||
|
||||
if (device->procs_per_gpu()==1 && t_time>0) {
|
||||
if (device->procs_per_gpu()==1 && times[3]>0) {
|
||||
fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/replica_size);
|
||||
fprintf(screen,"Data Cast/Pack: %.4f s.\n",times[5]/replica_size);
|
||||
fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/replica_size);
|
||||
if (nbor->gpu_nbor()>0)
|
||||
fprintf(screen,"Neighbor build: %.4f s.\n",times[2]/replica_size);
|
||||
@ -229,13 +247,15 @@ void BaseEllipsoidT::output_times() {
|
||||
fprintf(screen,"Force calc: %.4f s.\n",times[3]/replica_size);
|
||||
fprintf(screen,"LJ calc: %.4f s.\n",times[4]/replica_size);
|
||||
}
|
||||
if (nbor->gpu_nbor()==2)
|
||||
fprintf(screen,"Neighbor (CPU): %.4f s.\n",times[9]/replica_size);
|
||||
if (times[6]>0)
|
||||
fprintf(screen,"Device Overhead: %.4f s.\n",times[6]/replica_size);
|
||||
fprintf(screen,"Average split: %.4f.\n",avg_split);
|
||||
fprintf(screen,"Threads / atom: %d.\n",_threads_per_atom);
|
||||
fprintf(screen,"Vector width: %d.\n", device->simd_size());
|
||||
fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb);
|
||||
if (nbor->gpu_nbor()==2)
|
||||
fprintf(screen,"CPU Neighbor: %.4f s.\n",times[9]/replica_size);
|
||||
fprintf(screen,"CPU Cast/Pack: %.4f s.\n",times[5]/replica_size);
|
||||
fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[7]/replica_size);
|
||||
fprintf(screen,"CPU Idle_Time: %.4f s.\n",times[8]/replica_size);
|
||||
fprintf(screen,"-------------------------------------");
|
||||
@ -256,11 +276,13 @@ void BaseEllipsoidT::pack_nbors(const int GX, const int BX, const int start,
|
||||
if (shared_types) {
|
||||
k_nbor_fast.set_size(GX,BX);
|
||||
k_nbor_fast.run(&atom->x, &cut_form, &nbor->dev_nbor, &stride, &start,
|
||||
&inum, &nbor->dev_packed, &form_low, &form_high);
|
||||
&inum, &nbor->dev_packed, &form_low, &form_high,
|
||||
&_threads_per_atom);
|
||||
} else {
|
||||
k_nbor.set_size(GX,BX);
|
||||
k_nbor.run(&atom->x, &cut_form, &ntypes, &nbor->dev_nbor, &stride,
|
||||
&start, &inum, &nbor->dev_packed, &form_low, &form_high);
|
||||
&start, &inum, &nbor->dev_packed, &form_low, &form_high,
|
||||
&_threads_per_atom);
|
||||
}
|
||||
}
|
||||
|
||||
@ -298,7 +320,7 @@ void BaseEllipsoidT::reset_nbors(const int nall, const int inum,
|
||||
p++;
|
||||
}
|
||||
}
|
||||
nbor->get_host(inum,host_olist.begin(),numj,firstneigh,block_size());
|
||||
nbor->get_host(inum,host_olist,numj,firstneigh,block_size());
|
||||
nbor->copy_unpacked(inum,mn);
|
||||
return;
|
||||
}
|
||||
@ -330,8 +352,8 @@ inline void BaseEllipsoidT::build_nbor_list(const int inum, const int host_inum,
|
||||
atom->cast_copy_x(host_x,host_type);
|
||||
|
||||
int mn;
|
||||
nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, tag,
|
||||
nspecial, special, success, mn);
|
||||
nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi,
|
||||
tag, nspecial, special, success, mn, ans->error_flag);
|
||||
nbor->copy_unpacked(inum,mn);
|
||||
_last_ellipse=inum;
|
||||
_max_last_ellipse=inum;
|
||||
@ -348,11 +370,18 @@ template <class numtyp, class acctyp>
|
||||
int* BaseEllipsoidT::compute(const int f_ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh,
|
||||
const bool eflag, const bool vflag,
|
||||
const bool eflag_in, const bool vflag_in,
|
||||
const bool eatom, const bool vatom,
|
||||
int &host_start, const double cpu_time,
|
||||
bool &success, double **host_quat) {
|
||||
acc_timers();
|
||||
int eflag, vflag;
|
||||
if (eflag_in) eflag=2;
|
||||
else eflag=0;
|
||||
if (vflag_in) vflag=2;
|
||||
else vflag=0;
|
||||
|
||||
set_kernel(eflag,vflag);
|
||||
if (inum_full==0) {
|
||||
host_start=0;
|
||||
zero_timers();
|
||||
@ -373,7 +402,7 @@ int* BaseEllipsoidT::compute(const int f_ago, const int inum_full,
|
||||
}
|
||||
int *list;
|
||||
if (_multiple_forms)
|
||||
list=host_olist.begin();
|
||||
list=host_olist;
|
||||
else
|
||||
list=ilist;
|
||||
|
||||
@ -384,7 +413,7 @@ int* BaseEllipsoidT::compute(const int f_ago, const int inum_full,
|
||||
atom->add_quat_data();
|
||||
|
||||
loop(eflag,vflag);
|
||||
ans->copy_answers(eflag,vflag,eatom,vatom,list);
|
||||
ans->copy_answers(eflag_in,vflag_in,eatom,vatom,list,inum);
|
||||
device->add_ans_object(ans);
|
||||
hd_balancer.stop_timer();
|
||||
return list;
|
||||
@ -394,15 +423,23 @@ int* BaseEllipsoidT::compute(const int f_ago, const int inum_full,
|
||||
// Reneighbor on GPU if necessary and then compute forces, virials, energies
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
int** BaseEllipsoidT::compute(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, double *sublo,
|
||||
double *subhi, tagint *tag, int **nspecial,
|
||||
tagint **special, const bool eflag, const bool vflag,
|
||||
int** BaseEllipsoidT::compute(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *sublo, double *subhi, tagint *tag,
|
||||
int **nspecial, tagint **special,
|
||||
const bool eflag_in, const bool vflag_in,
|
||||
const bool eatom, const bool vatom,
|
||||
int &host_start, int **ilist, int **jnum,
|
||||
const double cpu_time, bool &success,
|
||||
double **host_quat) {
|
||||
acc_timers();
|
||||
int eflag, vflag;
|
||||
if (eflag_in) eflag=2;
|
||||
else eflag=0;
|
||||
if (vflag_in) vflag=2;
|
||||
else vflag=0;
|
||||
|
||||
set_kernel(eflag,vflag);
|
||||
if (inum_full==0) {
|
||||
host_start=0;
|
||||
zero_timers();
|
||||
@ -435,7 +472,7 @@ int** BaseEllipsoidT::compute(const int ago, const int inum_full, const int nall
|
||||
*jnum=nbor->host_acc.begin();
|
||||
|
||||
loop(eflag,vflag);
|
||||
ans->copy_answers(eflag,vflag,eatom,vatom);
|
||||
ans->copy_answers(eflag_in,vflag_in,eatom,vatom,inum);
|
||||
device->add_ans_object(ans);
|
||||
hd_balancer.stop_timer();
|
||||
|
||||
@ -462,25 +499,26 @@ void BaseEllipsoidT::compile_kernels(UCL_Device &dev,
|
||||
std::string s_lj=kns+"_lj";
|
||||
std::string s_lj_fast=kns+"_lj_fast";
|
||||
|
||||
std::string flags=device->compile_string();
|
||||
std::string oclstring = device->compile_string()+" -DEVFLAG=1";
|
||||
|
||||
if (nbor_program) delete nbor_program;
|
||||
nbor_program=new UCL_Program(dev);
|
||||
nbor_program->load_string(ellipsoid_nbor,flags.c_str());
|
||||
nbor_program->load_string(ellipsoid_nbor,oclstring.c_str(),nullptr,screen);
|
||||
k_nbor_fast.set_function(*nbor_program,"kernel_nbor_fast");
|
||||
k_nbor.set_function(*nbor_program,"kernel_nbor");
|
||||
neigh_tex.get_texture(*nbor_program,"pos_tex");
|
||||
|
||||
if (ellipsoid_program) delete ellipsoid_program;
|
||||
ellipsoid_program=new UCL_Program(dev);
|
||||
ellipsoid_program->load_string(ellipsoid_string,flags.c_str());
|
||||
ellipsoid_program->load_string(ellipsoid_string,oclstring.c_str(),
|
||||
nullptr,screen);
|
||||
k_ellipsoid.set_function(*ellipsoid_program,kname);
|
||||
pos_tex.get_texture(*ellipsoid_program,"pos_tex");
|
||||
quat_tex.get_texture(*ellipsoid_program,"quat_tex");
|
||||
|
||||
if (lj_program) delete lj_program;
|
||||
lj_program=new UCL_Program(dev);
|
||||
lj_program->load_string(lj_string,flags.c_str());
|
||||
lj_program->load_string(lj_string,oclstring.c_str(),nullptr,screen);
|
||||
k_sphere_ellipsoid.set_function(*lj_program,s_sphere_ellipsoid.c_str());
|
||||
k_lj_fast.set_function(*lj_program,s_lj_fast.c_str());
|
||||
k_lj.set_function(*lj_program,s_lj.c_str());
|
||||
@ -489,7 +527,52 @@ void BaseEllipsoidT::compile_kernels(UCL_Device &dev,
|
||||
lj_pos_tex.get_texture(*lj_program,"pos_tex");
|
||||
lj_quat_tex.get_texture(*lj_program,"quat_tex");
|
||||
|
||||
#if defined(LAL_OCL_EV_JIT)
|
||||
oclstring = device->compile_string()+" -DEVFLAG=0";
|
||||
if (ellipsoid_program_noev) delete ellipsoid_program_noev;
|
||||
ellipsoid_program_noev=new UCL_Program(dev);
|
||||
ellipsoid_program_noev->load_string(ellipsoid_string,oclstring.c_str(),
|
||||
nullptr,screen);
|
||||
k_ellipsoid_noev.set_function(*ellipsoid_program_noev,kname);
|
||||
|
||||
if (lj_program_noev) delete lj_program_noev;
|
||||
lj_program_noev=new UCL_Program(dev);
|
||||
lj_program_noev->load_string(lj_string,oclstring.c_str(),nullptr,screen);
|
||||
k_sphere_ellipsoid_noev.set_function(*lj_program_noev,
|
||||
s_sphere_ellipsoid.c_str());
|
||||
k_lj_fast_noev.set_function(*lj_program_noev,s_lj_fast.c_str());
|
||||
if (e_s)
|
||||
k_ellipsoid_sphere_noev.set_function(*lj_program_noev,
|
||||
s_ellipsoid_sphere.c_str());
|
||||
#else
|
||||
k_elps_sel = &k_ellipsoid;
|
||||
k_elps_sphere_sel = &k_ellipsoid_sphere;
|
||||
k_sphere_elps_sel = &k_sphere_ellipsoid;
|
||||
k_lj_sel = &k_lj_fast;
|
||||
#endif
|
||||
|
||||
_compiled=true;
|
||||
|
||||
#if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
|
||||
if (dev.cl_device_version() >= 210) {
|
||||
size_t mx_subgroup_sz = k_lj_fast.max_subgroup_size(_block_size);
|
||||
mx_subgroup_sz = std::min(mx_subgroup_sz, k_ellipsoid.max_subgroup_size(_block_size));
|
||||
mx_subgroup_sz = std::min(mx_subgroup_sz, k_sphere_ellipsoid.max_subgroup_size(_block_size));
|
||||
if (e_s)
|
||||
mx_subgroup_sz = std::min(mx_subgroup_sz, k_ellipsoid_sphere.max_subgroup_size(_block_size));
|
||||
#if defined(LAL_OCL_EV_JIT)
|
||||
mx_subgroup_sz = std::min(mx_subgroup_sz, k_lj_fast_noev.max_subgroup_size(_block_size));
|
||||
mx_subgroup_sz = std::min(mx_subgroup_sz, k_ellipsoid_noev.max_subgroup_size(_block_size));
|
||||
mx_subgroup_sz = std::min(mx_subgroup_sz, k_sphere_ellipsoid_noev.max_subgroup_size(_block_size));
|
||||
if (e_s)
|
||||
mx_subgroup_sz = std::min(mx_subgroup_sz, k_ellipsoid_sphere_noev.max_subgroup_size(_block_size));
|
||||
#endif
|
||||
if (_threads_per_atom > mx_subgroup_sz)
|
||||
_threads_per_atom = mx_subgroup_sz;
|
||||
device->set_simd_size(mx_subgroup_sz);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
template class BaseEllipsoid<PRECISION,ACC_PRECISION>;
|
||||
|
||||
@ -88,10 +88,10 @@ class BaseEllipsoid {
|
||||
ans->resize(nlocal, success);
|
||||
if (_multiple_forms) ans->force.zero();
|
||||
|
||||
if (olist_size>static_cast<int>(host_olist.numel())) {
|
||||
host_olist.clear();
|
||||
int new_size=static_cast<int>(static_cast<double>(olist_size)*1.10);
|
||||
success=success && (host_olist.alloc(new_size,*ucl_device)==UCL_SUCCESS);
|
||||
if (olist_size>host_olist_size) {
|
||||
if (host_olist_size) delete []host_olist;
|
||||
host_olist_size=static_cast<int>(static_cast<double>(olist_size)*1.10);
|
||||
host_olist = new int[host_olist_size];
|
||||
}
|
||||
|
||||
nbor->resize(nlocal,host_inum,max_nbors,success);
|
||||
@ -116,7 +116,7 @@ class BaseEllipsoid {
|
||||
/// Accumulate timers
|
||||
inline void acc_timers() {
|
||||
if (device->time_device()) {
|
||||
nbor->acc_timers();
|
||||
nbor->acc_timers(screen);
|
||||
time_nbor1.add_to_total();
|
||||
time_ellipsoid.add_to_total();
|
||||
if (_multiple_forms) {
|
||||
@ -223,14 +223,40 @@ class BaseEllipsoid {
|
||||
/// Neighbor data
|
||||
Neighbor *nbor;
|
||||
/// ilist with particles sorted by type
|
||||
UCL_H_Vec<int> host_olist;
|
||||
int *host_olist;
|
||||
int host_olist_size;
|
||||
|
||||
// ------------------------- DEVICE KERNELS -------------------------
|
||||
UCL_Program *nbor_program, *ellipsoid_program, *lj_program;
|
||||
UCL_Program *ellipsoid_program_noev, *lj_program_noev;
|
||||
UCL_Kernel k_nbor_fast, k_nbor;
|
||||
UCL_Kernel k_ellipsoid, k_ellipsoid_sphere, k_sphere_ellipsoid;
|
||||
UCL_Kernel k_lj_fast, k_lj;
|
||||
UCL_Kernel k_ellipsoid_noev, k_ellipsoid_sphere_noev;
|
||||
UCL_Kernel k_sphere_ellipsoid_noev, k_lj_fast_noev;
|
||||
UCL_Kernel *k_elps_sel, *k_elps_sphere_sel, *k_sphere_elps_sel, *k_lj_sel;
|
||||
inline int block_size() { return _block_size; }
|
||||
inline void set_kernel(const int eflag, const int vflag) {
|
||||
#if defined(LAL_OCL_EV_JIT)
|
||||
if (_multiple_forms == false) {
|
||||
if (eflag || vflag) k_elps_sel = &k_ellipsoid;
|
||||
else k_elps_sel = &k_ellipsoid_noev;
|
||||
} else {
|
||||
if (eflag || vflag) {
|
||||
k_elps_sel = &k_ellipsoid;
|
||||
k_elps_sphere_sel = &k_ellipsoid_sphere;
|
||||
k_sphere_elps_sel = &k_sphere_ellipsoid;
|
||||
k_lj_sel = &k_lj_fast;
|
||||
} else {
|
||||
k_elps_sel = &k_ellipsoid_noev;
|
||||
k_elps_sphere_sel = &k_ellipsoid_sphere_noev;
|
||||
k_sphere_elps_sel = &k_sphere_ellipsoid_noev;
|
||||
k_lj_sel = &k_lj_fast_noev;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
// --------------------------- TEXTURES -----------------------------
|
||||
UCL_Texture pos_tex, quat_tex, lj_pos_tex, lj_quat_tex, neigh_tex;
|
||||
@ -240,7 +266,6 @@ class BaseEllipsoid {
|
||||
int _block_size, _threads_per_atom;
|
||||
double _max_bytes, _max_an_bytes;
|
||||
double _gpu_overhead, _driver_overhead;
|
||||
UCL_D_Vec<int> *_nbor_data;
|
||||
|
||||
// True if we want to use fast GB-sphere or sphere-sphere calculations
|
||||
bool _multiple_forms;
|
||||
@ -250,7 +275,7 @@ class BaseEllipsoid {
|
||||
void compile_kernels(UCL_Device &dev, const void *ellipsoid_string,
|
||||
const void *lj_string, const char *kname,const bool e_s);
|
||||
|
||||
virtual void loop(const bool _eflag, const bool _vflag) = 0;
|
||||
virtual int loop(const int eflag, const int vflag) = 0;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@ -20,7 +20,7 @@ namespace LAMMPS_AL {
|
||||
extern Device<PRECISION,ACC_PRECISION> global_device;
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
BaseThreeT::BaseThree() : _compiled(false), _max_bytes(0) {
|
||||
BaseThreeT::BaseThree() : _compiled(false), _max_bytes(0), _onetype(-1) {
|
||||
device=&global_device;
|
||||
ans=new Answer<numtyp,acctyp>();
|
||||
nbor=new Neighbor();
|
||||
@ -29,6 +29,9 @@ BaseThreeT::BaseThree() : _compiled(false), _max_bytes(0) {
|
||||
#endif
|
||||
pair_program=nullptr;
|
||||
ucl_device=nullptr;
|
||||
#if defined(LAL_OCL_EV_JIT)
|
||||
pair_program_noev=nullptr;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
@ -44,6 +47,12 @@ BaseThreeT::~BaseThree() {
|
||||
k_pair.clear();
|
||||
k_short_nbor.clear();
|
||||
if (pair_program) delete pair_program;
|
||||
#if defined(LAL_OCL_EV_JIT)
|
||||
k_three_center_noev.clear();
|
||||
k_three_end_noev.clear();
|
||||
k_pair_noev.clear();
|
||||
if (pair_program_noev) delete pair_program_noev;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
@ -62,7 +71,9 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
|
||||
const double cell_size, const double gpu_split,
|
||||
FILE *_screen, const void *pair_program,
|
||||
const char *two, const char *three_center,
|
||||
const char *three_end, const char *short_nbor) {
|
||||
const char *three_end, const char *short_nbor,
|
||||
const int onetype, const int onetype3,
|
||||
const int spq, const int tpa_override) {
|
||||
screen=_screen;
|
||||
|
||||
int gpu_nbor=0;
|
||||
@ -77,24 +88,16 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
|
||||
if (host_nlocal>0)
|
||||
_gpu_host=1;
|
||||
|
||||
_threads_per_atom=device->threads_per_atom();
|
||||
if (_threads_per_atom>1 && gpu_nbor==0) { // neigh no and tpa > 1
|
||||
nbor->packing(true);
|
||||
_nbor_data=&(nbor->dev_packed);
|
||||
} else // neigh yes or tpa == 1
|
||||
_nbor_data=&(nbor->dev_nbor);
|
||||
if (_threads_per_atom*_threads_per_atom>device->warp_size())
|
||||
return -10;
|
||||
// Allow forcing threads per atom to 1 for tersoff due to subg sync issue
|
||||
if (tpa_override)
|
||||
_threads_per_atom=tpa_override;
|
||||
else
|
||||
_threads_per_atom=device->threads_per_three();
|
||||
|
||||
int success=device->init(*ans,false,false,nlocal,nall,maxspecial);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
|
||||
max_nbors,cell_size,false,_threads_per_atom);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
if (ucl_device!=device->gpu) _compiled=false;
|
||||
|
||||
ucl_device=device->gpu;
|
||||
@ -110,7 +113,19 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
|
||||
|
||||
_block_pair=device->pair_block_size();
|
||||
_block_size=device->block_ellipse();
|
||||
compile_kernels(*ucl_device,pair_program,two,three_center,three_end,short_nbor);
|
||||
compile_kernels(*ucl_device,pair_program,two,three_center,three_end,
|
||||
short_nbor,onetype,onetype3,spq);
|
||||
|
||||
while (_threads_per_atom*_threads_per_atom>device->simd_size())
|
||||
_threads_per_atom = _threads_per_atom / 2;
|
||||
|
||||
if (_threads_per_atom*_threads_per_atom>device->simd_size())
|
||||
return -10;
|
||||
|
||||
success = device->init_nbor(nbor,nall,host_nlocal,nall,maxspecial,
|
||||
_gpu_host,max_nbors,cell_size,true,1,true);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
// Initialize host-device load balancer
|
||||
hd_balancer.init(device,gpu_nbor,gpu_split);
|
||||
@ -121,22 +136,21 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
|
||||
|
||||
pos_tex.bind_float(atom->x,4);
|
||||
|
||||
int ef_nall=nall;
|
||||
if (ef_nall==0)
|
||||
ef_nall=2000;
|
||||
|
||||
_max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||
#ifdef THREE_CONCURRENT
|
||||
_max_an_bytes+=ans2->gpu_bytes();
|
||||
#endif
|
||||
|
||||
int ef_nall=nall;
|
||||
if (ef_nall==0)
|
||||
ef_nall=2000;
|
||||
dev_short_nbor.alloc(ef_nall*(2+max_nbors),*(this->ucl_device),UCL_READ_WRITE);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void BaseThreeT::estimate_gpu_overhead() {
|
||||
device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead);
|
||||
void BaseThreeT::estimate_gpu_overhead(const int add_kernels) {
|
||||
device->estimate_gpu_overhead(4+add_kernels,_gpu_overhead,_driver_overhead);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
@ -152,7 +166,6 @@ void BaseThreeT::clear_atomic() {
|
||||
time_pair.clear();
|
||||
hd_balancer.clear();
|
||||
|
||||
dev_short_nbor.clear();
|
||||
nbor->clear();
|
||||
ans->clear();
|
||||
#ifdef THREE_CONCURRENT
|
||||
@ -186,6 +199,7 @@ int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist,
|
||||
|
||||
// now the requirement is removed, allowing to work within pair hybrid
|
||||
nbor->get_host(nlist,ilist,numj,firstneigh,block_size());
|
||||
nbor->copy_unpacked(nlist,mn);
|
||||
|
||||
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||
#ifdef THREE_CONCURRENT
|
||||
@ -201,7 +215,7 @@ int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist,
|
||||
// Build neighbor list on device
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
inline int BaseThreeT::build_nbor_list(const int inum, const int host_inum,
|
||||
inline void BaseThreeT::build_nbor_list(const int inum, const int host_inum,
|
||||
const int nall, double **host_x,
|
||||
int *host_type, double *sublo,
|
||||
double *subhi, tagint *tag,
|
||||
@ -211,14 +225,22 @@ inline int BaseThreeT::build_nbor_list(const int inum, const int host_inum,
|
||||
resize_atom(inum,nall,success);
|
||||
resize_local(nall,host_inum,nbor->max_nbors(),success);
|
||||
if (!success)
|
||||
return 0;
|
||||
return;
|
||||
atom->cast_copy_x(host_x,host_type);
|
||||
|
||||
_nall = nall;
|
||||
|
||||
// Increase the effective sub-domain size for neighbors of ghosts
|
||||
// This is still inefficient because we are calculating neighbors for more
|
||||
// ghosts than necessary due to increased ghost cutoff
|
||||
const double ncut=nbor->cutoff()*2.0;
|
||||
for (int i=0; i<3; i++) sublo[i]-=ncut;
|
||||
for (int i=0; i<3; i++) subhi[i]+=ncut;
|
||||
|
||||
int mn;
|
||||
nbor->build_nbor_list(host_x, nall, host_inum, nall, *atom, sublo, subhi, tag,
|
||||
nspecial, special, success, mn);
|
||||
nbor->build_nbor_list(host_x, nall, host_inum, nall, *atom, sublo, subhi,
|
||||
tag, nspecial, special, success, mn, ans->error_flag);
|
||||
nbor->copy_unpacked(nall,mn);
|
||||
|
||||
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||
#ifdef THREE_CONCURRENT
|
||||
@ -226,7 +248,6 @@ inline int BaseThreeT::build_nbor_list(const int inum, const int host_inum,
|
||||
#endif
|
||||
if (bytes>_max_an_bytes)
|
||||
_max_an_bytes=bytes;
|
||||
return mn;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
@ -236,10 +257,24 @@ template <class numtyp, class acctyp>
|
||||
void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall,
|
||||
const int nlist, double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh,
|
||||
const bool eflag, const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start,
|
||||
const bool eflag_in, const bool vflag_in,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success) {
|
||||
acc_timers();
|
||||
int eflag, vflag;
|
||||
if (eatom) eflag=2;
|
||||
else if (eflag_in) eflag=1;
|
||||
else eflag=0;
|
||||
if (vatom) vflag=2;
|
||||
else if (vflag_in) vflag=1;
|
||||
else vflag=0;
|
||||
|
||||
#ifdef LAL_NO_BLOCK_REDUCE
|
||||
if (eflag) eflag=2;
|
||||
if (vflag) vflag=2;
|
||||
#endif
|
||||
|
||||
set_kernel(eflag,vflag);
|
||||
if (inum_full==0) {
|
||||
host_start=0;
|
||||
// Make sure textures are correct if realloc by a different hybrid style
|
||||
@ -260,19 +295,12 @@ void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall,
|
||||
reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success);
|
||||
if (!success)
|
||||
return;
|
||||
_max_nbors = nbor->max_nbor_loop(nlist,numj,ilist);
|
||||
}
|
||||
|
||||
atom->cast_x_data(host_x,host_type);
|
||||
hd_balancer.start_timer();
|
||||
atom->add_x_data(host_x,host_type);
|
||||
|
||||
// re-allocate dev_short_nbor if necessary
|
||||
if (nall*(2+_max_nbors) > dev_short_nbor.cols()) {
|
||||
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
|
||||
dev_short_nbor.resize((2+_max_nbors)*_nmax);
|
||||
}
|
||||
|
||||
// _ainum to be used in loop() for short neighbor list build
|
||||
_ainum = nlist;
|
||||
|
||||
@ -282,11 +310,11 @@ void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall,
|
||||
#ifdef THREE_CONCURRENT
|
||||
ucl_device->sync();
|
||||
#endif
|
||||
loop(eflag,vflag,evatom);
|
||||
ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
|
||||
const int red_blocks=loop(eflag,vflag,evatom,success);
|
||||
ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
|
||||
device->add_ans_object(ans);
|
||||
#ifdef THREE_CONCURRENT
|
||||
ans2->copy_answers(eflag,vflag,eatom,vatom,ilist);
|
||||
ans2->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
|
||||
device->add_ans_object(ans2);
|
||||
#endif
|
||||
hd_balancer.stop_timer();
|
||||
@ -296,15 +324,29 @@ void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall,
|
||||
// Reneighbor on GPU if necessary and then compute forces, virials, energies
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
int ** BaseThreeT::compute(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *sublo, double *subhi, tagint *tag,
|
||||
int **nspecial, tagint **special, const bool eflag,
|
||||
const bool vflag, const bool eatom,
|
||||
int ** BaseThreeT::compute(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, double *sublo,
|
||||
double *subhi, tagint *tag, int **nspecial,
|
||||
tagint **special, const bool eflag_in,
|
||||
const bool vflag_in, const bool eatom,
|
||||
const bool vatom, int &host_start,
|
||||
int **ilist, int **jnum,
|
||||
const double cpu_time, bool &success) {
|
||||
acc_timers();
|
||||
int eflag, vflag;
|
||||
if (eatom) eflag=2;
|
||||
else if (eflag_in) eflag=1;
|
||||
else eflag=0;
|
||||
if (vatom) vflag=2;
|
||||
else if (vflag_in) vflag=1;
|
||||
else vflag=0;
|
||||
|
||||
#ifdef LAL_NO_BLOCK_REDUCE
|
||||
if (eflag) eflag=2;
|
||||
if (vflag) vflag=2;
|
||||
#endif
|
||||
|
||||
set_kernel(eflag,vflag);
|
||||
if (inum_full==0) {
|
||||
host_start=0;
|
||||
// Make sure textures are correct if realloc by a different hybrid style
|
||||
@ -323,7 +365,7 @@ int ** BaseThreeT::compute(const int ago, const int inum_full,
|
||||
|
||||
// Build neighbor list on GPU if necessary
|
||||
if (ago==0) {
|
||||
_max_nbors = build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
|
||||
build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
|
||||
sublo, subhi, tag, nspecial, special, success);
|
||||
if (!success)
|
||||
return nullptr;
|
||||
@ -336,12 +378,6 @@ int ** BaseThreeT::compute(const int ago, const int inum_full,
|
||||
*ilist=nbor->host_ilist.begin();
|
||||
*jnum=nbor->host_acc.begin();
|
||||
|
||||
// re-allocate dev_short_nbor if necessary
|
||||
if (nall*(2+_max_nbors) > dev_short_nbor.cols()) {
|
||||
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
|
||||
dev_short_nbor.resize((2+_max_nbors)*_nmax);
|
||||
}
|
||||
|
||||
// _ainum to be used in loop() for short neighbor list build
|
||||
_ainum = nall;
|
||||
|
||||
@ -351,11 +387,11 @@ int ** BaseThreeT::compute(const int ago, const int inum_full,
|
||||
#ifdef THREE_CONCURRENT
|
||||
ucl_device->sync();
|
||||
#endif
|
||||
loop(eflag,vflag,evatom);
|
||||
ans->copy_answers(eflag,vflag,eatom,vatom);
|
||||
const int red_blocks=loop(eflag,vflag,evatom,success);
|
||||
ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
|
||||
device->add_ans_object(ans);
|
||||
#ifdef THREE_CONCURRENT
|
||||
ans2->copy_answers(eflag,vflag,eatom,vatom);
|
||||
ans2->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
|
||||
device->add_ans_object(ans2);
|
||||
#endif
|
||||
hd_balancer.stop_timer();
|
||||
@ -372,14 +408,24 @@ double BaseThreeT::host_memory_usage_atomic() const {
|
||||
template <class numtyp, class acctyp>
|
||||
void BaseThreeT::compile_kernels(UCL_Device &dev, const void *pair_str,
|
||||
const char *two, const char *three_center,
|
||||
const char *three_end, const char* short_nbor) {
|
||||
if (_compiled)
|
||||
const char *three_end, const char* short_nbor,
|
||||
const int onetype, const int onetype3,
|
||||
const int spq) {
|
||||
if (_compiled && _onetype==onetype && _onetype3==onetype3 && _spq==spq)
|
||||
return;
|
||||
|
||||
_onetype=onetype;
|
||||
_onetype3=onetype3;
|
||||
_spq=spq;
|
||||
|
||||
std::string vatom_name=std::string(three_end)+"_vatom";
|
||||
if (pair_program) delete pair_program;
|
||||
pair_program=new UCL_Program(dev);
|
||||
pair_program->load_string(pair_str,device->compile_string().c_str());
|
||||
std::string oclstring = device->compile_string()+" -DEVFLAG=1";
|
||||
if (_onetype>=0) oclstring+=" -DONETYPE="+device->toa(_onetype)+
|
||||
" -DONETYPE3="+device->toa(_onetype3);
|
||||
if (_spq) oclstring+=" -DSPQ="+device->toa(_spq);
|
||||
pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen);
|
||||
k_three_center.set_function(*pair_program,three_center);
|
||||
k_three_end.set_function(*pair_program,three_end);
|
||||
k_three_end_vatom.set_function(*pair_program,vatom_name.c_str());
|
||||
@ -387,12 +433,50 @@ void BaseThreeT::compile_kernels(UCL_Device &dev, const void *pair_str,
|
||||
k_short_nbor.set_function(*pair_program,short_nbor);
|
||||
pos_tex.get_texture(*pair_program,"pos_tex");
|
||||
|
||||
#if defined(LAL_OCL_EV_JIT)
|
||||
oclstring = device->compile_string()+" -DEVFLAG=0";
|
||||
if (_onetype>=0) oclstring+=" -DONETYPE="+device->toa(_onetype)+
|
||||
" -DONETYPE3="+device->toa(_onetype3);
|
||||
if (_spq) oclstring+=" -DSPQ="+device->toa(_spq);
|
||||
if (pair_program_noev) delete pair_program_noev;
|
||||
pair_program_noev=new UCL_Program(dev);
|
||||
pair_program_noev->load_string(pair_str,oclstring.c_str(),nullptr,screen);
|
||||
k_three_center_noev.set_function(*pair_program_noev,three_center);
|
||||
k_three_end_noev.set_function(*pair_program_noev,three_end);
|
||||
k_pair_noev.set_function(*pair_program_noev,two);
|
||||
#else
|
||||
k_sel = &k_pair;
|
||||
k_3center_sel = &k_three_center;
|
||||
k_3end_sel = &k_three_end;
|
||||
#endif
|
||||
|
||||
#ifdef THREE_CONCURRENT
|
||||
k_three_end.cq(ucl_device->cq(_end_command_queue));
|
||||
k_three_end_vatom.cq(ucl_device->cq(_end_command_queue));
|
||||
#if defined(LAL_OCL_EV_JIT)
|
||||
k_three_end_noev.cq(ucl_device->cq(_end_command_queue));
|
||||
#endif
|
||||
#endif
|
||||
|
||||
_compiled=true;
|
||||
|
||||
#if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
|
||||
if (dev.cl_device_version() >= 210) {
|
||||
size_t mx_subgroup_sz = k_pair.max_subgroup_size(_block_size);
|
||||
mx_subgroup_sz = std::min(mx_subgroup_sz, k_three_center.max_subgroup_size(_block_size));
|
||||
mx_subgroup_sz = std::min(mx_subgroup_sz, k_three_end.max_subgroup_size(_block_size));
|
||||
mx_subgroup_sz = std::min(mx_subgroup_sz, k_three_end_vatom.max_subgroup_size(_block_size));
|
||||
#if defined(LAL_OCL_EV_JIT)
|
||||
mx_subgroup_sz = std::min(mx_subgroup_sz, k_pair_noev.max_subgroup_size(_block_size));
|
||||
mx_subgroup_sz = std::min(mx_subgroup_sz, k_three_center_noev.max_subgroup_size(_block_size));
|
||||
mx_subgroup_sz = std::min(mx_subgroup_sz, k_three_end_noev.max_subgroup_size(_block_size));
|
||||
#endif
|
||||
if (_threads_per_atom > mx_subgroup_sz)
|
||||
_threads_per_atom = mx_subgroup_sz;
|
||||
device->set_simd_size(mx_subgroup_sz);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
template class BaseThree<PRECISION,ACC_PRECISION>;
|
||||
|
||||
@ -59,10 +59,12 @@ class BaseThree {
|
||||
const double gpu_split, FILE *screen,
|
||||
const void *pair_program, const char *k_two,
|
||||
const char *k_three_center, const char *k_three_end,
|
||||
const char *k_short_nbor=nullptr);
|
||||
const char *k_short_nbor=nullptr, const int onetype=-1,
|
||||
const int onetype3=-1, const int spq=0,
|
||||
const int tpa_override=0);
|
||||
|
||||
/// Estimate the overhead for GPU context changes and CPU driver
|
||||
void estimate_gpu_overhead();
|
||||
void estimate_gpu_overhead(const int add_kernels=0);
|
||||
|
||||
/// Check if there is enough storage for atom arrays and realloc if not
|
||||
/** \param success set to false if insufficient memory **/
|
||||
@ -109,7 +111,7 @@ class BaseThree {
|
||||
/// Accumulate timers
|
||||
inline void acc_timers() {
|
||||
if (device->time_device()) {
|
||||
nbor->acc_timers();
|
||||
nbor->acc_timers(screen);
|
||||
time_pair.add_to_total();
|
||||
atom->acc_timers();
|
||||
ans->acc_timers();
|
||||
@ -134,9 +136,9 @@ class BaseThree {
|
||||
int *numj, int **firstneigh, bool &success);
|
||||
|
||||
/// Build neighbor list on device
|
||||
int build_nbor_list(const int inum, const int host_inum,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *sublo, double *subhi, tagint *tag, int **nspecial,
|
||||
void build_nbor_list(const int inum, const int host_inum, const int nall,
|
||||
double **host_x, int *host_type, double *sublo,
|
||||
double *subhi, tagint *tag, int **nspecial,
|
||||
tagint **special, bool &success);
|
||||
|
||||
/// Pair loop with host neighboring
|
||||
@ -147,12 +149,12 @@ class BaseThree {
|
||||
int &host_start, const double cpu_time, bool &success);
|
||||
|
||||
/// Pair loop with device neighboring
|
||||
int ** compute(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type, double *sublo,
|
||||
double *subhi, tagint *tag, int **nspecial,
|
||||
tagint **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
int **ilist, int **numj, const double cpu_time, bool &success);
|
||||
int ** compute(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, double *sublo,
|
||||
double *subhi, tagint *tag, int **nspecial, tagint **special,
|
||||
const bool eflag, const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start, int **ilist,
|
||||
int **numj, const double cpu_time, bool &success);
|
||||
|
||||
// -------------------------- DEVICE DATA -------------------------
|
||||
|
||||
@ -188,14 +190,29 @@ class BaseThree {
|
||||
/// Neighbor data
|
||||
Neighbor *nbor;
|
||||
|
||||
UCL_D_Vec<int> dev_short_nbor;
|
||||
UCL_Kernel k_short_nbor;
|
||||
|
||||
// ------------------------- DEVICE KERNELS -------------------------
|
||||
UCL_Program *pair_program;
|
||||
UCL_Program *pair_program, *pair_program_noev;
|
||||
UCL_Kernel k_pair, k_three_center, k_three_end, k_three_end_vatom;
|
||||
UCL_Kernel k_pair_noev, k_three_center_noev, k_three_end_noev;
|
||||
UCL_Kernel *k_sel, *k_3center_sel, *k_3end_sel;
|
||||
inline int block_pair() { return _block_pair; }
|
||||
inline int block_size() { return _block_size; }
|
||||
inline void set_kernel(const int eflag, const int vflag) {
|
||||
#if defined(LAL_OCL_EV_JIT)
|
||||
if (eflag || vflag) {
|
||||
k_sel = &k_pair;
|
||||
k_3center_sel = &k_three_center;
|
||||
k_3end_sel = &k_three_end;
|
||||
} else {
|
||||
k_sel = &k_pair_noev;
|
||||
k_3center_sel = &k_three_center_noev;
|
||||
k_3end_sel = &k_three_end_noev;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
// --------------------------- TEXTURES -----------------------------
|
||||
UCL_Texture pos_tex;
|
||||
@ -203,18 +220,19 @@ class BaseThree {
|
||||
protected:
|
||||
bool _compiled;
|
||||
int _block_pair, _block_size, _threads_per_atom, _end_command_queue;
|
||||
int _gpu_nbor;
|
||||
int _gpu_nbor, _onetype, _onetype3, _spq;
|
||||
double _max_bytes, _max_an_bytes;
|
||||
int _max_nbors, _ainum, _nall;
|
||||
int _ainum, _nall;
|
||||
double _gpu_overhead, _driver_overhead;
|
||||
UCL_D_Vec<int> *_nbor_data;
|
||||
|
||||
void compile_kernels(UCL_Device &dev, const void *pair_string,
|
||||
const char *two, const char *three_center,
|
||||
const char *three_end, const char* short_nbor);
|
||||
const char *three_end, const char* short_nbor,
|
||||
const int onetype, const int onetype3,
|
||||
const int spq);
|
||||
|
||||
virtual void loop(const bool _eflag, const bool _vflag,
|
||||
const int evatom) = 0;
|
||||
virtual int loop(const int eflag, const int vflag, const int evatom,
|
||||
bool &success) = 0;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@ -113,20 +113,9 @@ double BeckT::host_memory_usage() const {
|
||||
// Calculate energies, forces, and torques
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
void BeckT::loop(const bool _eflag, const bool _vflag) {
|
||||
int BeckT::loop(const int eflag, const int vflag) {
|
||||
// Compute the block size and grid size to keep all cores busy
|
||||
const int BX=this->block_size();
|
||||
int eflag, vflag;
|
||||
if (_eflag)
|
||||
eflag=1;
|
||||
else
|
||||
eflag=0;
|
||||
|
||||
if (_vflag)
|
||||
vflag=1;
|
||||
else
|
||||
vflag=0;
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
@ -134,8 +123,8 @@ void BeckT::loop(const bool _eflag, const bool _vflag) {
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
this->time_pair.start();
|
||||
if (shared_types) {
|
||||
this->k_pair_fast.set_size(GX,BX);
|
||||
this->k_pair_fast.run(&this->atom->x, &beck1, &beck2, &sp_lj,
|
||||
this->k_pair_sel->set_size(GX,BX);
|
||||
this->k_pair_sel->run(&this->atom->x, &beck1, &beck2, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag, &vflag,
|
||||
&ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
@ -147,6 +136,7 @@ void BeckT::loop(const bool _eflag, const bool _vflag) {
|
||||
&ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
}
|
||||
this->time_pair.stop();
|
||||
return GX;
|
||||
}
|
||||
|
||||
template class Beck<PRECISION,ACC_PRECISION>;
|
||||
|
||||
@ -39,22 +39,25 @@ __kernel void k_beck(const __global numtyp4 *restrict x_,
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
||||
__local numtyp sp_lj[4];
|
||||
int n_stride;
|
||||
local_allocate_store_pair();
|
||||
|
||||
sp_lj[0]=sp_lj_in[0];
|
||||
sp_lj[1]=sp_lj_in[1];
|
||||
sp_lj[2]=sp_lj_in[2];
|
||||
sp_lj[3]=sp_lj_in[3];
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
energy=(acctyp)0;
|
||||
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
|
||||
}
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
@ -98,14 +101,14 @@ __kernel void k_beck(const __global numtyp4 *restrict x_,
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0) {
|
||||
if (EVFLAG && eflag) {
|
||||
numtyp term6 = pow(term1,(numtyp)-3);
|
||||
numtyp term1inv = ucl_recip(term1);
|
||||
numtyp e = beck2[mtype].x*ucl_exp((numtyp)-1.0*r*term4);
|
||||
e -= beck2[mtype].y*term6*((numtyp)1.0+((numtyp)2.709+(numtyp)3.0*aaij*aaij)*term1inv);
|
||||
energy+=factor_lj*e;
|
||||
}
|
||||
if (vflag>0) {
|
||||
if (EVFLAG && vflag) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
@ -116,9 +119,9 @@ __kernel void k_beck(const __global numtyp4 *restrict x_,
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
|
||||
ans,engv);
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void k_beck_fast(const __global numtyp4 *restrict x_,
|
||||
@ -137,6 +140,9 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
|
||||
__local numtyp4 beck1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp4 beck2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp sp_lj[4];
|
||||
int n_stride;
|
||||
local_allocate_store_pair();
|
||||
|
||||
if (tid<4)
|
||||
sp_lj[tid]=sp_lj_in[tid];
|
||||
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||
@ -144,19 +150,19 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
|
||||
beck2[tid]=beck2_in[tid];
|
||||
}
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
energy=(acctyp)0;
|
||||
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
@ -200,14 +206,14 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0) {
|
||||
if (EVFLAG && eflag) {
|
||||
numtyp term6 = pow(term1,(numtyp)-3);
|
||||
numtyp term1inv = ucl_recip(term1);
|
||||
numtyp e = beck2[mtype].x*ucl_exp((numtyp)-1.0*r*term4);
|
||||
e -= beck2[mtype].y*term6*((numtyp)1.0+((numtyp)2.709+(numtyp)3.0*aaij*aaij)*term1inv);
|
||||
energy+=factor_lj*e;
|
||||
}
|
||||
if (vflag>0) {
|
||||
if (EVFLAG && vflag) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
@ -218,8 +224,8 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
|
||||
ans,engv);
|
||||
} // if ii
|
||||
}
|
||||
|
||||
|
||||
@ -72,7 +72,7 @@ class Beck : public BaseAtomic<numtyp, acctyp> {
|
||||
|
||||
private:
|
||||
bool _allocated;
|
||||
void loop(const bool _eflag, const bool _vflag);
|
||||
int loop(const int eflag, const int vflag);
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@ -55,7 +55,7 @@ int beck_gpu_init(const int ntypes, double **cutsq, double **aa,
|
||||
int init_ok=0;
|
||||
if (world_me==0)
|
||||
init_ok=BLMF.init(ntypes, cutsq, aa, alpha, beta,
|
||||
AA, BB, special_lj, inum, nall, 300,
|
||||
AA, BB, special_lj, inum, nall, max_nbors,
|
||||
maxspecial, cell_size, gpu_split, screen);
|
||||
|
||||
BLMF.device->world_barrier();
|
||||
@ -73,7 +73,7 @@ int beck_gpu_init(const int ntypes, double **cutsq, double **aa,
|
||||
}
|
||||
if (gpu_rank==i && world_me!=0)
|
||||
init_ok=BLMF.init(ntypes, cutsq, aa, alpha, beta, AA, BB,
|
||||
special_lj, inum, nall, 300, maxspecial,
|
||||
special_lj, inum, nall, max_nbors, maxspecial,
|
||||
cell_size, gpu_split, screen);
|
||||
|
||||
BLMF.device->gpu_barrier();
|
||||
|
||||
@ -138,20 +138,9 @@ double BornT::host_memory_usage() const {
|
||||
// Calculate energies, forces, and torques
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
void BornT::loop(const bool _eflag, const bool _vflag) {
|
||||
int BornT::loop(const int eflag, const int vflag) {
|
||||
// Compute the block size and grid size to keep all cores busy
|
||||
const int BX=this->block_size();
|
||||
int eflag, vflag;
|
||||
if (_eflag)
|
||||
eflag=1;
|
||||
else
|
||||
eflag=0;
|
||||
|
||||
if (_vflag)
|
||||
vflag=1;
|
||||
else
|
||||
vflag=0;
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
@ -159,8 +148,8 @@ void BornT::loop(const bool _eflag, const bool _vflag) {
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
this->time_pair.start();
|
||||
if (shared_types) {
|
||||
this->k_pair_fast.set_size(GX,BX);
|
||||
this->k_pair_fast.run(&this->atom->x, &coeff1,&coeff2,
|
||||
this->k_pair_sel->set_size(GX,BX);
|
||||
this->k_pair_sel->run(&this->atom->x, &coeff1,&coeff2,
|
||||
&cutsq_sigma, &sp_lj,
|
||||
&this->nbor->dev_nbor,
|
||||
&this->_nbor_data->begin(),
|
||||
@ -176,6 +165,7 @@ void BornT::loop(const bool _eflag, const bool _vflag) {
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
}
|
||||
this->time_pair.stop();
|
||||
return GX;
|
||||
}
|
||||
|
||||
template class Born<PRECISION,ACC_PRECISION>;
|
||||
|
||||
@ -40,22 +40,25 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
||||
__local numtyp sp_lj[4];
|
||||
int n_stride;
|
||||
local_allocate_store_pair();
|
||||
|
||||
sp_lj[0]=sp_lj_in[0];
|
||||
sp_lj[1]=sp_lj_in[1];
|
||||
sp_lj[2]=sp_lj_in[2];
|
||||
sp_lj[3]=sp_lj_in[3];
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
energy=(acctyp)0;
|
||||
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
|
||||
}
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
@ -92,12 +95,12 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0) {
|
||||
if (EVFLAG && eflag) {
|
||||
numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
|
||||
+ coeff2[mtype].z*r2inv*r6inv;
|
||||
energy+=factor_lj*(e-coeff2[mtype].w);
|
||||
}
|
||||
if (vflag>0) {
|
||||
if (EVFLAG && vflag) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
@ -108,9 +111,9 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
|
||||
ans,engv);
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void k_born_fast(const __global numtyp4 *restrict x_,
|
||||
@ -130,27 +133,30 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
|
||||
__local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp sp_lj[4];
|
||||
int n_stride;
|
||||
local_allocate_store_pair();
|
||||
|
||||
if (tid<4)
|
||||
sp_lj[tid]=sp_lj_in[tid];
|
||||
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||
coeff1[tid]=coeff1_in[tid];
|
||||
if (eflag>0)
|
||||
if (EVFLAG && eflag)
|
||||
coeff2[tid]=coeff2_in[tid];
|
||||
}
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
energy=(acctyp)0;
|
||||
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
@ -187,12 +193,12 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0) {
|
||||
if (EVFLAG && eflag) {
|
||||
numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
|
||||
+ coeff2[mtype].z*r2inv*r6inv;
|
||||
energy+=factor_lj*(e-coeff2[mtype].w);
|
||||
}
|
||||
if (vflag>0) {
|
||||
if (EVFLAG && vflag) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
@ -203,8 +209,8 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
|
||||
ans,engv);
|
||||
} // if ii
|
||||
}
|
||||
|
||||
|
||||
@ -82,7 +82,7 @@ class Born : public BaseAtomic<numtyp, acctyp> {
|
||||
|
||||
private:
|
||||
bool _allocated;
|
||||
void loop(const bool _eflag, const bool _vflag);
|
||||
int loop(const int eflag, const int vflag);
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@ -129,20 +129,9 @@ double BornCoulLongT::host_memory_usage() const {
|
||||
// Calculate energies, forces, and torques
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
void BornCoulLongT::loop(const bool _eflag, const bool _vflag) {
|
||||
int BornCoulLongT::loop(const int eflag, const int vflag) {
|
||||
// Compute the block size and grid size to keep all cores busy
|
||||
const int BX=this->block_size();
|
||||
int eflag, vflag;
|
||||
if (_eflag)
|
||||
eflag=1;
|
||||
else
|
||||
eflag=0;
|
||||
|
||||
if (_vflag)
|
||||
vflag=1;
|
||||
else
|
||||
vflag=0;
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
@ -150,8 +139,8 @@ void BornCoulLongT::loop(const bool _eflag, const bool _vflag) {
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
this->time_pair.start();
|
||||
if (shared_types) {
|
||||
this->k_pair_fast.set_size(GX,BX);
|
||||
this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
|
||||
this->k_pair_sel->set_size(GX,BX);
|
||||
this->k_pair_sel->run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
|
||||
&this->nbor->dev_nbor,
|
||||
&this->_nbor_data->begin(),
|
||||
&this->ans->force,
|
||||
@ -170,6 +159,7 @@ void BornCoulLongT::loop(const bool _eflag, const bool _vflag) {
|
||||
&_qqrd2e, &_g_ewald, &this->_threads_per_atom);
|
||||
}
|
||||
this->time_pair.stop();
|
||||
return GX;
|
||||
}
|
||||
|
||||
template class BornCoulLong<PRECISION,ACC_PRECISION>;
|
||||
|
||||
@ -48,6 +48,9 @@ __kernel void k_born_coul_long(const __global numtyp4 *restrict x_,
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
||||
__local numtyp sp_lj[8];
|
||||
int n_stride;
|
||||
local_allocate_store_charge();
|
||||
|
||||
sp_lj[0]=sp_lj_in[0];
|
||||
sp_lj[1]=sp_lj_in[1];
|
||||
sp_lj[2]=sp_lj_in[2];
|
||||
@ -57,18 +60,18 @@ __kernel void k_born_coul_long(const __global numtyp4 *restrict x_,
|
||||
sp_lj[6]=sp_lj_in[6];
|
||||
sp_lj[7]=sp_lj_in[7];
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp e_coul=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
energy=(acctyp)0;
|
||||
e_coul=(acctyp)0;
|
||||
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
|
||||
}
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
@ -124,7 +127,7 @@ __kernel void k_born_coul_long(const __global numtyp4 *restrict x_,
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0) {
|
||||
if (EVFLAG && eflag) {
|
||||
if (rsq < cut_coulsq)
|
||||
e_coul += prefactor*(_erfc-factor_coul);
|
||||
if (rsq < cutsq_sigma[mtype].y) {
|
||||
@ -133,7 +136,7 @@ __kernel void k_born_coul_long(const __global numtyp4 *restrict x_,
|
||||
energy+=factor_lj*(e-coeff2[mtype].w);
|
||||
}
|
||||
}
|
||||
if (vflag>0) {
|
||||
if (EVFLAG && vflag) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
@ -144,9 +147,9 @@ __kernel void k_born_coul_long(const __global numtyp4 *restrict x_,
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
|
||||
vflag,ans,engv);
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void k_born_coul_long_fast(const __global numtyp4 *restrict x_,
|
||||
@ -169,28 +172,31 @@ __kernel void k_born_coul_long_fast(const __global numtyp4 *restrict x_,
|
||||
__local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp sp_lj[8];
|
||||
int n_stride;
|
||||
local_allocate_store_charge();
|
||||
|
||||
if (tid<8)
|
||||
sp_lj[tid]=sp_lj_in[tid];
|
||||
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||
coeff1[tid]=coeff1_in[tid];
|
||||
if (eflag>0)
|
||||
if (EVFLAG && eflag)
|
||||
coeff2[tid]=coeff2_in[tid];
|
||||
}
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp e_coul=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
energy=(acctyp)0;
|
||||
e_coul=(acctyp)0;
|
||||
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
@ -246,7 +252,7 @@ __kernel void k_born_coul_long_fast(const __global numtyp4 *restrict x_,
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0) {
|
||||
if (EVFLAG && eflag) {
|
||||
if (rsq < cut_coulsq)
|
||||
e_coul += prefactor*(_erfc-factor_coul);
|
||||
if (rsq < cutsq_sigma[mtype].y) {
|
||||
@ -255,7 +261,7 @@ __kernel void k_born_coul_long_fast(const __global numtyp4 *restrict x_,
|
||||
energy+=factor_lj*(e-coeff2[mtype].w);
|
||||
}
|
||||
}
|
||||
if (vflag>0) {
|
||||
if (EVFLAG && vflag) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
@ -266,8 +272,8 @@ __kernel void k_born_coul_long_fast(const __global numtyp4 *restrict x_,
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
|
||||
vflag,ans,engv);
|
||||
} // if ii
|
||||
}
|
||||
|
||||
|
||||
@ -80,7 +80,7 @@ class BornCoulLong : public BaseCharge<numtyp, acctyp> {
|
||||
|
||||
protected:
|
||||
bool _allocated;
|
||||
void loop(const bool _eflag, const bool _vflag);
|
||||
int loop(const int eflag, const int vflag);
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@ -63,6 +63,9 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
||||
__local numtyp sp_lj[8];
|
||||
int n_stride;
|
||||
local_allocate_store_charge();
|
||||
|
||||
sp_lj[0]=sp_lj_in[0];
|
||||
sp_lj[1]=sp_lj_in[1];
|
||||
sp_lj[2]=sp_lj_in[2];
|
||||
@ -72,18 +75,18 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
|
||||
sp_lj[6]=sp_lj_in[6];
|
||||
sp_lj[7]=sp_lj_in[7];
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp e_coul=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
energy=(acctyp)0;
|
||||
e_coul=(acctyp)0;
|
||||
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
|
||||
}
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
@ -155,7 +158,7 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0) {
|
||||
if (EVFLAG && eflag) {
|
||||
if (rsq < cut_coulsq) {
|
||||
numtyp e = prefactor*_erfc;
|
||||
if (factor_coul<(numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
|
||||
@ -167,7 +170,7 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
|
||||
energy+=factor_lj*(e-coeff2[mtype].w);
|
||||
}
|
||||
}
|
||||
if (vflag>0) {
|
||||
if (EVFLAG && vflag) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
@ -178,9 +181,9 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
|
||||
vflag,ans,engv);
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_,
|
||||
@ -203,28 +206,31 @@ __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_,
|
||||
__local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp sp_lj[8];
|
||||
int n_stride;
|
||||
local_allocate_store_charge();
|
||||
|
||||
if (tid<8)
|
||||
sp_lj[tid]=sp_lj_in[tid];
|
||||
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||
coeff1[tid]=coeff1_in[tid];
|
||||
if (eflag>0)
|
||||
if (EVFLAG && eflag)
|
||||
coeff2[tid]=coeff2_in[tid];
|
||||
}
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp e_coul=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
energy=(acctyp)0;
|
||||
e_coul=(acctyp)0;
|
||||
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
@ -296,7 +302,7 @@ __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_,
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0) {
|
||||
if (EVFLAG && eflag) {
|
||||
if (rsq < cut_coulsq) {
|
||||
numtyp e = prefactor*_erfc;
|
||||
if (factor_coul<(numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
|
||||
@ -308,7 +314,7 @@ __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_,
|
||||
energy+=factor_lj*(e-coeff2[mtype].w);
|
||||
}
|
||||
}
|
||||
if (vflag>0) {
|
||||
if (EVFLAG && vflag) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
@ -319,8 +325,8 @@ __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_,
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
|
||||
vflag,ans,engv);
|
||||
} // if ii
|
||||
}
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@ int bornclcs_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
|
||||
if (world_me==0)
|
||||
init_ok=BCLCSMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
|
||||
host_born3, host_a, host_c, host_d, sigma, offset,
|
||||
special_lj, inum, nall, 300, maxspecial, cell_size,
|
||||
special_lj, inum, nall, max_nbors, maxspecial, cell_size,
|
||||
gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
|
||||
host_special_coul, qqrd2e, g_ewald);
|
||||
|
||||
@ -80,7 +80,7 @@ int bornclcs_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
|
||||
if (gpu_rank==i && world_me!=0)
|
||||
init_ok=BCLCSMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
|
||||
host_born3, host_a, host_c, host_d, sigma, offset,
|
||||
special_lj, inum, nall, 300, maxspecial, cell_size,
|
||||
special_lj, inum, nall, max_nbors, maxspecial, cell_size,
|
||||
gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
|
||||
host_special_coul, qqrd2e, g_ewald);
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@ int borncl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
|
||||
if (world_me==0)
|
||||
init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
|
||||
host_born3, host_a, host_c, host_d, sigma, offset,
|
||||
special_lj, inum, nall, 300, maxspecial, cell_size,
|
||||
special_lj, inum, nall, max_nbors, maxspecial, cell_size,
|
||||
gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
|
||||
host_special_coul, qqrd2e, g_ewald);
|
||||
|
||||
@ -80,7 +80,7 @@ int borncl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
|
||||
if (gpu_rank==i && world_me!=0)
|
||||
init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
|
||||
host_born3, host_a, host_c, host_d, sigma, offset,
|
||||
special_lj, inum, nall, 300, maxspecial, cell_size,
|
||||
special_lj, inum, nall, max_nbors, maxspecial, cell_size,
|
||||
gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
|
||||
host_special_coul, qqrd2e, g_ewald);
|
||||
|
||||
|
||||
@ -131,20 +131,9 @@ double BornCoulWolfT::host_memory_usage() const {
|
||||
// Calculate energies, forces, and torques
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
void BornCoulWolfT::loop(const bool _eflag, const bool _vflag) {
|
||||
int BornCoulWolfT::loop(const int eflag, const int vflag) {
|
||||
// Compute the block size and grid size to keep all cores busy
|
||||
const int BX=this->block_size();
|
||||
int eflag, vflag;
|
||||
if (_eflag)
|
||||
eflag=1;
|
||||
else
|
||||
eflag=0;
|
||||
|
||||
if (_vflag)
|
||||
vflag=1;
|
||||
else
|
||||
vflag=0;
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
@ -152,8 +141,8 @@ void BornCoulWolfT::loop(const bool _eflag, const bool _vflag) {
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
this->time_pair.start();
|
||||
if (shared_types) {
|
||||
this->k_pair_fast.set_size(GX,BX);
|
||||
this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
|
||||
this->k_pair_sel->set_size(GX,BX);
|
||||
this->k_pair_sel->run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag, &vflag,
|
||||
&ainum, &nbor_pitch, &this->atom->q,
|
||||
@ -171,6 +160,7 @@ void BornCoulWolfT::loop(const bool _eflag, const bool _vflag) {
|
||||
&this->_threads_per_atom);
|
||||
}
|
||||
this->time_pair.stop();
|
||||
return GX;
|
||||
}
|
||||
|
||||
template class BornCoulWolf<PRECISION,ACC_PRECISION>;
|
||||
|
||||
@ -51,6 +51,9 @@ __kernel void k_born_coul_wolf(const __global numtyp4 *restrict x_,
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
||||
__local numtyp sp_lj[8];
|
||||
int n_stride;
|
||||
local_allocate_store_charge();
|
||||
|
||||
sp_lj[0]=sp_lj_in[0];
|
||||
sp_lj[1]=sp_lj_in[1];
|
||||
sp_lj[2]=sp_lj_in[2];
|
||||
@ -60,18 +63,18 @@ __kernel void k_born_coul_wolf(const __global numtyp4 *restrict x_,
|
||||
sp_lj[6]=sp_lj_in[6];
|
||||
sp_lj[7]=sp_lj_in[7];
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp e_coul=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
energy=(acctyp)0;
|
||||
e_coul=(acctyp)0;
|
||||
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
|
||||
}
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
@ -79,7 +82,7 @@ __kernel void k_born_coul_wolf(const __global numtyp4 *restrict x_,
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int itype=ix.w;
|
||||
|
||||
if (eflag>0) {
|
||||
if (EVFLAG && eflag) {
|
||||
acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) *
|
||||
qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
|
||||
e_coul += (acctyp)2.0*e_self;
|
||||
@ -137,7 +140,7 @@ __kernel void k_born_coul_wolf(const __global numtyp4 *restrict x_,
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0) {
|
||||
if (EVFLAG && eflag) {
|
||||
if (rsq < cut_coulsq) {
|
||||
numtyp e=v_sh;
|
||||
if (factor_coul < (numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
|
||||
@ -149,7 +152,7 @@ __kernel void k_born_coul_wolf(const __global numtyp4 *restrict x_,
|
||||
energy+=factor_lj*(e-coeff2[mtype].w);
|
||||
}
|
||||
}
|
||||
if (vflag>0) {
|
||||
if (EVFLAG && vflag) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
@ -160,9 +163,9 @@ __kernel void k_born_coul_wolf(const __global numtyp4 *restrict x_,
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
|
||||
vflag,ans,engv);
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_,
|
||||
@ -186,28 +189,31 @@ __kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_,
|
||||
__local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp sp_lj[8];
|
||||
int n_stride;
|
||||
local_allocate_store_charge();
|
||||
|
||||
if (tid<8)
|
||||
sp_lj[tid]=sp_lj_in[tid];
|
||||
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||
coeff1[tid]=coeff1_in[tid];
|
||||
if (eflag>0)
|
||||
if (EVFLAG && eflag)
|
||||
coeff2[tid]=coeff2_in[tid];
|
||||
}
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp e_coul=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
energy=(acctyp)0;
|
||||
e_coul=(acctyp)0;
|
||||
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
@ -216,7 +222,7 @@ __kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_,
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
if (eflag>0) {
|
||||
if (EVFLAG && eflag) {
|
||||
acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) *
|
||||
qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
|
||||
e_coul += (acctyp)2.0*e_self;
|
||||
@ -273,7 +279,7 @@ __kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_,
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0) {
|
||||
if (EVFLAG && eflag) {
|
||||
if (rsq < cut_coulsq) {
|
||||
numtyp e=v_sh;
|
||||
if (factor_coul < (numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
|
||||
@ -285,7 +291,7 @@ __kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_,
|
||||
energy+=factor_lj*(e-coeff2[mtype].w);
|
||||
}
|
||||
}
|
||||
if (vflag>0) {
|
||||
if (EVFLAG && vflag) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
@ -296,8 +302,7 @@ __kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_,
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
|
||||
vflag,ans,engv);
|
||||
} // if ii
|
||||
}
|
||||
|
||||
|
||||
@ -81,7 +81,7 @@ class BornCoulWolf : public BaseCharge<numtyp, acctyp> {
|
||||
|
||||
protected:
|
||||
bool _allocated;
|
||||
void loop(const bool _eflag, const bool _vflag);
|
||||
int loop(const int eflag, const int vflag);
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@ -52,6 +52,9 @@ __kernel void k_born_coul_wolf_cs(const __global numtyp4 *restrict x_,
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
||||
__local numtyp sp_lj[8];
|
||||
int n_stride;
|
||||
local_allocate_store_charge();
|
||||
|
||||
sp_lj[0]=sp_lj_in[0];
|
||||
sp_lj[1]=sp_lj_in[1];
|
||||
sp_lj[2]=sp_lj_in[2];
|
||||
@ -61,18 +64,18 @@ __kernel void k_born_coul_wolf_cs(const __global numtyp4 *restrict x_,
|
||||
sp_lj[6]=sp_lj_in[6];
|
||||
sp_lj[7]=sp_lj_in[7];
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp e_coul=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
energy=(acctyp)0;
|
||||
e_coul=(acctyp)0;
|
||||
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
|
||||
}
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
@ -80,7 +83,7 @@ __kernel void k_born_coul_wolf_cs(const __global numtyp4 *restrict x_,
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int itype=ix.w;
|
||||
|
||||
if (eflag>0) {
|
||||
if (EVFLAG && eflag) {
|
||||
acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) *
|
||||
qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
|
||||
e_coul += (acctyp)2.0*e_self;
|
||||
@ -139,7 +142,7 @@ __kernel void k_born_coul_wolf_cs(const __global numtyp4 *restrict x_,
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0) {
|
||||
if (EVFLAG && eflag) {
|
||||
if (rsq < cut_coulsq) {
|
||||
acctyp e=v_sh;
|
||||
if (factor_coul < (numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
|
||||
@ -151,7 +154,7 @@ __kernel void k_born_coul_wolf_cs(const __global numtyp4 *restrict x_,
|
||||
energy+=factor_lj*(e-coeff2[mtype].w);
|
||||
}
|
||||
}
|
||||
if (vflag>0) {
|
||||
if (EVFLAG && vflag) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
@ -162,9 +165,9 @@ __kernel void k_born_coul_wolf_cs(const __global numtyp4 *restrict x_,
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
|
||||
vflag,ans,engv);
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_,
|
||||
@ -188,28 +191,31 @@ __kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_,
|
||||
__local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp sp_lj[8];
|
||||
int n_stride;
|
||||
local_allocate_store_charge();
|
||||
|
||||
if (tid<8)
|
||||
sp_lj[tid]=sp_lj_in[tid];
|
||||
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||
coeff1[tid]=coeff1_in[tid];
|
||||
if (eflag>0)
|
||||
if (EVFLAG && eflag)
|
||||
coeff2[tid]=coeff2_in[tid];
|
||||
}
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp e_coul=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
energy=(acctyp)0;
|
||||
e_coul=(acctyp)0;
|
||||
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
@ -218,7 +224,7 @@ __kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_,
|
||||
int iw=ix.w;
|
||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
if (eflag>0) {
|
||||
if (EVFLAG && eflag) {
|
||||
acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) *
|
||||
qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
|
||||
e_coul += (acctyp)2.0*e_self;
|
||||
@ -276,7 +282,7 @@ __kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_,
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0) {
|
||||
if (EVFLAG && eflag) {
|
||||
if (rsq < cut_coulsq) {
|
||||
acctyp e=v_sh;
|
||||
if (factor_coul < (numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
|
||||
@ -288,7 +294,7 @@ __kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_,
|
||||
energy+=factor_lj*(e-coeff2[mtype].w);
|
||||
}
|
||||
}
|
||||
if (vflag>0) {
|
||||
if (EVFLAG && vflag) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
@ -299,8 +305,8 @@ __kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_,
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
|
||||
vflag,ans,engv);
|
||||
} // if ii
|
||||
}
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@ int borncwcs_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
|
||||
if (world_me==0)
|
||||
init_ok=BornCWCST.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
|
||||
host_born3, host_a, host_c, host_d, sigma,
|
||||
offset, special_lj, inum, nall, 300,
|
||||
offset, special_lj, inum, nall, max_nbors,
|
||||
maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
|
||||
host_cut_coulsq, host_special_coul, qqrd2e,
|
||||
alf, e_shift, f_shift);
|
||||
@ -81,7 +81,7 @@ int borncwcs_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
|
||||
if (gpu_rank==i && world_me!=0)
|
||||
init_ok=BornCWCST.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
|
||||
host_born3, host_a, host_c, host_d, sigma,
|
||||
offset, special_lj, inum, nall, 300,
|
||||
offset, special_lj, inum, nall, max_nbors,
|
||||
maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
|
||||
host_cut_coulsq, host_special_coul, qqrd2e,
|
||||
alf, e_shift, f_shift);
|
||||
|
||||
@ -60,7 +60,7 @@ int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
|
||||
if (world_me==0)
|
||||
init_ok=BORNCWMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
|
||||
host_born3, host_a, host_c, host_d, sigma,
|
||||
offset, special_lj, inum, nall, 300,
|
||||
offset, special_lj, inum, nall, max_nbors,
|
||||
maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
|
||||
host_cut_coulsq, host_special_coul, qqrd2e,
|
||||
alf, e_shift, f_shift);
|
||||
@ -81,7 +81,7 @@ int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
|
||||
if (gpu_rank==i && world_me!=0)
|
||||
init_ok=BORNCWMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
|
||||
host_born3, host_a, host_c, host_d, sigma,
|
||||
offset, special_lj, inum, nall, 300,
|
||||
offset, special_lj, inum, nall, max_nbors,
|
||||
maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
|
||||
host_cut_coulsq, host_special_coul, qqrd2e,
|
||||
alf, e_shift, f_shift);
|
||||
|
||||
@ -58,7 +58,7 @@ int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
|
||||
if (world_me==0)
|
||||
init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
|
||||
host_born3, host_a, host_c, host_d, sigma,
|
||||
offset, special_lj, inum, nall, 300,
|
||||
offset, special_lj, inum, nall, max_nbors,
|
||||
maxspecial, cell_size, gpu_split, screen);
|
||||
|
||||
BORNMF.device->world_barrier();
|
||||
@ -77,7 +77,7 @@ int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
|
||||
if (gpu_rank==i && world_me!=0)
|
||||
init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
|
||||
host_born3, host_a, host_c, host_d, sigma,
|
||||
offset, special_lj, inum, nall, 300,
|
||||
offset, special_lj, inum, nall, max_nbors,
|
||||
maxspecial, cell_size, gpu_split, screen);
|
||||
|
||||
BORNMF.device->gpu_barrier();
|
||||
|
||||
@ -130,20 +130,9 @@ double BuckT::host_memory_usage() const {
|
||||
// Calculate energies, forces, and torques
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
void BuckT::loop(const bool _eflag, const bool _vflag) {
|
||||
int BuckT::loop(const int eflag, const int vflag) {
|
||||
// Compute the block size and grid size to keep all cores busy
|
||||
const int BX=this->block_size();
|
||||
int eflag, vflag;
|
||||
if (_eflag)
|
||||
eflag=1;
|
||||
else
|
||||
eflag=0;
|
||||
|
||||
if (_vflag)
|
||||
vflag=1;
|
||||
else
|
||||
vflag=0;
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
@ -151,8 +140,8 @@ void BuckT::loop(const bool _eflag, const bool _vflag) {
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
this->time_pair.start();
|
||||
if (shared_types) {
|
||||
this->k_pair_fast.set_size(GX,BX);
|
||||
this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
|
||||
this->k_pair_sel->set_size(GX,BX);
|
||||
this->k_pair_sel->run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag,
|
||||
&vflag, &ainum, &nbor_pitch,
|
||||
@ -165,6 +154,7 @@ void BuckT::loop(const bool _eflag, const bool _vflag) {
|
||||
&ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
}
|
||||
this->time_pair.stop();
|
||||
return GX;
|
||||
}
|
||||
|
||||
template class Buck<PRECISION,ACC_PRECISION>;
|
||||
|
||||
@ -39,22 +39,25 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
||||
__local numtyp sp_lj[4];
|
||||
int n_stride;
|
||||
local_allocate_store_pair();
|
||||
|
||||
sp_lj[0]=sp_lj_in[0];
|
||||
sp_lj[1]=sp_lj_in[1];
|
||||
sp_lj[2]=sp_lj_in[2];
|
||||
sp_lj[3]=sp_lj_in[3];
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
energy=(acctyp)0;
|
||||
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
|
||||
}
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
@ -91,11 +94,11 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0) {
|
||||
if (EVFLAG && eflag) {
|
||||
numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv;
|
||||
energy+=factor_lj*(e-coeff2[mtype].z);
|
||||
}
|
||||
if (vflag>0) {
|
||||
if (EVFLAG && vflag) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
@ -106,9 +109,9 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
|
||||
ans,engv);
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void k_buck_fast(const __global numtyp4 *restrict x_,
|
||||
@ -127,27 +130,30 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
|
||||
__local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp sp_lj[4];
|
||||
int n_stride;
|
||||
local_allocate_store_pair();
|
||||
|
||||
if (tid<4)
|
||||
sp_lj[tid]=sp_lj_in[tid];
|
||||
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||
coeff1[tid]=coeff1_in[tid];
|
||||
if (eflag>0)
|
||||
if (EVFLAG && eflag)
|
||||
coeff2[tid]=coeff2_in[tid];
|
||||
}
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
energy=(acctyp)0;
|
||||
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
@ -184,11 +190,11 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0) {
|
||||
if (EVFLAG && eflag) {
|
||||
numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv;
|
||||
energy+=factor_lj*(e-coeff2[mtype].z);
|
||||
}
|
||||
if (vflag>0) {
|
||||
if (EVFLAG && vflag) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
@ -199,8 +205,8 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
|
||||
ans,engv);
|
||||
} // if ii
|
||||
}
|
||||
|
||||
|
||||
@ -77,7 +77,7 @@ class Buck : public BaseAtomic<numtyp, acctyp> {
|
||||
|
||||
private:
|
||||
bool _allocated;
|
||||
void loop(const bool _eflag, const bool _vflag);
|
||||
int loop(const int eflag, const int vflag);
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@ -122,20 +122,9 @@ double BuckCoulT::host_memory_usage() const {
|
||||
// Calculate energies, forces, and torques
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
void BuckCoulT::loop(const bool _eflag, const bool _vflag) {
|
||||
int BuckCoulT::loop(const int eflag, const int vflag) {
|
||||
// Compute the block size and grid size to keep all cores busy
|
||||
const int BX=this->block_size();
|
||||
int eflag, vflag;
|
||||
if (_eflag)
|
||||
eflag=1;
|
||||
else
|
||||
eflag=0;
|
||||
|
||||
if (_vflag)
|
||||
vflag=1;
|
||||
else
|
||||
vflag=0;
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
@ -143,8 +132,8 @@ void BuckCoulT::loop(const bool _eflag, const bool _vflag) {
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
this->time_pair.start();
|
||||
if (shared_types) {
|
||||
this->k_pair_fast.set_size(GX,BX);
|
||||
this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
|
||||
this->k_pair_sel->set_size(GX,BX);
|
||||
this->k_pair_sel->run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag,
|
||||
&vflag, &ainum, &nbor_pitch, &this->atom->q,
|
||||
@ -158,6 +147,7 @@ void BuckCoulT::loop(const bool _eflag, const bool _vflag) {
|
||||
&cutsq, &_qqrd2e, &this->_threads_per_atom);
|
||||
}
|
||||
this->time_pair.stop();
|
||||
return GX;
|
||||
}
|
||||
|
||||
template class BuckCoul<PRECISION,ACC_PRECISION>;
|
||||
|
||||
@ -47,6 +47,9 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
||||
__local numtyp sp_lj[8];
|
||||
int n_stride;
|
||||
local_allocate_store_charge();
|
||||
|
||||
sp_lj[0]=sp_lj_in[0];
|
||||
sp_lj[1]=sp_lj_in[1];
|
||||
sp_lj[2]=sp_lj_in[2];
|
||||
@ -56,18 +59,18 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
|
||||
sp_lj[6]=sp_lj_in[6];
|
||||
sp_lj[7]=sp_lj_in[7];
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp e_coul=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
energy=(acctyp)0;
|
||||
e_coul=(acctyp)0;
|
||||
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
|
||||
}
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
@ -119,14 +122,14 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0) {
|
||||
if (EVFLAG && eflag) {
|
||||
e_coul += forcecoul;
|
||||
if (rsq < cutsq[mtype].y) {
|
||||
numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv;
|
||||
energy+=factor_lj*(e-coeff2[mtype].z);
|
||||
}
|
||||
}
|
||||
if (vflag>0) {
|
||||
if (EVFLAG && vflag) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
@ -137,9 +140,9 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
|
||||
vflag,ans,engv);
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
|
||||
@ -162,29 +165,32 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
|
||||
__local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp4 cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp sp_lj[8];
|
||||
int n_stride;
|
||||
local_allocate_store_charge();
|
||||
|
||||
if (tid<8)
|
||||
sp_lj[tid]=sp_lj_in[tid];
|
||||
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||
coeff1[tid]=coeff1_in[tid];
|
||||
cutsq[tid]=_cutsq[tid];
|
||||
if (eflag>0)
|
||||
if (EVFLAG && eflag)
|
||||
coeff2[tid]=coeff2_in[tid];
|
||||
}
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp e_coul=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
energy=(acctyp)0;
|
||||
e_coul=(acctyp)0;
|
||||
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
@ -236,14 +242,14 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0) {
|
||||
if (EVFLAG && eflag) {
|
||||
e_coul += forcecoul;
|
||||
if (rsq < cutsq[mtype].y) {
|
||||
numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv;
|
||||
energy+=factor_lj*(e-coeff2[mtype].z);
|
||||
}
|
||||
}
|
||||
if (vflag>0) {
|
||||
if (EVFLAG && vflag) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
@ -254,8 +260,8 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
|
||||
vflag,ans,engv);
|
||||
} // if ii
|
||||
}
|
||||
|
||||
|
||||
@ -78,7 +78,7 @@ class BuckCoul : public BaseCharge<numtyp, acctyp> {
|
||||
|
||||
private:
|
||||
bool _allocated;
|
||||
void loop(const bool _eflag, const bool _vflag);
|
||||
int loop(const int eflag, const int vflag);
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@ -58,7 +58,7 @@ int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
|
||||
int init_ok=0;
|
||||
if (world_me==0)
|
||||
init_ok=BUCKCMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
|
||||
host_a, host_c, offset, special_lj, inum, nall, 300,
|
||||
host_a, host_c, offset, special_lj, inum, nall, max_nbors,
|
||||
maxspecial, cell_size, gpu_split, screen,
|
||||
host_cut_ljsq, host_cut_coulsq,
|
||||
host_special_coul, qqrd2e);
|
||||
@ -78,7 +78,7 @@ int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
|
||||
}
|
||||
if (gpu_rank==i && world_me!=0)
|
||||
init_ok=BUCKCMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
|
||||
host_a, host_c, offset, special_lj, inum, nall, 300,
|
||||
host_a, host_c, offset, special_lj, inum, nall, max_nbors,
|
||||
maxspecial, cell_size, gpu_split, screen,
|
||||
host_cut_ljsq, host_cut_coulsq,
|
||||
host_special_coul, qqrd2e);
|
||||
|
||||
@ -126,20 +126,9 @@ double BuckCoulLongT::host_memory_usage() const {
|
||||
// Calculate energies, forces, and torques
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
void BuckCoulLongT::loop(const bool _eflag, const bool _vflag) {
|
||||
int BuckCoulLongT::loop(const int eflag, const int vflag) {
|
||||
// Compute the block size and grid size to keep all cores busy
|
||||
const int BX=this->block_size();
|
||||
int eflag, vflag;
|
||||
if (_eflag)
|
||||
eflag=1;
|
||||
else
|
||||
eflag=0;
|
||||
|
||||
if (_vflag)
|
||||
vflag=1;
|
||||
else
|
||||
vflag=0;
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
@ -147,8 +136,8 @@ void BuckCoulLongT::loop(const bool _eflag, const bool _vflag) {
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
this->time_pair.start();
|
||||
if (shared_types) {
|
||||
this->k_pair_fast.set_size(GX,BX);
|
||||
this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
|
||||
this->k_pair_sel->set_size(GX,BX);
|
||||
this->k_pair_sel->run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag,
|
||||
&vflag, &ainum, &nbor_pitch, &this->atom->q,
|
||||
@ -163,6 +152,7 @@ void BuckCoulLongT::loop(const bool _eflag, const bool _vflag) {
|
||||
&_cut_coulsq, &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
|
||||
}
|
||||
this->time_pair.stop();
|
||||
return GX;
|
||||
}
|
||||
|
||||
template class BuckCoulLong<PRECISION,ACC_PRECISION>;
|
||||
|
||||
@ -48,6 +48,9 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
||||
__local numtyp sp_lj[8];
|
||||
int n_stride;
|
||||
local_allocate_store_charge();
|
||||
|
||||
sp_lj[0]=sp_lj_in[0];
|
||||
sp_lj[1]=sp_lj_in[1];
|
||||
sp_lj[2]=sp_lj_in[2];
|
||||
@ -57,18 +60,18 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
|
||||
sp_lj[6]=sp_lj_in[6];
|
||||
sp_lj[7]=sp_lj_in[7];
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp e_coul=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
energy=(acctyp)0;
|
||||
e_coul=(acctyp)0;
|
||||
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
|
||||
}
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
@ -126,7 +129,7 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0) {
|
||||
if (EVFLAG && eflag) {
|
||||
if (rsq < cut_coulsq)
|
||||
e_coul += prefactor*(_erfc-factor_coul);
|
||||
if (rsq < coeff1[mtype].w) {
|
||||
@ -134,7 +137,7 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
|
||||
energy+=factor_lj*(e-coeff2[mtype].z);
|
||||
}
|
||||
}
|
||||
if (vflag>0) {
|
||||
if (EVFLAG && vflag) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
@ -145,9 +148,9 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
|
||||
vflag,ans,engv);
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
|
||||
@ -171,28 +174,31 @@ __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
|
||||
__local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp sp_lj[8];
|
||||
int n_stride;
|
||||
local_allocate_store_charge();
|
||||
|
||||
if (tid<8)
|
||||
sp_lj[tid]=sp_lj_in[tid];
|
||||
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||
coeff1[tid]=coeff1_in[tid];
|
||||
if (eflag>0)
|
||||
if (EVFLAG && eflag)
|
||||
coeff2[tid]=coeff2_in[tid];
|
||||
}
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp e_coul=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
energy=(acctyp)0;
|
||||
e_coul=(acctyp)0;
|
||||
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
@ -250,7 +256,7 @@ __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0) {
|
||||
if (EVFLAG && eflag) {
|
||||
if (rsq < cut_coulsq)
|
||||
e_coul += prefactor*(_erfc-factor_coul);
|
||||
if (rsq < coeff1[mtype].w) {
|
||||
@ -258,7 +264,7 @@ __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
|
||||
energy+=factor_lj*(e-coeff2[mtype].z);
|
||||
}
|
||||
}
|
||||
if (vflag>0) {
|
||||
if (EVFLAG && vflag) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
@ -269,8 +275,8 @@ __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
|
||||
vflag,ans,engv);
|
||||
} // if ii
|
||||
}
|
||||
|
||||
|
||||
@ -78,7 +78,7 @@ class BuckCoulLong : public BaseCharge<numtyp, acctyp> {
|
||||
|
||||
private:
|
||||
bool _allocated;
|
||||
void loop(const bool _eflag, const bool _vflag);
|
||||
int loop(const int eflag, const int vflag);
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@ -59,7 +59,7 @@ int buckcl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
|
||||
int init_ok=0;
|
||||
if (world_me==0)
|
||||
init_ok=BUCKCLMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
|
||||
host_a, host_c, offset, special_lj, inum, nall, 300,
|
||||
host_a, host_c, offset, special_lj, inum, nall, max_nbors,
|
||||
maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
|
||||
host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
|
||||
|
||||
@ -78,7 +78,7 @@ int buckcl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
|
||||
}
|
||||
if (gpu_rank==i && world_me!=0)
|
||||
init_ok=BUCKCLMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
|
||||
host_a, host_c, offset, special_lj, inum, nall, 300,
|
||||
host_a, host_c, offset, special_lj, inum, nall, max_nbors,
|
||||
maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
|
||||
host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
|
||||
|
||||
|
||||
@ -56,7 +56,7 @@ int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
|
||||
int init_ok=0;
|
||||
if (world_me==0)
|
||||
init_ok=BUCKMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
|
||||
host_a, host_c, offset, special_lj, inum, nall, 300,
|
||||
host_a, host_c, offset, special_lj, inum, nall, max_nbors,
|
||||
maxspecial, cell_size, gpu_split, screen);
|
||||
|
||||
BUCKMF.device->world_barrier();
|
||||
@ -74,7 +74,7 @@ int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
|
||||
}
|
||||
if (gpu_rank==i && world_me!=0)
|
||||
init_ok=BUCKMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
|
||||
host_a, host_c, offset, special_lj, inum, nall, 300,
|
||||
host_a, host_c, offset, special_lj, inum, nall, max_nbors,
|
||||
maxspecial, cell_size, gpu_split, screen);
|
||||
|
||||
BUCKMF.device->gpu_barrier();
|
||||
|
||||
166
lib/gpu/lal_charmm.cpp
Normal file
166
lib/gpu/lal_charmm.cpp
Normal file
@ -0,0 +1,166 @@
|
||||
/***************************************************************************
|
||||
charmm.cpp
|
||||
-------------------
|
||||
W. Michael Brown (ORNL)
|
||||
|
||||
Class for acceleration of the charmm/coul pair style.
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
#if defined(USE_OPENCL)
|
||||
#include "charmm_cl.h"
|
||||
#elif defined(USE_CUDART)
|
||||
const char *charmm_long=0;
|
||||
#else
|
||||
#include "charmm_cubin.h"
|
||||
#endif
|
||||
|
||||
#include "lal_charmm.h"
|
||||
#include <cassert>
|
||||
namespace LAMMPS_AL {
|
||||
#define CHARMMT CHARMM<numtyp, acctyp>
|
||||
|
||||
extern Device<PRECISION,ACC_PRECISION> device;
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
CHARMMT::CHARMM() : BaseCharge<numtyp,acctyp>(),
|
||||
_allocated(false) {
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
CHARMMT::~CHARMM() {
|
||||
clear();
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int CHARMMT::bytes_per_atom(const int max_nbors) const {
|
||||
return this->bytes_per_atom_atomic(max_nbors);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int CHARMMT::init(const int ntypes, double host_cut_bothsq, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3, double **host_lj4,
|
||||
double *host_special_lj, const int nlocal, const int nall,
|
||||
const int max_nbors, const int maxspecial,
|
||||
const double cell_size, const double gpu_split,
|
||||
FILE *_screen, double host_cut_ljsq,
|
||||
const double host_cut_coulsq, double *host_special_coul,
|
||||
const double qqrd2e, const double cut_lj_innersq,
|
||||
const double cut_coul_innersq, const double denom_lj,
|
||||
const double denom_coul, double **epsilon,
|
||||
double **sigma, const bool mix_arithmetic) {
|
||||
int success;
|
||||
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,
|
||||
gpu_split,_screen,charmm,"k_charmm");
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
// If atom type constants fit in shared memory use fast kernel
|
||||
int lj_types=ntypes;
|
||||
shared_types=false;
|
||||
int max_bio_shared_types=this->device->max_bio_shared_types();
|
||||
if (this->_block_bio_size>=64 && mix_arithmetic &&
|
||||
lj_types<=max_bio_shared_types)
|
||||
shared_types=true;
|
||||
_lj_types=lj_types;
|
||||
|
||||
// Allocate a host write buffer for data initialization
|
||||
int h_size=lj_types*lj_types;
|
||||
if (h_size<max_bio_shared_types)
|
||||
h_size=max_bio_shared_types;
|
||||
UCL_H_Vec<numtyp> host_write(h_size*32,*(this->ucl_device),
|
||||
UCL_WRITE_ONLY);
|
||||
for (int i=0; i<h_size*32; i++)
|
||||
host_write[i]=0.0;
|
||||
|
||||
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
|
||||
host_lj3,host_lj4);
|
||||
|
||||
if (shared_types) {
|
||||
ljd.alloc(max_bio_shared_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->self_pack2(ntypes,ljd,host_write,epsilon,sigma);
|
||||
}
|
||||
|
||||
sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
|
||||
for (int i=0; i<4; i++) {
|
||||
host_write[i]=host_special_lj[i];
|
||||
host_write[i+4]=host_special_coul[i];
|
||||
}
|
||||
ucl_copy(sp_lj,host_write,8,false);
|
||||
|
||||
_cut_bothsq = host_cut_bothsq;
|
||||
_cut_coulsq = host_cut_coulsq;
|
||||
_cut_ljsq = host_cut_ljsq;
|
||||
_cut_lj_innersq = cut_lj_innersq;
|
||||
_cut_coul_innersq = cut_coul_innersq;
|
||||
_qqrd2e=qqrd2e;
|
||||
_denom_lj=denom_lj;
|
||||
_denom_coul=denom_coul;
|
||||
|
||||
_allocated=true;
|
||||
this->_max_bytes=lj1.row_bytes()+ljd.row_bytes()+sp_lj.row_bytes();
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void CHARMMT::clear() {
|
||||
if (!_allocated)
|
||||
return;
|
||||
_allocated=false;
|
||||
|
||||
lj1.clear();
|
||||
ljd.clear();
|
||||
sp_lj.clear();
|
||||
this->clear_atomic();
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
double CHARMMT::host_memory_usage() const {
|
||||
return this->host_memory_usage_atomic()+sizeof(CHARMM<numtyp,acctyp>);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Calculate energies, forces, and torques
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
int CHARMMT::loop(const int eflag, const int vflag) {
|
||||
// Compute the block size and grid size to keep all cores busy
|
||||
const int BX=this->_block_bio_size;
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
int ainum=this->ans->inum();
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
this->time_pair.start();
|
||||
if (shared_types) {
|
||||
this->k_pair_sel->set_size(GX,BX);
|
||||
this->k_pair_sel->run(&this->atom->x, &ljd, &sp_lj,
|
||||
&this->nbor->dev_nbor, this->_nbor_data,
|
||||
&this->ans->force, &this->ans->engv, &eflag,
|
||||
&vflag, &ainum, &nbor_pitch, &this->atom->q,
|
||||
&_cut_coulsq, &_qqrd2e, &_denom_lj, &_denom_coul,
|
||||
&_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq,
|
||||
&_cut_coul_innersq, &this->_threads_per_atom);
|
||||
} else {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->x, &ljd, &sp_lj,
|
||||
&this->nbor->dev_nbor, this->_nbor_data,
|
||||
&this->ans->force, &this->ans->engv, &eflag,
|
||||
&vflag, &ainum, &nbor_pitch, &this->atom->q,
|
||||
&_cut_coulsq, &_qqrd2e, &_denom_lj, &_denom_coul,
|
||||
&_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq,
|
||||
&_cut_coul_innersq, &this->_threads_per_atom);
|
||||
}
|
||||
this->time_pair.stop();
|
||||
return GX;
|
||||
}
|
||||
|
||||
template class CHARMM<PRECISION,ACC_PRECISION>;
|
||||
}
|
||||
303
lib/gpu/lal_charmm.cu
Normal file
303
lib/gpu/lal_charmm.cu
Normal file
@ -0,0 +1,303 @@
|
||||
// **************************************************************************
|
||||
// charmm.cu
|
||||
// -------------------
|
||||
// W. Michael Brown (ORNL)
|
||||
//
|
||||
// Device code for acceleration of the charmm/coul pair style
|
||||
//
|
||||
// __________________________________________________________________________
|
||||
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
// __________________________________________________________________________
|
||||
//
|
||||
// begin :
|
||||
// email : brownw@ornl.gov
|
||||
// ***************************************************************************/
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
|
||||
#include "lal_aux_fun1.h"
|
||||
#ifndef _DOUBLE_DOUBLE
|
||||
texture<float4> pos_tex;
|
||||
texture<float> q_tex;
|
||||
#else
|
||||
texture<int4,1> pos_tex;
|
||||
texture<int2> q_tex;
|
||||
#endif
|
||||
|
||||
#else
|
||||
#define pos_tex x_
|
||||
#define q_tex q_
|
||||
#endif
|
||||
|
||||
__kernel void k_charmm(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp2 *restrict ljd,
|
||||
const __global numtyp *restrict sp_lj,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
const __global numtyp *restrict q_,
|
||||
const numtyp cut_coulsq, const numtyp qqrd2e,
|
||||
const numtyp denom_lj,
|
||||
const numtyp denom_coul,
|
||||
const numtyp cut_bothsq,
|
||||
const numtyp cut_ljsq,
|
||||
const numtyp cut_lj_innersq,
|
||||
const numtyp cut_coul_innersq,
|
||||
const int t_per_atom) {
|
||||
int tid, ii, offset;
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
||||
int n_stride;
|
||||
local_allocate_store_bio();
|
||||
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
energy=(acctyp)0;
|
||||
e_coul=(acctyp)0;
|
||||
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
|
||||
}
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
factor_coul = sp_lj[sbmask(j)+4];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (rsq<cut_bothsq) {
|
||||
numtyp r2inv=ucl_recip(rsq);
|
||||
numtyp forcecoul, force_lj, force, switch1;
|
||||
numtyp lj3, lj4;
|
||||
|
||||
if (rsq < cut_ljsq) {
|
||||
numtyp eps = ucl_sqrt(ljd[itype].x*ljd[jtype].x);
|
||||
numtyp sig6 = (numtyp)0.5 * (ljd[itype].y+ljd[jtype].y);
|
||||
|
||||
numtyp sig_r_6 = sig6*sig6*r2inv;
|
||||
sig_r_6 = sig_r_6*sig_r_6*sig_r_6;
|
||||
lj4 = (numtyp)4.0*eps*sig_r_6;
|
||||
lj3 = lj4*sig_r_6;
|
||||
force_lj = factor_lj*((numtyp)12.0 * lj3 - (numtyp)6.0 * lj4);
|
||||
if (rsq > cut_lj_innersq) {
|
||||
switch1 = (cut_ljsq-rsq);
|
||||
numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)*
|
||||
denom_lj;
|
||||
switch1 *= switch1;
|
||||
switch1 *= (cut_ljsq+(numtyp)2.0*rsq-(numtyp)3.0*cut_lj_innersq)*
|
||||
denom_lj;
|
||||
switch2 *= lj3-lj4;
|
||||
force_lj = force_lj*switch1+switch2;
|
||||
}
|
||||
} else
|
||||
force_lj = (numtyp)0.0;
|
||||
|
||||
if (rsq < cut_coulsq) {
|
||||
numtyp rinv = ucl_rsqrt(rsq);
|
||||
fetch(forcecoul,j,q_tex);
|
||||
forcecoul *= factor_coul * qqrd2e * qtmp * rinv;
|
||||
if (rsq > cut_coul_innersq) {
|
||||
numtyp switch3 = (cut_coulsq-rsq) * (cut_coulsq-rsq) *
|
||||
(cut_coulsq + (numtyp)2.0*rsq - (numtyp)3.0*cut_coul_innersq) *
|
||||
denom_coul;
|
||||
forcecoul *= switch3;
|
||||
}
|
||||
} else
|
||||
forcecoul = (numtyp)0.0;
|
||||
|
||||
force = (force_lj + forcecoul) * r2inv;
|
||||
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (EVFLAG && eflag) {
|
||||
e_coul += forcecoul;
|
||||
if (rsq < cut_ljsq) {
|
||||
numtyp e=lj3-lj4;
|
||||
if (rsq > cut_lj_innersq)
|
||||
e *= switch1;
|
||||
energy+=factor_lj*e;
|
||||
}
|
||||
}
|
||||
if (EVFLAG && vflag) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
virial[3] += delx*dely*force;
|
||||
virial[4] += delx*delz*force;
|
||||
virial[5] += dely*delz*force;
|
||||
}
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
|
||||
vflag,ans,engv);
|
||||
}
|
||||
|
||||
__kernel void k_charmm_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp2 *restrict ljd_in,
|
||||
const __global numtyp *restrict sp_lj_in,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
const __global numtyp *restrict q_,
|
||||
const numtyp cut_coulsq, const numtyp qqrd2e,
|
||||
const numtyp denom_lj,
|
||||
const numtyp denom_coul,
|
||||
const numtyp cut_bothsq,
|
||||
const numtyp cut_ljsq,
|
||||
const numtyp cut_lj_innersq,
|
||||
const numtyp cut_coul_innersq,
|
||||
const int t_per_atom) {
|
||||
int tid, ii, offset;
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
||||
__local numtyp2 ljd[MAX_BIO_SHARED_TYPES];
|
||||
__local numtyp sp_lj[8];
|
||||
int n_stride;
|
||||
local_allocate_store_bio();
|
||||
|
||||
if (tid<8)
|
||||
sp_lj[tid]=sp_lj_in[tid];
|
||||
if (tid<MAX_BIO_SHARED_TYPES)
|
||||
ljd[tid]=ljd_in[tid];
|
||||
if (tid+BLOCK_BIO_PAIR<MAX_BIO_SHARED_TYPES)
|
||||
ljd[tid+BLOCK_BIO_PAIR]=ljd_in[tid+BLOCK_BIO_PAIR];
|
||||
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
energy=(acctyp)0;
|
||||
e_coul=(acctyp)0;
|
||||
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
int itype=ix.w;
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
int j=dev_packed[nbor];
|
||||
|
||||
numtyp factor_lj, factor_coul;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
factor_coul = sp_lj[sbmask(j)+4];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (rsq<cut_bothsq) {
|
||||
numtyp r2inv=ucl_recip(rsq);
|
||||
numtyp forcecoul, force_lj, force, switch1;
|
||||
numtyp lj3, lj4;
|
||||
|
||||
if (rsq < cut_ljsq) {
|
||||
numtyp eps = ucl_sqrt(ljd[itype].x*ljd[jtype].x);
|
||||
numtyp sig6 = (numtyp)0.5 * (ljd[itype].y+ljd[jtype].y);
|
||||
|
||||
numtyp sig_r_6 = sig6*sig6*r2inv;
|
||||
sig_r_6 = sig_r_6*sig_r_6*sig_r_6;
|
||||
lj4 = (numtyp)4.0*eps*sig_r_6;
|
||||
lj3 = lj4*sig_r_6;
|
||||
force_lj = factor_lj*((numtyp)12.0 * lj3 - (numtyp)6.0 * lj4);
|
||||
if (rsq > cut_lj_innersq) {
|
||||
switch1 = (cut_ljsq-rsq);
|
||||
numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)*
|
||||
denom_lj;
|
||||
switch1 *= switch1;
|
||||
switch1 *= (cut_ljsq+(numtyp)2.0*rsq-(numtyp)3.0*cut_lj_innersq)*
|
||||
denom_lj;
|
||||
switch2 *= lj3-lj4;
|
||||
force_lj = force_lj*switch1+switch2;
|
||||
}
|
||||
} else
|
||||
force_lj = (numtyp)0.0;
|
||||
|
||||
if (rsq < cut_coulsq) {
|
||||
numtyp rinv = ucl_rsqrt(rsq);
|
||||
fetch(forcecoul,j,q_tex);
|
||||
forcecoul *= factor_coul * qqrd2e * qtmp * rinv;
|
||||
if (rsq > cut_coul_innersq) {
|
||||
numtyp switch3 = (cut_coulsq-rsq) * (cut_coulsq-rsq) *
|
||||
(cut_coulsq + (numtyp)2.0*rsq - (numtyp)3.0*cut_coul_innersq) *
|
||||
denom_coul;
|
||||
forcecoul *= switch3;
|
||||
}
|
||||
} else
|
||||
forcecoul = (numtyp)0.0;
|
||||
|
||||
force = (force_lj + forcecoul) * r2inv;
|
||||
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (EVFLAG && eflag) {
|
||||
e_coul += forcecoul;
|
||||
if (rsq < cut_ljsq) {
|
||||
numtyp e=lj3-lj4;
|
||||
if (rsq > cut_lj_innersq)
|
||||
e *= switch1;
|
||||
energy+=factor_lj*e;
|
||||
}
|
||||
}
|
||||
if (EVFLAG && vflag) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
virial[3] += delx*dely*force;
|
||||
virial[4] += delx*delz*force;
|
||||
virial[5] += dely*delz*force;
|
||||
}
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
|
||||
vflag,ans,engv);
|
||||
}
|
||||
89
lib/gpu/lal_charmm.h
Normal file
89
lib/gpu/lal_charmm.h
Normal file
@ -0,0 +1,89 @@
|
||||
/***************************************************************************
|
||||
charmm.h
|
||||
-------------------
|
||||
W. Michael Brown (ORNL)
|
||||
|
||||
Class for acceleration of the charmm/coul pair style.
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
#ifndef LAL_CHARMM_
|
||||
#define LAL_CHARMM_
|
||||
|
||||
#include "lal_base_charge.h"
|
||||
|
||||
namespace LAMMPS_AL {
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
class CHARMM : public BaseCharge<numtyp, acctyp> {
|
||||
public:
|
||||
CHARMM();
|
||||
~CHARMM();
|
||||
|
||||
/// Clear any previous data and set up for a new LAMMPS run
|
||||
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||
* \param cell_size cutoff + skin
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
* - -5 Double precision is not supported on card **/
|
||||
int init(const int ntypes, double host_cut_bothsq,
|
||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double *host_special_lj,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen, double host_cut_ljsq,
|
||||
const double host_cut_coulsq, double *host_special_coul,
|
||||
const double qqrd2e, const double cut_lj_innersq,
|
||||
const double cut_coul_innersq, const double denom_lj,
|
||||
const double denom_coul, double **epsilon, double **sigma,
|
||||
const bool mix_arithmetic);
|
||||
|
||||
/// Clear all host and device data
|
||||
/** \note This is called at the beginning of the init() routine **/
|
||||
void clear();
|
||||
|
||||
/// Returns memory usage on device per atom
|
||||
int bytes_per_atom(const int max_nbors) const;
|
||||
|
||||
/// Total host memory used by library for pair style
|
||||
double host_memory_usage() const;
|
||||
|
||||
// --------------------------- TYPE DATA --------------------------
|
||||
|
||||
/// x = lj1, y = lj2, z = lj3, w = lj4
|
||||
UCL_D_Vec<numtyp4> lj1;
|
||||
/// x = epsilon, y = sigma
|
||||
UCL_D_Vec<numtyp2> ljd;
|
||||
/// Special LJ values [0-3] and Special Coul values [4-7]
|
||||
UCL_D_Vec<numtyp> sp_lj;
|
||||
|
||||
/// If atom type constants fit in shared memory, use fast kernels
|
||||
bool shared_types;
|
||||
|
||||
/// Number of atom types
|
||||
int _lj_types;
|
||||
|
||||
numtyp _qqrd2e, _denom_lj, _denom_coul;
|
||||
|
||||
numtyp _cut_coulsq, _cut_bothsq, _cut_ljsq, _cut_lj_innersq;
|
||||
numtyp _cut_coul_innersq;
|
||||
|
||||
private:
|
||||
bool _allocated;
|
||||
int loop(const int eflag, const int vflag);
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
137
lib/gpu/lal_charmm_ext.cpp
Normal file
137
lib/gpu/lal_charmm_ext.cpp
Normal file
@ -0,0 +1,137 @@
|
||||
/***************************************************************************
|
||||
charmm_long_ext.cpp
|
||||
-------------------
|
||||
W. Michael Brown (ORNL)
|
||||
|
||||
Functions for LAMMPS access to charmm/coul/long acceleration routines.
|
||||
|
||||
__________________________________________________________________________
|
||||
This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
||||
__________________________________________________________________________
|
||||
|
||||
begin :
|
||||
email : brownw@ornl.gov
|
||||
***************************************************************************/
|
||||
|
||||
#include <iostream>
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
|
||||
#include "lal_charmm.h"
|
||||
|
||||
using namespace std;
|
||||
using namespace LAMMPS_AL;
|
||||
|
||||
static CHARMM<PRECISION,ACC_PRECISION> CRMMF;
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Allocate memory on host and device and copy constants to device
|
||||
// ---------------------------------------------------------------------------
|
||||
int crm_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3, double **host_lj4,
|
||||
double *special_lj, const int inum,
|
||||
const int nall, const int max_nbors, const int maxspecial,
|
||||
const double cell_size, int &gpu_mode, FILE *screen,
|
||||
double host_cut_ljsq, double host_cut_coulsq,
|
||||
double *host_special_coul, const double qqrd2e,
|
||||
const double cut_lj_innersq, const double cut_coul_innersq,
|
||||
const double denom_lj, const double denom_coul,
|
||||
double **epsilon, double **sigma,
|
||||
const bool mix_arithmetic) {
|
||||
CRMMF.clear();
|
||||
gpu_mode=CRMMF.device->gpu_mode();
|
||||
double gpu_split=CRMMF.device->particle_split();
|
||||
int first_gpu=CRMMF.device->first_device();
|
||||
int last_gpu=CRMMF.device->last_device();
|
||||
int world_me=CRMMF.device->world_me();
|
||||
int gpu_rank=CRMMF.device->gpu_rank();
|
||||
int procs_per_gpu=CRMMF.device->procs_per_gpu();
|
||||
|
||||
CRMMF.device->init_message(screen,"lj/charmm/coul/charmm",first_gpu,
|
||||
last_gpu);
|
||||
|
||||
bool message=false;
|
||||
if (CRMMF.device->replica_me()==0 && screen)
|
||||
message=true;
|
||||
|
||||
if (message) {
|
||||
fprintf(screen,"Initializing Device and compiling on process 0...");
|
||||
fflush(screen);
|
||||
}
|
||||
|
||||
int init_ok=0;
|
||||
if (world_me==0)
|
||||
CRMMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3, host_lj4,
|
||||
special_lj, inum, nall, max_nbors, maxspecial, cell_size,
|
||||
gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
|
||||
host_special_coul, qqrd2e, cut_lj_innersq, cut_coul_innersq,
|
||||
denom_lj, denom_coul, epsilon, sigma, mix_arithmetic);
|
||||
|
||||
CRMMF.device->world_barrier();
|
||||
if (message)
|
||||
fprintf(screen,"Done.\n");
|
||||
|
||||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (message) {
|
||||
if (last_gpu-first_gpu==0)
|
||||
fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
|
||||
else
|
||||
fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
|
||||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
if (gpu_rank==i && world_me!=0)
|
||||
init_ok=CRMMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3,
|
||||
host_lj4, special_lj, inum, nall, max_nbors,
|
||||
maxspecial, cell_size, gpu_split, screen,
|
||||
host_cut_ljsq, host_cut_coulsq, host_special_coul,
|
||||
qqrd2e, cut_lj_innersq, cut_coul_innersq, denom_lj,
|
||||
denom_coul, epsilon, sigma, mix_arithmetic);
|
||||
|
||||
CRMMF.device->gpu_barrier();
|
||||
if (message)
|
||||
fprintf(screen,"Done.\n");
|
||||
}
|
||||
if (message)
|
||||
fprintf(screen,"\n");
|
||||
|
||||
if (init_ok==0)
|
||||
CRMMF.estimate_gpu_overhead();
|
||||
|
||||
return init_ok;
|
||||
}
|
||||
|
||||
void crm_gpu_clear() {
|
||||
CRMMF.clear();
|
||||
}
|
||||
|
||||
int** crm_gpu_compute_n(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, double *sublo,
|
||||
double *subhi, tagint *tag, int **nspecial,
|
||||
tagint **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
int **ilist, int **jnum, const double cpu_time,
|
||||
bool &success, double *host_q, double *boxlo,
|
||||
double *prd) {
|
||||
return CRMMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
|
||||
subhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||
vatom, host_start, ilist, jnum, cpu_time, success,
|
||||
host_q, boxlo, prd);
|
||||
}
|
||||
|
||||
void crm_gpu_compute(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, int *ilist, int *numj,
|
||||
int **firstneigh, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success, double *host_q,
|
||||
const int nlocal, double *boxlo, double *prd) {
|
||||
CRMMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,
|
||||
eflag,vflag,eatom,vatom,host_start,cpu_time,success,host_q,
|
||||
nlocal,boxlo,prd);
|
||||
}
|
||||
|
||||
double crm_gpu_bytes() {
|
||||
return CRMMF.host_memory_usage();
|
||||
}
|
||||
|
||||
|
||||
@ -131,20 +131,9 @@ double CHARMMLongT::host_memory_usage() const {
|
||||
// Calculate energies, forces, and torques
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
void CHARMMLongT::loop(const bool _eflag, const bool _vflag) {
|
||||
int CHARMMLongT::loop(const int eflag, const int vflag) {
|
||||
// Compute the block size and grid size to keep all cores busy
|
||||
const int BX=this->_block_bio_size;
|
||||
int eflag, vflag;
|
||||
if (_eflag)
|
||||
eflag=1;
|
||||
else
|
||||
eflag=0;
|
||||
|
||||
if (_vflag)
|
||||
vflag=1;
|
||||
else
|
||||
vflag=0;
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
@ -152,8 +141,8 @@ void CHARMMLongT::loop(const bool _eflag, const bool _vflag) {
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
this->time_pair.start();
|
||||
if (shared_types) {
|
||||
this->k_pair_fast.set_size(GX,BX);
|
||||
this->k_pair_fast.run(&this->atom->x, &ljd, &sp_lj,
|
||||
this->k_pair_sel->set_size(GX,BX);
|
||||
this->k_pair_sel->run(&this->atom->x, &ljd, &sp_lj,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag,
|
||||
&vflag, &ainum, &nbor_pitch, &this->atom->q,
|
||||
@ -171,6 +160,7 @@ void CHARMMLongT::loop(const bool _eflag, const bool _vflag) {
|
||||
&this->_threads_per_atom);
|
||||
}
|
||||
this->time_pair.stop();
|
||||
return GX;
|
||||
}
|
||||
|
||||
template class CHARMMLong<PRECISION,ACC_PRECISION>;
|
||||
|
||||
@ -47,18 +47,21 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
|
||||
int tid, ii, offset;
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp e_coul=(acctyp)0;
|
||||
int n_stride;
|
||||
local_allocate_store_bio();
|
||||
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
energy=(acctyp)0;
|
||||
e_coul=(acctyp)0;
|
||||
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
|
||||
}
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
@ -122,7 +125,7 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0) {
|
||||
if (EVFLAG && eflag) {
|
||||
if (rsq < cut_coulsq)
|
||||
e_coul += prefactor*(_erfc-factor_coul);
|
||||
if (rsq < cut_ljsq) {
|
||||
@ -132,7 +135,7 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
|
||||
energy+=factor_lj*e;
|
||||
}
|
||||
}
|
||||
if (vflag>0) {
|
||||
if (EVFLAG && vflag) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
@ -143,9 +146,9 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
|
||||
vflag,ans,engv);
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
|
||||
@ -168,6 +171,9 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
|
||||
|
||||
__local numtyp2 ljd[MAX_BIO_SHARED_TYPES];
|
||||
__local numtyp sp_lj[8];
|
||||
int n_stride;
|
||||
local_allocate_store_bio();
|
||||
|
||||
if (tid<8)
|
||||
sp_lj[tid]=sp_lj_in[tid];
|
||||
if (tid<MAX_BIO_SHARED_TYPES)
|
||||
@ -175,20 +181,20 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
|
||||
if (tid+BLOCK_BIO_PAIR<MAX_BIO_SHARED_TYPES)
|
||||
ljd[tid+BLOCK_BIO_PAIR]=ljd_in[tid+BLOCK_BIO_PAIR];
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp e_coul=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
acctyp energy, e_coul, virial[6];
|
||||
if (EVFLAG) {
|
||||
energy=(acctyp)0;
|
||||
e_coul=(acctyp)0;
|
||||
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
@ -258,7 +264,7 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0) {
|
||||
if (EVFLAG && eflag) {
|
||||
if (rsq < cut_coulsq)
|
||||
e_coul += prefactor*(_erfc-factor_coul);
|
||||
if (rsq < cut_ljsq) {
|
||||
@ -268,7 +274,7 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
|
||||
energy+=factor_lj*e;
|
||||
}
|
||||
}
|
||||
if (vflag>0) {
|
||||
if (EVFLAG && vflag) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
@ -277,10 +283,9 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
|
||||
virial[5] += dely*delz*force;
|
||||
}
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
|
||||
vflag,ans,engv);
|
||||
} // if ii
|
||||
}
|
||||
|
||||
|
||||
@ -79,7 +79,7 @@ class CHARMMLong : public BaseCharge<numtyp, acctyp> {
|
||||
|
||||
private:
|
||||
bool _allocated;
|
||||
void loop(const bool _eflag, const bool _vflag);
|
||||
int loop(const int eflag, const int vflag);
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@ -60,7 +60,7 @@ int crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
|
||||
int init_ok=0;
|
||||
if (world_me==0)
|
||||
CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3, host_lj4,
|
||||
offset, special_lj, inum, nall, 300, maxspecial, cell_size,
|
||||
offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size,
|
||||
gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
|
||||
host_special_coul, qqrd2e, g_ewald, cut_lj_innersq, denom_lj,
|
||||
epsilon,sigma,mix_arithmetic);
|
||||
@ -80,7 +80,7 @@ int crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
|
||||
}
|
||||
if (gpu_rank==i && world_me!=0)
|
||||
init_ok=CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3,
|
||||
host_lj4, offset, special_lj, inum, nall, 300,
|
||||
host_lj4, offset, special_lj, inum, nall, max_nbors,
|
||||
maxspecial, cell_size, gpu_split, screen,
|
||||
host_cut_ljsq, host_cut_coulsq, host_special_coul,
|
||||
qqrd2e, g_ewald, cut_lj_innersq, denom_lj, epsilon,
|
||||
|
||||
@ -140,20 +140,9 @@ double ColloidT::host_memory_usage() const {
|
||||
// Calculate energies, forces, and torques
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
void ColloidT::loop(const bool _eflag, const bool _vflag) {
|
||||
int ColloidT::loop(const int eflag, const int vflag) {
|
||||
// Compute the block size and grid size to keep all cores busy
|
||||
const int BX=this->block_size();
|
||||
int eflag, vflag;
|
||||
if (_eflag)
|
||||
eflag=1;
|
||||
else
|
||||
eflag=0;
|
||||
|
||||
if (_vflag)
|
||||
vflag=1;
|
||||
else
|
||||
vflag=0;
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
@ -161,8 +150,8 @@ void ColloidT::loop(const bool _eflag, const bool _vflag) {
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
this->time_pair.start();
|
||||
if (shared_types) {
|
||||
this->k_pair_fast.set_size(GX,BX);
|
||||
this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
|
||||
this->k_pair_sel->set_size(GX,BX);
|
||||
this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj,
|
||||
&colloid1, &colloid2, &form,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->ans->force, &this->ans->engv, &eflag, &vflag,
|
||||
@ -176,6 +165,7 @@ void ColloidT::loop(const bool _eflag, const bool _vflag) {
|
||||
&ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
}
|
||||
this->time_pair.stop();
|
||||
return GX;
|
||||
}
|
||||
|
||||
template class Colloid<PRECISION,ACC_PRECISION>;
|
||||
|
||||
@ -42,22 +42,25 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
||||
__local numtyp sp_lj[4];
|
||||
int n_stride;
|
||||
local_allocate_store_pair();
|
||||
|
||||
sp_lj[0]=sp_lj_in[0];
|
||||
sp_lj[1]=sp_lj_in[1];
|
||||
sp_lj[2]=sp_lj_in[2];
|
||||
sp_lj[3]=sp_lj_in[3];
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
energy=(acctyp)0;
|
||||
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
|
||||
}
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
@ -146,7 +149,7 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0) {
|
||||
if (EVFLAG && eflag) {
|
||||
numtyp e=(numtyp)0.0;
|
||||
if (form[mtype]==0) {
|
||||
e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
|
||||
@ -160,7 +163,7 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
|
||||
}
|
||||
energy+=factor_lj*(e-lj3[mtype].z);
|
||||
}
|
||||
if (vflag>0) {
|
||||
if (EVFLAG && vflag) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
@ -171,9 +174,9 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
|
||||
ans,engv);
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
|
||||
@ -198,6 +201,9 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
|
||||
__local numtyp4 colloid2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp sp_lj[4];
|
||||
int n_stride;
|
||||
local_allocate_store_pair();
|
||||
|
||||
if (tid<4)
|
||||
sp_lj[tid]=sp_lj_in[tid];
|
||||
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||
@ -205,23 +211,23 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
|
||||
colloid1[tid]=colloid1_in[tid];
|
||||
colloid2[tid]=colloid2_in[tid];
|
||||
form[tid]=form_in[tid];
|
||||
if (eflag>0)
|
||||
if (EVFLAG && eflag)
|
||||
lj3[tid]=lj3_in[tid];
|
||||
}
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
energy=(acctyp)0;
|
||||
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
__local int n_stride;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
@ -310,7 +316,7 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0) {
|
||||
if (EVFLAG && eflag) {
|
||||
numtyp e=(numtyp)0.0;
|
||||
if (form[mtype]==0) {
|
||||
e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
|
||||
@ -325,7 +331,7 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
|
||||
}
|
||||
energy+=factor_lj*(e-lj3[mtype].z);
|
||||
}
|
||||
if (vflag>0) {
|
||||
if (EVFLAG && vflag) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
@ -336,8 +342,8 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
|
||||
ans,engv);
|
||||
} // if ii
|
||||
}
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user