diff --git a/doc/src/Commands_compute.rst b/doc/src/Commands_compute.rst index fd68ce3e39..7c73583a4f 100644 --- a/doc/src/Commands_compute.rst +++ b/doc/src/Commands_compute.rst @@ -58,6 +58,7 @@ KOKKOS, o = OPENMP, t = OPT. * :doc:`fep/ta ` * :doc:`force/tally ` * :doc:`fragment/atom ` + * :doc:`gaussian/grid/local (k) ` * :doc:`global/atom ` * :doc:`group/group ` * :doc:`gyration ` @@ -140,8 +141,8 @@ KOKKOS, o = OPENMP, t = OPT. * :doc:`smd/vol ` * :doc:`snap ` * :doc:`sna/atom ` - * :doc:`sna/grid ` - * :doc:`sna/grid/local ` + * :doc:`sna/grid (k) ` + * :doc:`sna/grid/local (k) ` * :doc:`snad/atom ` * :doc:`snav/atom ` * :doc:`sph/e/atom ` diff --git a/doc/src/compute.rst b/doc/src/compute.rst index 082f93a6c4..9a8a1734fb 100644 --- a/doc/src/compute.rst +++ b/doc/src/compute.rst @@ -236,6 +236,7 @@ The individual style names on the :doc:`Commands compute ` pag * :doc:`fep/ta ` - compute free energies for a test area perturbation * :doc:`force/tally ` - force between two groups of atoms via the tally callback mechanism * :doc:`fragment/atom ` - fragment ID for each atom +* :doc:`gaussian/grid/local ` - local array of Gaussian atomic contributions on a regular grid * :doc:`global/atom ` - assign global values to each atom from arrays of global values * :doc:`group/group ` - energy/force between two groups of atoms * :doc:`gyration ` - radius of gyration of group of atoms diff --git a/doc/src/compute_gaussian_grid_local.rst b/doc/src/compute_gaussian_grid_local.rst new file mode 100644 index 0000000000..4ae99e7b55 --- /dev/null +++ b/doc/src/compute_gaussian_grid_local.rst @@ -0,0 +1,97 @@ +.. index:: compute gaussian/grid/local +.. index:: compute gaussian/grid/local/kk + +compute gaussian/grid/local command +=================================== + +Accelerator Variants: *gaussian/grid/local/kk* + +Syntax +"""""" + +.. code-block:: LAMMPS + + compute ID group-ID gaussian/grid/local grid nx ny nz rcutfac R_1 R_2 ... sigma_1 sigma_2 + +* ID, group-ID are documented in :doc:`compute ` command +* gaussian/grid/local = style name of this compute command +* *grid* values = nx, ny, nz, number of grid points in x, y, and z directions (positive integer) +* *rcutfac* = scale factor applied to all cutoff radii (positive real) +* *R_1, R_2,...* = list of cutoff radii, one for each type (distance units) +* *sigma_1, sigma_2,...* = Gaussian widths, one for each type (distance units) + +Examples +"""""""" + +.. code-block:: LAMMPS + + compute mygrid all gaussian/grid/local grid 40 40 40 4.0 0.5 0.5 0.4 0.4 + +Description +""""""""""" + +Define a computation that calculates a Gaussian representation of the ionic +structure. This representation is used for the efficient evaluation +of quantities related to the structure factor in a grid-based workflow, +such as the ML-DFT workflow MALA :ref:`(Ellis) `, for which it was originally +implemented. Usage of the workflow is described in a separate publication :ref:`(Fiedler) `. + +For each LAMMPS type, a separate sum of Gaussians is calculated, using +a separate Gaussian broadening per type. The computation +is always performed on the numerical grid, no atom-based version of this +compute exists. The Gaussian representation can only be executed in a local +fashion, thus the output array only contains rows for grid points +that are local to the processor subdomain. The layout of the grid is the same +as for the see :doc:`sna/grid/local ` command. + +Namely, the array contains one row for each of the +local grid points, looping over the global index *ix* fastest, +then *iy*, and *iz* slowest. Each row of the array contains +the global indexes *ix*, *iy*, and *iz* first, followed by the *x*, *y*, +and *z* coordinates of the grid point, followed by the values of the Gaussians +(one floating point number per type per grid point). + +---------- + + +.. include:: accel_styles.rst + + + +---------- + +Output info +""""""""""" + +Compute *gaussian/grid/local* evaluates a local array. +The array contains one row for each of the +local grid points, looping over the global index *ix* fastest, +then *iy*, and *iz* slowest. The array contains math :math:`ntypes+6` columns, +where *ntypes* is the number of LAMMPS types. The first three columns are +the global indexes *ix*, *iy*, and *iz*, followed by the *x*, *y*, +and *z* coordinates of the grid point, followed by the *ntypes* columns +containing the values of the Gaussians for each type. + +Restrictions +"""""""""""" + +These computes are part of the ML-SNAP package. They are only enabled +if LAMMPS was built with that package. See the :doc:`Build package +` page for more info. + +Related commands +"""""""""""""""" + +:doc:`compute sna/grid/local ` + +---------- + +.. _Ellis2021b: + +**(Ellis)** Ellis, Fiedler, Popoola, Modine, Stephens, Thompson, Cangi, Rajamanickam, `Phys. Rev. B, 104, 035120, (2021) `_ + +.. _Fiedler2023: + +**(Fiedler)** Fiedler, Modine, Schmerler, Vogel, Popoola, Thompson, Rajamanickam, and Cangi, +`npj Comp. Mater., 9, 115 (2023) `_ + diff --git a/doc/src/compute_sna_atom.rst b/doc/src/compute_sna_atom.rst index 179c362dc6..2572093499 100644 --- a/doc/src/compute_sna_atom.rst +++ b/doc/src/compute_sna_atom.rst @@ -3,7 +3,9 @@ .. index:: compute snav/atom .. index:: compute snap .. index:: compute sna/grid +.. index:: compute sna/grid/kk .. index:: compute sna/grid/local +.. index:: compute sna/grid/local/kk compute sna/atom command ======================== @@ -20,9 +22,14 @@ compute snap command compute sna/grid command ======================== +compute sna/grid/kk command +=========================== + compute sna/grid/local command ============================== +Accelerator Variants: *sna/grid/local/kk* + Syntax """""" @@ -33,17 +40,17 @@ Syntax compute ID group-ID snav/atom rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ... compute ID group-ID snap rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ... compute ID group-ID snap rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ... - compute ID group-ID sna/grid nx ny nz rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ... - compute ID group-ID sna/grid/local nx ny nz rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ... + compute ID group-ID sna/grid grid nx ny nz rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ... + compute ID group-ID sna/grid/local grid nx ny nz rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ... * ID, group-ID are documented in :doc:`compute ` command * sna/atom = style name of this compute command -* rcutfac = scale factor applied to all cutoff radii (positive real) -* rfac0 = parameter in distance to angle conversion (0 < rcutfac < 1) -* twojmax = band limit for bispectrum components (non-negative integer) -* R_1, R_2,... = list of cutoff radii, one for each type (distance units) -* w_1, w_2,... = list of neighbor weights, one for each type -* nx, ny, nz = number of grid points in x, y, and z directions (positive integer) +* *rcutfac* = scale factor applied to all cutoff radii (positive real) +* *rfac0* = parameter in distance to angle conversion (0 < rcutfac < 1) +* *twojmax* = band limit for bispectrum components (non-negative integer) +* *R_1, R_2,...* = list of cutoff radii, one for each type (distance units) +* *w_1, w_2,...* = list of neighbor weights, one for each type +* *grid* values = nx, ny, nz, number of grid points in x, y, and z directions (positive integer) * zero or more keyword/value pairs may be appended * keyword = *rmin0* or *switchflag* or *bzeroflag* or *quadraticflag* or *chem* or *bnormflag* or *wselfallflag* or *bikflag* or *switchinnerflag* or *sinner* or *dinner* or *dgradflag* or *nnn* or *wmode* or *delta* @@ -103,7 +110,7 @@ Examples compute snap all snap 1.4 0.95 6 2.0 1.0 compute snap all snap 1.0 0.99363 6 3.81 3.83 1.0 0.93 chem 2 0 1 compute snap all snap 1.0 0.99363 6 3.81 3.83 1.0 0.93 switchinnerflag 1 sinner 1.35 1.6 dinner 0.25 0.3 - compute bgrid all sna/grid/local 200 200 200 1.4 0.95 6 2.0 1.0 + compute bgrid all sna/grid/local grid 200 200 200 1.4 0.95 6 2.0 1.0 compute bnnn all sna/atom 9.0 0.99363 8 0.5 1.0 rmin0 0.0 nnn 24 wmode 1 delta 0.2 Description @@ -252,7 +259,8 @@ for finite-temperature Kohn-Sham density functional theory (:ref:`Ellis et al. `) Neighbor atoms not in the group do not contribute to the bispectrum components of the grid points. The distance cutoff :math:`R_{ii'}` assumes that *i* has the same type as the neighbor atom -*i'*. +*i'*. Both computes can be hardware accelerated with Kokkos by using the +*sna/grid/kk* and *sna/grid/local/kk* commands, respectively. Compute *sna/grid* calculates a global array containing bispectrum components for a regular grid of points. @@ -463,6 +471,12 @@ fluctuations in the resulting local atomic environment fingerprint. The detailed formalism is given in the paper by Lafourcade et al. :ref:`(Lafourcade) `. +---------- + + +.. include:: accel_styles.rst + + ---------- Output info @@ -654,7 +668,7 @@ of Angular Momentum, World Scientific, Singapore (1987). .. _Ellis2021: -**(Ellis)** Ellis, Fiedler, Popoola, Modine, Stephens, Thompson, Cangi, Rajamanickam, Phys Rev B, 104, 035120, (2021) +**(Ellis)** Ellis, Fiedler, Popoola, Modine, Stephens, Thompson, Cangi, Rajamanickam, `Phys. Rev. B, 104, 035120, (2021) `_ .. _Lafourcade2023_2: diff --git a/doc/utils/sphinx-config/false_positives.txt b/doc/utils/sphinx-config/false_positives.txt index f272af6dec..5e8ab356e6 100644 --- a/doc/utils/sphinx-config/false_positives.txt +++ b/doc/utils/sphinx-config/false_positives.txt @@ -3380,6 +3380,7 @@ Schilfgarde Schimansky Schiotz Schlitter +Schmerler Schmid Schnieders Schoen @@ -4042,6 +4043,7 @@ VMDARCH VMDHOME vn Voigt +Vogel volfactor Volkov Volpe diff --git a/examples/snap/README.md b/examples/snap/README.md index 305f920ae8..1df24acf1f 100644 --- a/examples/snap/README.md +++ b/examples/snap/README.md @@ -9,5 +9,11 @@ in.snap.Mo_Chen # SNAP linear Mo potential in.snap.compute # SNAP compute for training a linear model in.snap.compute.quadratic # SNAP compute for training a quadratic model in.snap.scale.Ni_Zuo_JCPA2020 # SNAP linear Ni potential with thermodynamic integration (fix adapt scale) +in.C_SNAP # SNAP carbon potential compute_snap_dgrad.py # SNAP compute with dgradflag (dBi/dRj) for training a non-linear model + +in.snap.grid # SNAP descriptors on a grid +in.snap.grid.triclinic # SNAP descriptors on a grid, triclinic +in.gaussian.grid # Gaussian descriptors on a grid + diff --git a/examples/snap/in.gaussian.grid b/examples/snap/in.gaussian.grid new file mode 100644 index 0000000000..48aeec1632 --- /dev/null +++ b/examples/snap/in.gaussian.grid @@ -0,0 +1,68 @@ +# Demonstrate calculation of Gaussian descriptors on a grid +# for a cell with two atoms of type 1 and type 2. +# The output in dump.glocal shows that for grid points +# sitting on an atom of type 1 or 2: +# val1 = 1.0/(0.1355*sqrt(2.0*pi))**3 = 25.5219 +# val2 = 1.0/(0.2 *sqrt(2.0*pi))**3 = 7.93670 +# These values are extracted to the log file +# + +variable nrep index 1 +variable a index 3.316 +variable ngrid index 2 + +units metal +atom_modify map hash + +# generate the box and atom positions using a BCC lattice + +variable nx equal ${nrep} +variable ny equal ${nrep} +variable nz equal ${nrep} + +boundary p p p + +lattice custom $a & + a1 1 0 0 & + a2 0 1 0 & + a3 0 0 1 & + basis 0 0 0 & + basis 0.5 0.5 0.5 & + +region box block 0 ${nx} 0 ${ny} 0 ${nz} +create_box 2 box +create_atoms 1 box basis 1 1 basis 2 2 + +mass * 180.88 + +# define atom compute and grid compute + +variable rcutfac equal 4.67637 +variable radelem1 equal 0.5 +variable radelem2 equal 0.5 +variable sigmaelem1 equal 0.1355 +variable sigmaelem2 equal 0.2 +variable gaussian_options string & + "${rcutfac} ${radelem1} ${radelem2} ${sigmaelem1} ${sigmaelem2}" + +# build zero potential to force ghost atom creation + +pair_style zero ${rcutfac} +pair_coeff * * + +# define atom and grid computes + +compute mygridlocal all gaussian/grid/local grid ${ngrid} ${ngrid} ${ngrid} & + ${gaussian_options} + +# define output + +dump 1 all local 1000 dump.glocal c_mygridlocal[*] +dump 2 all custom 1000 dump.gatom id x y z +compute val1 all reduce max c_mygridlocal[7] inputs local +compute val2 all reduce max c_mygridlocal[8] inputs local +thermo_style custom step c_val1 c_val2 + +# run + +run 0 diff --git a/examples/snap/in.grid.snap b/examples/snap/in.snap.grid similarity index 100% rename from examples/snap/in.grid.snap rename to examples/snap/in.snap.grid diff --git a/examples/snap/in.grid.tri b/examples/snap/in.snap.grid.triclinic similarity index 99% rename from examples/snap/in.grid.tri rename to examples/snap/in.snap.grid.triclinic index 95a14f3bb4..59063f576e 100644 --- a/examples/snap/in.grid.tri +++ b/examples/snap/in.snap.grid.triclinic @@ -47,7 +47,6 @@ lattice custom $a & basis 0.0 0.0 0.5 & spacing 1 1 1 -box tilt large region box prism 0 ${nx} 0 ${ny} 0 ${nz} ${ny} ${nz} ${nz} create_box 1 box create_atoms 1 box diff --git a/examples/snap/log.10Dec24.gaussian.grid.g++.1 b/examples/snap/log.10Dec24.gaussian.grid.g++.1 new file mode 100644 index 0000000000..b158ac07d0 --- /dev/null +++ b/examples/snap/log.10Dec24.gaussian.grid.g++.1 @@ -0,0 +1,129 @@ +LAMMPS (19 Nov 2024 - Development - patch_19Nov2024-59-g16e0a7788a) +OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:99) + using 1 OpenMP thread(s) per MPI task +# Demonstrate calculation of Gaussian descriptors on a grid +# for a cell with two atoms of type 1 and type 2. +# The output in dump.glocal shows that for grid points +# sitting on an atom of type 1 or 2: +# val1 = 1.0/(0.1355*sqrt(2.0*pi))**3 = 25.5219 +# val2 = 1.0/(0.2 *sqrt(2.0*pi))**3 = 7.93670 +# These values are extracted to the log file +# + +variable nrep index 1 +variable a index 3.316 +variable ngrid index 2 + +units metal +atom_modify map hash + +# generate the box and atom positions using a BCC lattice + +variable nx equal ${nrep} +variable nx equal 1 +variable ny equal ${nrep} +variable ny equal 1 +variable nz equal ${nrep} +variable nz equal 1 + +boundary p p p + +lattice custom $a a1 1 0 0 a2 0 1 0 a3 0 0 1 basis 0 0 0 basis 0.5 0.5 0.5 +lattice custom 3.316 a1 1 0 0 a2 0 1 0 a3 0 0 1 basis 0 0 0 basis 0.5 0.5 0.5 +Lattice spacing in x,y,z = 3.316 3.316 3.316 +region box block 0 ${nx} 0 ${ny} 0 ${nz} +region box block 0 1 0 ${ny} 0 ${nz} +region box block 0 1 0 1 0 ${nz} +region box block 0 1 0 1 0 1 +create_box 2 box +Created orthogonal box = (0 0 0) to (3.316 3.316 3.316) + 1 by 1 by 1 MPI processor grid +create_atoms 1 box basis 1 1 basis 2 2 +Created 2 atoms + using lattice units in orthogonal box = (0 0 0) to (3.316 3.316 3.316) + create_atoms CPU = 0.001 seconds + +mass * 180.88 + +# define atom compute and grid compute + +variable rcutfac equal 4.67637 +variable radelem1 equal 0.5 +variable radelem2 equal 0.5 +variable sigmaelem1 equal 0.1355 +variable sigmaelem2 equal 0.2 +variable gaussian_options string "${rcutfac} ${radelem1} ${radelem2} ${sigmaelem1} ${sigmaelem2}" +4.67637 ${radelem1} ${radelem2} ${sigmaelem1} ${sigmaelem2} +4.67637 0.5 ${radelem2} ${sigmaelem1} ${sigmaelem2} +4.67637 0.5 0.5 ${sigmaelem1} ${sigmaelem2} +4.67637 0.5 0.5 0.1355 ${sigmaelem2} +4.67637 0.5 0.5 0.1355 0.2 + +# build zero potential to force ghost atom creation + +pair_style zero ${rcutfac} +pair_style zero 4.67637 +pair_coeff * * + +# define atom and grid computes + +compute mygridlocal all gaussian/grid/local grid ${ngrid} ${ngrid} ${ngrid} ${gaussian_options} +compute mygridlocal all gaussian/grid/local grid 2 ${ngrid} ${ngrid} ${gaussian_options} +compute mygridlocal all gaussian/grid/local grid 2 2 ${ngrid} ${gaussian_options} +compute mygridlocal all gaussian/grid/local grid 2 2 2 ${gaussian_options} +compute mygridlocal all gaussian/grid/local grid 2 2 2 4.67637 0.5 0.5 0.1355 0.2 + +# define output + +dump 1 all local 1000 dump.glocal c_mygridlocal[*] +dump 2 all custom 1000 dump.gatom id x y z +compute val1 all reduce max c_mygridlocal[7] inputs local +compute val2 all reduce max c_mygridlocal[8] inputs local +thermo_style custom step c_val1 c_val2 + +# run + +run 0 +WARNING: No fixes with time integration, atoms won't move (src/verlet.cpp:60) +Generated 0 of 1 mixed pair_coeff terms from geometric mixing rule +Neighbor list info ... + update: every = 1 steps, delay = 0 steps, check = yes + max neighbors/atom: 2000, page size: 100000 + master list distance cutoff = 6.67637 + ghost atom cutoff = 6.67637 + binsize = 3.338185, bins = 1 1 1 + 1 neighbor lists, perpetual/occasional/extra = 1 0 0 + (1) pair zero, perpetual + attributes: half, newton on + pair build: half/bin/atomonly/newton + stencil: half/bin/3d + bin: standard +Per MPI rank memory allocation (min/avg/max) = 3.492 | 3.492 | 3.492 Mbytes + Step c_val1 c_val2 + 0 25.521859 7.9367045 +Loop time of 1.088e-06 on 1 procs for 0 steps with 2 atoms + +183.8% CPU use with 1 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 0 | 0 | 0 | 0.0 | 0.00 +Neigh | 0 | 0 | 0 | 0.0 | 0.00 +Comm | 0 | 0 | 0 | 0.0 | 0.00 +Output | 0 | 0 | 0 | 0.0 | 0.00 +Modify | 0 | 0 | 0 | 0.0 | 0.00 +Other | | 1.088e-06 | | |100.00 + +Nlocal: 2 ave 2 max 2 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Nghost: 339 ave 339 max 339 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Neighs: 64 ave 64 max 64 min +Histogram: 1 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 64 +Ave neighs/atom = 32 +Neighbor list builds = 0 +Dangerous builds = 0 +Total wall time: 0:00:00 diff --git a/examples/snap/log.10Dec24.gaussian.grid.g++.4 b/examples/snap/log.10Dec24.gaussian.grid.g++.4 new file mode 100644 index 0000000000..54cc842bc7 --- /dev/null +++ b/examples/snap/log.10Dec24.gaussian.grid.g++.4 @@ -0,0 +1,130 @@ +LAMMPS (19 Nov 2024 - Development - patch_19Nov2024-59-g16e0a7788a) +OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:99) + using 1 OpenMP thread(s) per MPI task +# Demonstrate calculation of Gaussian descriptors on a grid +# for a cell with two atoms of type 1 and type 2. +# The output in dump.glocal shows that for grid points +# sitting on an atom of type 1 or 2: +# val1 = 1.0/(0.1355*sqrt(2.0*pi))**3 = 25.5219 +# val2 = 1.0/(0.2 *sqrt(2.0*pi))**3 = 7.93670 +# These values are extracted to the log file +# + +variable nrep index 1 +variable a index 3.316 +variable ngrid index 2 + +units metal +atom_modify map hash + +# generate the box and atom positions using a BCC lattice + +variable nx equal ${nrep} +variable nx equal 1 +variable ny equal ${nrep} +variable ny equal 1 +variable nz equal ${nrep} +variable nz equal 1 + +boundary p p p + +lattice custom $a a1 1 0 0 a2 0 1 0 a3 0 0 1 basis 0 0 0 basis 0.5 0.5 0.5 +lattice custom 3.316 a1 1 0 0 a2 0 1 0 a3 0 0 1 basis 0 0 0 basis 0.5 0.5 0.5 +Lattice spacing in x,y,z = 3.316 3.316 3.316 +region box block 0 ${nx} 0 ${ny} 0 ${nz} +region box block 0 1 0 ${ny} 0 ${nz} +region box block 0 1 0 1 0 ${nz} +region box block 0 1 0 1 0 1 +create_box 2 box +Created orthogonal box = (0 0 0) to (3.316 3.316 3.316) + 1 by 2 by 2 MPI processor grid +create_atoms 1 box basis 1 1 basis 2 2 +Created 2 atoms + using lattice units in orthogonal box = (0 0 0) to (3.316 3.316 3.316) + create_atoms CPU = 0.001 seconds + +mass * 180.88 + +# define atom compute and grid compute + +variable rcutfac equal 4.67637 +variable radelem1 equal 0.5 +variable radelem2 equal 0.5 +variable sigmaelem1 equal 0.1355 +variable sigmaelem2 equal 0.2 +variable gaussian_options string "${rcutfac} ${radelem1} ${radelem2} ${sigmaelem1} ${sigmaelem2}" +4.67637 ${radelem1} ${radelem2} ${sigmaelem1} ${sigmaelem2} +4.67637 0.5 ${radelem2} ${sigmaelem1} ${sigmaelem2} +4.67637 0.5 0.5 ${sigmaelem1} ${sigmaelem2} +4.67637 0.5 0.5 0.1355 ${sigmaelem2} +4.67637 0.5 0.5 0.1355 0.2 + +# build zero potential to force ghost atom creation + +pair_style zero ${rcutfac} +pair_style zero 4.67637 +pair_coeff * * + +# define atom and grid computes + +compute mygridlocal all gaussian/grid/local grid ${ngrid} ${ngrid} ${ngrid} ${gaussian_options} +compute mygridlocal all gaussian/grid/local grid 2 ${ngrid} ${ngrid} ${gaussian_options} +compute mygridlocal all gaussian/grid/local grid 2 2 ${ngrid} ${gaussian_options} +compute mygridlocal all gaussian/grid/local grid 2 2 2 ${gaussian_options} +compute mygridlocal all gaussian/grid/local grid 2 2 2 4.67637 0.5 0.5 0.1355 0.2 + +# define output + +dump 1 all local 1000 dump.glocal c_mygridlocal[*] +dump 2 all custom 1000 dump.gatom id x y z +compute val1 all reduce max c_mygridlocal[7] inputs local +compute val2 all reduce max c_mygridlocal[8] inputs local +thermo_style custom step c_val1 c_val2 + +# run + +run 0 +WARNING: No fixes with time integration, atoms won't move (src/verlet.cpp:60) +Generated 0 of 1 mixed pair_coeff terms from geometric mixing rule +Neighbor list info ... + update: every = 1 steps, delay = 0 steps, check = yes + max neighbors/atom: 2000, page size: 100000 + master list distance cutoff = 6.67637 + ghost atom cutoff = 6.67637 + binsize = 3.338185, bins = 1 1 1 + 1 neighbor lists, perpetual/occasional/extra = 1 0 0 + (1) pair zero, perpetual + attributes: half, newton on + pair build: half/bin/atomonly/newton + stencil: half/bin/3d + bin: standard +WARNING: Proc sub-domain size < neighbor skin, could lead to lost atoms (src/domain.cpp:1202) +Per MPI rank memory allocation (min/avg/max) = 3.522 | 3.523 | 3.524 Mbytes + Step c_val1 c_val2 + 0 25.521859 7.9367045 +Loop time of 2.238e-06 on 4 procs for 0 steps with 2 atoms + +89.4% CPU use with 4 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 0 | 0 | 0 | 0.0 | 0.00 +Neigh | 0 | 0 | 0 | 0.0 | 0.00 +Comm | 0 | 0 | 0 | 0.0 | 0.00 +Output | 0 | 0 | 0 | 0.0 | 0.00 +Modify | 0 | 0 | 0 | 0.0 | 0.00 +Other | | 2.238e-06 | | |100.00 + +Nlocal: 0.5 ave 1 max 0 min +Histogram: 2 0 0 0 0 0 0 0 0 2 +Nghost: 274.5 ave 275 max 274 min +Histogram: 2 0 0 0 0 0 0 0 0 2 +Neighs: 16 ave 40 max 0 min +Histogram: 2 0 0 0 0 0 1 0 0 1 + +Total # of neighbors = 64 +Ave neighs/atom = 32 +Neighbor list builds = 0 +Dangerous builds = 0 +Total wall time: 0:00:00 diff --git a/src/.gitignore b/src/.gitignore index c1f6b6e892..45f7a9f1a0 100644 --- a/src/.gitignore +++ b/src/.gitignore @@ -252,6 +252,8 @@ /*rheo*.cpp /*rheo*.h +/compute_gaussian_grid_local.cpp +/compute_gaussian_grid_local.h /compute_grid.cpp /compute_grid.h /compute_grid_local.cpp diff --git a/src/KOKKOS/Install.sh b/src/KOKKOS/Install.sh index 4269e64189..d34d5eb9ee 100755 --- a/src/KOKKOS/Install.sh +++ b/src/KOKKOS/Install.sh @@ -119,6 +119,14 @@ action compute_composition_atom_kokkos.cpp compute_composition_atom.cpp action compute_composition_atom_kokkos.h compute_composition_atom.h action compute_orientorder_atom_kokkos.cpp action compute_orientorder_atom_kokkos.h +action compute_sna_grid_kokkos.cpp compute_sna_grid.cpp +action compute_sna_grid_kokkos.h compute_sna_grid.h +action compute_sna_grid_kokkos_impl.h compute_sna_grid.cpp +action compute_sna_grid_local_kokkos.cpp compute_sna_grid_local.cpp +action compute_sna_grid_local_kokkos.h compute_sna_grid_local.h +action compute_sna_grid_local_kokkos_impl.h compute_sna_grid_local.cpp +action compute_gaussian_grid_local_kokkos.cpp compute_gaussian_grid_local.cpp +action compute_gaussian_grid_local_kokkos.h compute_gaussian_grid_local.h action compute_temp_deform_kokkos.cpp action compute_temp_deform_kokkos.h action compute_temp_kokkos.cpp diff --git a/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp b/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp new file mode 100644 index 0000000000..cfd7e5a582 --- /dev/null +++ b/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp @@ -0,0 +1,327 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Drew Rohskopf (SNL) +------------------------------------------------------------------------- */ + +#include "compute_gaussian_grid_local_kokkos.h" + +#include "atom_kokkos.h" +#include "atom_masks.h" +#include "comm.h" +#include "domain.h" +#include "error.h" +#include "force.h" +#include "memory_kokkos.h" +#include "modify.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "neighbor_kokkos.h" +#include "pair.h" +#include "update.h" + +#include +#include + +using namespace LAMMPS_NS; + +/* ---------------------------------------------------------------------- */ + +template +ComputeGaussianGridLocalKokkos::ComputeGaussianGridLocalKokkos(LAMMPS *lmp, int narg, char **arg) : + ComputeGaussianGridLocal(lmp, narg, arg) +{ + kokkosable = 1; + atomKK = (AtomKokkos *) atom; + execution_space = ExecutionSpaceFromDevice::space; + datamask_read = EMPTY_MASK; + datamask_modify = EMPTY_MASK; + + k_cutsq = tdual_fparams("ComputeSNAGridKokkos::cutsq",atom->ntypes+1,atom->ntypes+1); + auto d_cutsq = k_cutsq.template view(); + rnd_cutsq = d_cutsq; + + host_flag = (execution_space == Host); + + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = 1; j <= atom->ntypes; j++){ + k_cutsq.h_view(i,j) = k_cutsq.h_view(j,i) = cutsq[i][j]; //cutsq_tmp; + k_cutsq.template modify(); + } + } + // Set up element lists + int n = atom->ntypes; + MemKK::realloc_kokkos(d_radelem,"ComputeSNAGridKokkos::radelem",n); + MemKK::realloc_kokkos(d_sigmaelem,"ComputeSNAGridKokkos::sigmaelem",n+1); + MemKK::realloc_kokkos(d_prefacelem,"ComputeSNAGridKokkos::prefacelem",n+1); + MemKK::realloc_kokkos(d_argfacelem,"ComputeSNAGridKokkos::argfacelem",n+1); + MemKK::realloc_kokkos(d_map,"ComputeSNAGridKokkos::map",n+1); + auto h_radelem = Kokkos::create_mirror_view(d_radelem); + auto h_sigmaelem = Kokkos::create_mirror_view(d_sigmaelem); + auto h_prefacelem = Kokkos::create_mirror_view(d_prefacelem); + auto h_argfacelem = Kokkos::create_mirror_view(d_argfacelem); + auto h_map = Kokkos::create_mirror_view(d_map); + // start from index 1 because of how compute sna/grid is + for (int i = 1; i <= atom->ntypes; i++) { + h_radelem(i-1) = radelem[i]; + h_sigmaelem(i-1) = sigmaelem[i]; + h_prefacelem(i-1) = prefacelem[i]; + h_argfacelem(i-1) = argfacelem[i]; + } + Kokkos::deep_copy(d_radelem,h_radelem); + Kokkos::deep_copy(d_sigmaelem,h_sigmaelem); + Kokkos::deep_copy(d_prefacelem, h_prefacelem); + Kokkos::deep_copy(d_argfacelem, h_argfacelem); + Kokkos::deep_copy(d_map,h_map); + +} + +/* ---------------------------------------------------------------------- */ + +template +ComputeGaussianGridLocalKokkos::~ComputeGaussianGridLocalKokkos() +{ + if (copymode) return; + + memoryKK->destroy_kokkos(k_cutsq,cutsq); + memoryKK->destroy_kokkos(k_alocal,alocal); + //gridlocal_allocated = 0; + +} + +/* ---------------------------------------------------------------------- */ + +template +void ComputeGaussianGridLocalKokkos::setup() +{ + + ComputeGridLocal::setup(); + + // allocate arrays + memoryKK->create_kokkos(k_alocal, alocal, size_local_rows, size_local_cols, "grid:alocal"); + array_local = alocal; + d_alocal = k_alocal.template view(); + +} + +/* ---------------------------------------------------------------------- */ + +template +void ComputeGaussianGridLocalKokkos::init() +{ + ComputeGaussianGridLocal::init(); +} + +/* ---------------------------------------------------------------------- */ + +template +void ComputeGaussianGridLocalKokkos::compute_local() +{ + if (host_flag) { + return; + } + + invoked_local = update->ntimestep; + + copymode = 1; + + zlen = nzhi-nzlo+1; + ylen = nyhi-nylo+1; + xlen = nxhi-nxlo+1; + total_range = (nzhi-nzlo+1)*(nyhi-nylo+1)*(nxhi-nxlo+1); + + atomKK->sync(execution_space,X_MASK|F_MASK|TYPE_MASK); + x = atomKK->k_x.view(); + type = atomKK->k_type.view(); + k_cutsq.template sync(); + + // max_neighs is defined here - think of more elaborate methods. + max_neighs = 100; + + // Pair snap/kk uses grow_ij with some max number of neighs but compute sna/grid uses total + // number of atoms. + ntotal = atomKK->nlocal + atomKK->nghost; + // Allocate view for number of neighbors per grid point + MemKK::realloc_kokkos(d_ninside,"ComputeSNAGridKokkos:ninside",total_range); + + // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user + // `total_range` is the number of grid points which may be larger than chunk size. + // printf(">>> total_range: %d\n", total_range); + chunksize = 32768; // 100*32768 + chunk_size = MIN(chunksize, total_range); + chunk_offset = 0; + + int vector_length_default = 1; + int team_size_default = 1; + if (!host_flag) + team_size_default = 1; // cost will increase with increasing team size //32;//max_neighs; + + if (triclinic){ + h0 = domain->h[0]; + h1 = domain->h[1]; + h2 = domain->h[2]; + h3 = domain->h[3]; + h4 = domain->h[4]; + h5 = domain->h[5]; + lo0 = domain->boxlo[0]; + lo1 = domain->boxlo[1]; + lo2 = domain->boxlo[2]; + } + + while (chunk_offset < total_range) { // chunk up loop to prevent running out of memory + + if (chunk_size > total_range - chunk_offset) + chunk_size = total_range - chunk_offset; + + //Neigh + { + int vector_length = vector_length_default; + int team_size = team_size_default; + check_team_size_for(chunk_size,team_size,vector_length); + typename Kokkos::TeamPolicy policy_neigh(chunk_size,team_size,vector_length); + Kokkos::parallel_for("ComputeGaussianGridLocalNeigh",policy_neigh,*this); + } + + // Proceed to the next chunk. + chunk_offset += chunk_size; + } // end while + + copymode = 0; + + k_alocal.template modify(); + k_alocal.template sync(); + +} + +/* ---------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void ComputeGaussianGridLocalKokkos::operator() (TagComputeGaussianGridLocalNeigh,const typename Kokkos::TeamPolicy::member_type& team) const +{ + const int ii = team.league_rank(); + + if (ii >= chunk_size) return; + + // extract grid index + int igrid = ii + chunk_offset; + + // get a pointer to scratch memory + // This is used to cache whether or not an atom is within the cutoff. + // If it is, type_cache is assigned to the atom type. + // If it's not, it's assigned to -1. + const int tile_size = ntotal; //max_neighs; // number of elements per thread + const int team_rank = team.team_rank(); + const int scratch_shift = team_rank * tile_size; // offset into pointer for entire team + int* type_cache = (int*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(int), 0) + scratch_shift; + + // convert to grid indices + + int iz = igrid/(xlen*ylen); + int i2 = igrid - (iz*xlen*ylen); + int iy = i2/xlen; + int ix = i2 % xlen; + iz += nzlo; + iy += nylo; + ix += nxlo; + + double xgrid[3]; + + // index ii already captures the proper grid point + //int igrid = iz * (nx * ny) + iy * nx + ix; + + // grid2x converts igrid to ix,iy,iz like we've done before + // multiply grid integers by grid spacing delx, dely, delz + //grid2x(igrid, xgrid); + xgrid[0] = ix * delx; + xgrid[1] = iy * dely; + xgrid[2] = iz * delz; + + if (triclinic) { + + // Do a conversion on `xgrid` here like we do in the CPU version. + + // Can't do this: + // domainKK->lamda2x(xgrid, xgrid); + // Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed + + // Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats. + xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0; + xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1; + xgrid[2] = h2*xgrid[2] + lo2; + } + + const F_FLOAT xtmp = xgrid[0]; + const F_FLOAT ytmp = xgrid[1]; + const F_FLOAT ztmp = xgrid[2]; + + // Zeroing out the components, which are filled as a sum. + for (int icol = size_local_cols_base; icol < size_local_cols; icol++){ + d_alocal(igrid, icol) = 0.0; + } + + // Fill grid info columns + d_alocal(igrid, 0) = ix; + d_alocal(igrid, 1) = iy; + d_alocal(igrid, 2) = iz; + d_alocal(igrid, 3) = xtmp; + d_alocal(igrid, 4) = ytmp; + d_alocal(igrid, 5) = ztmp; + + // currently, all grid points are type 1 + // not clear what a better choice would be + const int itype = 1; + int ielem = 0; + ielem = d_map[itype]; + const double radi = d_radelem[ielem]; + + // Compute the number of neighbors, store rsq + int ninside = 0; + + // Looping over ntotal for now. + for (int j = 0; j < ntotal; j++){ + const F_FLOAT dx = x(j,0) - xtmp; + const F_FLOAT dy = x(j,1) - ytmp; + const F_FLOAT dz = x(j,2) - ztmp; + int jtype = type(j); + const F_FLOAT rsq = dx*dx + dy*dy + dz*dz; + + if (rsq < rnd_cutsq(jtype, jtype) ) { + int icol = size_local_cols_base + jtype - 1; + d_alocal(igrid, icol) += d_prefacelem(jtype-1) * exp(-rsq * d_argfacelem(jtype-1)); + } + } +} + +/* ---------------------------------------------------------------------- + check max team size +------------------------------------------------------------------------- */ + +template +template +void ComputeGaussianGridLocalKokkos::check_team_size_for(int inum, int &team_size, int vector_length) { + int team_size_max; + + team_size_max = Kokkos::TeamPolicy(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelForTag()); + + if (team_size*vector_length > team_size_max) + team_size = team_size_max/vector_length; +} + +namespace LAMMPS_NS { +template class ComputeGaussianGridLocalKokkos; +#ifdef LMP_KOKKOS_GPU +template class ComputeGaussianGridLocalKokkos; +#endif +} diff --git a/src/KOKKOS/compute_gaussian_grid_local_kokkos.h b/src/KOKKOS/compute_gaussian_grid_local_kokkos.h new file mode 100644 index 0000000000..34e12bc4af --- /dev/null +++ b/src/KOKKOS/compute_gaussian_grid_local_kokkos.h @@ -0,0 +1,96 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef COMPUTE_CLASS +// clang-format off +ComputeStyle(gaussian/grid/local/kk,ComputeGaussianGridLocalKokkos); +ComputeStyle(gaussian/grid/local/kk/device,ComputeGaussianGridLocalKokkos); +ComputeStyle(gaussian/grid/local/kk/host,ComputeGaussianGridLocalKokkos); +// clang-format on + +#else + +#ifndef LMP_COMPUTE_GAUSSIAN_GRID_LOCAL_KOKKOS_H +#define LMP_COMPUTE_GAUSSIAN_GRID_LOCAL_KOKKOS_H + +#include "compute_gaussian_grid_local.h" +#include "kokkos_type.h" + +namespace LAMMPS_NS { + +// clang-format off +struct TagComputeGaussianGridLocalNeigh{}; +// clang-format on + +template class ComputeGaussianGridLocalKokkos : public ComputeGaussianGridLocal { + public: + typedef DeviceType device_type; + typedef ArrayTypes AT; + + // Static team/tile sizes for device offload + +#ifdef KOKKOS_ENABLE_HIP + static constexpr int team_size_compute_neigh = 2; +#else + static constexpr int team_size_compute_neigh = 4; +#endif + + ComputeGaussianGridLocalKokkos(class LAMMPS *, int, char **); + ~ComputeGaussianGridLocalKokkos() override; + void setup() override; + void init() override; + void compute_local() override; + + template + void check_team_size_for(int, int&, int); + + KOKKOS_INLINE_FUNCTION + void operator() (TagComputeGaussianGridLocalNeigh, const typename Kokkos::TeamPolicy::member_type& team) const; + + private: + Kokkos::View d_radelem; // element radii + Kokkos::View d_sigmaelem; + Kokkos::View d_prefacelem; + Kokkos::View d_argfacelem; + Kokkos::View d_ninside; // ninside for all atoms in list + Kokkos::View d_map; // mapping from atom types to elements + + typedef Kokkos::DualView tdual_fparams; + tdual_fparams k_cutsq; + typedef Kokkos::View > t_fparams_rnd; + t_fparams_rnd rnd_cutsq; + + + int max_neighs, inum, chunk_size, chunk_offset; + int host_flag; + int total_range; // total number of loop iterations in grid + int xlen, ylen, zlen; + int chunksize; + int ntotal; + + typename AT::t_x_array_randomread x; + typename AT::t_int_1d_randomread type; + + DAT::tdual_float_2d k_alocal; + typename AT::t_float_2d d_alocal; + + // triclinic vars + double h0, h1, h2, h3, h4, h5; + double lo0, lo1, lo2; +}; + +} // namespace LAMMPS_NS + +#endif +#endif diff --git a/src/KOKKOS/compute_sna_grid_kokkos.cpp b/src/KOKKOS/compute_sna_grid_kokkos.cpp new file mode 100644 index 0000000000..197234cf1d --- /dev/null +++ b/src/KOKKOS/compute_sna_grid_kokkos.cpp @@ -0,0 +1,25 @@ +// clang-format off +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include "compute_sna_grid_kokkos.h" +#include "compute_sna_grid_kokkos_impl.h" + +namespace LAMMPS_NS { + +template class ComputeSNAGridKokkosDevice; +#ifdef LMP_KOKKOS_GPU +template class ComputeSNAGridKokkosHost; +#endif + +} diff --git a/src/KOKKOS/compute_sna_grid_kokkos.h b/src/KOKKOS/compute_sna_grid_kokkos.h new file mode 100644 index 0000000000..8a7d87acbb --- /dev/null +++ b/src/KOKKOS/compute_sna_grid_kokkos.h @@ -0,0 +1,297 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef COMPUTE_CLASS +// clang-format off +ComputeStyle(sna/grid/kk,ComputeSNAGridKokkosDevice); +ComputeStyle(sna/grid/kk/device,ComputeSNAGridKokkosDevice); +#ifdef LMP_KOKKOS_GPU +ComputeStyle(sna/grid/kk/host,ComputeSNAGridKokkosHost); +#else +ComputeStyle(sna/grid/kk/host,ComputeSNAGridKokkosDevice); +#endif +// clang-format on +#else + +// clang-format off +#ifndef LMP_COMPUTE_SNA_GRID_KOKKOS_H +#define LMP_COMPUTE_SNA_GRID_KOKKOS_H + +#include "compute_sna_grid.h" +#include "kokkos_type.h" +#include "sna_kokkos.h" + +namespace LAMMPS_NS { + +// Routines for both the CPU and GPU backend + +// GPU backend only +struct TagCSNAGridComputeNeigh{}; +struct TagCSNAGridComputeCayleyKlein{}; +struct TagCSNAGridPreUi{}; +struct TagCSNAGridComputeUiSmall{}; // more parallelism, more divergence +struct TagCSNAGridComputeUiLarge{}; // less parallelism, no divergence +struct TagCSNAGridTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist +template struct TagCSNAGridComputeZi{}; +template struct TagCSNAGridComputeBi{}; +struct TagCSNAGridLocalFill{}; // fill the gridlocal array + +struct TagComputeSNAGridLoop{}; +struct TagComputeSNAGrid3D{}; + +// CPU backend only +struct TagComputeSNAGridLoopCPU{}; + +//template +template +class ComputeSNAGridKokkos : public ComputeSNAGrid { + public: + typedef DeviceType device_type; + typedef ArrayTypes AT; + + static constexpr int vector_length = vector_length_; + using real_type = real_type_; + using complex = SNAComplex; + + // Static team/tile sizes for device offload + +#ifdef KOKKOS_ENABLE_HIP + static constexpr int team_size_compute_neigh = 2; + static constexpr int tile_size_compute_ck = 2; + static constexpr int tile_size_pre_ui = 2; + static constexpr int team_size_compute_ui = 2; + static constexpr int tile_size_transform_ui = 2; + static constexpr int tile_size_compute_zi = 2; + static constexpr int min_blocks_compute_zi = 0; // no minimum bound + static constexpr int tile_size_compute_bi = 2; + static constexpr int tile_size_compute_yi = 2; + static constexpr int min_blocks_compute_yi = 0; // no minimum bound + static constexpr int team_size_compute_fused_deidrj = 2; +#else + static constexpr int team_size_compute_neigh = 4; + static constexpr int tile_size_compute_ck = 4; + static constexpr int tile_size_pre_ui = 4; + static constexpr int team_size_compute_ui = sizeof(real_type) == 4 ? 8 : 4; + static constexpr int tile_size_transform_ui = 4; + static constexpr int tile_size_compute_zi = 8; + static constexpr int tile_size_compute_bi = 4; + static constexpr int tile_size_compute_yi = 8; + static constexpr int team_size_compute_fused_deidrj = sizeof(real_type) == 4 ? 4 : 2; + + // this empirically reduces perf fluctuations from compiler version to compiler version + static constexpr int min_blocks_compute_zi = 4; + static constexpr int min_blocks_compute_yi = 4; +#endif + + // Custom MDRangePolicy, Rank3, to reduce verbosity of kernel launches + // This hides the Kokkos::IndexType and Kokkos::Rank<3...> + // and reduces the verbosity of the LaunchBound by hiding the explicit + // multiplication by vector_length + template + using Snap3DRangePolicy = typename Kokkos::MDRangePolicy, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, Kokkos::LaunchBounds, TagComputeSNA>; + + // MDRangePolicy for the 3D grid loop: + template + using CSNAGrid3DPolicy = typename Kokkos::MDRangePolicy, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>>; + + // Testing out team policies + template + using CSNAGridTeamPolicy = typename Kokkos::TeamPolicy, TagComputeSNA>; + //using CSNAGridTeamPolicy = typename Kokkos::TeamPolicy, Kokkos::IndexType, Kokkos::IndexType, TagComputeSNA>; + //using team_member = typename team_policy::member_type; + + // Custom SnapAoSoATeamPolicy to reduce the verbosity of kernel launches + // This hides the LaunchBounds abstraction by hiding the explicit + // multiplication by vector length + template + using SnapAoSoATeamPolicy = typename Kokkos::TeamPolicy, TagComputeSNA>; + + // Helper routine that returns a CPU or a GPU policy as appropriate + template + auto snap_get_policy(const int& chunk_size_div, const int& second_loop) { + return Snap3DRangePolicy({0, 0, 0}, + {vector_length, second_loop, chunk_size_div}, + {vector_length, num_tiles, 1}); + } + + ComputeSNAGridKokkos(class LAMMPS *, int, char **); + ~ComputeSNAGridKokkos() override; + + void setup() override; + void compute_array() override; + + // Utility functions for teams + + template + void check_team_size_for(int, int&); + + template + void check_team_size_reduce(int, int&); + + // operator function for example team policy + //KOKKOS_INLINE_FUNCTION + + KOKKOS_INLINE_FUNCTION + void operator() (TagComputeSNAGridLoop, const int& ) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagComputeSNAGridLoopCPU, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridComputeNeigh,const typename Kokkos::TeamPolicy::member_type& team) const; + + // 3D case - used by parallel_for + KOKKOS_INLINE_FUNCTION + void operator()(TagComputeSNAGrid3D, const int& iz, const int& iy, const int& ix) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridComputeCayleyKlein, const int iatom_mod, const int jnbor, const int iatom_div) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridPreUi, const int& iatom_mod, const int& j, const int& iatom_div) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridPreUi, const int& iatom, const int& j) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridPreUi, const int& iatom) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridComputeUiSmall,const typename Kokkos::TeamPolicy::member_type& team) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridComputeUiLarge,const typename Kokkos::TeamPolicy::member_type& team) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridTransformUi, const int& iatom_mod, const int& idxu, const int& iatom_div) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridTransformUi, const int& iatom, const int& idxu) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridTransformUi, const int& iatom) const; + + template KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridComputeZi, const int& iatom_mod, const int& idxz, const int& iatom_div) const; + + template KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridComputeZi, const int& iatom, const int& idxz) const; + + template KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridComputeZi, const int& iatom) const; + + template KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridComputeBi, const int& iatom_mod, const int& idxb, const int& iatom_div) const; + + template KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridComputeBi, const int& iatom, const int& idxb) const; + + template KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridComputeBi, const int& iatom) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridLocalFill,const int& ii) const; + + protected: + + SNAKokkos snaKK; + + int max_neighs, chunk_size, chunk_offset; + int host_flag; + int ntotal; + int total_range; // total number of loop iterations in grid + int zlen; //= nzhi-nzlo+1; + int ylen; //= nyhi-nylo+1; + int xlen; //= nxhi-nxlo+1; + + double cutsq_tmp; // temporary cutsq until we get a view + + Kokkos::View d_radelem; // element radii + Kokkos::View d_wjelem; // elements weights + Kokkos::View d_coeffelem; // element bispectrum coefficients + Kokkos::View d_sinnerelem; // element inner cutoff midpoint + Kokkos::View d_dinnerelem; // element inner cutoff half-width + Kokkos::View d_ninside; // ninside for all atoms in list + Kokkos::View d_map; // mapping from atom types to elements + Kokkos::View d_test; // test view + + typedef Kokkos::DualView tdual_fparams; + tdual_fparams k_cutsq; + typedef Kokkos::View > t_fparams_rnd; + t_fparams_rnd rnd_cutsq; + + typename AT::t_x_array_randomread x; + typename AT::t_int_1d_randomread type; + DAT::tdual_float_2d k_grid; + DAT::tdual_float_2d k_gridall; + typename AT::t_float_2d d_grid; + typename AT::t_float_2d d_gridall; + + DAT::tdual_float_4d k_gridlocal; + typename AT::t_float_4d d_gridlocal; + + + // Utility routine which wraps computing per-team scratch size requirements for + // ComputeNeigh, ComputeUi, and ComputeFusedDeidrj + template + int scratch_size_helper(int values_per_team); + + class DomainKokkos *domainKK; + + // triclinic vars + double h0, h1, h2, h3, h4, h5; + double lo0, lo1, lo2; + + // Make SNAKokkos a friend + friend class SNAKokkos; +}; + +// These wrapper classes exist to make the compute style factory happy/avoid having +// to extend the compute style factory to support Compute classes w/an arbitrary number +// of extra template parameters + +template +class ComputeSNAGridKokkosDevice : public ComputeSNAGridKokkos { + + private: + using Base = ComputeSNAGridKokkos; + + public: + + ComputeSNAGridKokkosDevice(class LAMMPS *, int, char **); + + void compute_array() override; + +}; + +#ifdef LMP_KOKKOS_GPU +template +class ComputeSNAGridKokkosHost : public ComputeSNAGridKokkos { + + private: + using Base = ComputeSNAGridKokkos; + + public: + + ComputeSNAGridKokkosHost(class LAMMPS *, int, char **); + + void compute_array() override; + +}; +#endif + +} + +#endif +#endif diff --git a/src/KOKKOS/compute_sna_grid_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_kokkos_impl.h new file mode 100644 index 0000000000..665a1b67e7 --- /dev/null +++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h @@ -0,0 +1,786 @@ +// clang-format off +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Christian Trott (SNL), Stan Moore (SNL), + Evan Weinberg (NVIDIA) +------------------------------------------------------------------------- */ + +#include "compute_sna_grid_kokkos.h" +#include "pair_snap_kokkos.h" + +#include "atom_kokkos.h" +#include "atom_masks.h" +#include "comm.h" +#include "error.h" +#include "memory_kokkos.h" +#include "modify.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "neighbor_kokkos.h" +#include "domain.h" +#include "domain_kokkos.h" +#include "sna.h" +#include "update.h" + +#include +#include +#include + +#include + +#define MAXLINE 1024 +#define MAXWORD 3 + +namespace LAMMPS_NS { + +// Constructor + +template +ComputeSNAGridKokkos::ComputeSNAGridKokkos(LAMMPS *lmp, int narg, char **arg) : ComputeSNAGrid(lmp, narg, arg) +{ + kokkosable = 1; + atomKK = (AtomKokkos *) atom; + domainKK = (DomainKokkos *) domain; + execution_space = ExecutionSpaceFromDevice::space; + datamask_read = EMPTY_MASK; + datamask_modify = EMPTY_MASK; + + k_cutsq = tdual_fparams("ComputeSNAGridKokkos::cutsq",atom->ntypes+1,atom->ntypes+1); + auto d_cutsq = k_cutsq.template view(); + rnd_cutsq = d_cutsq; + + host_flag = (execution_space == Host); + + // TODO: Extract cutsq in double loop below, no need for cutsq_tmp + + cutsq_tmp = cutsq[1][1]; + + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = 1; j <= atom->ntypes; j++){ + k_cutsq.h_view(i,j) = k_cutsq.h_view(j,i) = cutsq_tmp; + k_cutsq.template modify(); + } + } + + // Set up element lists + MemKK::realloc_kokkos(d_radelem,"ComputeSNAGridKokkos::radelem",nelements); + MemKK::realloc_kokkos(d_wjelem,"ComputeSNAGridKokkos:wjelem",nelements); + MemKK::realloc_kokkos(d_sinnerelem,"ComputeSNAGridKokkos:sinnerelem",nelements); + MemKK::realloc_kokkos(d_dinnerelem,"ComputeSNAGridKokkos:dinnerelem",nelements); + // test + MemKK::realloc_kokkos(d_test, "ComputeSNAGridKokkos::test", nelements); + + int n = atom->ntypes; + MemKK::realloc_kokkos(d_map,"ComputeSNAGridKokkos::map",n+1); + + auto h_radelem = Kokkos::create_mirror_view(d_radelem); + auto h_wjelem = Kokkos::create_mirror_view(d_wjelem); + auto h_sinnerelem = Kokkos::create_mirror_view(d_sinnerelem); + auto h_dinnerelem = Kokkos::create_mirror_view(d_dinnerelem); + auto h_map = Kokkos::create_mirror_view(d_map); + // test + auto h_test = Kokkos::create_mirror_view(d_test); + h_test(0) = 2.0; + + // start from index 1 because of how compute sna/grid is + for (int i = 1; i <= atom->ntypes; i++) { + h_radelem(i-1) = radelem[i]; + h_wjelem(i-1) = wjelem[i]; + if (switchinnerflag){ + h_sinnerelem(i) = sinnerelem[i]; + h_dinnerelem(i) = dinnerelem[i]; + } + } + + // In pair snap some things like `map` get allocated regardless of chem flag. + if (chemflag){ + for (int i = 1; i <= atom->ntypes; i++) { + h_map(i) = map[i]; + } + } + + Kokkos::deep_copy(d_radelem,h_radelem); + Kokkos::deep_copy(d_wjelem,h_wjelem); + if (switchinnerflag){ + Kokkos::deep_copy(d_sinnerelem,h_sinnerelem); + Kokkos::deep_copy(d_dinnerelem,h_dinnerelem); + } + if (chemflag){ + Kokkos::deep_copy(d_map,h_map); + } + Kokkos::deep_copy(d_test,h_test); + + snaKK = SNAKokkos(*this); + snaKK.grow_rij(0,0); + snaKK.init(); +} + +// Destructor + +template +ComputeSNAGridKokkos::~ComputeSNAGridKokkos() +{ + if (copymode) return; + + memoryKK->destroy_kokkos(k_cutsq,cutsq); + memoryKK->destroy_kokkos(k_gridall, gridall); +} + +// Setup + +template +void ComputeSNAGridKokkos::setup() +{ + // Do not call ComputeGrid::setup(), we don't wanna allocate the grid array there. + // Instead, call ComputeGrid::set_grid_global and set_grid_local to set the n indices. + + ComputeGrid::set_grid_global(); + ComputeGrid::set_grid_local(); + + // allocate arrays + memoryKK->create_kokkos(k_gridall, gridall, size_array_rows, size_array_cols, "grid:gridall"); + + // do not use or allocate gridlocal for now + + gridlocal_allocated = 0; + array = gridall; + + d_gridlocal = k_gridlocal.template view(); + d_gridall = k_gridall.template view(); +} + +// Compute + +template +void ComputeSNAGridKokkos::compute_array() +{ + if (host_flag) { + ComputeSNAGrid::compute_array(); + return; + } + + copymode = 1; + + zlen = nzhi-nzlo+1; + ylen = nyhi-nylo+1; + xlen = nxhi-nxlo+1; + total_range = (nzhi-nzlo+1)*(nyhi-nylo+1)*(nxhi-nxlo+1); + + atomKK->sync(execution_space,X_MASK|F_MASK|TYPE_MASK); + x = atomKK->k_x.view(); + type = atomKK->k_type.view(); + k_cutsq.template sync(); + + // max_neighs is defined here - think of more elaborate methods. + max_neighs = 100; + + // Pair snap/kk uses grow_ij with some max number of neighs but compute sna/grid uses total + // number of atoms. + + ntotal = atomKK->nlocal + atomKK->nghost; + // Allocate view for number of neighbors per grid point + MemKK::realloc_kokkos(d_ninside,"ComputeSNAGridKokkos:ninside",total_range); + + // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user + // `total_range` is the number of grid points which may be larger than chunk size. + chunk_size = MIN(chunksize, total_range); + chunk_offset = 0; + snaKK.grow_rij(chunk_size, max_neighs); + + // Pre-compute ceil(chunk_size / vector_length) for code cleanliness + const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length; + + if (triclinic) { + h0 = domain->h[0]; + h1 = domain->h[1]; + h2 = domain->h[2]; + h3 = domain->h[3]; + h4 = domain->h[4]; + h5 = domain->h[5]; + lo0 = domain->boxlo[0]; + lo1 = domain->boxlo[1]; + lo2 = domain->boxlo[2]; + } + + while (chunk_offset < total_range) { // chunk up loop to prevent running out of memory + + if (chunk_size > total_range - chunk_offset) + chunk_size = total_range - chunk_offset; + + + //ComputeNeigh + { + int scratch_size = scratch_size_helper(team_size_compute_neigh * max_neighs); //ntotal); + + SnapAoSoATeamPolicy + policy_neigh(chunk_size, team_size_compute_neigh, vector_length); + policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this); + } + + //ComputeCayleyKlein + { + // tile_size_compute_ck is defined in `compute_sna_grid_kokkos.h` + Snap3DRangePolicy + policy_compute_ck({0,0,0}, {vector_length, max_neighs, chunk_size_div}, {vector_length, tile_size_compute_ck, 1}); + Kokkos::parallel_for("ComputeCayleyKlein", policy_compute_ck, *this); + } + + //PreUi + { + auto policy_pre_ui = snap_get_policy(chunk_size_div, twojmax + 1); + Kokkos::parallel_for("PreUi", policy_pre_ui, *this); + } + + // ComputeUi w/ vector parallelism, shared memory, direct atomicAdd into ulisttot + { + // team_size_compute_ui is defined in `compute_sna_grid_kokkos.h` + // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer + const int tile_size = vector_length * (twojmax + 1); + const int scratch_size = scratch_size_helper(team_size_compute_ui * tile_size); + + if (chunk_size < parallel_thresh) + { + // Version with parallelism over j_bend + + // total number of teams needed: (natoms / 32) * (ntotal) * ("bend" locations) + const int n_teams = chunk_size_div * max_neighs * (twojmax + 1); + const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui; + + SnapAoSoATeamPolicy + policy_ui(n_teams_div, team_size_compute_ui, vector_length); + policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + Kokkos::parallel_for("ComputeUiSmall", policy_ui, *this); + } else { + // Version w/out parallelism over j_bend + + // total number of teams needed: (natoms / 32) * (ntotal) + const int n_teams = chunk_size_div * max_neighs; + const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui; + + SnapAoSoATeamPolicy + policy_ui(n_teams_div, team_size_compute_ui, vector_length); + policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + Kokkos::parallel_for("ComputeUiLarge", policy_ui, *this); + } + } + + //TransformUi: un-"fold" ulisttot, zero ylist + { + // Expand ulisttot_re,_im -> ulisttot + // Zero out ylist + auto policy_transform_ui = snap_get_policy(chunk_size_div, snaKK.idxu_max); + Kokkos::parallel_for("TransformUi", policy_transform_ui, *this); + } + + //Compute bispectrum + // team_size_[compute_zi, compute_bi, transform_bi] are defined in `pair_snap_kokkos.h` + + //ComputeZi and Bi + if (nelements > 1) { + auto policy_compute_zi = snap_get_policy, min_blocks_compute_zi>(chunk_size_div, snaKK.idxz_max); + Kokkos::parallel_for("ComputeZiChemsnap", policy_compute_zi, *this); + + auto policy_compute_bi = snap_get_policy>(chunk_size_div, snaKK.idxb_max); + Kokkos::parallel_for("ComputeBiChemsnap", policy_compute_bi, *this); + } else { + auto policy_compute_zi = snap_get_policy, min_blocks_compute_zi>(chunk_size_div, snaKK.idxz_max); + Kokkos::parallel_for("ComputeZi", policy_compute_zi, *this); + + auto policy_compute_bi = snap_get_policy>(chunk_size_div, snaKK.idxb_max); + Kokkos::parallel_for("ComputeBi", policy_compute_bi, *this); + } + + // Fill the grid array with bispectrum values + { + typename Kokkos::RangePolicy policy_fill(0,chunk_size); + Kokkos::parallel_for(policy_fill, *this); + } + + // Proceed to the next chunk. + chunk_offset += chunk_size; + + } // end while + + copymode = 0; + + k_gridlocal.template modify(); + k_gridlocal.template sync(); + + k_gridall.template modify(); + k_gridall.template sync(); +} + +/* ---------------------------------------------------------------------- + Begin routines that are unique to the GPU codepath. These take advantage + of AoSoA data layouts and scratch memory for recursive polynomials +------------------------------------------------------------------------- */ + +/* + Simple team policy functor seeing how many layers deep we can go with the parallelism. + */ +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridKokkos::operator() (TagCSNAGridComputeNeigh,const typename Kokkos::TeamPolicy::member_type& team) const { + + // This function follows similar procedure as ComputeNeigh of PairSNAPKokkos. + // Main difference is that we don't use the neighbor class or neighbor variables here. + // This is because the grid points are not atoms and therefore do not get assigned + // neighbors in LAMMPS. + // TODO: If we did make a neighborlist for each grid point, we could use current + // routines and avoid having to loop over all atoms (which limits us to + // natoms = max team size). + + // basic quantities associated with this team: + // team_rank : rank of thread in this team + // league_rank : rank of team in this league + // team_size : number of threads in this team + + // extract loop index + int ii = team.team_rank() + team.league_rank() * team.team_size(); + + if (ii >= chunk_size) return; + + // extract grid index + int igrid = ii + chunk_offset; + + // get a pointer to scratch memory + // This is used to cache whether or not an atom is within the cutoff. + // If it is, type_cache is assigned to the atom type. + // If it's not, it's assigned to -1. + //const int tile_size = ntotal; //max_neighs; // number of elements per thread + //const int team_rank = team.team_rank(); + //const int scratch_shift = team_rank * tile_size; // offset into pointer for entire team + //int* type_cache = (int*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(int), 0) + scratch_shift; + + // convert to grid indices + + int iz = igrid/(xlen*ylen); + int i2 = igrid - (iz*xlen*ylen); + int iy = i2/xlen; + int ix = i2 % xlen; + iz += nzlo; + iy += nylo; + ix += nxlo; + + double xgrid[3]; + + // index ii already captures the proper grid point + //int igrid = iz * (nx * ny) + iy * nx + ix; + + // grid2x converts igrid to ix,iy,iz like we've done before + // multiply grid integers by grid spacing delx, dely, delz + //grid2x(igrid, xgrid); + xgrid[0] = ix * delx; + xgrid[1] = iy * dely; + xgrid[2] = iz * delz; + + if (triclinic) { + + // Do a conversion on `xgrid` here like we do in the CPU version. + + // Can't do this: + // domainKK->lamda2x(xgrid, xgrid); + // Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed + + // Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats. + xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0; + xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1; + xgrid[2] = h2*xgrid[2] + lo2; + } + + const F_FLOAT xtmp = xgrid[0]; + const F_FLOAT ytmp = xgrid[1]; + const F_FLOAT ztmp = xgrid[2]; + + // currently, all grid points are type 1 + // not clear what a better choice would be + + const int itype = 1; + int ielem = 0; + if (chemflag) ielem = d_map[itype]; + //const double radi = d_radelem[ielem]; + + // Compute the number of neighbors, store rsq + int ninside = 0; + + // Looping over ntotal for now. + for (int j = 0; j < ntotal; j++){ + const F_FLOAT dx = x(j,0) - xtmp; + const F_FLOAT dy = x(j,1) - ytmp; + const F_FLOAT dz = x(j,2) - ztmp; + int jtype = type(j); + const F_FLOAT rsq = dx*dx + dy*dy + dz*dz; + + // don't include atoms that share location with grid point + if (rsq >= rnd_cutsq(itype,jtype) || rsq < 1e-20) { + jtype = -1; // use -1 to signal it's outside the radius + } + + if (jtype >= 0) + ninside++; + } + + d_ninside(ii) = ninside; + + // TODO: Adjust for multi-element, currently we set jelem = 0 regardless of type. + int offset = 0; + for (int j = 0; j < ntotal; j++){ + //const int jtype = type_cache[j]; + //if (jtype >= 0) { + const F_FLOAT dx = x(j,0) - xtmp; + const F_FLOAT dy = x(j,1) - ytmp; + const F_FLOAT dz = x(j,2) - ztmp; + const F_FLOAT rsq = dx*dx + dy*dy + dz*dz; + int jtype = type(j); + if (rsq < rnd_cutsq(itype,jtype) && rsq > 1e-20) { + int jelem = 0; + if (chemflag) jelem = d_map[jtype]; + snaKK.rij(ii,offset,0) = static_cast(dx); + snaKK.rij(ii,offset,1) = static_cast(dy); + snaKK.rij(ii,offset,2) = static_cast(dz); + // pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp + // actually since the views here have values starting at 0, let's use jelem + snaKK.wj(ii,offset) = static_cast(d_wjelem[jelem]); + snaKK.rcutij(ii,offset) = static_cast((2.0 * d_radelem[jelem])*rcutfac); + snaKK.inside(ii,offset) = j; + if (switchinnerflag) { + snaKK.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]); + snaKK.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]); + } + if (chemflag) + snaKK.element(ii,offset) = jelem; + else + snaKK.element(ii,offset) = 0; + offset++; + } + } +} + +/* ---------------------------------------------------------------------- + Pre-compute the Cayley-Klein parameters for reuse in later routines +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridKokkos::operator() (TagCSNAGridComputeCayleyKlein,const int iatom_mod, const int jnbor, const int iatom_div) const { + + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; + + const int ninside = d_ninside(iatom); + if (jnbor >= ninside) return; + + snaKK.compute_cayley_klein(iatom, jnbor); +} + +/* ---------------------------------------------------------------------- + Initialize the "ulisttot" structure with non-zero on-diagonal terms + and zero terms elsewhere +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridKokkos::operator() (TagCSNAGridPreUi, const int& iatom_mod, const int& j, const int& iatom_div) const { + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; + + int itype = type(iatom); + int ielem = d_map[itype]; + + snaKK.pre_ui(iatom, j, ielem); +} + +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridKokkos::operator() (TagCSNAGridPreUi, const int& iatom, const int& j) const { + if (iatom >= chunk_size) return; + + int itype = type(iatom); + int ielem = d_map[itype]; + + snaKK.pre_ui(iatom, j, ielem); +} + +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridKokkos::operator() (TagCSNAGridPreUi, const int& iatom) const { + if (iatom >= chunk_size) return; + + const int itype = type(iatom); + const int ielem = d_map[itype]; + + for (int j = 0; j <= twojmax; j++) + snaKK.pre_ui(iatom, j, ielem); +} + +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridKokkos::operator() (TagCSNAGridComputeUiSmall,const typename Kokkos::TeamPolicy::member_type& team) const { + + // extract flattened atom_div / neighbor number / bend location + int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui; + + // extract neighbor index, iatom_div + int iatom_div = flattened_idx / (max_neighs * (twojmax + 1)); // removed "const" to work around GCC 7 bug + const int jj_jbend = flattened_idx - iatom_div * (max_neighs * (twojmax + 1)); + const int jbend = jj_jbend / max_neighs; + int jj = jj_jbend - jbend * max_neighs; // removed "const" to work around GCC 7 bug + + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length), + [&] (const int iatom_mod) { + const int ii = iatom_mod + vector_length * iatom_div; + if (ii >= chunk_size) return; + + const int ninside = d_ninside(ii); + if (jj >= ninside) return; + + snaKK.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div); + }); + +} + +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridKokkos::operator() (TagCSNAGridComputeUiLarge,const typename Kokkos::TeamPolicy::member_type& team) const { + + // extract flattened atom_div / neighbor number / bend location + int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui; + + // extract neighbor index, iatom_div + int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug + int jj = flattened_idx - iatom_div * max_neighs; + + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length), + [&] (const int iatom_mod) { + const int ii = iatom_mod + vector_length * iatom_div; + if (ii >= chunk_size) return; + + const int ninside = d_ninside(ii); + if (jj >= ninside) return; + + snaKK.compute_ui_large(team,iatom_mod, jj, iatom_div); + }); +} + +/* ---------------------------------------------------------------------- + De-symmetrize ulisttot_re and _im and pack it into a unified ulisttot + structure. Zero-initialize ylist. CPU and GPU. +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridKokkos::operator() (TagCSNAGridTransformUi, const int& iatom_mod, const int& idxu, const int& iatom_div) const { + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; + if (idxu >= snaKK.idxu_max) return; + snaKK.transform_ui(iatom, idxu); +} + +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridKokkos::operator() (TagCSNAGridTransformUi, const int& iatom, const int& idxu) const { + if (iatom >= chunk_size) return; + snaKK.transform_ui(iatom, idxu); +} + +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridKokkos::operator() (TagCSNAGridTransformUi, const int& iatom) const { + if (iatom >= chunk_size) return; + for (int idxu = 0; idxu < snaKK.idxu_max; idxu++) + snaKK.transform_ui(iatom, idxu); +} + +/* ---------------------------------------------------------------------- + Compute all elements of the Z tensor and store them into the `zlist` + view +------------------------------------------------------------------------- */ + +template +template KOKKOS_INLINE_FUNCTION +void ComputeSNAGridKokkos::operator() (TagCSNAGridComputeZi, const int& iatom_mod, const int& jjz, const int& iatom_div) const { + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; + if (jjz >= snaKK.idxz_max) return; + snaKK.template compute_zi(iatom, jjz); +} + +template +template KOKKOS_INLINE_FUNCTION +void ComputeSNAGridKokkos::operator() (TagCSNAGridComputeZi, const int& iatom, const int& jjz) const { + if (iatom >= chunk_size) return; + snaKK.template compute_zi(iatom, jjz); +} + +template +template KOKKOS_INLINE_FUNCTION +void ComputeSNAGridKokkos::operator() (TagCSNAGridComputeZi, const int& iatom) const { + if (iatom >= chunk_size) return; + for (int jjz = 0; jjz < snaKK.idxz_max; jjz++) + snaKK.template compute_zi(iatom, jjz); +} + +/* ---------------------------------------------------------------------- + Compute the energy triple products and store in the "blist" view +------------------------------------------------------------------------- */ + +template +template KOKKOS_INLINE_FUNCTION +void ComputeSNAGridKokkos::operator() (TagCSNAGridComputeBi, const int& iatom_mod, const int& jjb, const int& iatom_div) const { + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; + if (jjb >= snaKK.idxb_max) return; + snaKK.template compute_bi(iatom, jjb); +} + +template +template KOKKOS_INLINE_FUNCTION +void ComputeSNAGridKokkos::operator() (TagCSNAGridComputeBi, const int& iatom, const int& jjb) const { + if (iatom >= chunk_size) return; + snaKK.template compute_bi(iatom, jjb); +} + +template +template KOKKOS_INLINE_FUNCTION +void ComputeSNAGridKokkos::operator() (TagCSNAGridComputeBi, const int& iatom) const { + if (iatom >= chunk_size) return; + for (int jjb = 0; jjb < snaKK.idxb_max; jjb++) + snaKK.template compute_bi(iatom, jjb); +} + +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridKokkos::operator() (TagCSNAGridLocalFill, const int& ii) const { + + // extract grid index + int igrid = ii + chunk_offset; + + // convert to grid indices + + int iz = igrid/(xlen*ylen); + int i2 = igrid - (iz*xlen*ylen); + int iy = i2/xlen; + int ix = i2 % xlen; + iz += nzlo; + iy += nylo; + ix += nxlo; + + double xgrid[3]; + + // index ii already captures the proper grid point + // int igrid = iz * (nx * ny) + iy * nx + ix; + // printf("ii igrid: %d %d\n", ii, igrid); + + // grid2x converts igrid to ix,iy,iz like we've done before + //grid2x(igrid, xgrid); + xgrid[0] = ix * delx; + xgrid[1] = iy * dely; + xgrid[2] = iz * delz; + if (triclinic) { + + // Do a conversion on `xgrid` here like we do in the CPU version. + + // Can't do this: + // domainKK->lamda2x(xgrid, xgrid); + // Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed + + // Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats. + xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0; + xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1; + xgrid[2] = h2*xgrid[2] + lo2; + } + + const F_FLOAT xtmp = xgrid[0]; + const F_FLOAT ytmp = xgrid[1]; + const F_FLOAT ztmp = xgrid[2]; + d_gridall(igrid,0) = xtmp; + d_gridall(igrid,1) = ytmp; + d_gridall(igrid,2) = ztmp; + + const auto idxb_max = snaKK.idxb_max; + + // linear contributions + + for (int icoeff = 0; icoeff < ncoeff; icoeff++) { + const auto idxb = icoeff % idxb_max; + const auto idx_chem = icoeff / idxb_max; + d_gridall(igrid,icoeff+3) = snaKK.blist(ii,idx_chem,idxb); + } + +} + +/* ---------------------------------------------------------------------- + utility functions +------------------------------------------------------------------------- */ + +template +template +void ComputeSNAGridKokkos::check_team_size_for(int inum, int &team_size) { + int team_size_max; + + team_size_max = Kokkos::TeamPolicy(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelForTag()); + + if (team_size*vector_length > team_size_max) + team_size = team_size_max/vector_length; +} + +template +template +void ComputeSNAGridKokkos::check_team_size_reduce(int inum, int &team_size) { + int team_size_max; + + team_size_max = Kokkos::TeamPolicy(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelReduceTag()); + + if (team_size*vector_length > team_size_max) + team_size = team_size_max/vector_length; +} + +template +template +int ComputeSNAGridKokkos::scratch_size_helper(int values_per_team) { + typedef Kokkos::View > ScratchViewType; + + return ScratchViewType::shmem_size(values_per_team); +} + +/* ---------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + routines used by template reference classes +------------------------------------------------------------------------- */ + + +template +ComputeSNAGridKokkosDevice::ComputeSNAGridKokkosDevice(class LAMMPS *lmp, int narg, char **arg) + : ComputeSNAGridKokkos(lmp, narg, arg) { ; } + +template +void ComputeSNAGridKokkosDevice::compute_array() +{ + Base::compute_array(); +} + +#ifdef LMP_KOKKOS_GPU +template +ComputeSNAGridKokkosHost::ComputeSNAGridKokkosHost(class LAMMPS *lmp, int narg, char **arg) + : ComputeSNAGridKokkos(lmp, narg, arg) { ; } + +template +void ComputeSNAGridKokkosHost::compute_array() +{ + Base::compute_array(); +} +#endif + +} diff --git a/src/KOKKOS/compute_sna_grid_local_kokkos.cpp b/src/KOKKOS/compute_sna_grid_local_kokkos.cpp new file mode 100644 index 0000000000..3835a56bf8 --- /dev/null +++ b/src/KOKKOS/compute_sna_grid_local_kokkos.cpp @@ -0,0 +1,25 @@ +// clang-format off +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include "compute_sna_grid_local_kokkos.h" +#include "compute_sna_grid_local_kokkos_impl.h" + +namespace LAMMPS_NS { + +template class ComputeSNAGridLocalKokkosDevice; +#ifdef LMP_KOKKOS_GPU +template class ComputeSNAGridLocalKokkosHost; +#endif + +} diff --git a/src/KOKKOS/compute_sna_grid_local_kokkos.h b/src/KOKKOS/compute_sna_grid_local_kokkos.h new file mode 100644 index 0000000000..2ffc050b2d --- /dev/null +++ b/src/KOKKOS/compute_sna_grid_local_kokkos.h @@ -0,0 +1,288 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef COMPUTE_CLASS +// clang-format off +ComputeStyle(sna/grid/local/kk,ComputeSNAGridLocalKokkosDevice); +ComputeStyle(sna/grid/local/kk/device,ComputeSNAGridLocalKokkosDevice); +#ifdef LMP_KOKKOS_GPU +ComputeStyle(sna/grid/local/kk/host,ComputeSNAGridLocalKokkosHost); +#else +ComputeStyle(sna/grid/local/kk/host,ComputeSNAGridLocalKokkosDevice); +#endif +// clang-format on +#else + +// clang-format off +#ifndef LMP_COMPUTE_SNA_GRID_LOCAL_KOKKOS_H +#define LMP_COMPUTE_SNA_GRID_LOCAL_KOKKOS_H + +#include "compute_sna_grid_local.h" +#include "kokkos_type.h" +#include "sna_kokkos.h" + +namespace LAMMPS_NS { + +// Routines for both the CPU and GPU backend + +// GPU backend only +struct TagCSNAGridLocalComputeNeigh{}; +struct TagCSNAGridLocalComputeCayleyKlein{}; +struct TagCSNAGridLocalPreUi{}; +struct TagCSNAGridLocalComputeUiSmall{}; // more parallelism, more divergence +struct TagCSNAGridLocalComputeUiLarge{}; // less parallelism, no divergence +struct TagCSNAGridLocalTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist +template struct TagCSNAGridLocalComputeZi{}; +template struct TagCSNAGridLocalComputeBi{}; +struct TagCSNAGridLocal2Fill{}; // fill the gridlocal array + +struct TagComputeSNAGridLocalLoop{}; +struct TagComputeSNAGridLocal3D{}; + +// CPU backend only +struct TagComputeSNAGridLocalLoopCPU{}; + +//template +template +class ComputeSNAGridLocalKokkos : public ComputeSNAGridLocal { + public: + typedef DeviceType device_type; + typedef ArrayTypes AT; + + static constexpr int vector_length = vector_length_; + using real_type = real_type_; + using complex = SNAComplex; + + // Static team/tile sizes for device offload + +#ifdef KOKKOS_ENABLE_HIP + static constexpr int team_size_compute_neigh = 2; + static constexpr int tile_size_compute_ck = 2; + static constexpr int tile_size_pre_ui = 2; + static constexpr int team_size_compute_ui = 2; + static constexpr int tile_size_transform_ui = 2; + static constexpr int tile_size_compute_zi = 2; + static constexpr int min_blocks_compute_zi = 0; // no minimum bound + static constexpr int tile_size_compute_bi = 2; + static constexpr int tile_size_compute_yi = 2; + static constexpr int min_blocks_compute_yi = 0; // no minimum bound + static constexpr int team_size_compute_fused_deidrj = 2; +#else + static constexpr int team_size_compute_neigh = 4; + static constexpr int tile_size_compute_ck = 4; + static constexpr int tile_size_pre_ui = 4; + static constexpr int team_size_compute_ui = sizeof(real_type) == 4 ? 8 : 4; + static constexpr int tile_size_transform_ui = 4; + static constexpr int tile_size_compute_zi = 8; + static constexpr int tile_size_compute_bi = 4; + static constexpr int tile_size_compute_yi = 8; + static constexpr int team_size_compute_fused_deidrj = sizeof(real_type) == 4 ? 4 : 2; + + // this empirically reduces perf fluctuations from compiler version to compiler version + static constexpr int min_blocks_compute_zi = 4; + static constexpr int min_blocks_compute_yi = 4; +#endif + + // Custom MDRangePolicy, Rank3, to reduce verbosity of kernel launches + // This hides the Kokkos::IndexType and Kokkos::Rank<3...> + // and reduces the verbosity of the LaunchBound by hiding the explicit + // multiplication by vector_length + template + using Snap3DRangePolicy = typename Kokkos::MDRangePolicy, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, Kokkos::LaunchBounds, TagComputeSNA>; + + // MDRangePolicy for the 3D grid loop: + template + using CSNAGridLocal3DPolicy = typename Kokkos::MDRangePolicy, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>>; + + // Testing out team policies + template + using CSNAGridLocalTeamPolicy = typename Kokkos::TeamPolicy, TagComputeSNA>; + + // Custom SnapAoSoATeamPolicy to reduce the verbosity of kernel launches + // This hides the LaunchBounds abstraction by hiding the explicit + // multiplication by vector length + template + using SnapAoSoATeamPolicy = typename Kokkos::TeamPolicy, TagComputeSNA>; + + // Helper routine that returns a CPU or a GPU policy as appropriate + template + auto snap_get_policy(const int& chunk_size_div, const int& second_loop) { + return Snap3DRangePolicy({0, 0, 0}, + {vector_length, second_loop, chunk_size_div}, + {vector_length, num_tiles, 1}); + } + + ComputeSNAGridLocalKokkos(class LAMMPS *, int, char **); + ~ComputeSNAGridLocalKokkos() override; + + void setup() override; + void compute_local() override; + + // Utility functions for teams + + template + void check_team_size_for(int, int&); + + template + void check_team_size_reduce(int, int&); + + KOKKOS_INLINE_FUNCTION + void operator() (TagComputeSNAGridLocalLoop, const int& ) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagComputeSNAGridLocalLoopCPU, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridLocalComputeNeigh,const typename Kokkos::TeamPolicy::member_type& team) const; + + // 3D case - used by parallel_for + KOKKOS_INLINE_FUNCTION + void operator()(TagComputeSNAGridLocal3D, const int& iz, const int& iy, const int& ix) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridLocalComputeCayleyKlein, const int iatom_mod, const int jnbor, const int iatom_div) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridLocalPreUi, const int& iatom_mod, const int& j, const int& iatom_div) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridLocalPreUi, const int& iatom, const int& j) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridLocalPreUi, const int& iatom) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridLocalComputeUiSmall,const typename Kokkos::TeamPolicy::member_type& team) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridLocalComputeUiLarge,const typename Kokkos::TeamPolicy::member_type& team) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridLocalTransformUi, const int& iatom_mod, const int& idxu, const int& iatom_div) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridLocalTransformUi, const int& iatom, const int& idxu) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridLocalTransformUi, const int& iatom) const; + + template KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridLocalComputeZi, const int& iatom_mod, const int& idxz, const int& iatom_div) const; + + template KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridLocalComputeZi, const int& iatom, const int& idxz) const; + + template KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridLocalComputeZi, const int& iatom) const; + + template KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridLocalComputeBi, const int& iatom_mod, const int& idxb, const int& iatom_div) const; + + template KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridLocalComputeBi, const int& iatom, const int& idxb) const; + + template KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridLocalComputeBi, const int& iatom) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridLocal2Fill,const int& ii) const; + + protected: + + SNAKokkos snaKK; + + int max_neighs, chunk_size, chunk_offset; + int host_flag; + int ntotal; + int total_range; // total number of loop iterations in grid + int zlen; //= nzhi-nzlo+1; + int ylen; //= nyhi-nylo+1; + int xlen; //= nxhi-nxlo+1; + + double cutsq_tmp; // temporary cutsq until we get a view + + Kokkos::View d_radelem; // element radii + Kokkos::View d_wjelem; // elements weights + Kokkos::View d_coeffelem; // element bispectrum coefficients + Kokkos::View d_sinnerelem; // element inner cutoff midpoint + Kokkos::View d_dinnerelem; // element inner cutoff half-width + Kokkos::View d_ninside; // ninside for all atoms in list + Kokkos::View d_map; // mapping from atom types to elements + Kokkos::View d_test; // test view + + typedef Kokkos::DualView tdual_fparams; + tdual_fparams k_cutsq; + typedef Kokkos::View > t_fparams_rnd; + t_fparams_rnd rnd_cutsq; + + typename AT::t_x_array_randomread x; + typename AT::t_int_1d_randomread type; + + DAT::tdual_float_2d k_alocal; + typename AT::t_float_2d d_alocal; + + + // Utility routine which wraps computing per-team scratch size requirements for + // ComputeNeigh, ComputeUi, and ComputeFusedDeidrj + template + int scratch_size_helper(int values_per_team); + + class DomainKokkos *domainKK; + + // triclinic vars + double h0, h1, h2, h3, h4, h5; + double lo0, lo1, lo2; + + // Make SNAKokkos a friend + friend class SNAKokkos; +}; + +// These wrapper classes exist to make the compute style factory happy/avoid having +// to extend the compute style factory to support Compute classes w/an arbitrary number +// of extra template parameters + +template +class ComputeSNAGridLocalKokkosDevice : public ComputeSNAGridLocalKokkos { + + private: + using Base = ComputeSNAGridLocalKokkos; + + public: + + ComputeSNAGridLocalKokkosDevice(class LAMMPS *, int, char **); + + void compute_local() override; + +}; + +#ifdef LMP_KOKKOS_GPU +template +class ComputeSNAGridLocalKokkosHost : public ComputeSNAGridLocalKokkos { + + private: + using Base = ComputeSNAGridLocalKokkos; + + public: + + ComputeSNAGridLocalKokkosHost(class LAMMPS *, int, char **); + + void compute_local() override; + +}; +#endif + +} + +#endif +#endif diff --git a/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h new file mode 100644 index 0000000000..01bb2b427b --- /dev/null +++ b/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h @@ -0,0 +1,783 @@ +// clang-format off +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Andrew Rohskopf (SNL) +------------------------------------------------------------------------- */ + +#include "compute_sna_grid_local_kokkos.h" +#include "pair_snap_kokkos.h" + +#include "atom_kokkos.h" +#include "atom_masks.h" +#include "comm.h" +#include "error.h" +#include "memory_kokkos.h" +#include "modify.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "neighbor_kokkos.h" +#include "domain.h" +#include "domain_kokkos.h" +#include "sna.h" +#include "update.h" + +#include +#include +#include + +#include + +#define MAXLINE 1024 +#define MAXWORD 3 + +namespace LAMMPS_NS { + +// Constructor + +template +ComputeSNAGridLocalKokkos::ComputeSNAGridLocalKokkos(LAMMPS *lmp, int narg, char **arg) : ComputeSNAGridLocal(lmp, narg, arg) +{ + kokkosable = 1; + atomKK = (AtomKokkos *) atom; + domainKK = (DomainKokkos *) domain; + execution_space = ExecutionSpaceFromDevice::space; + datamask_read = EMPTY_MASK; + datamask_modify = EMPTY_MASK; + + k_cutsq = tdual_fparams("ComputeSNAGridLocalKokkos::cutsq",atom->ntypes+1,atom->ntypes+1); + auto d_cutsq = k_cutsq.template view(); + rnd_cutsq = d_cutsq; + + host_flag = (execution_space == Host); + + // TODO: Extract cutsq in double loop below, no need for cutsq_tmp + + cutsq_tmp = cutsq[1][1]; + + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = 1; j <= atom->ntypes; j++){ + k_cutsq.h_view(i,j) = k_cutsq.h_view(j,i) = cutsq_tmp; + k_cutsq.template modify(); + } + } + + // Set up element lists + MemKK::realloc_kokkos(d_radelem,"ComputeSNAGridLocalKokkos::radelem",nelements); + MemKK::realloc_kokkos(d_wjelem,"ComputeSNAGridLocalKokkos:wjelem",nelements); + MemKK::realloc_kokkos(d_sinnerelem,"ComputeSNAGridLocalKokkos:sinnerelem",nelements); + MemKK::realloc_kokkos(d_dinnerelem,"ComputeSNAGridLocalKokkos:dinnerelem",nelements); + // test + MemKK::realloc_kokkos(d_test, "ComputeSNAGridLocalKokkos::test", nelements); + + int n = atom->ntypes; + MemKK::realloc_kokkos(d_map,"ComputeSNAGridLocalKokkos::map",n+1); + + auto h_radelem = Kokkos::create_mirror_view(d_radelem); + auto h_wjelem = Kokkos::create_mirror_view(d_wjelem); + auto h_sinnerelem = Kokkos::create_mirror_view(d_sinnerelem); + auto h_dinnerelem = Kokkos::create_mirror_view(d_dinnerelem); + auto h_map = Kokkos::create_mirror_view(d_map); + // test + auto h_test = Kokkos::create_mirror_view(d_test); + h_test(0) = 2.0; + + // start from index 1 because of how compute sna/grid is + for (int i = 1; i <= atom->ntypes; i++) { + h_radelem(i-1) = radelem[i]; + h_wjelem(i-1) = wjelem[i]; + if (switchinnerflag){ + h_sinnerelem(i) = sinnerelem[i]; + h_dinnerelem(i) = dinnerelem[i]; + } + } + + // In pair snap some things like `map` get allocated regardless of chem flag. + if (chemflag){ + for (int i = 1; i <= atom->ntypes; i++) { + h_map(i) = map[i]; + } + } + + Kokkos::deep_copy(d_radelem,h_radelem); + Kokkos::deep_copy(d_wjelem,h_wjelem); + if (switchinnerflag){ + Kokkos::deep_copy(d_sinnerelem,h_sinnerelem); + Kokkos::deep_copy(d_dinnerelem,h_dinnerelem); + } + if (chemflag){ + Kokkos::deep_copy(d_map,h_map); + } + Kokkos::deep_copy(d_test,h_test); + + snaKK = SNAKokkos(*this); + snaKK.grow_rij(0,0); + snaKK.init(); +} + +// Destructor + +template +ComputeSNAGridLocalKokkos::~ComputeSNAGridLocalKokkos() +{ + if (copymode) return; + + memoryKK->destroy_kokkos(k_cutsq,cutsq); + memoryKK->destroy_kokkos(k_alocal,alocal); +} + +// Setup + +template +void ComputeSNAGridLocalKokkos::setup() +{ + + ComputeGridLocal::setup(); + + // allocate arrays + memoryKK->create_kokkos(k_alocal, alocal, size_local_rows, size_local_cols, "grid:alocal"); + array_local = alocal; + d_alocal = k_alocal.template view(); +} + +// Compute + +template +void ComputeSNAGridLocalKokkos::compute_local() +{ + if (host_flag) { + ComputeSNAGridLocal::compute_array(); + return; + } + + copymode = 1; + + zlen = nzhi-nzlo+1; + ylen = nyhi-nylo+1; + xlen = nxhi-nxlo+1; + total_range = (nzhi-nzlo+1)*(nyhi-nylo+1)*(nxhi-nxlo+1); + + atomKK->sync(execution_space,X_MASK|F_MASK|TYPE_MASK); + x = atomKK->k_x.view(); + type = atomKK->k_type.view(); + k_cutsq.template sync(); + + // max_neighs is defined here - think of more elaborate methods. + max_neighs = 100; + + // Pair snap/kk uses grow_ij with some max number of neighs but compute sna/grid uses total + // number of atoms. + + ntotal = atomKK->nlocal + atomKK->nghost; + // Allocate view for number of neighbors per grid point + MemKK::realloc_kokkos(d_ninside,"ComputeSNAGridLocalKokkos:ninside",total_range); + + // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user + // `total_range` is the number of grid points which may be larger than chunk size. + chunk_size = MIN(chunksize, total_range); + chunk_offset = 0; + //snaKK.grow_rij(chunk_size, ntotal); + snaKK.grow_rij(chunk_size, max_neighs); + + //chunk_size = total_range; + + // Pre-compute ceil(chunk_size / vector_length) for code cleanliness + const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length; + + if (triclinic) { + h0 = domain->h[0]; + h1 = domain->h[1]; + h2 = domain->h[2]; + h3 = domain->h[3]; + h4 = domain->h[4]; + h5 = domain->h[5]; + lo0 = domain->boxlo[0]; + lo1 = domain->boxlo[1]; + lo2 = domain->boxlo[2]; + } + + while (chunk_offset < total_range) { // chunk up loop to prevent running out of memory + + if (chunk_size > total_range - chunk_offset) + chunk_size = total_range - chunk_offset; + + + //ComputeNeigh + { + int scratch_size = scratch_size_helper(team_size_compute_neigh * max_neighs); //ntotal); + + SnapAoSoATeamPolicy + policy_neigh(chunk_size, team_size_compute_neigh, vector_length); + policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this); + } + + //ComputeCayleyKlein + { + // tile_size_compute_ck is defined in `compute_sna_grid_kokkos.h` + Snap3DRangePolicy + policy_compute_ck({0,0,0}, {vector_length, max_neighs, chunk_size_div}, {vector_length, tile_size_compute_ck, 1}); + Kokkos::parallel_for("ComputeCayleyKlein", policy_compute_ck, *this); + } + + //PreUi + { + auto policy_pre_ui = snap_get_policy(chunk_size_div, twojmax + 1); + Kokkos::parallel_for("PreUi", policy_pre_ui, *this); + } + + // ComputeUi w/ vector parallelism, shared memory, direct atomicAdd into ulisttot + { + // team_size_compute_ui is defined in `compute_sna_grid_kokkos.h` + // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer + const int tile_size = vector_length * (twojmax + 1); + const int scratch_size = scratch_size_helper(team_size_compute_ui * tile_size); + + if (chunk_size < parallel_thresh) + { + // Version with parallelism over j_bend + + // total number of teams needed: (natoms / 32) * (ntotal) * ("bend" locations) + const int n_teams = chunk_size_div * max_neighs * (twojmax + 1); + const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui; + + SnapAoSoATeamPolicy + policy_ui(n_teams_div, team_size_compute_ui, vector_length); + policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + Kokkos::parallel_for("ComputeUiSmall", policy_ui, *this); + } else { + // Version w/out parallelism over j_bend + + // total number of teams needed: (natoms / 32) * (ntotal) + const int n_teams = chunk_size_div * max_neighs; + const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui; + + SnapAoSoATeamPolicy + policy_ui(n_teams_div, team_size_compute_ui, vector_length); + policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + Kokkos::parallel_for("ComputeUiLarge", policy_ui, *this); + } + } + + //TransformUi: un-"fold" ulisttot, zero ylist + { + // Expand ulisttot_re,_im -> ulisttot + // Zero out ylist + auto policy_transform_ui = snap_get_policy(chunk_size_div, snaKK.idxu_max); + Kokkos::parallel_for("TransformUi", policy_transform_ui, *this); + } + + //Compute bispectrum + // team_size_[compute_zi, compute_bi, transform_bi] are defined in `pair_snap_kokkos.h` + + //ComputeZi and Bi + if (nelements > 1) { + auto policy_compute_zi = snap_get_policy, min_blocks_compute_zi>(chunk_size_div, snaKK.idxz_max); + Kokkos::parallel_for("ComputeZiChemsnap", policy_compute_zi, *this); + + auto policy_compute_bi = snap_get_policy>(chunk_size_div, snaKK.idxb_max); + Kokkos::parallel_for("ComputeBiChemsnap", policy_compute_bi, *this); + } else { + auto policy_compute_zi = snap_get_policy, min_blocks_compute_zi>(chunk_size_div, snaKK.idxz_max); + Kokkos::parallel_for("ComputeZi", policy_compute_zi, *this); + + auto policy_compute_bi = snap_get_policy>(chunk_size_div, snaKK.idxb_max); + Kokkos::parallel_for("ComputeBi", policy_compute_bi, *this); + } + + // Fill the grid array with bispectrum values + { + typename Kokkos::RangePolicy policy_fill(0,chunk_size); + Kokkos::parallel_for(policy_fill, *this); + } + + // Proceed to the next chunk. + chunk_offset += chunk_size; + + } // end while + + copymode = 0; + + k_alocal.template modify(); + k_alocal.template sync(); +} + +/* ---------------------------------------------------------------------- + Begin routines that are unique to the GPU codepath. These take advantage + of AoSoA data layouts and scratch memory for recursive polynomials +------------------------------------------------------------------------- */ + +/* + Simple team policy functor seeing how many layers deep we can go with the parallelism. + */ +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalComputeNeigh,const typename Kokkos::TeamPolicy::member_type& team) const { + + // This function follows similar procedure as ComputeNeigh of PairSNAPKokkos. + // Main difference is that we don't use the neighbor class or neighbor variables here. + // This is because the grid points are not atoms and therefore do not get assigned + // neighbors in LAMMPS. + // TODO: If we did make a neighborlist for each grid point, we could use current + // routines and avoid having to loop over all atoms (which limits us to + // natoms = max team size). + + // basic quantities associated with this team: + // team_rank : rank of thread in this team + // league_rank : rank of team in this league + // team_size : number of threads in this team + + // extract loop index + int ii = team.team_rank() + team.league_rank() * team.team_size(); + + if (ii >= chunk_size) return; + + // extract grid index + int igrid = ii + chunk_offset; + + // get a pointer to scratch memory + // This is used to cache whether or not an atom is within the cutoff. + // If it is, type_cache is assigned to the atom type. + // If it's not, it's assigned to -1. + //const int tile_size = ntotal; //max_neighs; // number of elements per thread + //const int team_rank = team.team_rank(); + //const int scratch_shift = team_rank * tile_size; // offset into pointer for entire team + //int* type_cache = (int*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(int), 0) + scratch_shift; + + // convert to grid indices + + int iz = igrid/(xlen*ylen); + int i2 = igrid - (iz*xlen*ylen); + int iy = i2/xlen; + int ix = i2 % xlen; + iz += nzlo; + iy += nylo; + ix += nxlo; + + double xgrid[3]; + + // index ii already captures the proper grid point + //int igrid = iz * (nx * ny) + iy * nx + ix; + + // grid2x converts igrid to ix,iy,iz like we've done before + // multiply grid integers by grid spacing delx, dely, delz + //grid2x(igrid, xgrid); + xgrid[0] = ix * delx; + xgrid[1] = iy * dely; + xgrid[2] = iz * delz; + + if (triclinic) { + + // Do a conversion on `xgrid` here like we do in the CPU version. + + // Can't do this: + // domainKK->lamda2x(xgrid, xgrid); + // Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed + + // Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats. + xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0; + xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1; + xgrid[2] = h2*xgrid[2] + lo2; + } + + const F_FLOAT xtmp = xgrid[0]; + const F_FLOAT ytmp = xgrid[1]; + const F_FLOAT ztmp = xgrid[2]; + + // Zeroing out the components, which are filled as a sum. + for (int icol = size_local_cols_base; icol < size_local_cols; icol++){ + d_alocal(igrid, icol) = 0.0; + } + + // Fill grid info columns + d_alocal(igrid, 0) = ix; + d_alocal(igrid, 1) = iy; + d_alocal(igrid, 2) = iz; + d_alocal(igrid, 3) = xtmp; + d_alocal(igrid, 4) = ytmp; + d_alocal(igrid, 5) = ztmp; + + // currently, all grid points are type 1 + // not clear what a better choice would be + + const int itype = 1; + int ielem = 0; + if (chemflag) ielem = d_map[itype]; + //const double radi = d_radelem[ielem]; + + // Compute the number of neighbors, store rsq + int ninside = 0; + + // Looping over ntotal for now. + for (int j = 0; j < ntotal; j++){ + const F_FLOAT dx = x(j,0) - xtmp; + const F_FLOAT dy = x(j,1) - ytmp; + const F_FLOAT dz = x(j,2) - ztmp; + int jtype = type(j); + const F_FLOAT rsq = dx*dx + dy*dy + dz*dz; + + // don't include atoms that share location with grid point + if (rsq >= rnd_cutsq(itype,jtype) || rsq < 1e-20) { + jtype = -1; // use -1 to signal it's outside the radius + } + + if (jtype >= 0) + ninside++; + } + + d_ninside(ii) = ninside; + + // TODO: Adjust for multi-element, currently we set jelem = 0 regardless of type. + int offset = 0; + for (int j = 0; j < ntotal; j++){ + //const int jtype = type_cache[j]; + //if (jtype >= 0) { + const F_FLOAT dx = x(j,0) - xtmp; + const F_FLOAT dy = x(j,1) - ytmp; + const F_FLOAT dz = x(j,2) - ztmp; + const F_FLOAT rsq = dx*dx + dy*dy + dz*dz; + int jtype = type(j); + if (rsq < rnd_cutsq(itype,jtype) && rsq > 1e-20) { + int jelem = 0; + if (chemflag) jelem = d_map[jtype]; + snaKK.rij(ii,offset,0) = static_cast(dx); + snaKK.rij(ii,offset,1) = static_cast(dy); + snaKK.rij(ii,offset,2) = static_cast(dz); + // pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp + // actually since the views here have values starting at 0, let's use jelem + snaKK.wj(ii,offset) = static_cast(d_wjelem[jelem]); + snaKK.rcutij(ii,offset) = static_cast((2.0 * d_radelem[jelem])*rcutfac); + snaKK.inside(ii,offset) = j; + if (switchinnerflag) { + snaKK.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]); + snaKK.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]); + } + if (chemflag) + snaKK.element(ii,offset) = jelem; + else + snaKK.element(ii,offset) = 0; + offset++; + } + } +} + +/* ---------------------------------------------------------------------- + Pre-compute the Cayley-Klein parameters for reuse in later routines +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalComputeCayleyKlein,const int iatom_mod, const int jnbor, const int iatom_div) const { + + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; + + const int ninside = d_ninside(iatom); + if (jnbor >= ninside) return; + + snaKK.compute_cayley_klein(iatom, jnbor); +} + +/* ---------------------------------------------------------------------- + Initialize the "ulisttot" structure with non-zero on-diagonal terms + and zero terms elsewhere +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalPreUi, const int& iatom_mod, const int& j, const int& iatom_div) const { + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; + + int itype = type(iatom); + int ielem = d_map[itype]; + + snaKK.pre_ui(iatom, j, ielem); +} + +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalPreUi, const int& iatom, const int& j) const { + if (iatom >= chunk_size) return; + + int itype = type(iatom); + int ielem = d_map[itype]; + + snaKK.pre_ui(iatom, j, ielem); +} + +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalPreUi, const int& iatom) const { + if (iatom >= chunk_size) return; + + const int itype = type(iatom); + const int ielem = d_map[itype]; + + for (int j = 0; j <= twojmax; j++) + snaKK.pre_ui(iatom, j, ielem); +} + +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalComputeUiSmall,const typename Kokkos::TeamPolicy::member_type& team) const { + + // extract flattened atom_div / neighbor number / bend location + int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui; + + // extract neighbor index, iatom_div + int iatom_div = flattened_idx / (max_neighs * (twojmax + 1)); // removed "const" to work around GCC 7 bug + const int jj_jbend = flattened_idx - iatom_div * (max_neighs * (twojmax + 1)); + const int jbend = jj_jbend / max_neighs; + int jj = jj_jbend - jbend * max_neighs; // removed "const" to work around GCC 7 bug + + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length), + [&] (const int iatom_mod) { + const int ii = iatom_mod + vector_length * iatom_div; + if (ii >= chunk_size) return; + + const int ninside = d_ninside(ii); + if (jj >= ninside) return; + + snaKK.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div); + }); + +} + +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalComputeUiLarge,const typename Kokkos::TeamPolicy::member_type& team) const { + + // extract flattened atom_div / neighbor number / bend location + int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui; + + // extract neighbor index, iatom_div + int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug + int jj = flattened_idx - iatom_div * max_neighs; + + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length), + [&] (const int iatom_mod) { + const int ii = iatom_mod + vector_length * iatom_div; + if (ii >= chunk_size) return; + + const int ninside = d_ninside(ii); + if (jj >= ninside) return; + + snaKK.compute_ui_large(team,iatom_mod, jj, iatom_div); + }); +} + +/* ---------------------------------------------------------------------- + De-symmetrize ulisttot_re and _im and pack it into a unified ulisttot + structure. Zero-initialize ylist. CPU and GPU. +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalTransformUi, const int& iatom_mod, const int& idxu, const int& iatom_div) const { + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; + if (idxu >= snaKK.idxu_max) return; + snaKK.transform_ui(iatom, idxu); +} + +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalTransformUi, const int& iatom, const int& idxu) const { + if (iatom >= chunk_size) return; + snaKK.transform_ui(iatom, idxu); +} + +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalTransformUi, const int& iatom) const { + if (iatom >= chunk_size) return; + for (int idxu = 0; idxu < snaKK.idxu_max; idxu++) + snaKK.transform_ui(iatom, idxu); +} + +/* ---------------------------------------------------------------------- + Compute all elements of the Z tensor and store them into the `zlist` + view +------------------------------------------------------------------------- */ + +template +template KOKKOS_INLINE_FUNCTION +void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalComputeZi, const int& iatom_mod, const int& jjz, const int& iatom_div) const { + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; + if (jjz >= snaKK.idxz_max) return; + snaKK.template compute_zi(iatom, jjz); +} + +template +template KOKKOS_INLINE_FUNCTION +void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalComputeZi, const int& iatom, const int& jjz) const { + if (iatom >= chunk_size) return; + snaKK.template compute_zi(iatom, jjz); +} + +template +template KOKKOS_INLINE_FUNCTION +void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalComputeZi, const int& iatom) const { + if (iatom >= chunk_size) return; + for (int jjz = 0; jjz < snaKK.idxz_max; jjz++) + snaKK.template compute_zi(iatom, jjz); +} + +/* ---------------------------------------------------------------------- + Compute the energy triple products and store in the "blist" view +------------------------------------------------------------------------- */ + +template +template KOKKOS_INLINE_FUNCTION +void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalComputeBi, const int& iatom_mod, const int& jjb, const int& iatom_div) const { + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; + if (jjb >= snaKK.idxb_max) return; + snaKK.template compute_bi(iatom, jjb); +} + +template +template KOKKOS_INLINE_FUNCTION +void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalComputeBi, const int& iatom, const int& jjb) const { + if (iatom >= chunk_size) return; + snaKK.template compute_bi(iatom, jjb); +} + +template +template KOKKOS_INLINE_FUNCTION +void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalComputeBi, const int& iatom) const { + if (iatom >= chunk_size) return; + for (int jjb = 0; jjb < snaKK.idxb_max; jjb++) + snaKK.template compute_bi(iatom, jjb); +} + +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocal2Fill, const int& ii) const { + + // extract grid index + int igrid = ii + chunk_offset; + + // convert to grid indices + + int iz = igrid/(xlen*ylen); + int i2 = igrid - (iz*xlen*ylen); + int iy = i2/xlen; + int ix = i2 % xlen; + iz += nzlo; + iy += nylo; + ix += nxlo; + + double xgrid[3]; + + // index ii already captures the proper grid point + // int igrid = iz * (nx * ny) + iy * nx + ix; + // printf("ii igrid: %d %d\n", ii, igrid); + + // grid2x converts igrid to ix,iy,iz like we've done before + //grid2x(igrid, xgrid); + xgrid[0] = ix * delx; + xgrid[1] = iy * dely; + xgrid[2] = iz * delz; + if (triclinic) { + + // Do a conversion on `xgrid` here like we do in the CPU version. + + // Can't do this: + // domainKK->lamda2x(xgrid, xgrid); + // Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed + + // Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats. + xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0; + xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1; + xgrid[2] = h2*xgrid[2] + lo2; + } + + + const auto idxb_max = snaKK.idxb_max; + + // linear contributions + + for (int icoeff = 0; icoeff < ncoeff; icoeff++) { + const auto idxb = icoeff % idxb_max; + const auto idx_chem = icoeff / idxb_max; + d_alocal(igrid,icoeff+6) = snaKK.blist(ii,idx_chem,idxb); + } + +} + +/* ---------------------------------------------------------------------- + utility functions +------------------------------------------------------------------------- */ + +template +template +void ComputeSNAGridLocalKokkos::check_team_size_for(int inum, int &team_size) { + int team_size_max; + + team_size_max = Kokkos::TeamPolicy(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelForTag()); + + if (team_size*vector_length > team_size_max) + team_size = team_size_max/vector_length; +} + +template +template +void ComputeSNAGridLocalKokkos::check_team_size_reduce(int inum, int &team_size) { + int team_size_max; + + team_size_max = Kokkos::TeamPolicy(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelReduceTag()); + + if (team_size*vector_length > team_size_max) + team_size = team_size_max/vector_length; +} + +template +template +int ComputeSNAGridLocalKokkos::scratch_size_helper(int values_per_team) { + typedef Kokkos::View > ScratchViewType; + + return ScratchViewType::shmem_size(values_per_team); +} + +/* ---------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + routines used by template reference classes +------------------------------------------------------------------------- */ + + +template +ComputeSNAGridLocalKokkosDevice::ComputeSNAGridLocalKokkosDevice(class LAMMPS *lmp, int narg, char **arg) + : ComputeSNAGridLocalKokkos(lmp, narg, arg) { ; } + +template +void ComputeSNAGridLocalKokkosDevice::compute_local() +{ + Base::compute_local(); +} + +#ifdef LMP_KOKKOS_GPU +template +ComputeSNAGridLocalKokkosHost::ComputeSNAGridLocalKokkosHost(class LAMMPS *lmp, int narg, char **arg) + : ComputeSNAGridLocalKokkos(lmp, narg, arg) { ; } + +template +void ComputeSNAGridLocalKokkosHost::compute_local() +{ + Base::compute_local(); +} +#endif + +} diff --git a/src/KOKKOS/kokkos_type.h b/src/KOKKOS/kokkos_type.h index bfd9bba8aa..ae86e17b50 100644 --- a/src/KOKKOS/kokkos_type.h +++ b/src/KOKKOS/kokkos_type.h @@ -792,6 +792,14 @@ typedef tdual_float_3d::t_dev_um t_float_3d_um; typedef tdual_float_3d::t_dev_const_um t_float_3d_const_um; typedef tdual_float_3d::t_dev_const_randomread t_float_3d_randomread; +//4d float array n +typedef Kokkos::DualView tdual_float_4d; +typedef tdual_float_4d::t_dev t_float_4d; +typedef tdual_float_4d::t_dev_const t_float_4d_const; +typedef tdual_float_4d::t_dev_um t_float_4d_um; +typedef tdual_float_4d::t_dev_const_um t_float_4d_const_um; +typedef tdual_float_4d::t_dev_const_randomread t_float_4d_randomread; + #ifdef LMP_KOKKOS_NO_LEGACY typedef Kokkos::DualView tdual_float_1d_4; #else @@ -1126,6 +1134,14 @@ typedef tdual_float_3d::t_host_um t_float_3d_um; typedef tdual_float_3d::t_host_const_um t_float_3d_const_um; typedef tdual_float_3d::t_host_const_randomread t_float_3d_randomread; +//4d float array n +typedef Kokkos::DualView tdual_float_4d; +typedef tdual_float_4d::t_host t_float_4d; +typedef tdual_float_4d::t_host_const t_float_4d_const; +typedef tdual_float_4d::t_host_um t_float_4d_um; +typedef tdual_float_4d::t_host_const_um t_float_4d_const_um; +typedef tdual_float_4d::t_host_const_randomread t_float_4d_randomread; + #ifdef LMP_KOKKOS_NO_LEGACY typedef Kokkos::DualView tdual_float_1d_4; #else diff --git a/src/KOKKOS/memory_kokkos.h b/src/KOKKOS/memory_kokkos.h index c194cc173f..a94d9eb1e6 100644 --- a/src/KOKKOS/memory_kokkos.h +++ b/src/KOKKOS/memory_kokkos.h @@ -101,6 +101,7 @@ template { data = TYPE(std::string(name),n1,n2); h_data = Kokkos::create_mirror_view(data); + //printf(">>> name: %s\n", name); return data; } @@ -111,6 +112,7 @@ TYPE create_kokkos(TYPE &data, typename TYPE::value_type **&array, data = TYPE(std::string(name),n1,n2); bigint nbytes = ((bigint) sizeof(typename TYPE::value_type *)) * n1; array = (typename TYPE::value_type **) smalloc(nbytes,name); + //printf(">>> name %s nbytes %d\n", name, nbytes); for (int i = 0; i < n1; i++) { if (n2 == 0) @@ -121,6 +123,56 @@ TYPE create_kokkos(TYPE &data, typename TYPE::value_type **&array, return data; } +/* ---------------------------------------------------------------------- + create a 4d array with indices 2,3,4 offset, but not first + 2nd index from n2lo to n2hi inclusive + 3rd index from n3lo to n3hi inclusive + 4th index from n4lo to n4hi inclusive + cannot grow it +------------------------------------------------------------------------- */ + +template +TYPE create4d_offset_kokkos(TYPE &data, typename TYPE::value_type ****&array, + int n1, int n2lo, int n2hi, int n3lo, int n3hi, int n4lo, int n4hi, + const char *name) +{ + //if (n1 <= 0 || n2lo > n2hi || n3lo > n3hi || n4lo > n4hi) array = nullptr; + + printf("^^^^^ memoryKK->create_4d_offset_kokkos\n"); + + int n2 = n2hi - n2lo + 1; + int n3 = n3hi - n3lo + 1; + int n4 = n4hi - n4lo + 1; + data = TYPE(std::string(name),n1,n2,n3,n4); + bigint nbytes = ((bigint) sizeof(typename TYPE::value_type ***)) * n1; + array = (typename TYPE::value_type ****) smalloc(nbytes,name); + + for (int i = 0; i < n1; i++) { + if (n2 == 0) { + array[i] = nullptr; + } else { + nbytes = ((bigint) sizeof(typename TYPE::value_type **)) * n2; + array[i] = (typename TYPE::value_type ***) smalloc(nbytes,name); + for (int j = 0; j < n2; j++){ + if (n3 == 0){ + array[i][j] = nullptr; + } else { + nbytes = ((bigint) sizeof(typename TYPE::value_type *)) * n3; + array[i][j] = (typename TYPE::value_type **) smalloc(nbytes, name); + for (int k = 0; k < n3; k++){ + if (n4 == 0) + array[i][j][k] = nullptr; + else + array[i][j][k] = &data.h_view(i,j,k,0); + } + } + } + } + } + + return data; +} + template TYPE create_kokkos(TYPE &data, HTYPE &h_data, typename TYPE::value_type **&array, int n1, int n2, diff --git a/src/KOKKOS/pair_mliap_kokkos.cpp b/src/KOKKOS/pair_mliap_kokkos.cpp index 3c5bb7d910..599c49f523 100644 --- a/src/KOKKOS/pair_mliap_kokkos.cpp +++ b/src/KOKKOS/pair_mliap_kokkos.cpp @@ -240,6 +240,7 @@ void PairMLIAPKokkos::coeff(int narg, char **arg) { if (strcmp(elemname,descriptor->elements[jelem]) == 0) break; + //printf(">>> nelements: %d\n", descriptor->nelements); if (jelem < descriptor->nelements) map[i] = jelem; else if (strcmp(elemname,"NULL") == 0) map[i] = -1; diff --git a/src/KOKKOS/pair_snap_kokkos.h b/src/KOKKOS/pair_snap_kokkos.h index 660503eed8..4dc4029d12 100644 --- a/src/KOKKOS/pair_snap_kokkos.h +++ b/src/KOKKOS/pair_snap_kokkos.h @@ -375,7 +375,6 @@ class PairSNAPKokkos : public PairSNAP { // Make SNAKokkos a friend friend class SNAKokkos; - }; diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h index 2b9b862645..17ce8e1c9d 100644 --- a/src/KOKKOS/pair_snap_kokkos_impl.h +++ b/src/KOKKOS/pair_snap_kokkos_impl.h @@ -3,12 +3,10 @@ LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator https://www.lammps.org/, Sandia National Laboratories LAMMPS development team: developers@lammps.org - Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. - See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ @@ -39,17 +37,6 @@ namespace LAMMPS_NS { -// Outstanding issues with quadratic term -// 1. there seems to a problem with compute_optimized energy calc -// it does not match compute_regular, even when quadratic coeffs = 0 - -//static double t1 = 0.0; -//static double t2 = 0.0; -//static double t3 = 0.0; -//static double t4 = 0.0; -//static double t5 = 0.0; -//static double t6 = 0.0; -//static double t7 = 0.0; /* ---------------------------------------------------------------------- */ template @@ -219,7 +206,8 @@ void PairSNAPKokkos::compute(int eflag_in, // team_size_compute_neigh is defined in `pair_snap_kokkos.h` int scratch_size = scratch_size_helper(team_size_compute_neigh * max_neighs); - SnapAoSoATeamPolicy policy_neigh(chunk_size,team_size_compute_neigh,vector_length); + SnapAoSoATeamPolicy + policy_neigh(chunk_size,team_size_compute_neigh,vector_length); policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this); } @@ -259,7 +247,8 @@ void PairSNAPKokkos::compute(int eflag_in, const int n_teams = chunk_size_div * max_neighs * (twojmax + 1); const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui; - SnapAoSoATeamPolicy policy_ui(n_teams_div, team_size_compute_ui, vector_length); + SnapAoSoATeamPolicy + policy_ui(n_teams_div, team_size_compute_ui, vector_length); policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); Kokkos::parallel_for("ComputeUiSmall",policy_ui,*this); } else { @@ -269,7 +258,8 @@ void PairSNAPKokkos::compute(int eflag_in, const int n_teams = chunk_size_div * max_neighs; const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui; - SnapAoSoATeamPolicy policy_ui(n_teams_div, team_size_compute_ui, vector_length); + SnapAoSoATeamPolicy + policy_ui(n_teams_div, team_size_compute_ui, vector_length); policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); Kokkos::parallel_for("ComputeUiLarge",policy_ui,*this); } @@ -536,8 +526,7 @@ void PairSNAPKokkos::coeff(int narg, char Kokkos::deep_copy(d_dinnerelem,h_dinnerelem); Kokkos::deep_copy(d_map,h_map); - snaKK = SNAKokkos(*this); //rfac0,twojmax, - //rmin0,switchflag,bzeroflag,chemflag,bnormflag,wselfallflag,nelements,switchinnerflag); + snaKK = SNAKokkos(*this); snaKK.grow_rij(0,0); snaKK.init(); } diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h index a438ccd25e..61aebaf97d 100644 --- a/src/KOKKOS/sna_kokkos.h +++ b/src/KOKKOS/sna_kokkos.h @@ -29,7 +29,9 @@ #endif namespace LAMMPS_NS { - +// copied from pair_snap_kokkos.h +// pre-declare so sna_kokkos.h can refer to it +template class PairSNAPKokkos; template struct WignerWrapper { using real_type = real_type_; @@ -170,9 +172,9 @@ class SNAKokkos { KOKKOS_INLINE_FUNCTION SNAKokkos(const SNAKokkos& sna, const typename Kokkos::TeamPolicy::member_type& team); + template inline - //SNAKokkos(real_type, int, real_type, int, int, int, int, int, int, int); - SNAKokkos(const PairSNAPKokkos&); + SNAKokkos(const CopyClass&); KOKKOS_INLINE_FUNCTION ~SNAKokkos(); diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index 9a97f229b5..622ef0b8ae 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -29,17 +29,18 @@ static const double MY_PI = 3.14159265358979323846; // pi static const double MY_PI2 = 1.57079632679489661923; // pi/2 template +template inline -SNAKokkos::SNAKokkos(const PairSNAPKokkos& psk) - : rfac0(psk.rfac0), rmin0(psk.rmin0), switch_flag(psk.switchflag), - bzero_flag(psk.bzeroflag), chem_flag(psk.chemflag), bnorm_flag(psk.bnormflag), - wselfall_flag(psk.wselfallflag), switch_inner_flag(psk.switchinnerflag), - quadratic_flag(psk.quadraticflag), twojmax(psk.twojmax), d_coeffelem(psk.d_coeffelem) +SNAKokkos::SNAKokkos(const CopyClass& copy) + : twojmax(copy.twojmax), d_coeffelem(copy.d_coeffelem), rmin0(copy.rmin0), + rfac0(copy.rfac0), switch_flag(copy.switchflag), switch_inner_flag(copy.switchinnerflag), + chem_flag(copy.chemflag), bnorm_flag(copy.bnormflag), wselfall_flag(copy.wselfallflag), + quadratic_flag(copy.quadraticflag), bzero_flag(copy.bzeroflag) { wself = static_cast(1.0); if (chem_flag) - nelements = psk.nelements; + nelements = copy.nelements; else nelements = 1; @@ -611,7 +612,6 @@ void SNAKokkos::evaluate_ui_jbend(const Wi } ulist_wrapper.set(ma, ulist_accum); - mb++; } @@ -830,7 +830,6 @@ typename SNAKokkos::complex SNAKokkos +#include + +using namespace LAMMPS_NS; +using MathConst::MY_2PI; +using MathSpecial::powint; + +ComputeGaussianGridLocal::ComputeGaussianGridLocal(LAMMPS *lmp, int narg, char **arg) : + ComputeGridLocal(lmp, narg, arg), cutsq(nullptr), radelem(nullptr), + sigmaelem(nullptr), prefacelem(nullptr), argfacelem(nullptr) +{ + // skip over arguments used by base class + // so that argument positions are identical to + // regular per-atom compute + + arg += nargbase; + narg -= nargbase; + + //double rfac0, rmin0; + //int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag; + + int ntypes = atom->ntypes; + int nargmin = 4 + 2 * ntypes; + + if (narg < nargmin) error->all(FLERR, "Illegal compute {} command", style); + + // process required arguments + + memory->create(radelem, ntypes + 1, "gaussian/atom:radelem"); // offset by 1 to match up with types + memory->create(sigmaelem, ntypes + 1, "gaussian/atom:sigmaelem"); + memory->create(prefacelem, ntypes + 1, "gaussian/atom:prefacelem"); + memory->create(argfacelem, ntypes + 1, "gaussian/atom:argfacelem"); + + rcutfac = utils::numeric(FLERR, arg[3], false, lmp); + + for (int i = 0; i < ntypes; i++) radelem[i + 1] = utils::numeric(FLERR, arg[4 + i], false, lmp); + for (int i = 0; i < ntypes; i++) + sigmaelem[i + 1] = utils::numeric(FLERR, arg[ntypes + 4 + i], false, lmp); + + // construct cutsq + double cut; + cutmax = 0.0; + memory->create(cutsq, ntypes + 1, ntypes + 1, "gaussian/atom:cutsq"); + for (int i = 1; i <= ntypes; i++) { + cut = 2.0 * radelem[i] * rcutfac; + if (cut > cutmax) cutmax = cut; + cutsq[i][i] = cut * cut; + for (int j = i + 1; j <= ntypes; j++) { + cut = (radelem[i] + radelem[j]) * rcutfac; + cutsq[i][j] = cutsq[j][i] = cut * cut; + } + } + + size_local_cols = size_local_cols_base + ntypes; + + // pre-compute coefficients + for (int i = 0; i < ntypes; i++) { + prefacelem[i + 1] = 1.0/powint(sigmaelem[i + 1] * sqrt(MY_2PI), 3); + argfacelem[i + 1] = 1.0/(2.0 * sigmaelem[i + 1] * sigmaelem[i + 1]); + } +} + +/* ---------------------------------------------------------------------- */ + +ComputeGaussianGridLocal::~ComputeGaussianGridLocal() +{ + if (copymode) return; + memory->destroy(radelem); + memory->destroy(sigmaelem); + memory->destroy(prefacelem); + memory->destroy(argfacelem); + memory->destroy(cutsq); +} + +/* ---------------------------------------------------------------------- */ + +void ComputeGaussianGridLocal::init() +{ + if ((modify->get_compute_by_style("^gaussian/grid/local$").size() > 1) && (comm->me == 0)) + error->warning(FLERR, "More than one instance of compute gaussian/grid/local"); +} + +/* ---------------------------------------------------------------------- */ + +void ComputeGaussianGridLocal::compute_local() +{ + invoked_local = update->ntimestep; + + // compute gaussian for each gridpoint + + double **const x = atom->x; + const int *const mask = atom->mask; + int *const type = atom->type; + const int ntotal = atom->nlocal + atom->nghost; + + int igrid = 0; + for (int iz = nzlo; iz <= nzhi; iz++) + for (int iy = nylo; iy <= nyhi; iy++) + for (int ix = nxlo; ix <= nxhi; ix++) { + double xgrid[3]; + grid2x(ix, iy, iz, xgrid); + const double xtmp = xgrid[0]; + const double ytmp = xgrid[1]; + const double ztmp = xgrid[2]; + + // Zeroing out the components, which are filled as a sum. + for (int icol = size_local_cols_base; icol < size_local_cols; icol++){ + alocal[igrid][icol] = 0.0; + } + + for (int j = 0; j < ntotal; j++) { + + // check that j is in compute group + + if (!(mask[j] & groupbit)) continue; + + const double delx = xtmp - x[j][0]; + const double dely = ytmp - x[j][1]; + const double delz = ztmp - x[j][2]; + const double rsq = delx * delx + dely * dely + delz * delz; + int jtype = type[j]; + if (rsq < cutsq[jtype][jtype]) { + int icol = size_local_cols_base + jtype - 1; + alocal[igrid][icol] += prefacelem[jtype] * exp(-rsq * argfacelem[jtype]); + } + } + igrid++; + } +} + +/* ---------------------------------------------------------------------- + memory usage +------------------------------------------------------------------------- */ + +double ComputeGaussianGridLocal::memory_usage() +{ + int n = atom->ntypes + 1; + int nbytes = (double) n * sizeof(int); // map + + return nbytes; +} diff --git a/src/ML-SNAP/compute_gaussian_grid_local.h b/src/ML-SNAP/compute_gaussian_grid_local.h new file mode 100644 index 0000000000..77f88a7a8e --- /dev/null +++ b/src/ML-SNAP/compute_gaussian_grid_local.h @@ -0,0 +1,51 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/ Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef COMPUTE_CLASS +// clang-format off +ComputeStyle(gaussian/grid/local,ComputeGaussianGridLocal); +// clang-format on +#else + +#ifndef LMP_COMPUTE_GAUSSIAN_GRID_LOCAL_H +#define LMP_COMPUTE_GAUSSIAN_GRID_LOCAL_H + +#include "compute_grid_local.h" + +namespace LAMMPS_NS { + +class ComputeGaussianGridLocal : public ComputeGridLocal { + public: + ComputeGaussianGridLocal(class LAMMPS *, int, char **); + ~ComputeGaussianGridLocal() override; + void init() override; + void compute_local() override; + double memory_usage() override; + + protected: + int ncoeff; + double **cutsq; + double rcutfac; // global cut-off scale + double *radelem; // cut-off radius of each atom type + double *sigmaelem; // Gaussian width of each atom type + double *prefacelem; // Gaussian prefactor of each atom type + double *argfacelem; // Gaussian argument factor of each atom type + int *map; // map types to [0,nelements) + int nelements; + double cutmax; +}; + +} // namespace LAMMPS_NS + +#endif +#endif diff --git a/src/ML-SNAP/compute_grid.cpp b/src/ML-SNAP/compute_grid.cpp index 2179bb8ebd..12135c705d 100644 --- a/src/ML-SNAP/compute_grid.cpp +++ b/src/ML-SNAP/compute_grid.cpp @@ -57,6 +57,7 @@ ComputeGrid::ComputeGrid(LAMMPS *lmp, int narg, char **arg) : ComputeGrid::~ComputeGrid() { + if (copymode) return; deallocate(); } @@ -111,7 +112,6 @@ void ComputeGrid::assign_coords_all() void ComputeGrid::allocate() { // allocate arrays - memory->create(grid, size_array_rows, size_array_cols, "grid:grid"); memory->create(gridall, size_array_rows, size_array_cols, "grid:gridall"); if (nxlo <= nxhi && nylo <= nyhi && nzlo <= nzhi) { diff --git a/src/ML-SNAP/compute_grid_local.cpp b/src/ML-SNAP/compute_grid_local.cpp index 0f275a9aae..80feb75be5 100644 --- a/src/ML-SNAP/compute_grid_local.cpp +++ b/src/ML-SNAP/compute_grid_local.cpp @@ -119,6 +119,8 @@ void ComputeGridLocal::allocate() void ComputeGridLocal::deallocate() { + if (copymode) return; + if (gridlocal_allocated) { gridlocal_allocated = 0; memory->destroy(alocal); diff --git a/src/ML-SNAP/compute_sna_grid.cpp b/src/ML-SNAP/compute_sna_grid.cpp index 4243202545..95c3fa70a8 100644 --- a/src/ML-SNAP/compute_sna_grid.cpp +++ b/src/ML-SNAP/compute_sna_grid.cpp @@ -31,14 +31,13 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) : // skip over arguments used by base class // so that argument positions are identical to // regular per-atom compute - arg += nargbase; narg -= nargbase; // begin code common to all SNAP computes - double rfac0, rmin0; - int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag; + //double rfac0, rmin0; + //int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag; int ntypes = atom->ntypes; int nargmin = 6 + 2 * ntypes; @@ -56,6 +55,8 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) : wselfallflag = 0; switchinnerflag = 0; nelements = 1; + chunksize = 32768; + parallel_thresh = 8192; // process required arguments @@ -67,8 +68,9 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) : twojmax = utils::inumeric(FLERR, arg[5], false, lmp); for (int i = 0; i < ntypes; i++) radelem[i + 1] = utils::numeric(FLERR, arg[6 + i], false, lmp); - for (int i = 0; i < ntypes; i++) + for (int i = 0; i < ntypes; i++) { wjelem[i + 1] = utils::numeric(FLERR, arg[6 + ntypes + i], false, lmp); + } // construct cutsq @@ -181,11 +183,12 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) : ComputeSNAGrid::~ComputeSNAGrid() { + if (copymode) return; + memory->destroy(radelem); memory->destroy(wjelem); memory->destroy(cutsq); delete snaptr; - if (chemflag) memory->destroy(map); } @@ -202,6 +205,7 @@ void ComputeSNAGrid::init() void ComputeSNAGrid::compute_array() { + invoked_array = update->ntimestep; // compute sna for each gridpoint diff --git a/src/ML-SNAP/compute_sna_grid.h b/src/ML-SNAP/compute_sna_grid.h index 3a5a373826..a158c2342f 100644 --- a/src/ML-SNAP/compute_sna_grid.h +++ b/src/ML-SNAP/compute_sna_grid.h @@ -31,21 +31,27 @@ class ComputeSNAGrid : public ComputeGrid { void init() override; void compute_array() override; double memory_usage() override; + int ncoeff,nelements; // public for kokkos, but could go in the protected block now - private: - int ncoeff; + protected: + //int ncoeff; double **cutsq; double rcutfac; double *radelem; double *wjelem; int *map; // map types to [0,nelements) - int nelements, chemflag; + int chemflag; int switchinnerflag; double *sinnerelem; double *dinnerelem; + int parallel_thresh; class SNA *snaptr; double cutmax; int quadraticflag; + double rfac0, rmin0; + int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag; + int chunksize; + }; } // namespace LAMMPS_NS diff --git a/src/ML-SNAP/compute_sna_grid_local.cpp b/src/ML-SNAP/compute_sna_grid_local.cpp index 1d42a42c05..db49063920 100644 --- a/src/ML-SNAP/compute_sna_grid_local.cpp +++ b/src/ML-SNAP/compute_sna_grid_local.cpp @@ -37,8 +37,8 @@ ComputeSNAGridLocal::ComputeSNAGridLocal(LAMMPS *lmp, int narg, char **arg) : // begin code common to all SNAP computes - double rfac0, rmin0; - int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag; + //double rfac0, rmin0; + //int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag; int ntypes = atom->ntypes; int nargmin = 6 + 2 * ntypes; @@ -56,6 +56,8 @@ ComputeSNAGridLocal::ComputeSNAGridLocal(LAMMPS *lmp, int narg, char **arg) : wselfallflag = 0; switchinnerflag = 0; nelements = 1; + chunksize = 32768; + parallel_thresh = 8192; // process required arguments @@ -180,6 +182,7 @@ ComputeSNAGridLocal::ComputeSNAGridLocal(LAMMPS *lmp, int narg, char **arg) : ComputeSNAGridLocal::~ComputeSNAGridLocal() { + if (copymode) return; memory->destroy(radelem); memory->destroy(wjelem); memory->destroy(cutsq); diff --git a/src/ML-SNAP/compute_sna_grid_local.h b/src/ML-SNAP/compute_sna_grid_local.h index 0475212e13..85662ad509 100644 --- a/src/ML-SNAP/compute_sna_grid_local.h +++ b/src/ML-SNAP/compute_sna_grid_local.h @@ -32,7 +32,7 @@ class ComputeSNAGridLocal : public ComputeGridLocal { void compute_local() override; double memory_usage() override; - private: + protected: int ncoeff; double **cutsq; double rcutfac; @@ -46,6 +46,10 @@ class ComputeSNAGridLocal : public ComputeGridLocal { class SNA *snaptr; double cutmax; int quadraticflag; + double rfac0, rmin0; + int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag; + int chunksize; + int parallel_thresh; }; } // namespace LAMMPS_NS