Merge remote-tracking branch 'github/develop' into collected-small-changes

2025-01-23 12:06:59 -05:00
parent f8484de195 0dfd5d821a
commit b5da9ce3e2
36 changed files with 3434 additions and 56 deletions
--- a/doc/src/Commands_compute.rst
+++ b/doc/src/Commands_compute.rst
@ -58,6 +58,7 @@ KOKKOS, o = OPENMP, t = OPT.
   * :doc:`fep/ta <compute_fep_ta>`
   * :doc:`force/tally <compute_tally>`
   * :doc:`fragment/atom <compute_cluster_atom>`
   * :doc:`gaussian/grid/local (k) <compute_gaussian_grid_local>`
   * :doc:`global/atom <compute_global_atom>`
   * :doc:`group/group <compute_group_group>`
   * :doc:`gyration <compute_gyration>`
@ -140,8 +141,8 @@ KOKKOS, o = OPENMP, t = OPT.
   * :doc:`smd/vol <compute_smd_vol>`
   * :doc:`snap <compute_sna_atom>`
   * :doc:`sna/atom <compute_sna_atom>`
-   * :doc:`sna/grid <compute_sna_atom>`
+   * :doc:`sna/grid (k) <compute_sna_atom>`
-   * :doc:`sna/grid/local <compute_sna_atom>`
+   * :doc:`sna/grid/local (k) <compute_sna_atom>`
   * :doc:`snad/atom <compute_sna_atom>`
   * :doc:`snav/atom <compute_sna_atom>`
   * :doc:`sph/e/atom <compute_sph_e_atom>`
--- a/doc/src/compute.rst
+++ b/doc/src/compute.rst
@ -236,6 +236,7 @@ The individual style names on the :doc:`Commands compute <Commands_compute>` pag
 * :doc:`fep/ta <compute_fep_ta>` - compute free energies for a test area perturbation
 * :doc:`force/tally <compute_tally>` - force between two groups of atoms via the tally callback mechanism
 * :doc:`fragment/atom <compute_cluster_atom>` - fragment ID for each atom
 * :doc:`gaussian/grid/local <compute_gaussian_grid_local>` - local array of Gaussian atomic contributions on a regular grid
 * :doc:`global/atom <compute_global_atom>` - assign global values to each atom from arrays of global values
 * :doc:`group/group <compute_group_group>` - energy/force between two groups of atoms
 * :doc:`gyration <compute_gyration>` - radius of gyration of group of atoms
--- a/doc/src/compute_gaussian_grid_local.rst
+++ b/doc/src/compute_gaussian_grid_local.rst
@ -0,0 +1,97 @@
 .. index:: compute gaussian/grid/local
 .. index:: compute gaussian/grid/local/kk
 compute gaussian/grid/local command
 ===================================
 Accelerator Variants: *gaussian/grid/local/kk*
 Syntax
 """"""
 .. code-block:: LAMMPS
   compute ID group-ID gaussian/grid/local grid nx ny nz rcutfac  R_1 R_2 ... sigma_1 sigma_2
 * ID, group-ID are documented in :doc:`compute <compute>` command
 * gaussian/grid/local = style name of this compute command
 * *grid* values = nx, ny, nz, number of grid points in x, y, and z directions (positive integer)
 * *rcutfac* = scale factor applied to all cutoff radii (positive real)
 * *R_1, R_2,...* = list of cutoff radii, one for each type (distance units)
 * *sigma_1, sigma_2,...* = Gaussian widths, one for each type (distance units)
 Examples
 """"""""
 .. code-block:: LAMMPS
    compute mygrid all gaussian/grid/local grid 40 40 40 4.0 0.5 0.5 0.4 0.4
 Description
 """""""""""
 Define a computation that calculates a Gaussian representation of the ionic
 structure. This representation is used for the efficient evaluation
 of quantities related to the structure factor in a grid-based workflow,
 such as the ML-DFT workflow MALA :ref:`(Ellis) <Ellis2021b>`, for which it was originally
 implemented. Usage of the workflow is described in a separate publication :ref:`(Fiedler) <Fiedler2023>`.
 For each LAMMPS type, a separate sum of Gaussians is calculated, using
 a separate Gaussian broadening per type. The computation
 is always performed on the numerical grid, no atom-based version of this
 compute exists. The Gaussian representation can only be executed in a local
 fashion, thus the output array only contains rows for grid points
 that are local to the processor subdomain. The layout of the grid is the same
 as for the see :doc:`sna/grid/local <compute_sna_atom>` command.
 Namely, the array contains one row for each of the
 local grid points, looping over the global index *ix* fastest,
 then *iy*, and *iz* slowest.  Each row of the array contains
 the global indexes *ix*, *iy*, and *iz* first, followed by the *x*, *y*,
 and *z* coordinates of the grid point, followed by the values of the Gaussians
 (one floating point number per type per grid point).
 ----------
 .. include:: accel_styles.rst
 ----------
 Output info
 """""""""""
 Compute *gaussian/grid/local* evaluates a local array.
 The array contains one row for each of the
 local grid points, looping over the global index *ix* fastest,
 then *iy*, and *iz* slowest.  The array contains math :math:`ntypes+6` columns,
 where *ntypes* is the number of LAMMPS types. The first three columns are
 the global indexes *ix*, *iy*, and *iz*, followed by the *x*, *y*,
 and *z* coordinates of the grid point, followed by the *ntypes* columns
 containing the values of the Gaussians for each type.
 Restrictions
 """"""""""""
 These computes are part of the ML-SNAP package.  They are only enabled
 if LAMMPS was built with that package.  See the :doc:`Build package
 <Build_package>` page for more info.
 Related commands
 """"""""""""""""
 :doc:`compute sna/grid/local <compute_sna_atom>`
 ----------
 .. _Ellis2021b:
 **(Ellis)** Ellis, Fiedler, Popoola, Modine, Stephens, Thompson, Cangi, Rajamanickam, `Phys. Rev. B, 104, 035120, (2021) <https://doi.org/10.1103/PhysRevB.104.035120>`_
 .. _Fiedler2023:
 **(Fiedler)** Fiedler, Modine, Schmerler, Vogel, Popoola, Thompson, Rajamanickam, and Cangi,
 `npj Comp. Mater., 9, 115 (2023) <https://doi.org/10.1038/s41524-023-01070-z>`_
--- a/doc/src/compute_sna_atom.rst
+++ b/doc/src/compute_sna_atom.rst
@ -3,7 +3,9 @@
 .. index:: compute snav/atom
 .. index:: compute snap
 .. index:: compute sna/grid
 .. index:: compute sna/grid/kk
 .. index:: compute sna/grid/local
 .. index:: compute sna/grid/local/kk
 compute sna/atom command
 ========================
@ -20,9 +22,14 @@ compute snap command
 compute sna/grid command
 ========================
 compute sna/grid/kk command
 ===========================
 compute sna/grid/local command
 ==============================
 Accelerator Variants: *sna/grid/local/kk*
 Syntax
 """"""
@ -33,17 +40,17 @@ Syntax
   compute ID group-ID snav/atom rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ...
   compute ID group-ID snap rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ...
   compute ID group-ID snap rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ...
-   compute ID group-ID sna/grid nx ny nz rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ...
+   compute ID group-ID sna/grid grid nx ny nz rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ...
-   compute ID group-ID sna/grid/local nx ny nz rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ...
+   compute ID group-ID sna/grid/local grid nx ny nz rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ...
 * ID, group-ID are documented in :doc:`compute <compute>` command
 * sna/atom = style name of this compute command
-* rcutfac = scale factor applied to all cutoff radii (positive real)
+* *rcutfac* = scale factor applied to all cutoff radii (positive real)
-* rfac0 = parameter in distance to angle conversion (0 < rcutfac < 1)
+* *rfac0* = parameter in distance to angle conversion (0 < rcutfac < 1)
-* twojmax = band limit for bispectrum components (non-negative integer)
+* *twojmax* = band limit for bispectrum components (non-negative integer)
-* R_1, R_2,... = list of cutoff radii, one for each type (distance units)
+* *R_1, R_2,...* = list of cutoff radii, one for each type (distance units)
-* w_1, w_2,... = list of neighbor weights, one for each type
+* *w_1, w_2,...* = list of neighbor weights, one for each type
-* nx, ny, nz = number of grid points in x, y, and z directions (positive integer)
+* *grid* values = nx, ny, nz, number of grid points in x, y, and z directions (positive integer)
 * zero or more keyword/value pairs may be appended
 * keyword = *rmin0* or *switchflag* or *bzeroflag* or *quadraticflag* or *chem* or *bnormflag* or *wselfallflag* or *bikflag* or *switchinnerflag* or *sinner* or *dinner* or *dgradflag* or *nnn* or *wmode* or *delta*
@ -103,7 +110,7 @@ Examples
   compute snap all snap 1.4 0.95 6 2.0 1.0
   compute snap all snap 1.0 0.99363 6 3.81 3.83 1.0 0.93 chem 2 0 1
   compute snap all snap 1.0 0.99363 6 3.81 3.83 1.0 0.93 switchinnerflag 1 sinner 1.35 1.6 dinner 0.25 0.3
-   compute bgrid all sna/grid/local 200 200 200 1.4 0.95 6 2.0 1.0
+   compute bgrid all sna/grid/local grid 200 200 200 1.4 0.95 6 2.0 1.0
   compute bnnn all sna/atom 9.0 0.99363 8 0.5 1.0 rmin0 0.0 nnn 24 wmode 1 delta 0.2
 Description
@ -252,7 +259,8 @@ for finite-temperature Kohn-Sham density functional theory (:ref:`Ellis
 et al. <Ellis2021>`) Neighbor atoms not in the group do not contribute
 to the bispectrum components of the grid points. The distance cutoff
 :math:`R_{ii'}` assumes that *i* has the same type as the neighbor atom
-*i'*.
+*i'*. Both computes can be hardware accelerated with Kokkos by using the
 *sna/grid/kk* and *sna/grid/local/kk* commands, respectively.
 Compute *sna/grid* calculates a global array containing bispectrum
 components for a regular grid of points.
@ -463,6 +471,12 @@ fluctuations in the resulting local atomic environment fingerprint.  The
 detailed formalism is given in the paper by Lafourcade et
 al. :ref:`(Lafourcade) <Lafourcade2023_2>`.
 ----------
 .. include:: accel_styles.rst
 ----------
 Output info
@ -654,7 +668,7 @@ of Angular Momentum, World Scientific, Singapore (1987).
 .. _Ellis2021:
-**(Ellis)** Ellis, Fiedler, Popoola, Modine, Stephens, Thompson, Cangi, Rajamanickam,  Phys Rev B, 104, 035120, (2021)
+**(Ellis)** Ellis, Fiedler, Popoola, Modine, Stephens, Thompson, Cangi, Rajamanickam, `Phys. Rev. B, 104, 035120, (2021) <https://doi.org/10.1103/PhysRevB.104.035120>`_
 .. _Lafourcade2023_2:
--- a/doc/utils/sphinx-config/false_positives.txt
+++ b/doc/utils/sphinx-config/false_positives.txt
@ -3380,6 +3380,7 @@ Schilfgarde
 Schimansky
 Schiotz
 Schlitter
 Schmerler
 Schmid
 Schnieders
 Schoen
@ -4042,6 +4043,7 @@ VMDARCH
 VMDHOME
 vn
 Voigt
 Vogel
 volfactor
 Volkov
 Volpe
--- a/examples/snap/README.md
+++ b/examples/snap/README.md
@ -9,5 +9,11 @@ in.snap.Mo_Chen                   # SNAP linear Mo potential
 in.snap.compute                   # SNAP compute for training a linear model
 in.snap.compute.quadratic         # SNAP compute for training a quadratic model
 in.snap.scale.Ni_Zuo_JCPA2020     # SNAP linear Ni potential with thermodynamic integration (fix adapt scale)
 in.C_SNAP                         # SNAP carbon potential
 compute_snap_dgrad.py             # SNAP compute with dgradflag (dBi/dRj) for training a non-linear model
 in.snap.grid                      # SNAP descriptors on a grid
 in.snap.grid.triclinic            # SNAP descriptors on a grid, triclinic
 in.gaussian.grid                  # Gaussian descriptors on a grid
--- a/examples/snap/in.gaussian.grid
+++ b/examples/snap/in.gaussian.grid
@ -0,0 +1,68 @@
 # Demonstrate calculation of Gaussian descriptors on a grid
 # for a cell with two atoms of type 1 and type 2.
 # The output in dump.glocal shows that for grid points
 # sitting on an atom of type 1 or 2:
 # val1 = 1.0/(0.1355*sqrt(2.0*pi))**3 = 25.5219
 # val2 = 1.0/(0.2   *sqrt(2.0*pi))**3 = 7.93670
 # These values are extracted to the log file
 # 
 variable 	nrep index 1
 variable 	a index 3.316
 variable 	ngrid index 2
 units		metal
 atom_modify	map hash
 # generate the box and atom positions using a BCC lattice
 variable       	nx equal ${nrep}
 variable 	ny equal ${nrep}
 variable 	nz equal ${nrep}
 boundary	p p p
 lattice		custom $a &
 		a1 1 0 0 &
 		a2 0 1 0  &
 		a3 0 0 1 &
 		basis 0 0 0 &
 		basis 0.5 0.5 0.5 &
 region		box block 0 ${nx} 0 ${ny} 0 ${nz}
 create_box	2 box
 create_atoms	1 box basis 1 1 basis 2 2
 mass 		* 180.88
 # define atom compute and grid compute
 variable 	rcutfac equal 4.67637
 variable 	radelem1 equal 0.5
 variable 	radelem2 equal 0.5
 variable	sigmaelem1 equal 0.1355
 variable	sigmaelem2 equal 0.2
 variable 	gaussian_options string &
 		"${rcutfac} ${radelem1} ${radelem2} ${sigmaelem1} ${sigmaelem2}"
 # build zero potential to force ghost atom creation
 pair_style      zero ${rcutfac}
 pair_coeff      * *
 # define atom and grid computes
 compute 	mygridlocal all gaussian/grid/local grid ${ngrid} ${ngrid} ${ngrid} &
 	 	${gaussian_options}
 # define output
 dump		1 all local 1000 dump.glocal c_mygridlocal[*]
 dump 		2 all custom 1000 dump.gatom id x y z
 compute		val1 all reduce max c_mygridlocal[7] inputs local
 compute		val2 all reduce max c_mygridlocal[8] inputs local
 thermo_style	custom step c_val1 c_val2
 # run
 run		0
--- a/examples/snap/in.snap.grid
+++ b/examples/snap/in.snap.grid
--- a/examples/snap/in.snap.grid.triclinic
+++ b/examples/snap/in.snap.grid.triclinic
@ -47,7 +47,6 @@ lattice		custom $a &
 		basis 0.0 0.0 0.5 &
 		spacing 1 1 1
 box 		tilt large
 region		box prism 0 ${nx} 0 ${ny} 0 ${nz} ${ny} ${nz} ${nz}
 create_box	1 box
 create_atoms	1 box
--- a/examples/snap/log.10Dec24.gaussian.grid.g++.1
+++ b/examples/snap/log.10Dec24.gaussian.grid.g++.1
@ -0,0 +1,129 @@
 LAMMPS (19 Nov 2024 - Development - patch_19Nov2024-59-g16e0a7788a)
 OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:99)
  using 1 OpenMP thread(s) per MPI task
 # Demonstrate calculation of Gaussian descriptors on a grid
 # for a cell with two atoms of type 1 and type 2.
 # The output in dump.glocal shows that for grid points
 # sitting on an atom of type 1 or 2:
 # val1 = 1.0/(0.1355*sqrt(2.0*pi))**3 = 25.5219
 # val2 = 1.0/(0.2   *sqrt(2.0*pi))**3 = 7.93670
 # These values are extracted to the log file
 #
 variable 	nrep index 1
 variable 	a index 3.316
 variable 	ngrid index 2
 units		metal
 atom_modify	map hash
 # generate the box and atom positions using a BCC lattice
 variable       	nx equal ${nrep}
 variable       	nx equal 1
 variable 	ny equal ${nrep}
 variable 	ny equal 1
 variable 	nz equal ${nrep}
 variable 	nz equal 1
 boundary	p p p
 lattice		custom $a 		a1 1 0 0 		a2 0 1 0  		a3 0 0 1 		basis 0 0 0 		basis 0.5 0.5 0.5
 lattice		custom 3.316 		a1 1 0 0 		a2 0 1 0  		a3 0 0 1 		basis 0 0 0 		basis 0.5 0.5 0.5
 Lattice spacing in x,y,z = 3.316 3.316 3.316
 region		box block 0 ${nx} 0 ${ny} 0 ${nz}
 region		box block 0 1 0 ${ny} 0 ${nz}
 region		box block 0 1 0 1 0 ${nz}
 region		box block 0 1 0 1 0 1
 create_box	2 box
 Created orthogonal box = (0 0 0) to (3.316 3.316 3.316)
  1 by 1 by 1 MPI processor grid
 create_atoms	1 box basis 1 1 basis 2 2
 Created 2 atoms
  using lattice units in orthogonal box = (0 0 0) to (3.316 3.316 3.316)
  create_atoms CPU = 0.001 seconds
 mass 		* 180.88
 # define atom compute and grid compute
 variable 	rcutfac equal 4.67637
 variable 	radelem1 equal 0.5
 variable 	radelem2 equal 0.5
 variable	sigmaelem1 equal 0.1355
 variable	sigmaelem2 equal 0.2
 variable 	gaussian_options string 		"${rcutfac} ${radelem1} ${radelem2} ${sigmaelem1} ${sigmaelem2}"
 4.67637 ${radelem1} ${radelem2} ${sigmaelem1} ${sigmaelem2}
 4.67637 0.5 ${radelem2} ${sigmaelem1} ${sigmaelem2}
 4.67637 0.5 0.5 ${sigmaelem1} ${sigmaelem2}
 4.67637 0.5 0.5 0.1355 ${sigmaelem2}
 4.67637 0.5 0.5 0.1355 0.2
 # build zero potential to force ghost atom creation
 pair_style      zero ${rcutfac}
 pair_style      zero 4.67637
 pair_coeff      * *
 # define atom and grid computes
 compute 	mygridlocal all gaussian/grid/local grid ${ngrid} ${ngrid} ${ngrid} 	 	${gaussian_options}
 compute 	mygridlocal all gaussian/grid/local grid 2 ${ngrid} ${ngrid} 	 	${gaussian_options}
 compute 	mygridlocal all gaussian/grid/local grid 2 2 ${ngrid} 	 	${gaussian_options}
 compute 	mygridlocal all gaussian/grid/local grid 2 2 2 	 	${gaussian_options}
 compute 	mygridlocal all gaussian/grid/local grid 2 2 2 	 	4.67637 0.5 0.5 0.1355 0.2
 # define output
 dump		1 all local 1000 dump.glocal c_mygridlocal[*]
 dump 		2 all custom 1000 dump.gatom id x y z
 compute		val1 all reduce max c_mygridlocal[7] inputs local
 compute		val2 all reduce max c_mygridlocal[8] inputs local
 thermo_style	custom step c_val1 c_val2
 # run
 run		0
 WARNING: No fixes with time integration, atoms won't move (src/verlet.cpp:60)
 Generated 0 of 1 mixed pair_coeff terms from geometric mixing rule
 Neighbor list info ...
  update: every = 1 steps, delay = 0 steps, check = yes
  max neighbors/atom: 2000, page size: 100000
  master list distance cutoff = 6.67637
  ghost atom cutoff = 6.67637
  binsize = 3.338185, bins = 1 1 1
  1 neighbor lists, perpetual/occasional/extra = 1 0 0
  (1) pair zero, perpetual
      attributes: half, newton on
      pair build: half/bin/atomonly/newton
      stencil: half/bin/3d
      bin: standard
 Per MPI rank memory allocation (min/avg/max) = 3.492 | 3.492 | 3.492 Mbytes
   Step         c_val1         c_val2    
         0   25.521859      7.9367045    
 Loop time of 1.088e-06 on 1 procs for 0 steps with 2 atoms
 183.8% CPU use with 1 MPI tasks x 1 OpenMP threads
 MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
 ---------------------------------------------------------------
 Pair    | 0          | 0          | 0          |   0.0 |  0.00
 Neigh   | 0          | 0          | 0          |   0.0 |  0.00
 Comm    | 0          | 0          | 0          |   0.0 |  0.00
 Output  | 0          | 0          | 0          |   0.0 |  0.00
 Modify  | 0          | 0          | 0          |   0.0 |  0.00
 Other   |            | 1.088e-06  |            |       |100.00
 Nlocal:              2 ave           2 max           2 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
 Nghost:            339 ave         339 max         339 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
 Neighs:             64 ave          64 max          64 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
 Total # of neighbors = 64
 Ave neighs/atom = 32
 Neighbor list builds = 0
 Dangerous builds = 0
 Total wall time: 0:00:00
--- a/examples/snap/log.10Dec24.gaussian.grid.g++.4
+++ b/examples/snap/log.10Dec24.gaussian.grid.g++.4
@ -0,0 +1,130 @@
 LAMMPS (19 Nov 2024 - Development - patch_19Nov2024-59-g16e0a7788a)
 OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:99)
  using 1 OpenMP thread(s) per MPI task
 # Demonstrate calculation of Gaussian descriptors on a grid
 # for a cell with two atoms of type 1 and type 2.
 # The output in dump.glocal shows that for grid points
 # sitting on an atom of type 1 or 2:
 # val1 = 1.0/(0.1355*sqrt(2.0*pi))**3 = 25.5219
 # val2 = 1.0/(0.2   *sqrt(2.0*pi))**3 = 7.93670
 # These values are extracted to the log file
 #
 variable 	nrep index 1
 variable 	a index 3.316
 variable 	ngrid index 2
 units		metal
 atom_modify	map hash
 # generate the box and atom positions using a BCC lattice
 variable       	nx equal ${nrep}
 variable       	nx equal 1
 variable 	ny equal ${nrep}
 variable 	ny equal 1
 variable 	nz equal ${nrep}
 variable 	nz equal 1
 boundary	p p p
 lattice		custom $a 		a1 1 0 0 		a2 0 1 0  		a3 0 0 1 		basis 0 0 0 		basis 0.5 0.5 0.5
 lattice		custom 3.316 		a1 1 0 0 		a2 0 1 0  		a3 0 0 1 		basis 0 0 0 		basis 0.5 0.5 0.5
 Lattice spacing in x,y,z = 3.316 3.316 3.316
 region		box block 0 ${nx} 0 ${ny} 0 ${nz}
 region		box block 0 1 0 ${ny} 0 ${nz}
 region		box block 0 1 0 1 0 ${nz}
 region		box block 0 1 0 1 0 1
 create_box	2 box
 Created orthogonal box = (0 0 0) to (3.316 3.316 3.316)
  1 by 2 by 2 MPI processor grid
 create_atoms	1 box basis 1 1 basis 2 2
 Created 2 atoms
  using lattice units in orthogonal box = (0 0 0) to (3.316 3.316 3.316)
  create_atoms CPU = 0.001 seconds
 mass 		* 180.88
 # define atom compute and grid compute
 variable 	rcutfac equal 4.67637
 variable 	radelem1 equal 0.5
 variable 	radelem2 equal 0.5
 variable	sigmaelem1 equal 0.1355
 variable	sigmaelem2 equal 0.2
 variable 	gaussian_options string 		"${rcutfac} ${radelem1} ${radelem2} ${sigmaelem1} ${sigmaelem2}"
 4.67637 ${radelem1} ${radelem2} ${sigmaelem1} ${sigmaelem2}
 4.67637 0.5 ${radelem2} ${sigmaelem1} ${sigmaelem2}
 4.67637 0.5 0.5 ${sigmaelem1} ${sigmaelem2}
 4.67637 0.5 0.5 0.1355 ${sigmaelem2}
 4.67637 0.5 0.5 0.1355 0.2
 # build zero potential to force ghost atom creation
 pair_style      zero ${rcutfac}
 pair_style      zero 4.67637
 pair_coeff      * *
 # define atom and grid computes
 compute 	mygridlocal all gaussian/grid/local grid ${ngrid} ${ngrid} ${ngrid} 	 	${gaussian_options}
 compute 	mygridlocal all gaussian/grid/local grid 2 ${ngrid} ${ngrid} 	 	${gaussian_options}
 compute 	mygridlocal all gaussian/grid/local grid 2 2 ${ngrid} 	 	${gaussian_options}
 compute 	mygridlocal all gaussian/grid/local grid 2 2 2 	 	${gaussian_options}
 compute 	mygridlocal all gaussian/grid/local grid 2 2 2 	 	4.67637 0.5 0.5 0.1355 0.2
 # define output
 dump		1 all local 1000 dump.glocal c_mygridlocal[*]
 dump 		2 all custom 1000 dump.gatom id x y z
 compute		val1 all reduce max c_mygridlocal[7] inputs local
 compute		val2 all reduce max c_mygridlocal[8] inputs local
 thermo_style	custom step c_val1 c_val2
 # run
 run		0
 WARNING: No fixes with time integration, atoms won't move (src/verlet.cpp:60)
 Generated 0 of 1 mixed pair_coeff terms from geometric mixing rule
 Neighbor list info ...
  update: every = 1 steps, delay = 0 steps, check = yes
  max neighbors/atom: 2000, page size: 100000
  master list distance cutoff = 6.67637
  ghost atom cutoff = 6.67637
  binsize = 3.338185, bins = 1 1 1
  1 neighbor lists, perpetual/occasional/extra = 1 0 0
  (1) pair zero, perpetual
      attributes: half, newton on
      pair build: half/bin/atomonly/newton
      stencil: half/bin/3d
      bin: standard
 WARNING: Proc sub-domain size < neighbor skin, could lead to lost atoms (src/domain.cpp:1202)
 Per MPI rank memory allocation (min/avg/max) = 3.522 | 3.523 | 3.524 Mbytes
   Step         c_val1         c_val2    
         0   25.521859      7.9367045    
 Loop time of 2.238e-06 on 4 procs for 0 steps with 2 atoms
 89.4% CPU use with 4 MPI tasks x 1 OpenMP threads
 MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
 ---------------------------------------------------------------
 Pair    | 0          | 0          | 0          |   0.0 |  0.00
 Neigh   | 0          | 0          | 0          |   0.0 |  0.00
 Comm    | 0          | 0          | 0          |   0.0 |  0.00
 Output  | 0          | 0          | 0          |   0.0 |  0.00
 Modify  | 0          | 0          | 0          |   0.0 |  0.00
 Other   |            | 2.238e-06  |            |       |100.00
 Nlocal:            0.5 ave           1 max           0 min
 Histogram: 2 0 0 0 0 0 0 0 0 2
 Nghost:          274.5 ave         275 max         274 min
 Histogram: 2 0 0 0 0 0 0 0 0 2
 Neighs:             16 ave          40 max           0 min
 Histogram: 2 0 0 0 0 0 1 0 0 1
 Total # of neighbors = 64
 Ave neighs/atom = 32
 Neighbor list builds = 0
 Dangerous builds = 0
 Total wall time: 0:00:00
--- a/src/.gitignore
+++ b/src/.gitignore
@ -252,6 +252,8 @@
 /*rheo*.cpp
 /*rheo*.h
 /compute_gaussian_grid_local.cpp
 /compute_gaussian_grid_local.h
 /compute_grid.cpp
 /compute_grid.h
 /compute_grid_local.cpp
--- a/src/KOKKOS/Install.sh
+++ b/src/KOKKOS/Install.sh
@ -119,6 +119,14 @@ action compute_composition_atom_kokkos.cpp compute_composition_atom.cpp
 action compute_composition_atom_kokkos.h compute_composition_atom.h
 action compute_orientorder_atom_kokkos.cpp
 action compute_orientorder_atom_kokkos.h
 action compute_sna_grid_kokkos.cpp compute_sna_grid.cpp
 action compute_sna_grid_kokkos.h compute_sna_grid.h
 action compute_sna_grid_kokkos_impl.h compute_sna_grid.cpp
 action compute_sna_grid_local_kokkos.cpp compute_sna_grid_local.cpp
 action compute_sna_grid_local_kokkos.h compute_sna_grid_local.h
 action compute_sna_grid_local_kokkos_impl.h compute_sna_grid_local.cpp
 action compute_gaussian_grid_local_kokkos.cpp compute_gaussian_grid_local.cpp
 action compute_gaussian_grid_local_kokkos.h compute_gaussian_grid_local.h
 action compute_temp_deform_kokkos.cpp
 action compute_temp_deform_kokkos.h
 action compute_temp_kokkos.cpp
--- a/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp
+++ b/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp
@ -0,0 +1,327 @@
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   https://www.lammps.org/, Sandia National Laboratories
   LAMMPS development team: developers@lammps.org
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   Contributing author: Drew Rohskopf (SNL)
 ------------------------------------------------------------------------- */
 #include "compute_gaussian_grid_local_kokkos.h"
 #include "atom_kokkos.h"
 #include "atom_masks.h"
 #include "comm.h"
 #include "domain.h"
 #include "error.h"
 #include "force.h"
 #include "memory_kokkos.h"
 #include "modify.h"
 #include "neigh_list.h"
 #include "neigh_request.h"
 #include "neighbor_kokkos.h"
 #include "pair.h"
 #include "update.h"
 #include <cmath>
 #include <cstring>
 using namespace LAMMPS_NS;
 /* ---------------------------------------------------------------------- */
 template<class DeviceType>
 ComputeGaussianGridLocalKokkos<DeviceType>::ComputeGaussianGridLocalKokkos(LAMMPS *lmp, int narg, char **arg) :
  ComputeGaussianGridLocal(lmp, narg, arg)
 {
  kokkosable = 1;
  atomKK = (AtomKokkos *) atom;
  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
  datamask_read = EMPTY_MASK;
  datamask_modify = EMPTY_MASK;
  k_cutsq = tdual_fparams("ComputeSNAGridKokkos::cutsq",atom->ntypes+1,atom->ntypes+1);
  auto d_cutsq = k_cutsq.template view<DeviceType>();
  rnd_cutsq = d_cutsq;
  host_flag = (execution_space == Host);
  for (int i = 1; i <= atom->ntypes; i++) {
    for (int j = 1; j <= atom->ntypes; j++){
      k_cutsq.h_view(i,j) = k_cutsq.h_view(j,i) = cutsq[i][j]; //cutsq_tmp;
      k_cutsq.template modify<LMPHostType>();
    }
  }
  // Set up element lists
  int n = atom->ntypes;
  MemKK::realloc_kokkos(d_radelem,"ComputeSNAGridKokkos::radelem",n);
  MemKK::realloc_kokkos(d_sigmaelem,"ComputeSNAGridKokkos::sigmaelem",n+1);
  MemKK::realloc_kokkos(d_prefacelem,"ComputeSNAGridKokkos::prefacelem",n+1);
  MemKK::realloc_kokkos(d_argfacelem,"ComputeSNAGridKokkos::argfacelem",n+1);
  MemKK::realloc_kokkos(d_map,"ComputeSNAGridKokkos::map",n+1);
  auto h_radelem = Kokkos::create_mirror_view(d_radelem);
  auto h_sigmaelem = Kokkos::create_mirror_view(d_sigmaelem);
  auto h_prefacelem = Kokkos::create_mirror_view(d_prefacelem);
  auto h_argfacelem = Kokkos::create_mirror_view(d_argfacelem);
  auto h_map = Kokkos::create_mirror_view(d_map);
  // start from index 1 because of how compute sna/grid is
  for (int i = 1; i <= atom->ntypes; i++) {
    h_radelem(i-1) = radelem[i];
    h_sigmaelem(i-1) = sigmaelem[i];
    h_prefacelem(i-1) = prefacelem[i];
    h_argfacelem(i-1) = argfacelem[i];
  }
  Kokkos::deep_copy(d_radelem,h_radelem);
  Kokkos::deep_copy(d_sigmaelem,h_sigmaelem);
  Kokkos::deep_copy(d_prefacelem, h_prefacelem);
  Kokkos::deep_copy(d_argfacelem, h_argfacelem);
  Kokkos::deep_copy(d_map,h_map);
 }
 /* ---------------------------------------------------------------------- */
 template<class DeviceType>
 ComputeGaussianGridLocalKokkos<DeviceType>::~ComputeGaussianGridLocalKokkos()
 {
  if (copymode) return;
  memoryKK->destroy_kokkos(k_cutsq,cutsq);
  memoryKK->destroy_kokkos(k_alocal,alocal);
  //gridlocal_allocated = 0;
 }
 /* ---------------------------------------------------------------------- */
 template<class DeviceType>
 void ComputeGaussianGridLocalKokkos<DeviceType>::setup()
 {
  ComputeGridLocal::setup();
  // allocate arrays
  memoryKK->create_kokkos(k_alocal, alocal, size_local_rows, size_local_cols, "grid:alocal");
  array_local = alocal;
  d_alocal = k_alocal.template view<DeviceType>();
 }
 /* ---------------------------------------------------------------------- */
 template<class DeviceType>
 void ComputeGaussianGridLocalKokkos<DeviceType>::init()
 {
  ComputeGaussianGridLocal::init();
 }
 /* ---------------------------------------------------------------------- */
 template<class DeviceType>
 void ComputeGaussianGridLocalKokkos<DeviceType>::compute_local()
 {
  if (host_flag) {
    return;
  }
  invoked_local = update->ntimestep;
  copymode = 1;
  zlen = nzhi-nzlo+1;
  ylen = nyhi-nylo+1;
  xlen = nxhi-nxlo+1;
  total_range = (nzhi-nzlo+1)*(nyhi-nylo+1)*(nxhi-nxlo+1);
  atomKK->sync(execution_space,X_MASK|F_MASK|TYPE_MASK);
  x = atomKK->k_x.view<DeviceType>();
  type = atomKK->k_type.view<DeviceType>();
  k_cutsq.template sync<DeviceType>();
  // max_neighs is defined here - think of more elaborate methods.
  max_neighs = 100;
  // Pair snap/kk uses grow_ij with some max number of neighs but compute sna/grid uses total
  // number of atoms.
  ntotal = atomKK->nlocal + atomKK->nghost;
  // Allocate view for number of neighbors per grid point
  MemKK::realloc_kokkos(d_ninside,"ComputeSNAGridKokkos:ninside",total_range);
  // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user
  // `total_range` is the number of grid points which may be larger than chunk size.
  // printf(">>> total_range: %d\n", total_range);
  chunksize = 32768; // 100*32768
  chunk_size = MIN(chunksize, total_range);
  chunk_offset = 0;
  int vector_length_default = 1;
  int team_size_default = 1;
  if (!host_flag)
    team_size_default = 1; // cost will increase with increasing team size //32;//max_neighs;
  if (triclinic){
    h0 = domain->h[0];
    h1 = domain->h[1];
    h2 = domain->h[2];
    h3 = domain->h[3];
    h4 = domain->h[4];
    h5 = domain->h[5];
    lo0 = domain->boxlo[0];
    lo1 = domain->boxlo[1];
    lo2 = domain->boxlo[2];
  }
  while (chunk_offset < total_range) { // chunk up loop to prevent running out of memory
    if (chunk_size > total_range - chunk_offset)
      chunk_size = total_range - chunk_offset;
    //Neigh
    {
      int vector_length = vector_length_default;
      int team_size = team_size_default;
      check_team_size_for<TagComputeGaussianGridLocalNeigh>(chunk_size,team_size,vector_length);
      typename Kokkos::TeamPolicy<DeviceType, TagComputeGaussianGridLocalNeigh> policy_neigh(chunk_size,team_size,vector_length);
      Kokkos::parallel_for("ComputeGaussianGridLocalNeigh",policy_neigh,*this);
    }
    // Proceed to the next chunk.
    chunk_offset += chunk_size;
  } // end while
  copymode = 0;
  k_alocal.template modify<DeviceType>();
  k_alocal.template sync<LMPHostType>();
 }
 /* ---------------------------------------------------------------------- */
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void ComputeGaussianGridLocalKokkos<DeviceType>::operator() (TagComputeGaussianGridLocalNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagComputeGaussianGridLocalNeigh>::member_type& team) const
 {
  const int ii = team.league_rank();
  if (ii >= chunk_size) return;
  // extract grid index
  int igrid = ii + chunk_offset;
  // get a pointer to scratch memory
  // This is used to cache whether or not an atom is within the cutoff.
  // If it is, type_cache is assigned to the atom type.
  // If it's not, it's assigned to -1.
  const int tile_size = ntotal; //max_neighs; // number of elements per thread
  const int team_rank = team.team_rank();
  const int scratch_shift = team_rank * tile_size; // offset into pointer for entire team
  int* type_cache = (int*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(int), 0) + scratch_shift;
  // convert to grid indices
  int iz = igrid/(xlen*ylen);
  int i2 = igrid - (iz*xlen*ylen);
  int iy = i2/xlen;
  int ix = i2 % xlen;
  iz += nzlo;
  iy += nylo;
  ix += nxlo;
  double xgrid[3];
  // index ii already captures the proper grid point
  //int igrid = iz * (nx * ny) + iy * nx + ix;
  // grid2x converts igrid to ix,iy,iz like we've done before
  // multiply grid integers by grid spacing delx, dely, delz
  //grid2x(igrid, xgrid);
  xgrid[0] = ix * delx;
  xgrid[1] = iy * dely;
  xgrid[2] = iz * delz;
  if (triclinic) {
    // Do a conversion on `xgrid` here like we do in the CPU version.
    // Can't do this:
    // domainKK->lamda2x(xgrid, xgrid);
    // Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed
    // Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats.
    xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0;
    xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1;
    xgrid[2] = h2*xgrid[2] + lo2;
  }
  const F_FLOAT xtmp = xgrid[0];
  const F_FLOAT ytmp = xgrid[1];
  const F_FLOAT ztmp = xgrid[2];
  // Zeroing out the components, which are filled as a sum.
  for (int icol = size_local_cols_base; icol < size_local_cols; icol++){
    d_alocal(igrid, icol) = 0.0;
  }
  // Fill grid info columns
  d_alocal(igrid, 0) = ix;
  d_alocal(igrid, 1) = iy;
  d_alocal(igrid, 2) = iz;
  d_alocal(igrid, 3) = xtmp;
  d_alocal(igrid, 4) = ytmp;
  d_alocal(igrid, 5) = ztmp;
  // currently, all grid points are type 1
  // not clear what a better choice would be
  const int itype = 1;
  int ielem = 0;
  ielem = d_map[itype];
  const double radi = d_radelem[ielem];
  // Compute the number of neighbors, store rsq
  int ninside = 0;
  // Looping over ntotal for now.
  for (int j = 0; j < ntotal; j++){
    const F_FLOAT dx = x(j,0) - xtmp;
    const F_FLOAT dy = x(j,1) - ytmp;
    const F_FLOAT dz = x(j,2) - ztmp;
    int jtype = type(j);
    const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
    if (rsq < rnd_cutsq(jtype, jtype) ) {
      int icol = size_local_cols_base + jtype - 1;
      d_alocal(igrid, icol) += d_prefacelem(jtype-1) * exp(-rsq * d_argfacelem(jtype-1));
    }
  }
 }
 /* ----------------------------------------------------------------------
   check max team size
 ------------------------------------------------------------------------- */
 template<class DeviceType>
 template<class TagStyle>
 void ComputeGaussianGridLocalKokkos<DeviceType>::check_team_size_for(int inum, int &team_size, int vector_length) {
  int team_size_max;
  team_size_max = Kokkos::TeamPolicy<DeviceType,TagStyle>(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelForTag());
  if (team_size*vector_length > team_size_max)
    team_size = team_size_max/vector_length;
 }
 namespace LAMMPS_NS {
 template class ComputeGaussianGridLocalKokkos<LMPDeviceType>;
 #ifdef LMP_KOKKOS_GPU
 template class ComputeGaussianGridLocalKokkos<LMPHostType>;
 #endif
 }
--- a/src/KOKKOS/compute_gaussian_grid_local_kokkos.h
+++ b/src/KOKKOS/compute_gaussian_grid_local_kokkos.h
@ -0,0 +1,96 @@
 /* -*- c++ -*- ----------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   https://www.lammps.org/, Sandia National Laboratories
   LAMMPS development team: developers@lammps.org
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 #ifdef COMPUTE_CLASS
 // clang-format off
 ComputeStyle(gaussian/grid/local/kk,ComputeGaussianGridLocalKokkos<LMPDeviceType>);
 ComputeStyle(gaussian/grid/local/kk/device,ComputeGaussianGridLocalKokkos<LMPDeviceType>);
 ComputeStyle(gaussian/grid/local/kk/host,ComputeGaussianGridLocalKokkos<LMPHostType>);
 // clang-format on
 #else
 #ifndef LMP_COMPUTE_GAUSSIAN_GRID_LOCAL_KOKKOS_H
 #define LMP_COMPUTE_GAUSSIAN_GRID_LOCAL_KOKKOS_H
 #include "compute_gaussian_grid_local.h"
 #include "kokkos_type.h"
 namespace LAMMPS_NS {
 // clang-format off
 struct TagComputeGaussianGridLocalNeigh{};
 // clang-format on
 template <class DeviceType> class ComputeGaussianGridLocalKokkos : public ComputeGaussianGridLocal {
 public:
  typedef DeviceType device_type;
  typedef ArrayTypes<DeviceType> AT;
  // Static team/tile sizes for device offload
 #ifdef KOKKOS_ENABLE_HIP
  static constexpr int team_size_compute_neigh = 2;
 #else
  static constexpr int team_size_compute_neigh = 4;
 #endif
  ComputeGaussianGridLocalKokkos(class LAMMPS *, int, char **);
  ~ComputeGaussianGridLocalKokkos() override;
  void setup() override;
  void init() override;
  void compute_local() override;
  template<class TagStyle>
  void check_team_size_for(int, int&, int);
  KOKKOS_INLINE_FUNCTION
  void operator() (TagComputeGaussianGridLocalNeigh, const typename Kokkos::TeamPolicy<DeviceType, TagComputeGaussianGridLocalNeigh>::member_type& team) const;
 private:
  Kokkos::View<double*, DeviceType> d_radelem;              // element radii
  Kokkos::View<double*, DeviceType> d_sigmaelem;
  Kokkos::View<double*, DeviceType> d_prefacelem;
  Kokkos::View<double*, DeviceType> d_argfacelem;
  Kokkos::View<int*, DeviceType> d_ninside;                // ninside for all atoms in list
  Kokkos::View<int*, DeviceType> d_map;                    // mapping from atom types to elements
  typedef Kokkos::DualView<F_FLOAT**, DeviceType> tdual_fparams;
  tdual_fparams k_cutsq;
  typedef Kokkos::View<const F_FLOAT**, DeviceType,
      Kokkos::MemoryTraits<Kokkos::RandomAccess> > t_fparams_rnd;
  t_fparams_rnd rnd_cutsq;
  int max_neighs, inum, chunk_size, chunk_offset;
  int host_flag;
  int total_range; // total number of loop iterations in grid
  int xlen, ylen, zlen;
  int chunksize;
  int ntotal;
  typename AT::t_x_array_randomread x;
  typename AT::t_int_1d_randomread type;
  DAT::tdual_float_2d k_alocal;
  typename AT::t_float_2d d_alocal;
  // triclinic vars
  double h0, h1, h2, h3, h4, h5;
  double lo0, lo1, lo2;
 };
 }    // namespace LAMMPS_NS
 #endif
 #endif
--- a/src/KOKKOS/compute_sna_grid_kokkos.cpp
+++ b/src/KOKKOS/compute_sna_grid_kokkos.cpp
@ -0,0 +1,25 @@
 // clang-format off
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   https://www.lammps.org/, Sandia National Laboratories
   LAMMPS development team: developers@lammps.org
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 #include "compute_sna_grid_kokkos.h"
 #include "compute_sna_grid_kokkos_impl.h"
 namespace LAMMPS_NS {
 template class ComputeSNAGridKokkosDevice<LMPDeviceType>;
 #ifdef LMP_KOKKOS_GPU
 template class ComputeSNAGridKokkosHost<LMPHostType>;
 #endif
 }
--- a/src/KOKKOS/compute_sna_grid_kokkos.h
+++ b/src/KOKKOS/compute_sna_grid_kokkos.h
@ -0,0 +1,297 @@
 /* -*- c++ -*- ----------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   https://www.lammps.org/, Sandia National Laboratories
   LAMMPS development team: developers@lammps.org
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 #ifdef COMPUTE_CLASS
 // clang-format off
 ComputeStyle(sna/grid/kk,ComputeSNAGridKokkosDevice<LMPDeviceType>);
 ComputeStyle(sna/grid/kk/device,ComputeSNAGridKokkosDevice<LMPDeviceType>);
 #ifdef LMP_KOKKOS_GPU
 ComputeStyle(sna/grid/kk/host,ComputeSNAGridKokkosHost<LMPHostType>);
 #else
 ComputeStyle(sna/grid/kk/host,ComputeSNAGridKokkosDevice<LMPHostType>);
 #endif
 // clang-format on
 #else
 // clang-format off
 #ifndef LMP_COMPUTE_SNA_GRID_KOKKOS_H
 #define LMP_COMPUTE_SNA_GRID_KOKKOS_H
 #include "compute_sna_grid.h"
 #include "kokkos_type.h"
 #include "sna_kokkos.h"
 namespace LAMMPS_NS {
 // Routines for both the CPU and GPU backend
 // GPU backend only
 struct TagCSNAGridComputeNeigh{};
 struct TagCSNAGridComputeCayleyKlein{};
 struct TagCSNAGridPreUi{};
 struct TagCSNAGridComputeUiSmall{}; // more parallelism, more divergence
 struct TagCSNAGridComputeUiLarge{}; // less parallelism, no divergence
 struct TagCSNAGridTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist
 template <bool chemsnap> struct TagCSNAGridComputeZi{};
 template <bool chemsnap> struct TagCSNAGridComputeBi{};
 struct TagCSNAGridLocalFill{}; // fill the gridlocal array
 struct TagComputeSNAGridLoop{};
 struct TagComputeSNAGrid3D{};
 // CPU backend only
 struct TagComputeSNAGridLoopCPU{};
 //template<class DeviceType>
 template<class DeviceType, typename real_type_, int vector_length_>
 class ComputeSNAGridKokkos : public ComputeSNAGrid {
 public:
  typedef DeviceType device_type;
  typedef ArrayTypes<DeviceType> AT;
  static constexpr int vector_length = vector_length_;
  using real_type = real_type_;
  using complex = SNAComplex<real_type>;
  // Static team/tile sizes for device offload
 #ifdef KOKKOS_ENABLE_HIP
  static constexpr int team_size_compute_neigh = 2;
  static constexpr int tile_size_compute_ck = 2;
  static constexpr int tile_size_pre_ui = 2;
  static constexpr int team_size_compute_ui = 2;
  static constexpr int tile_size_transform_ui = 2;
  static constexpr int tile_size_compute_zi = 2;
  static constexpr int min_blocks_compute_zi = 0; // no minimum bound
  static constexpr int tile_size_compute_bi = 2;
  static constexpr int tile_size_compute_yi = 2;
  static constexpr int min_blocks_compute_yi = 0; // no minimum bound
  static constexpr int team_size_compute_fused_deidrj = 2;
 #else
  static constexpr int team_size_compute_neigh = 4;
  static constexpr int tile_size_compute_ck = 4;
  static constexpr int tile_size_pre_ui = 4;
  static constexpr int team_size_compute_ui = sizeof(real_type) == 4 ? 8 : 4;
  static constexpr int tile_size_transform_ui = 4;
  static constexpr int tile_size_compute_zi = 8;
  static constexpr int tile_size_compute_bi = 4;
  static constexpr int tile_size_compute_yi = 8;
  static constexpr int team_size_compute_fused_deidrj = sizeof(real_type) == 4 ? 4 : 2;
  // this empirically reduces perf fluctuations from compiler version to compiler version
  static constexpr int min_blocks_compute_zi = 4;
  static constexpr int min_blocks_compute_yi = 4;
 #endif
  // Custom MDRangePolicy, Rank3, to reduce verbosity of kernel launches
  // This hides the Kokkos::IndexType<int> and Kokkos::Rank<3...>
  // and reduces the verbosity of the LaunchBound by hiding the explicit
  // multiplication by vector_length
  template <class Device, int num_tiles, class TagComputeSNA, int min_blocks = 0>
  using Snap3DRangePolicy = typename Kokkos::MDRangePolicy<Device, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, Kokkos::LaunchBounds<vector_length * num_tiles, min_blocks>, TagComputeSNA>;
  // MDRangePolicy for the 3D grid loop:
  template <class Device, class TagComputeSNA>
  using CSNAGrid3DPolicy = typename Kokkos::MDRangePolicy<Device, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>>;
  // Testing out team policies
  template <class Device, int num_teams,  class TagComputeSNA>
  using CSNAGridTeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::LaunchBounds<vector_length * num_teams>, TagComputeSNA>;
  //using CSNAGridTeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::IndexType<int>, Kokkos::IndexType<int>, Kokkos::IndexType<int>, TagComputeSNA>;
  //using team_member = typename team_policy::member_type;
  // Custom SnapAoSoATeamPolicy to reduce the verbosity of kernel launches
  // This hides the LaunchBounds abstraction by hiding the explicit
  // multiplication by vector length
  template <class Device, int num_teams, class TagComputeSNA>
  using SnapAoSoATeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::LaunchBounds<vector_length * num_teams>, TagComputeSNA>;
  // Helper routine that returns a CPU or a GPU policy as appropriate
  template <class Device, int num_tiles, class TagComputeSNA, int min_blocks = 0>
  auto snap_get_policy(const int& chunk_size_div, const int& second_loop) {
    return Snap3DRangePolicy<Device, num_tiles, TagComputeSNA, min_blocks>({0, 0, 0},
                                                                 {vector_length, second_loop, chunk_size_div},
                                                                 {vector_length, num_tiles, 1});
  }
  ComputeSNAGridKokkos(class LAMMPS *, int, char **);
  ~ComputeSNAGridKokkos() override;
  void setup() override;
  void compute_array() override;
  // Utility functions for teams
  template<class TagStyle>
  void check_team_size_for(int, int&);
  template<class TagStyle>
  void check_team_size_reduce(int, int&);
  // operator function for example team policy
  //KOKKOS_INLINE_FUNCTION
  KOKKOS_INLINE_FUNCTION
  void operator() (TagComputeSNAGridLoop, const int& ) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagComputeSNAGridLoopCPU, const int&) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridComputeNeigh>::member_type& team) const;
  // 3D case - used by parallel_for
  KOKKOS_INLINE_FUNCTION
  void operator()(TagComputeSNAGrid3D, const int& iz, const int& iy, const int& ix) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridComputeCayleyKlein, const int iatom_mod, const int jnbor, const int iatom_div) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridPreUi, const int& iatom_mod, const int& j, const int& iatom_div) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridPreUi, const int& iatom, const int& j) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridPreUi, const int& iatom) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridComputeUiSmall>::member_type& team) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridComputeUiLarge>::member_type& team) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridTransformUi, const int& iatom_mod, const int& idxu, const int& iatom_div) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridTransformUi, const int& iatom, const int& idxu) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridTransformUi, const int& iatom) const;
  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridComputeZi<chemsnap>, const int& iatom_mod, const int& idxz, const int& iatom_div) const;
  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridComputeZi<chemsnap>, const int& iatom, const int& idxz) const;
  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridComputeZi<chemsnap>, const int& iatom) const;
  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridComputeBi<chemsnap>, const int& iatom_mod, const int& idxb, const int& iatom_div) const;
  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridComputeBi<chemsnap>, const int& iatom, const int& idxb) const;
  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridComputeBi<chemsnap>, const int& iatom) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridLocalFill,const int& ii) const;
 protected:
  SNAKokkos<DeviceType, real_type, vector_length> snaKK;
  int max_neighs, chunk_size, chunk_offset;
  int host_flag;
  int ntotal;
  int total_range; // total number of loop iterations in grid
  int zlen; //= nzhi-nzlo+1;
  int ylen; //= nyhi-nylo+1;
  int xlen; //= nxhi-nxlo+1;
  double cutsq_tmp; // temporary cutsq until we get a view
  Kokkos::View<real_type*, DeviceType> d_radelem;              // element radii
  Kokkos::View<real_type*, DeviceType> d_wjelem;               // elements weights
  Kokkos::View<real_type**, Kokkos::LayoutRight, DeviceType> d_coeffelem;           // element bispectrum coefficients
  Kokkos::View<real_type*, DeviceType> d_sinnerelem;           // element inner cutoff midpoint
  Kokkos::View<real_type*, DeviceType> d_dinnerelem;           // element inner cutoff half-width
  Kokkos::View<T_INT*, DeviceType> d_ninside;                // ninside for all atoms in list
  Kokkos::View<T_INT*, DeviceType> d_map;                    // mapping from atom types to elements
  Kokkos::View<real_type*, DeviceType> d_test;              // test view
  typedef Kokkos::DualView<F_FLOAT**, DeviceType> tdual_fparams;
  tdual_fparams k_cutsq;
  typedef Kokkos::View<const F_FLOAT**, DeviceType,
      Kokkos::MemoryTraits<Kokkos::RandomAccess> > t_fparams_rnd;
  t_fparams_rnd rnd_cutsq;
  typename AT::t_x_array_randomread x;
  typename AT::t_int_1d_randomread type;
  DAT::tdual_float_2d k_grid;
  DAT::tdual_float_2d k_gridall;
  typename AT::t_float_2d d_grid;
  typename AT::t_float_2d d_gridall;
  DAT::tdual_float_4d k_gridlocal;
  typename AT::t_float_4d d_gridlocal;
  // Utility routine which wraps computing per-team scratch size requirements for
  // ComputeNeigh, ComputeUi, and ComputeFusedDeidrj
  template <typename scratch_type>
  int scratch_size_helper(int values_per_team);
  class DomainKokkos *domainKK;
  // triclinic vars
  double h0, h1, h2, h3, h4, h5;
  double lo0, lo1, lo2;
  // Make SNAKokkos a friend
  friend class SNAKokkos<DeviceType, real_type, vector_length>;
 };
 // These wrapper classes exist to make the compute style factory happy/avoid having
 // to extend the compute  style factory to support Compute classes w/an arbitrary number
 // of extra template parameters
 template <class DeviceType>
 class ComputeSNAGridKokkosDevice : public ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN> {
 private:
  using Base = ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN>;
 public:
  ComputeSNAGridKokkosDevice(class LAMMPS *, int, char **);
  void compute_array() override;
 };
 #ifdef LMP_KOKKOS_GPU
 template <class DeviceType>
 class ComputeSNAGridKokkosHost : public ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_HOST_VECLEN> {
 private:
  using Base = ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_HOST_VECLEN>;
 public:
  ComputeSNAGridKokkosHost(class LAMMPS *, int, char **);
  void compute_array() override;
 };
 #endif
 }
 #endif
 #endif
--- a/src/KOKKOS/compute_sna_grid_kokkos_impl.h
+++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
@ -0,0 +1,786 @@
 // clang-format off
 /* -*- c++ -*- ----------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   https://www.lammps.org/, Sandia National Laboratories
   LAMMPS development team: developers@lammps.org
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   Contributing authors: Christian Trott (SNL), Stan Moore (SNL),
                         Evan Weinberg (NVIDIA)
 ------------------------------------------------------------------------- */
 #include "compute_sna_grid_kokkos.h"
 #include "pair_snap_kokkos.h"
 #include "atom_kokkos.h"
 #include "atom_masks.h"
 #include "comm.h"
 #include "error.h"
 #include "memory_kokkos.h"
 #include "modify.h"
 #include "neigh_list.h"
 #include "neigh_request.h"
 #include "neighbor_kokkos.h"
 #include "domain.h"
 #include "domain_kokkos.h"
 #include "sna.h"
 #include "update.h"
 #include <cmath>
 #include <cstdlib>
 #include <cstring>
 #include <iostream>
 #define MAXLINE 1024
 #define MAXWORD 3
 namespace LAMMPS_NS {
 // Constructor
 template<class DeviceType, typename real_type, int vector_length>
 ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::ComputeSNAGridKokkos(LAMMPS *lmp, int narg, char **arg) : ComputeSNAGrid(lmp, narg, arg)
 {
  kokkosable = 1;
  atomKK = (AtomKokkos *) atom;
  domainKK = (DomainKokkos *) domain;
  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
  datamask_read = EMPTY_MASK;
  datamask_modify = EMPTY_MASK;
  k_cutsq = tdual_fparams("ComputeSNAGridKokkos::cutsq",atom->ntypes+1,atom->ntypes+1);
  auto d_cutsq = k_cutsq.template view<DeviceType>();
  rnd_cutsq = d_cutsq;
  host_flag = (execution_space == Host);
  // TODO: Extract cutsq in double loop below, no need for cutsq_tmp
  cutsq_tmp = cutsq[1][1];
  for (int i = 1; i <= atom->ntypes; i++) {
    for (int j = 1; j <= atom->ntypes; j++){
      k_cutsq.h_view(i,j) = k_cutsq.h_view(j,i) = cutsq_tmp;
      k_cutsq.template modify<LMPHostType>();
    }
  }
   // Set up element lists
  MemKK::realloc_kokkos(d_radelem,"ComputeSNAGridKokkos::radelem",nelements);
  MemKK::realloc_kokkos(d_wjelem,"ComputeSNAGridKokkos:wjelem",nelements);
  MemKK::realloc_kokkos(d_sinnerelem,"ComputeSNAGridKokkos:sinnerelem",nelements);
  MemKK::realloc_kokkos(d_dinnerelem,"ComputeSNAGridKokkos:dinnerelem",nelements);
  // test
  MemKK::realloc_kokkos(d_test, "ComputeSNAGridKokkos::test", nelements);
  int n = atom->ntypes;
  MemKK::realloc_kokkos(d_map,"ComputeSNAGridKokkos::map",n+1);
  auto h_radelem = Kokkos::create_mirror_view(d_radelem);
  auto h_wjelem = Kokkos::create_mirror_view(d_wjelem);
  auto h_sinnerelem = Kokkos::create_mirror_view(d_sinnerelem);
  auto h_dinnerelem = Kokkos::create_mirror_view(d_dinnerelem);
  auto h_map = Kokkos::create_mirror_view(d_map);
  // test
  auto h_test = Kokkos::create_mirror_view(d_test);
  h_test(0) = 2.0;
  // start from index 1 because of how compute sna/grid is
  for (int i = 1; i <= atom->ntypes; i++) {
    h_radelem(i-1) = radelem[i];
    h_wjelem(i-1) = wjelem[i];
    if (switchinnerflag){
      h_sinnerelem(i) = sinnerelem[i];
      h_dinnerelem(i) = dinnerelem[i];
    }
  }
  // In pair snap some things like `map` get allocated regardless of chem flag.
  if (chemflag){
    for (int i = 1; i <= atom->ntypes; i++) {
      h_map(i) = map[i];
    }
  }
  Kokkos::deep_copy(d_radelem,h_radelem);
  Kokkos::deep_copy(d_wjelem,h_wjelem);
  if (switchinnerflag){
    Kokkos::deep_copy(d_sinnerelem,h_sinnerelem);
    Kokkos::deep_copy(d_dinnerelem,h_dinnerelem);
  }
  if (chemflag){
    Kokkos::deep_copy(d_map,h_map);
  }
  Kokkos::deep_copy(d_test,h_test);
  snaKK = SNAKokkos<DeviceType, real_type, vector_length>(*this);
  snaKK.grow_rij(0,0);
  snaKK.init();
 }
 // Destructor
 template<class DeviceType, typename real_type, int vector_length>
 ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::~ComputeSNAGridKokkos()
 {
  if (copymode) return;
  memoryKK->destroy_kokkos(k_cutsq,cutsq);
  memoryKK->destroy_kokkos(k_gridall, gridall);
 }
 // Setup
 template<class DeviceType, typename real_type, int vector_length>
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::setup()
 {
  // Do not call ComputeGrid::setup(), we don't wanna allocate the grid array there.
  // Instead, call ComputeGrid::set_grid_global and set_grid_local to set the n indices.
  ComputeGrid::set_grid_global();
  ComputeGrid::set_grid_local();
  // allocate arrays
  memoryKK->create_kokkos(k_gridall, gridall, size_array_rows, size_array_cols, "grid:gridall");
  // do not use or allocate gridlocal for now
  gridlocal_allocated = 0;
  array = gridall;
  d_gridlocal = k_gridlocal.template view<DeviceType>();
  d_gridall = k_gridall.template view<DeviceType>();
 }
 // Compute
 template<class DeviceType, typename real_type, int vector_length>
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::compute_array()
 {
  if (host_flag) {
    ComputeSNAGrid::compute_array();
    return;
  }
  copymode = 1;
  zlen = nzhi-nzlo+1;
  ylen = nyhi-nylo+1;
  xlen = nxhi-nxlo+1;
  total_range = (nzhi-nzlo+1)*(nyhi-nylo+1)*(nxhi-nxlo+1);
  atomKK->sync(execution_space,X_MASK|F_MASK|TYPE_MASK);
  x = atomKK->k_x.view<DeviceType>();
  type = atomKK->k_type.view<DeviceType>();
  k_cutsq.template sync<DeviceType>();
  // max_neighs is defined here - think of more elaborate methods.
  max_neighs = 100;
  // Pair snap/kk uses grow_ij with some max number of neighs but compute sna/grid uses total
  // number of atoms.
  ntotal = atomKK->nlocal + atomKK->nghost;
  // Allocate view for number of neighbors per grid point
  MemKK::realloc_kokkos(d_ninside,"ComputeSNAGridKokkos:ninside",total_range);
  // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user
  // `total_range` is the number of grid points which may be larger than chunk size.
  chunk_size = MIN(chunksize, total_range);
  chunk_offset = 0;
  snaKK.grow_rij(chunk_size, max_neighs);
  // Pre-compute ceil(chunk_size / vector_length) for code cleanliness
  const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length;
  if (triclinic) {
    h0 = domain->h[0];
    h1 = domain->h[1];
    h2 = domain->h[2];
    h3 = domain->h[3];
    h4 = domain->h[4];
    h5 = domain->h[5];
    lo0 = domain->boxlo[0];
    lo1 = domain->boxlo[1];
    lo2 = domain->boxlo[2];
  }
  while (chunk_offset < total_range) { // chunk up loop to prevent running out of memory
    if (chunk_size > total_range - chunk_offset)
      chunk_size = total_range - chunk_offset;
    //ComputeNeigh
    {
      int scratch_size = scratch_size_helper<int>(team_size_compute_neigh * max_neighs); //ntotal);
      SnapAoSoATeamPolicy<DeviceType, team_size_compute_neigh, TagCSNAGridComputeNeigh>
        policy_neigh(chunk_size, team_size_compute_neigh, vector_length);
      policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
      Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this);
    }
    //ComputeCayleyKlein
    {
      // tile_size_compute_ck is defined in `compute_sna_grid_kokkos.h`
      Snap3DRangePolicy<DeviceType, tile_size_compute_ck, TagCSNAGridComputeCayleyKlein>
        policy_compute_ck({0,0,0}, {vector_length, max_neighs, chunk_size_div}, {vector_length, tile_size_compute_ck, 1});
      Kokkos::parallel_for("ComputeCayleyKlein", policy_compute_ck, *this);
    }
    //PreUi
    {
      auto policy_pre_ui = snap_get_policy<DeviceType, tile_size_pre_ui, TagCSNAGridPreUi>(chunk_size_div, twojmax + 1);
      Kokkos::parallel_for("PreUi", policy_pre_ui, *this);
    }
    // ComputeUi w/ vector parallelism, shared memory, direct atomicAdd into ulisttot
    {
      // team_size_compute_ui is defined in `compute_sna_grid_kokkos.h`
      // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer
      const int tile_size = vector_length * (twojmax + 1);
      const int scratch_size = scratch_size_helper<complex>(team_size_compute_ui * tile_size);
      if (chunk_size < parallel_thresh)
      {
        // Version with parallelism over j_bend
        // total number of teams needed: (natoms / 32) * (ntotal) * ("bend" locations)
        const int n_teams = chunk_size_div * max_neighs * (twojmax + 1);
        const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
        SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagCSNAGridComputeUiSmall>
          policy_ui(n_teams_div, team_size_compute_ui, vector_length);
        policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
        Kokkos::parallel_for("ComputeUiSmall", policy_ui, *this);
      } else {
        // Version w/out parallelism over j_bend
        // total number of teams needed: (natoms / 32) * (ntotal)
        const int n_teams = chunk_size_div * max_neighs;
        const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
        SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagCSNAGridComputeUiLarge>
          policy_ui(n_teams_div, team_size_compute_ui, vector_length);
        policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
        Kokkos::parallel_for("ComputeUiLarge", policy_ui, *this);
      }
    }
    //TransformUi: un-"fold" ulisttot, zero ylist
    {
      // Expand ulisttot_re,_im -> ulisttot
      // Zero out ylist
      auto policy_transform_ui = snap_get_policy<DeviceType, tile_size_transform_ui, TagCSNAGridTransformUi>(chunk_size_div, snaKK.idxu_max);
      Kokkos::parallel_for("TransformUi", policy_transform_ui, *this);
    }
    //Compute bispectrum
    // team_size_[compute_zi, compute_bi, transform_bi] are defined in `pair_snap_kokkos.h`
    //ComputeZi and Bi
    if (nelements > 1) {
      auto policy_compute_zi = snap_get_policy<DeviceType, tile_size_compute_zi, TagCSNAGridComputeZi<true>, min_blocks_compute_zi>(chunk_size_div, snaKK.idxz_max);
      Kokkos::parallel_for("ComputeZiChemsnap", policy_compute_zi, *this);
      auto policy_compute_bi = snap_get_policy<DeviceType, tile_size_compute_bi, TagCSNAGridComputeBi<true>>(chunk_size_div, snaKK.idxb_max);
      Kokkos::parallel_for("ComputeBiChemsnap", policy_compute_bi, *this);
    } else {
      auto policy_compute_zi = snap_get_policy<DeviceType, tile_size_compute_zi, TagCSNAGridComputeZi<false>, min_blocks_compute_zi>(chunk_size_div, snaKK.idxz_max);
      Kokkos::parallel_for("ComputeZi", policy_compute_zi, *this);
      auto policy_compute_bi = snap_get_policy<DeviceType, tile_size_compute_bi, TagCSNAGridComputeBi<false>>(chunk_size_div, snaKK.idxb_max);
      Kokkos::parallel_for("ComputeBi", policy_compute_bi, *this);
    }
    // Fill the grid array with bispectrum values
    {
      typename Kokkos::RangePolicy<DeviceType,TagCSNAGridLocalFill> policy_fill(0,chunk_size);
      Kokkos::parallel_for(policy_fill, *this);
    }
    // Proceed to the next chunk.
    chunk_offset += chunk_size;
  } // end while
  copymode = 0;
  k_gridlocal.template modify<DeviceType>();
  k_gridlocal.template sync<LMPHostType>();
  k_gridall.template modify<DeviceType>();
  k_gridall.template sync<LMPHostType>();
 }
 /* ----------------------------------------------------------------------
   Begin routines that are unique to the GPU codepath. These take advantage
   of AoSoA data layouts and scratch memory for recursive polynomials
 ------------------------------------------------------------------------- */
 /*
 Simple team policy functor seeing how many layers deep we can go with the parallelism.
 */
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridComputeNeigh>::member_type& team) const {
  // This function follows similar procedure as ComputeNeigh of PairSNAPKokkos.
  // Main difference is that we don't use the neighbor class or neighbor variables here.
  // This is because the grid points are not atoms and therefore do not get assigned
  // neighbors in LAMMPS.
  // TODO: If we did make a neighborlist for each grid point, we could use current
  //       routines and avoid having to loop over all atoms (which limits us to
  //       natoms = max team size).
  // basic quantities associated with this team:
  // team_rank : rank of thread in this team
  // league_rank : rank of team in this league
  // team_size : number of threads in this team
  // extract loop index
  int ii = team.team_rank() + team.league_rank() * team.team_size();
  if (ii >= chunk_size) return;
  // extract grid index
  int igrid = ii + chunk_offset;
  // get a pointer to scratch memory
  // This is used to cache whether or not an atom is within the cutoff.
  // If it is, type_cache is assigned to the atom type.
  // If it's not, it's assigned to -1.
  //const int tile_size = ntotal; //max_neighs; // number of elements per thread
  //const int team_rank = team.team_rank();
  //const int scratch_shift = team_rank * tile_size; // offset into pointer for entire team
  //int* type_cache = (int*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(int), 0) + scratch_shift;
  // convert to grid indices
  int iz = igrid/(xlen*ylen);
  int i2 = igrid - (iz*xlen*ylen);
  int iy = i2/xlen;
  int ix = i2 % xlen;
  iz += nzlo;
  iy += nylo;
  ix += nxlo;
  double xgrid[3];
  // index ii already captures the proper grid point
  //int igrid = iz * (nx * ny) + iy * nx + ix;
  // grid2x converts igrid to ix,iy,iz like we've done before
  // multiply grid integers by grid spacing delx, dely, delz
  //grid2x(igrid, xgrid);
  xgrid[0] = ix * delx;
  xgrid[1] = iy * dely;
  xgrid[2] = iz * delz;
  if (triclinic) {
    // Do a conversion on `xgrid` here like we do in the CPU version.
    // Can't do this:
    // domainKK->lamda2x(xgrid, xgrid);
    // Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed
    // Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats.
    xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0;
    xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1;
    xgrid[2] = h2*xgrid[2] + lo2;
  }
  const F_FLOAT xtmp = xgrid[0];
  const F_FLOAT ytmp = xgrid[1];
  const F_FLOAT ztmp = xgrid[2];
  // currently, all grid points are type 1
  // not clear what a better choice would be
  const int itype = 1;
  int ielem = 0;
  if (chemflag) ielem = d_map[itype];
  //const double radi = d_radelem[ielem];
  // Compute the number of neighbors, store rsq
  int ninside = 0;
  // Looping over ntotal for now.
  for (int j = 0; j < ntotal; j++){
    const F_FLOAT dx = x(j,0) - xtmp;
    const F_FLOAT dy = x(j,1) - ytmp;
    const F_FLOAT dz = x(j,2) - ztmp;
    int jtype = type(j);
    const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
    // don't include atoms that share location with grid point
    if (rsq >= rnd_cutsq(itype,jtype) || rsq < 1e-20) {
      jtype = -1; // use -1 to signal it's outside the radius
    }
    if (jtype >= 0)
      ninside++;
  }
  d_ninside(ii) = ninside;
  // TODO: Adjust for multi-element, currently we set jelem = 0 regardless of type.
  int offset = 0;
  for (int j = 0; j < ntotal; j++){
    //const int jtype = type_cache[j];
    //if (jtype >= 0) {
    const F_FLOAT dx = x(j,0) - xtmp;
    const F_FLOAT dy = x(j,1) - ytmp;
    const F_FLOAT dz = x(j,2) - ztmp;
    const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
    int jtype = type(j);
    if (rsq < rnd_cutsq(itype,jtype) && rsq > 1e-20) {
      int jelem = 0;
      if (chemflag) jelem = d_map[jtype];
      snaKK.rij(ii,offset,0) = static_cast<real_type>(dx);
      snaKK.rij(ii,offset,1) = static_cast<real_type>(dy);
      snaKK.rij(ii,offset,2) = static_cast<real_type>(dz);
      // pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp
      // actually since the views here have values starting at 0, let's use jelem
      snaKK.wj(ii,offset) = static_cast<real_type>(d_wjelem[jelem]);
      snaKK.rcutij(ii,offset) = static_cast<real_type>((2.0 * d_radelem[jelem])*rcutfac);
      snaKK.inside(ii,offset) = j;
      if (switchinnerflag) {
        snaKK.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]);
        snaKK.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]);
      }
      if (chemflag)
        snaKK.element(ii,offset) = jelem;
      else
        snaKK.element(ii,offset) = 0;
      offset++;
    }
  }
 }
 /* ----------------------------------------------------------------------
  Pre-compute the Cayley-Klein parameters for reuse in later routines
 ------------------------------------------------------------------------- */
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeCayleyKlein,const int iatom_mod, const int jnbor, const int iatom_div) const {
  const int iatom = iatom_mod + iatom_div * vector_length;
  if (iatom >= chunk_size) return;
  const int ninside = d_ninside(iatom);
  if (jnbor >= ninside) return;
  snaKK.compute_cayley_klein(iatom, jnbor);
 }
 /* ----------------------------------------------------------------------
  Initialize the "ulisttot" structure with non-zero on-diagonal terms
  and zero terms elsewhere
 ------------------------------------------------------------------------- */
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridPreUi, const int& iatom_mod, const int& j, const int& iatom_div) const {
  const int iatom = iatom_mod + iatom_div * vector_length;
  if (iatom >= chunk_size) return;
  int itype = type(iatom);
  int ielem = d_map[itype];
  snaKK.pre_ui(iatom, j, ielem);
 }
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridPreUi, const int& iatom, const int& j) const {
  if (iatom >= chunk_size) return;
  int itype = type(iatom);
  int ielem = d_map[itype];
  snaKK.pre_ui(iatom, j, ielem);
 }
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridPreUi, const int& iatom) const {
  if (iatom >= chunk_size) return;
  const int itype = type(iatom);
  const int ielem = d_map[itype];
  for (int j = 0; j <= twojmax; j++)
    snaKK.pre_ui(iatom, j, ielem);
 }
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridComputeUiSmall>::member_type& team) const {
  // extract flattened atom_div / neighbor number / bend location
  int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
  // extract neighbor index, iatom_div
  int iatom_div = flattened_idx / (max_neighs * (twojmax + 1)); // removed "const" to work around GCC 7 bug
  const int jj_jbend = flattened_idx - iatom_div * (max_neighs * (twojmax + 1));
  const int jbend = jj_jbend / max_neighs;
  int jj = jj_jbend - jbend * max_neighs; // removed "const" to work around GCC 7 bug
  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
    [&] (const int iatom_mod) {
    const int ii = iatom_mod + vector_length * iatom_div;
    if (ii >= chunk_size) return;
    const int ninside = d_ninside(ii);
    if (jj >= ninside) return;
    snaKK.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div);
  });
 }
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridComputeUiLarge>::member_type& team) const {
  // extract flattened atom_div / neighbor number / bend location
  int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
  // extract neighbor index, iatom_div
  int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug
  int jj = flattened_idx - iatom_div * max_neighs;
  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
    [&] (const int iatom_mod) {
    const int ii = iatom_mod + vector_length * iatom_div;
    if (ii >= chunk_size) return;
    const int ninside = d_ninside(ii);
    if (jj >= ninside) return;
    snaKK.compute_ui_large(team,iatom_mod, jj, iatom_div);
  });
 }
 /* ----------------------------------------------------------------------
  De-symmetrize ulisttot_re and _im and pack it into a unified ulisttot
  structure. Zero-initialize ylist. CPU and GPU.
 ------------------------------------------------------------------------- */
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridTransformUi, const int& iatom_mod, const int& idxu, const int& iatom_div) const {
  const int iatom = iatom_mod + iatom_div * vector_length;
  if (iatom >= chunk_size) return;
  if (idxu >= snaKK.idxu_max) return;
  snaKK.transform_ui(iatom, idxu);
 }
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridTransformUi, const int& iatom, const int& idxu) const {
  if (iatom >= chunk_size) return;
  snaKK.transform_ui(iatom, idxu);
 }
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridTransformUi, const int& iatom) const {
  if (iatom >= chunk_size) return;
  for (int idxu = 0; idxu < snaKK.idxu_max; idxu++)
    snaKK.transform_ui(iatom, idxu);
 }
 /* ----------------------------------------------------------------------
  Compute all elements of the Z tensor and store them into the `zlist`
   view
 ------------------------------------------------------------------------- */
 template<class DeviceType, typename real_type, int vector_length>
 template <bool chemsnap> KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeZi<chemsnap>, const int& iatom_mod, const int& jjz, const int& iatom_div) const {
  const int iatom = iatom_mod + iatom_div * vector_length;
  if (iatom >= chunk_size) return;
  if (jjz >= snaKK.idxz_max) return;
  snaKK.template compute_zi<chemsnap>(iatom, jjz);
 }
 template<class DeviceType, typename real_type, int vector_length>
 template <bool chemsnap> KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeZi<chemsnap>, const int& iatom, const int& jjz) const {
  if (iatom >= chunk_size) return;
  snaKK.template compute_zi<chemsnap>(iatom, jjz);
 }
 template<class DeviceType, typename real_type, int vector_length>
 template <bool chemsnap> KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeZi<chemsnap>, const int& iatom) const {
  if (iatom >= chunk_size) return;
  for (int jjz = 0; jjz < snaKK.idxz_max; jjz++)
    snaKK.template compute_zi<chemsnap>(iatom, jjz);
 }
 /* ----------------------------------------------------------------------
  Compute the energy triple products and store in the "blist" view
 ------------------------------------------------------------------------- */
 template<class DeviceType, typename real_type, int vector_length>
 template <bool chemsnap> KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeBi<chemsnap>, const int& iatom_mod, const int& jjb, const int& iatom_div) const {
  const int iatom = iatom_mod + iatom_div * vector_length;
  if (iatom >= chunk_size) return;
  if (jjb >= snaKK.idxb_max) return;
  snaKK.template compute_bi<chemsnap>(iatom, jjb);
 }
 template<class DeviceType, typename real_type, int vector_length>
 template <bool chemsnap> KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeBi<chemsnap>, const int& iatom, const int& jjb) const {
  if (iatom >= chunk_size) return;
  snaKK.template compute_bi<chemsnap>(iatom, jjb);
 }
 template<class DeviceType, typename real_type, int vector_length>
 template <bool chemsnap> KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeBi<chemsnap>, const int& iatom) const {
  if (iatom >= chunk_size) return;
  for (int jjb = 0; jjb < snaKK.idxb_max; jjb++)
    snaKK.template compute_bi<chemsnap>(iatom, jjb);
 }
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalFill, const int& ii) const {
  // extract grid index
  int igrid = ii + chunk_offset;
  // convert to grid indices
  int iz = igrid/(xlen*ylen);
  int i2 = igrid - (iz*xlen*ylen);
  int iy = i2/xlen;
  int ix = i2 % xlen;
  iz += nzlo;
  iy += nylo;
  ix += nxlo;
  double xgrid[3];
  // index ii already captures the proper grid point
  // int igrid = iz * (nx * ny) + iy * nx + ix;
  // printf("ii igrid: %d %d\n", ii, igrid);
  // grid2x converts igrid to ix,iy,iz like we've done before
  //grid2x(igrid, xgrid);
  xgrid[0] = ix * delx;
  xgrid[1] = iy * dely;
  xgrid[2] = iz * delz;
  if (triclinic) {
    // Do a conversion on `xgrid` here like we do in the CPU version.
    // Can't do this:
    // domainKK->lamda2x(xgrid, xgrid);
    // Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed
    // Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats.
    xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0;
    xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1;
    xgrid[2] = h2*xgrid[2] + lo2;
  }
  const F_FLOAT xtmp = xgrid[0];
  const F_FLOAT ytmp = xgrid[1];
  const F_FLOAT ztmp = xgrid[2];
  d_gridall(igrid,0) = xtmp;
  d_gridall(igrid,1) = ytmp;
  d_gridall(igrid,2) = ztmp;
  const auto idxb_max = snaKK.idxb_max;
  // linear contributions
  for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
    const auto idxb = icoeff % idxb_max;
    const auto idx_chem = icoeff / idxb_max;
    d_gridall(igrid,icoeff+3) = snaKK.blist(ii,idx_chem,idxb);
  }
 }
 /* ----------------------------------------------------------------------
   utility functions
 ------------------------------------------------------------------------- */
 template<class DeviceType, typename real_type, int vector_length>
 template<class TagStyle>
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::check_team_size_for(int inum, int &team_size) {
  int team_size_max;
  team_size_max = Kokkos::TeamPolicy<DeviceType,TagStyle>(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelForTag());
  if (team_size*vector_length > team_size_max)
    team_size = team_size_max/vector_length;
 }
 template<class DeviceType, typename real_type, int vector_length>
 template<class TagStyle>
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::check_team_size_reduce(int inum, int &team_size) {
  int team_size_max;
  team_size_max = Kokkos::TeamPolicy<DeviceType,TagStyle>(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelReduceTag());
  if (team_size*vector_length > team_size_max)
    team_size = team_size_max/vector_length;
 }
 template<class DeviceType, typename real_type, int vector_length>
 template<typename scratch_type>
 int ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::scratch_size_helper(int values_per_team) {
  typedef Kokkos::View<scratch_type*, Kokkos::DefaultExecutionSpace::scratch_memory_space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > ScratchViewType;
  return ScratchViewType::shmem_size(values_per_team);
 }
 /* ---------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   routines used by template reference classes
 ------------------------------------------------------------------------- */
 template<class DeviceType>
 ComputeSNAGridKokkosDevice<DeviceType>::ComputeSNAGridKokkosDevice(class LAMMPS *lmp, int narg, char **arg)
   : ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN>(lmp, narg, arg) { ; }
 template<class DeviceType>
 void ComputeSNAGridKokkosDevice<DeviceType>::compute_array()
 {
  Base::compute_array();
 }
 #ifdef LMP_KOKKOS_GPU
 template<class DeviceType>
 ComputeSNAGridKokkosHost<DeviceType>::ComputeSNAGridKokkosHost(class LAMMPS *lmp, int narg, char **arg)
   : ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_HOST_VECLEN>(lmp, narg, arg) { ; }
 template<class DeviceType>
 void ComputeSNAGridKokkosHost<DeviceType>::compute_array()
 {
  Base::compute_array();
 }
 #endif
 }
--- a/src/KOKKOS/compute_sna_grid_local_kokkos.cpp
+++ b/src/KOKKOS/compute_sna_grid_local_kokkos.cpp
@ -0,0 +1,25 @@
 // clang-format off
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   https://www.lammps.org/, Sandia National Laboratories
   LAMMPS development team: developers@lammps.org
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 #include "compute_sna_grid_local_kokkos.h"
 #include "compute_sna_grid_local_kokkos_impl.h"
 namespace LAMMPS_NS {
 template class ComputeSNAGridLocalKokkosDevice<LMPDeviceType>;
 #ifdef LMP_KOKKOS_GPU
 template class ComputeSNAGridLocalKokkosHost<LMPHostType>;
 #endif
 }
--- a/src/KOKKOS/compute_sna_grid_local_kokkos.h
+++ b/src/KOKKOS/compute_sna_grid_local_kokkos.h
@ -0,0 +1,288 @@
 /* -*- c++ -*- ----------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   https://www.lammps.org/, Sandia National Laboratories
   LAMMPS development team: developers@lammps.org
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 #ifdef COMPUTE_CLASS
 // clang-format off
 ComputeStyle(sna/grid/local/kk,ComputeSNAGridLocalKokkosDevice<LMPDeviceType>);
 ComputeStyle(sna/grid/local/kk/device,ComputeSNAGridLocalKokkosDevice<LMPDeviceType>);
 #ifdef LMP_KOKKOS_GPU
 ComputeStyle(sna/grid/local/kk/host,ComputeSNAGridLocalKokkosHost<LMPHostType>);
 #else
 ComputeStyle(sna/grid/local/kk/host,ComputeSNAGridLocalKokkosDevice<LMPHostType>);
 #endif
 // clang-format on
 #else
 // clang-format off
 #ifndef LMP_COMPUTE_SNA_GRID_LOCAL_KOKKOS_H
 #define LMP_COMPUTE_SNA_GRID_LOCAL_KOKKOS_H
 #include "compute_sna_grid_local.h"
 #include "kokkos_type.h"
 #include "sna_kokkos.h"
 namespace LAMMPS_NS {
 // Routines for both the CPU and GPU backend
 // GPU backend only
 struct TagCSNAGridLocalComputeNeigh{};
 struct TagCSNAGridLocalComputeCayleyKlein{};
 struct TagCSNAGridLocalPreUi{};
 struct TagCSNAGridLocalComputeUiSmall{}; // more parallelism, more divergence
 struct TagCSNAGridLocalComputeUiLarge{}; // less parallelism, no divergence
 struct TagCSNAGridLocalTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist
 template <bool chemsnap> struct TagCSNAGridLocalComputeZi{};
 template <bool chemsnap> struct TagCSNAGridLocalComputeBi{};
 struct TagCSNAGridLocal2Fill{}; // fill the gridlocal array
 struct TagComputeSNAGridLocalLoop{};
 struct TagComputeSNAGridLocal3D{};
 // CPU backend only
 struct TagComputeSNAGridLocalLoopCPU{};
 //template<class DeviceType>
 template<class DeviceType, typename real_type_, int vector_length_>
 class ComputeSNAGridLocalKokkos : public ComputeSNAGridLocal {
 public:
  typedef DeviceType device_type;
  typedef ArrayTypes<DeviceType> AT;
  static constexpr int vector_length = vector_length_;
  using real_type = real_type_;
  using complex = SNAComplex<real_type>;
  // Static team/tile sizes for device offload
 #ifdef KOKKOS_ENABLE_HIP
  static constexpr int team_size_compute_neigh = 2;
  static constexpr int tile_size_compute_ck = 2;
  static constexpr int tile_size_pre_ui = 2;
  static constexpr int team_size_compute_ui = 2;
  static constexpr int tile_size_transform_ui = 2;
  static constexpr int tile_size_compute_zi = 2;
  static constexpr int min_blocks_compute_zi = 0; // no minimum bound
  static constexpr int tile_size_compute_bi = 2;
  static constexpr int tile_size_compute_yi = 2;
  static constexpr int min_blocks_compute_yi = 0; // no minimum bound
  static constexpr int team_size_compute_fused_deidrj = 2;
 #else
  static constexpr int team_size_compute_neigh = 4;
  static constexpr int tile_size_compute_ck = 4;
  static constexpr int tile_size_pre_ui = 4;
  static constexpr int team_size_compute_ui = sizeof(real_type) == 4 ? 8 : 4;
  static constexpr int tile_size_transform_ui = 4;
  static constexpr int tile_size_compute_zi = 8;
  static constexpr int tile_size_compute_bi = 4;
  static constexpr int tile_size_compute_yi = 8;
  static constexpr int team_size_compute_fused_deidrj = sizeof(real_type) == 4 ? 4 : 2;
  // this empirically reduces perf fluctuations from compiler version to compiler version
  static constexpr int min_blocks_compute_zi = 4;
  static constexpr int min_blocks_compute_yi = 4;
 #endif
  // Custom MDRangePolicy, Rank3, to reduce verbosity of kernel launches
  // This hides the Kokkos::IndexType<int> and Kokkos::Rank<3...>
  // and reduces the verbosity of the LaunchBound by hiding the explicit
  // multiplication by vector_length
  template <class Device, int num_tiles, class TagComputeSNA, int min_blocks = 0>
  using Snap3DRangePolicy = typename Kokkos::MDRangePolicy<Device, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, Kokkos::LaunchBounds<vector_length * num_tiles, min_blocks>, TagComputeSNA>;
  // MDRangePolicy for the 3D grid loop:
  template <class Device, class TagComputeSNA>
  using CSNAGridLocal3DPolicy = typename Kokkos::MDRangePolicy<Device, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>>;
  // Testing out team policies
  template <class Device, int num_teams,  class TagComputeSNA>
  using CSNAGridLocalTeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::LaunchBounds<vector_length * num_teams>, TagComputeSNA>;
  // Custom SnapAoSoATeamPolicy to reduce the verbosity of kernel launches
  // This hides the LaunchBounds abstraction by hiding the explicit
  // multiplication by vector length
  template <class Device, int num_teams, class TagComputeSNA>
  using SnapAoSoATeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::LaunchBounds<vector_length * num_teams>, TagComputeSNA>;
  // Helper routine that returns a CPU or a GPU policy as appropriate
  template <class Device, int num_tiles, class TagComputeSNA, int min_blocks = 0>
  auto snap_get_policy(const int& chunk_size_div, const int& second_loop) {
    return Snap3DRangePolicy<Device, num_tiles, TagComputeSNA, min_blocks>({0, 0, 0},
                                                                 {vector_length, second_loop, chunk_size_div},
                                                                 {vector_length, num_tiles, 1});
  }
  ComputeSNAGridLocalKokkos(class LAMMPS *, int, char **);
  ~ComputeSNAGridLocalKokkos() override;
  void setup() override;
  void compute_local() override;
  // Utility functions for teams
  template<class TagStyle>
  void check_team_size_for(int, int&);
  template<class TagStyle>
  void check_team_size_reduce(int, int&);
  KOKKOS_INLINE_FUNCTION
  void operator() (TagComputeSNAGridLocalLoop, const int& ) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagComputeSNAGridLocalLoopCPU, const int&) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridLocalComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridLocalComputeNeigh>::member_type& team) const;
  // 3D case - used by parallel_for
  KOKKOS_INLINE_FUNCTION
  void operator()(TagComputeSNAGridLocal3D, const int& iz, const int& iy, const int& ix) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridLocalComputeCayleyKlein, const int iatom_mod, const int jnbor, const int iatom_div) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridLocalPreUi, const int& iatom_mod, const int& j, const int& iatom_div) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridLocalPreUi, const int& iatom, const int& j) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridLocalPreUi, const int& iatom) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridLocalComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridLocalComputeUiSmall>::member_type& team) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridLocalComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridLocalComputeUiLarge>::member_type& team) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridLocalTransformUi, const int& iatom_mod, const int& idxu, const int& iatom_div) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridLocalTransformUi, const int& iatom, const int& idxu) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridLocalTransformUi, const int& iatom) const;
  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridLocalComputeZi<chemsnap>, const int& iatom_mod, const int& idxz, const int& iatom_div) const;
  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridLocalComputeZi<chemsnap>, const int& iatom, const int& idxz) const;
  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridLocalComputeZi<chemsnap>, const int& iatom) const;
  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridLocalComputeBi<chemsnap>, const int& iatom_mod, const int& idxb, const int& iatom_div) const;
  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridLocalComputeBi<chemsnap>, const int& iatom, const int& idxb) const;
  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridLocalComputeBi<chemsnap>, const int& iatom) const;
  KOKKOS_INLINE_FUNCTION
  void operator() (TagCSNAGridLocal2Fill,const int& ii) const;
 protected:
  SNAKokkos<DeviceType, real_type, vector_length> snaKK;
  int max_neighs, chunk_size, chunk_offset;
  int host_flag;
  int ntotal;
  int total_range; // total number of loop iterations in grid
  int zlen; //= nzhi-nzlo+1;
  int ylen; //= nyhi-nylo+1;
  int xlen; //= nxhi-nxlo+1;
  double cutsq_tmp; // temporary cutsq until we get a view
  Kokkos::View<real_type*, DeviceType> d_radelem;              // element radii
  Kokkos::View<real_type*, DeviceType> d_wjelem;               // elements weights
  Kokkos::View<real_type**, Kokkos::LayoutRight, DeviceType> d_coeffelem;           // element bispectrum coefficients
  Kokkos::View<real_type*, DeviceType> d_sinnerelem;           // element inner cutoff midpoint
  Kokkos::View<real_type*, DeviceType> d_dinnerelem;           // element inner cutoff half-width
  Kokkos::View<T_INT*, DeviceType> d_ninside;                // ninside for all atoms in list
  Kokkos::View<T_INT*, DeviceType> d_map;                    // mapping from atom types to elements
  Kokkos::View<real_type*, DeviceType> d_test;              // test view
  typedef Kokkos::DualView<F_FLOAT**, DeviceType> tdual_fparams;
  tdual_fparams k_cutsq;
  typedef Kokkos::View<const F_FLOAT**, DeviceType,
      Kokkos::MemoryTraits<Kokkos::RandomAccess> > t_fparams_rnd;
  t_fparams_rnd rnd_cutsq;
  typename AT::t_x_array_randomread x;
  typename AT::t_int_1d_randomread type;
  DAT::tdual_float_2d k_alocal;
  typename AT::t_float_2d d_alocal;
  // Utility routine which wraps computing per-team scratch size requirements for
  // ComputeNeigh, ComputeUi, and ComputeFusedDeidrj
  template <typename scratch_type>
  int scratch_size_helper(int values_per_team);
  class DomainKokkos *domainKK;
  // triclinic vars
  double h0, h1, h2, h3, h4, h5;
  double lo0, lo1, lo2;
  // Make SNAKokkos a friend
  friend class SNAKokkos<DeviceType, real_type, vector_length>;
 };
 // These wrapper classes exist to make the compute style factory happy/avoid having
 // to extend the compute  style factory to support Compute classes w/an arbitrary number
 // of extra template parameters
 template <class DeviceType>
 class ComputeSNAGridLocalKokkosDevice : public ComputeSNAGridLocalKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN> {
 private:
  using Base = ComputeSNAGridLocalKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN>;
 public:
  ComputeSNAGridLocalKokkosDevice(class LAMMPS *, int, char **);
  void compute_local() override;
 };
 #ifdef LMP_KOKKOS_GPU
 template <class DeviceType>
 class ComputeSNAGridLocalKokkosHost : public ComputeSNAGridLocalKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_HOST_VECLEN> {
 private:
  using Base = ComputeSNAGridLocalKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_HOST_VECLEN>;
 public:
  ComputeSNAGridLocalKokkosHost(class LAMMPS *, int, char **);
  void compute_local() override;
 };
 #endif
 }
 #endif
 #endif
--- a/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h
+++ b/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h
@ -0,0 +1,783 @@
 // clang-format off
 /* -*- c++ -*- ----------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   https://www.lammps.org/, Sandia National Laboratories
   LAMMPS development team: developers@lammps.org
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   Contributing authors: Andrew Rohskopf (SNL)
 ------------------------------------------------------------------------- */
 #include "compute_sna_grid_local_kokkos.h"
 #include "pair_snap_kokkos.h"
 #include "atom_kokkos.h"
 #include "atom_masks.h"
 #include "comm.h"
 #include "error.h"
 #include "memory_kokkos.h"
 #include "modify.h"
 #include "neigh_list.h"
 #include "neigh_request.h"
 #include "neighbor_kokkos.h"
 #include "domain.h"
 #include "domain_kokkos.h"
 #include "sna.h"
 #include "update.h"
 #include <cmath>
 #include <cstdlib>
 #include <cstring>
 #include <iostream>
 #define MAXLINE 1024
 #define MAXWORD 3
 namespace LAMMPS_NS {
 // Constructor
 template<class DeviceType, typename real_type, int vector_length>
 ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::ComputeSNAGridLocalKokkos(LAMMPS *lmp, int narg, char **arg) : ComputeSNAGridLocal(lmp, narg, arg)
 {
  kokkosable = 1;
  atomKK = (AtomKokkos *) atom;
  domainKK = (DomainKokkos *) domain;
  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
  datamask_read = EMPTY_MASK;
  datamask_modify = EMPTY_MASK;
  k_cutsq = tdual_fparams("ComputeSNAGridLocalKokkos::cutsq",atom->ntypes+1,atom->ntypes+1);
  auto d_cutsq = k_cutsq.template view<DeviceType>();
  rnd_cutsq = d_cutsq;
  host_flag = (execution_space == Host);
  // TODO: Extract cutsq in double loop below, no need for cutsq_tmp
  cutsq_tmp = cutsq[1][1];
  for (int i = 1; i <= atom->ntypes; i++) {
    for (int j = 1; j <= atom->ntypes; j++){
      k_cutsq.h_view(i,j) = k_cutsq.h_view(j,i) = cutsq_tmp;
      k_cutsq.template modify<LMPHostType>();
    }
  }
   // Set up element lists
  MemKK::realloc_kokkos(d_radelem,"ComputeSNAGridLocalKokkos::radelem",nelements);
  MemKK::realloc_kokkos(d_wjelem,"ComputeSNAGridLocalKokkos:wjelem",nelements);
  MemKK::realloc_kokkos(d_sinnerelem,"ComputeSNAGridLocalKokkos:sinnerelem",nelements);
  MemKK::realloc_kokkos(d_dinnerelem,"ComputeSNAGridLocalKokkos:dinnerelem",nelements);
  // test
  MemKK::realloc_kokkos(d_test, "ComputeSNAGridLocalKokkos::test", nelements);
  int n = atom->ntypes;
  MemKK::realloc_kokkos(d_map,"ComputeSNAGridLocalKokkos::map",n+1);
  auto h_radelem = Kokkos::create_mirror_view(d_radelem);
  auto h_wjelem = Kokkos::create_mirror_view(d_wjelem);
  auto h_sinnerelem = Kokkos::create_mirror_view(d_sinnerelem);
  auto h_dinnerelem = Kokkos::create_mirror_view(d_dinnerelem);
  auto h_map = Kokkos::create_mirror_view(d_map);
  // test
  auto h_test = Kokkos::create_mirror_view(d_test);
  h_test(0) = 2.0;
  // start from index 1 because of how compute sna/grid is
  for (int i = 1; i <= atom->ntypes; i++) {
    h_radelem(i-1) = radelem[i];
    h_wjelem(i-1) = wjelem[i];
    if (switchinnerflag){
      h_sinnerelem(i) = sinnerelem[i];
      h_dinnerelem(i) = dinnerelem[i];
    }
  }
  // In pair snap some things like `map` get allocated regardless of chem flag.
  if (chemflag){
    for (int i = 1; i <= atom->ntypes; i++) {
      h_map(i) = map[i];
    }
  }
  Kokkos::deep_copy(d_radelem,h_radelem);
  Kokkos::deep_copy(d_wjelem,h_wjelem);
  if (switchinnerflag){
    Kokkos::deep_copy(d_sinnerelem,h_sinnerelem);
    Kokkos::deep_copy(d_dinnerelem,h_dinnerelem);
  }
  if (chemflag){
    Kokkos::deep_copy(d_map,h_map);
  }
  Kokkos::deep_copy(d_test,h_test);
  snaKK = SNAKokkos<DeviceType, real_type, vector_length>(*this);
  snaKK.grow_rij(0,0);
  snaKK.init();
 }
 // Destructor
 template<class DeviceType, typename real_type, int vector_length>
 ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::~ComputeSNAGridLocalKokkos()
 {
  if (copymode) return;
  memoryKK->destroy_kokkos(k_cutsq,cutsq);
  memoryKK->destroy_kokkos(k_alocal,alocal);
 }
 // Setup
 template<class DeviceType, typename real_type, int vector_length>
 void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::setup()
 {
  ComputeGridLocal::setup();
  // allocate arrays
  memoryKK->create_kokkos(k_alocal, alocal, size_local_rows, size_local_cols, "grid:alocal");
  array_local = alocal;
  d_alocal = k_alocal.template view<DeviceType>();
 }
 // Compute
 template<class DeviceType, typename real_type, int vector_length>
 void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::compute_local()
 {
  if (host_flag) {
    ComputeSNAGridLocal::compute_array();
    return;
  }
  copymode = 1;
  zlen = nzhi-nzlo+1;
  ylen = nyhi-nylo+1;
  xlen = nxhi-nxlo+1;
  total_range = (nzhi-nzlo+1)*(nyhi-nylo+1)*(nxhi-nxlo+1);
  atomKK->sync(execution_space,X_MASK|F_MASK|TYPE_MASK);
  x = atomKK->k_x.view<DeviceType>();
  type = atomKK->k_type.view<DeviceType>();
  k_cutsq.template sync<DeviceType>();
  // max_neighs is defined here - think of more elaborate methods.
  max_neighs = 100;
  // Pair snap/kk uses grow_ij with some max number of neighs but compute sna/grid uses total
  // number of atoms.
  ntotal = atomKK->nlocal + atomKK->nghost;
  // Allocate view for number of neighbors per grid point
  MemKK::realloc_kokkos(d_ninside,"ComputeSNAGridLocalKokkos:ninside",total_range);
  // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user
  // `total_range` is the number of grid points which may be larger than chunk size.
  chunk_size = MIN(chunksize, total_range);
  chunk_offset = 0;
  //snaKK.grow_rij(chunk_size, ntotal);
  snaKK.grow_rij(chunk_size, max_neighs);
  //chunk_size = total_range;
  // Pre-compute ceil(chunk_size / vector_length) for code cleanliness
  const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length;
  if (triclinic) {
    h0 = domain->h[0];
    h1 = domain->h[1];
    h2 = domain->h[2];
    h3 = domain->h[3];
    h4 = domain->h[4];
    h5 = domain->h[5];
    lo0 = domain->boxlo[0];
    lo1 = domain->boxlo[1];
    lo2 = domain->boxlo[2];
  }
  while (chunk_offset < total_range) { // chunk up loop to prevent running out of memory
    if (chunk_size > total_range - chunk_offset)
      chunk_size = total_range - chunk_offset;
    //ComputeNeigh
    {
      int scratch_size = scratch_size_helper<int>(team_size_compute_neigh * max_neighs); //ntotal);
      SnapAoSoATeamPolicy<DeviceType, team_size_compute_neigh, TagCSNAGridLocalComputeNeigh>
        policy_neigh(chunk_size, team_size_compute_neigh, vector_length);
      policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
      Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this);
    }
    //ComputeCayleyKlein
    {
      // tile_size_compute_ck is defined in `compute_sna_grid_kokkos.h`
      Snap3DRangePolicy<DeviceType, tile_size_compute_ck, TagCSNAGridLocalComputeCayleyKlein>
        policy_compute_ck({0,0,0}, {vector_length, max_neighs, chunk_size_div}, {vector_length, tile_size_compute_ck, 1});
      Kokkos::parallel_for("ComputeCayleyKlein", policy_compute_ck, *this);
    }
    //PreUi
    {
      auto policy_pre_ui = snap_get_policy<DeviceType, tile_size_pre_ui, TagCSNAGridLocalPreUi>(chunk_size_div, twojmax + 1);
      Kokkos::parallel_for("PreUi", policy_pre_ui, *this);
    }
    // ComputeUi w/ vector parallelism, shared memory, direct atomicAdd into ulisttot
    {
      // team_size_compute_ui is defined in `compute_sna_grid_kokkos.h`
      // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer
      const int tile_size = vector_length * (twojmax + 1);
      const int scratch_size = scratch_size_helper<complex>(team_size_compute_ui * tile_size);
      if (chunk_size < parallel_thresh)
      {
        // Version with parallelism over j_bend
        // total number of teams needed: (natoms / 32) * (ntotal) * ("bend" locations)
        const int n_teams = chunk_size_div * max_neighs * (twojmax + 1);
        const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
        SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagCSNAGridLocalComputeUiSmall>
          policy_ui(n_teams_div, team_size_compute_ui, vector_length);
        policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
        Kokkos::parallel_for("ComputeUiSmall", policy_ui, *this);
      } else {
        // Version w/out parallelism over j_bend
        // total number of teams needed: (natoms / 32) * (ntotal)
        const int n_teams = chunk_size_div * max_neighs;
        const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
        SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagCSNAGridLocalComputeUiLarge>
          policy_ui(n_teams_div, team_size_compute_ui, vector_length);
        policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
        Kokkos::parallel_for("ComputeUiLarge", policy_ui, *this);
      }
    }
    //TransformUi: un-"fold" ulisttot, zero ylist
    {
      // Expand ulisttot_re,_im -> ulisttot
      // Zero out ylist
      auto policy_transform_ui = snap_get_policy<DeviceType, tile_size_transform_ui, TagCSNAGridLocalTransformUi>(chunk_size_div, snaKK.idxu_max);
      Kokkos::parallel_for("TransformUi", policy_transform_ui, *this);
    }
    //Compute bispectrum
    // team_size_[compute_zi, compute_bi, transform_bi] are defined in `pair_snap_kokkos.h`
    //ComputeZi and Bi
    if (nelements > 1) {
      auto policy_compute_zi = snap_get_policy<DeviceType, tile_size_compute_zi, TagCSNAGridLocalComputeZi<true>, min_blocks_compute_zi>(chunk_size_div, snaKK.idxz_max);
      Kokkos::parallel_for("ComputeZiChemsnap", policy_compute_zi, *this);
      auto policy_compute_bi = snap_get_policy<DeviceType, tile_size_compute_bi, TagCSNAGridLocalComputeBi<true>>(chunk_size_div, snaKK.idxb_max);
      Kokkos::parallel_for("ComputeBiChemsnap", policy_compute_bi, *this);
    } else {
      auto policy_compute_zi = snap_get_policy<DeviceType, tile_size_compute_zi, TagCSNAGridLocalComputeZi<false>, min_blocks_compute_zi>(chunk_size_div, snaKK.idxz_max);
      Kokkos::parallel_for("ComputeZi", policy_compute_zi, *this);
      auto policy_compute_bi = snap_get_policy<DeviceType, tile_size_compute_bi, TagCSNAGridLocalComputeBi<false>>(chunk_size_div, snaKK.idxb_max);
      Kokkos::parallel_for("ComputeBi", policy_compute_bi, *this);
    }
    // Fill the grid array with bispectrum values
    {
      typename Kokkos::RangePolicy<DeviceType,TagCSNAGridLocal2Fill> policy_fill(0,chunk_size);
      Kokkos::parallel_for(policy_fill, *this);
    }
    // Proceed to the next chunk.
    chunk_offset += chunk_size;
  } // end while
  copymode = 0;
  k_alocal.template modify<DeviceType>();
  k_alocal.template sync<LMPHostType>();
 }
 /* ----------------------------------------------------------------------
   Begin routines that are unique to the GPU codepath. These take advantage
   of AoSoA data layouts and scratch memory for recursive polynomials
 ------------------------------------------------------------------------- */
 /*
 Simple team policy functor seeing how many layers deep we can go with the parallelism.
 */
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridLocalComputeNeigh>::member_type& team) const {
  // This function follows similar procedure as ComputeNeigh of PairSNAPKokkos.
  // Main difference is that we don't use the neighbor class or neighbor variables here.
  // This is because the grid points are not atoms and therefore do not get assigned
  // neighbors in LAMMPS.
  // TODO: If we did make a neighborlist for each grid point, we could use current
  //       routines and avoid having to loop over all atoms (which limits us to
  //       natoms = max team size).
  // basic quantities associated with this team:
  // team_rank : rank of thread in this team
  // league_rank : rank of team in this league
  // team_size : number of threads in this team
  // extract loop index
  int ii = team.team_rank() + team.league_rank() * team.team_size();
  if (ii >= chunk_size) return;
  // extract grid index
  int igrid = ii + chunk_offset;
  // get a pointer to scratch memory
  // This is used to cache whether or not an atom is within the cutoff.
  // If it is, type_cache is assigned to the atom type.
  // If it's not, it's assigned to -1.
  //const int tile_size = ntotal; //max_neighs; // number of elements per thread
  //const int team_rank = team.team_rank();
  //const int scratch_shift = team_rank * tile_size; // offset into pointer for entire team
  //int* type_cache = (int*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(int), 0) + scratch_shift;
  // convert to grid indices
  int iz = igrid/(xlen*ylen);
  int i2 = igrid - (iz*xlen*ylen);
  int iy = i2/xlen;
  int ix = i2 % xlen;
  iz += nzlo;
  iy += nylo;
  ix += nxlo;
  double xgrid[3];
  // index ii already captures the proper grid point
  //int igrid = iz * (nx * ny) + iy * nx + ix;
  // grid2x converts igrid to ix,iy,iz like we've done before
  // multiply grid integers by grid spacing delx, dely, delz
  //grid2x(igrid, xgrid);
  xgrid[0] = ix * delx;
  xgrid[1] = iy * dely;
  xgrid[2] = iz * delz;
  if (triclinic) {
    // Do a conversion on `xgrid` here like we do in the CPU version.
    // Can't do this:
    // domainKK->lamda2x(xgrid, xgrid);
    // Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed
    // Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats.
    xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0;
    xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1;
    xgrid[2] = h2*xgrid[2] + lo2;
  }
  const F_FLOAT xtmp = xgrid[0];
  const F_FLOAT ytmp = xgrid[1];
  const F_FLOAT ztmp = xgrid[2];
  // Zeroing out the components, which are filled as a sum.
  for (int icol = size_local_cols_base; icol < size_local_cols; icol++){
    d_alocal(igrid, icol) = 0.0;
  }
  // Fill grid info columns
  d_alocal(igrid, 0) = ix;
  d_alocal(igrid, 1) = iy;
  d_alocal(igrid, 2) = iz;
  d_alocal(igrid, 3) = xtmp;
  d_alocal(igrid, 4) = ytmp;
  d_alocal(igrid, 5) = ztmp;
  // currently, all grid points are type 1
  // not clear what a better choice would be
  const int itype = 1;
  int ielem = 0;
  if (chemflag) ielem = d_map[itype];
  //const double radi = d_radelem[ielem];
  // Compute the number of neighbors, store rsq
  int ninside = 0;
  // Looping over ntotal for now.
  for (int j = 0; j < ntotal; j++){
    const F_FLOAT dx = x(j,0) - xtmp;
    const F_FLOAT dy = x(j,1) - ytmp;
    const F_FLOAT dz = x(j,2) - ztmp;
    int jtype = type(j);
    const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
    // don't include atoms that share location with grid point
    if (rsq >= rnd_cutsq(itype,jtype) || rsq < 1e-20) {
      jtype = -1; // use -1 to signal it's outside the radius
    }
    if (jtype >= 0)
      ninside++;
  }
  d_ninside(ii) = ninside;
  // TODO: Adjust for multi-element, currently we set jelem = 0 regardless of type.
  int offset = 0;
  for (int j = 0; j < ntotal; j++){
    //const int jtype = type_cache[j];
    //if (jtype >= 0) {
    const F_FLOAT dx = x(j,0) - xtmp;
    const F_FLOAT dy = x(j,1) - ytmp;
    const F_FLOAT dz = x(j,2) - ztmp;
    const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
    int jtype = type(j);
    if (rsq < rnd_cutsq(itype,jtype) && rsq > 1e-20) {
      int jelem = 0;
      if (chemflag) jelem = d_map[jtype];
      snaKK.rij(ii,offset,0) = static_cast<real_type>(dx);
      snaKK.rij(ii,offset,1) = static_cast<real_type>(dy);
      snaKK.rij(ii,offset,2) = static_cast<real_type>(dz);
      // pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp
      // actually since the views here have values starting at 0, let's use jelem
      snaKK.wj(ii,offset) = static_cast<real_type>(d_wjelem[jelem]);
      snaKK.rcutij(ii,offset) = static_cast<real_type>((2.0 * d_radelem[jelem])*rcutfac);
      snaKK.inside(ii,offset) = j;
      if (switchinnerflag) {
        snaKK.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]);
        snaKK.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]);
      }
      if (chemflag)
        snaKK.element(ii,offset) = jelem;
      else
        snaKK.element(ii,offset) = 0;
      offset++;
    }
  }
 }
 /* ----------------------------------------------------------------------
  Pre-compute the Cayley-Klein parameters for reuse in later routines
 ------------------------------------------------------------------------- */
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeCayleyKlein,const int iatom_mod, const int jnbor, const int iatom_div) const {
  const int iatom = iatom_mod + iatom_div * vector_length;
  if (iatom >= chunk_size) return;
  const int ninside = d_ninside(iatom);
  if (jnbor >= ninside) return;
  snaKK.compute_cayley_klein(iatom, jnbor);
 }
 /* ----------------------------------------------------------------------
  Initialize the "ulisttot" structure with non-zero on-diagonal terms
  and zero terms elsewhere
 ------------------------------------------------------------------------- */
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalPreUi, const int& iatom_mod, const int& j, const int& iatom_div) const {
  const int iatom = iatom_mod + iatom_div * vector_length;
  if (iatom >= chunk_size) return;
  int itype = type(iatom);
  int ielem = d_map[itype];
  snaKK.pre_ui(iatom, j, ielem);
 }
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalPreUi, const int& iatom, const int& j) const {
  if (iatom >= chunk_size) return;
  int itype = type(iatom);
  int ielem = d_map[itype];
  snaKK.pre_ui(iatom, j, ielem);
 }
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalPreUi, const int& iatom) const {
  if (iatom >= chunk_size) return;
  const int itype = type(iatom);
  const int ielem = d_map[itype];
  for (int j = 0; j <= twojmax; j++)
    snaKK.pre_ui(iatom, j, ielem);
 }
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridLocalComputeUiSmall>::member_type& team) const {
  // extract flattened atom_div / neighbor number / bend location
  int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
  // extract neighbor index, iatom_div
  int iatom_div = flattened_idx / (max_neighs * (twojmax + 1)); // removed "const" to work around GCC 7 bug
  const int jj_jbend = flattened_idx - iatom_div * (max_neighs * (twojmax + 1));
  const int jbend = jj_jbend / max_neighs;
  int jj = jj_jbend - jbend * max_neighs; // removed "const" to work around GCC 7 bug
  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
    [&] (const int iatom_mod) {
    const int ii = iatom_mod + vector_length * iatom_div;
    if (ii >= chunk_size) return;
    const int ninside = d_ninside(ii);
    if (jj >= ninside) return;
    snaKK.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div);
  });
 }
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridLocalComputeUiLarge>::member_type& team) const {
  // extract flattened atom_div / neighbor number / bend location
  int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
  // extract neighbor index, iatom_div
  int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug
  int jj = flattened_idx - iatom_div * max_neighs;
  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
    [&] (const int iatom_mod) {
    const int ii = iatom_mod + vector_length * iatom_div;
    if (ii >= chunk_size) return;
    const int ninside = d_ninside(ii);
    if (jj >= ninside) return;
    snaKK.compute_ui_large(team,iatom_mod, jj, iatom_div);
  });
 }
 /* ----------------------------------------------------------------------
  De-symmetrize ulisttot_re and _im and pack it into a unified ulisttot
  structure. Zero-initialize ylist. CPU and GPU.
 ------------------------------------------------------------------------- */
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalTransformUi, const int& iatom_mod, const int& idxu, const int& iatom_div) const {
  const int iatom = iatom_mod + iatom_div * vector_length;
  if (iatom >= chunk_size) return;
  if (idxu >= snaKK.idxu_max) return;
  snaKK.transform_ui(iatom, idxu);
 }
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalTransformUi, const int& iatom, const int& idxu) const {
  if (iatom >= chunk_size) return;
  snaKK.transform_ui(iatom, idxu);
 }
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalTransformUi, const int& iatom) const {
  if (iatom >= chunk_size) return;
  for (int idxu = 0; idxu < snaKK.idxu_max; idxu++)
    snaKK.transform_ui(iatom, idxu);
 }
 /* ----------------------------------------------------------------------
  Compute all elements of the Z tensor and store them into the `zlist`
   view
 ------------------------------------------------------------------------- */
 template<class DeviceType, typename real_type, int vector_length>
 template <bool chemsnap> KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeZi<chemsnap>, const int& iatom_mod, const int& jjz, const int& iatom_div) const {
  const int iatom = iatom_mod + iatom_div * vector_length;
  if (iatom >= chunk_size) return;
  if (jjz >= snaKK.idxz_max) return;
  snaKK.template compute_zi<chemsnap>(iatom, jjz);
 }
 template<class DeviceType, typename real_type, int vector_length>
 template <bool chemsnap> KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeZi<chemsnap>, const int& iatom, const int& jjz) const {
  if (iatom >= chunk_size) return;
  snaKK.template compute_zi<chemsnap>(iatom, jjz);
 }
 template<class DeviceType, typename real_type, int vector_length>
 template <bool chemsnap> KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeZi<chemsnap>, const int& iatom) const {
  if (iatom >= chunk_size) return;
  for (int jjz = 0; jjz < snaKK.idxz_max; jjz++)
    snaKK.template compute_zi<chemsnap>(iatom, jjz);
 }
 /* ----------------------------------------------------------------------
  Compute the energy triple products and store in the "blist" view
 ------------------------------------------------------------------------- */
 template<class DeviceType, typename real_type, int vector_length>
 template <bool chemsnap> KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeBi<chemsnap>, const int& iatom_mod, const int& jjb, const int& iatom_div) const {
  const int iatom = iatom_mod + iatom_div * vector_length;
  if (iatom >= chunk_size) return;
  if (jjb >= snaKK.idxb_max) return;
  snaKK.template compute_bi<chemsnap>(iatom, jjb);
 }
 template<class DeviceType, typename real_type, int vector_length>
 template <bool chemsnap> KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeBi<chemsnap>, const int& iatom, const int& jjb) const {
  if (iatom >= chunk_size) return;
  snaKK.template compute_bi<chemsnap>(iatom, jjb);
 }
 template<class DeviceType, typename real_type, int vector_length>
 template <bool chemsnap> KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeBi<chemsnap>, const int& iatom) const {
  if (iatom >= chunk_size) return;
  for (int jjb = 0; jjb < snaKK.idxb_max; jjb++)
    snaKK.template compute_bi<chemsnap>(iatom, jjb);
 }
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocal2Fill, const int& ii) const {
  // extract grid index
  int igrid = ii + chunk_offset;
  // convert to grid indices
  int iz = igrid/(xlen*ylen);
  int i2 = igrid - (iz*xlen*ylen);
  int iy = i2/xlen;
  int ix = i2 % xlen;
  iz += nzlo;
  iy += nylo;
  ix += nxlo;
  double xgrid[3];
  // index ii already captures the proper grid point
  // int igrid = iz * (nx * ny) + iy * nx + ix;
  // printf("ii igrid: %d %d\n", ii, igrid);
  // grid2x converts igrid to ix,iy,iz like we've done before
  //grid2x(igrid, xgrid);
  xgrid[0] = ix * delx;
  xgrid[1] = iy * dely;
  xgrid[2] = iz * delz;
  if (triclinic) {
    // Do a conversion on `xgrid` here like we do in the CPU version.
    // Can't do this:
    // domainKK->lamda2x(xgrid, xgrid);
    // Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed
    // Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats.
    xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0;
    xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1;
    xgrid[2] = h2*xgrid[2] + lo2;
  }
  const auto idxb_max = snaKK.idxb_max;
  // linear contributions
  for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
    const auto idxb = icoeff % idxb_max;
    const auto idx_chem = icoeff / idxb_max;
    d_alocal(igrid,icoeff+6) = snaKK.blist(ii,idx_chem,idxb);
  }
 }
 /* ----------------------------------------------------------------------
   utility functions
 ------------------------------------------------------------------------- */
 template<class DeviceType, typename real_type, int vector_length>
 template<class TagStyle>
 void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::check_team_size_for(int inum, int &team_size) {
  int team_size_max;
  team_size_max = Kokkos::TeamPolicy<DeviceType,TagStyle>(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelForTag());
  if (team_size*vector_length > team_size_max)
    team_size = team_size_max/vector_length;
 }
 template<class DeviceType, typename real_type, int vector_length>
 template<class TagStyle>
 void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::check_team_size_reduce(int inum, int &team_size) {
  int team_size_max;
  team_size_max = Kokkos::TeamPolicy<DeviceType,TagStyle>(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelReduceTag());
  if (team_size*vector_length > team_size_max)
    team_size = team_size_max/vector_length;
 }
 template<class DeviceType, typename real_type, int vector_length>
 template<typename scratch_type>
 int ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::scratch_size_helper(int values_per_team) {
  typedef Kokkos::View<scratch_type*, Kokkos::DefaultExecutionSpace::scratch_memory_space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > ScratchViewType;
  return ScratchViewType::shmem_size(values_per_team);
 }
 /* ---------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   routines used by template reference classes
 ------------------------------------------------------------------------- */
 template<class DeviceType>
 ComputeSNAGridLocalKokkosDevice<DeviceType>::ComputeSNAGridLocalKokkosDevice(class LAMMPS *lmp, int narg, char **arg)
   : ComputeSNAGridLocalKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN>(lmp, narg, arg) { ; }
 template<class DeviceType>
 void ComputeSNAGridLocalKokkosDevice<DeviceType>::compute_local()
 {
  Base::compute_local();
 }
 #ifdef LMP_KOKKOS_GPU
 template<class DeviceType>
 ComputeSNAGridLocalKokkosHost<DeviceType>::ComputeSNAGridLocalKokkosHost(class LAMMPS *lmp, int narg, char **arg)
   : ComputeSNAGridLocalKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_HOST_VECLEN>(lmp, narg, arg) { ; }
 template<class DeviceType>
 void ComputeSNAGridLocalKokkosHost<DeviceType>::compute_local()
 {
  Base::compute_local();
 }
 #endif
 }
--- a/src/KOKKOS/kokkos_type.h
+++ b/src/KOKKOS/kokkos_type.h
@ -792,6 +792,14 @@ typedef tdual_float_3d::t_dev_um t_float_3d_um;
 typedef tdual_float_3d::t_dev_const_um t_float_3d_const_um;
 typedef tdual_float_3d::t_dev_const_randomread t_float_3d_randomread;
 //4d float array n
 typedef Kokkos::DualView<LMP_FLOAT****, Kokkos::LayoutRight, LMPDeviceType> tdual_float_4d;
 typedef tdual_float_4d::t_dev t_float_4d;
 typedef tdual_float_4d::t_dev_const t_float_4d_const;
 typedef tdual_float_4d::t_dev_um t_float_4d_um;
 typedef tdual_float_4d::t_dev_const_um t_float_4d_const_um;
 typedef tdual_float_4d::t_dev_const_randomread t_float_4d_randomread;
 #ifdef LMP_KOKKOS_NO_LEGACY
 typedef Kokkos::DualView<X_FLOAT*[4], Kokkos::LayoutLeft, LMPDeviceType> tdual_float_1d_4;
 #else
@ -1126,6 +1134,14 @@ typedef tdual_float_3d::t_host_um t_float_3d_um;
 typedef tdual_float_3d::t_host_const_um t_float_3d_const_um;
 typedef tdual_float_3d::t_host_const_randomread t_float_3d_randomread;
 //4d float array n
 typedef Kokkos::DualView<LMP_FLOAT****, Kokkos::LayoutRight, LMPDeviceType> tdual_float_4d;
 typedef tdual_float_4d::t_host t_float_4d;
 typedef tdual_float_4d::t_host_const t_float_4d_const;
 typedef tdual_float_4d::t_host_um t_float_4d_um;
 typedef tdual_float_4d::t_host_const_um t_float_4d_const_um;
 typedef tdual_float_4d::t_host_const_randomread t_float_4d_randomread;
 #ifdef LMP_KOKKOS_NO_LEGACY
 typedef Kokkos::DualView<X_FLOAT*[4], Kokkos::LayoutLeft, LMPDeviceType> tdual_float_1d_4;
 #else
--- a/src/KOKKOS/memory_kokkos.h
+++ b/src/KOKKOS/memory_kokkos.h
@ -101,6 +101,7 @@ template <typename TYPE, typename HTYPE>
 {
  data = TYPE(std::string(name),n1,n2);
  h_data = Kokkos::create_mirror_view(data);
  //printf(">>> name: %s\n", name);
  return data;
 }
@ -111,6 +112,7 @@ TYPE create_kokkos(TYPE &data, typename TYPE::value_type **&array,
  data = TYPE(std::string(name),n1,n2);
  bigint nbytes = ((bigint) sizeof(typename TYPE::value_type *)) * n1;
  array = (typename TYPE::value_type **) smalloc(nbytes,name);
  //printf(">>> name %s nbytes %d\n", name, nbytes);
  for (int i = 0; i < n1; i++) {
    if (n2 == 0)
@ -121,6 +123,56 @@ TYPE create_kokkos(TYPE &data, typename TYPE::value_type **&array,
  return data;
 }
 /* ----------------------------------------------------------------------
   create a 4d array with indices 2,3,4 offset, but not first
   2nd index from n2lo to n2hi inclusive
   3rd index from n3lo to n3hi inclusive
   4th index from n4lo to n4hi inclusive
   cannot grow it
 ------------------------------------------------------------------------- */
 template <typename TYPE>
 TYPE create4d_offset_kokkos(TYPE &data, typename TYPE::value_type ****&array,
                             int n1, int n2lo, int n2hi, int n3lo, int n3hi, int n4lo, int n4hi,
                             const char *name)
 {
  //if (n1 <= 0 || n2lo > n2hi || n3lo > n3hi || n4lo > n4hi) array =  nullptr;
  printf("^^^^^ memoryKK->create_4d_offset_kokkos\n");
  int n2 = n2hi - n2lo + 1;
  int n3 = n3hi - n3lo + 1;
  int n4 = n4hi - n4lo + 1;
  data = TYPE(std::string(name),n1,n2,n3,n4);
  bigint nbytes = ((bigint) sizeof(typename TYPE::value_type ***)) * n1;
  array = (typename TYPE::value_type ****) smalloc(nbytes,name);
  for (int i = 0; i < n1; i++) {
    if (n2 == 0) {
      array[i] = nullptr;
    } else {
      nbytes = ((bigint) sizeof(typename TYPE::value_type **)) * n2;
      array[i] = (typename TYPE::value_type ***) smalloc(nbytes,name);
      for (int j = 0; j < n2; j++){
        if (n3 == 0){
          array[i][j] = nullptr;
        } else {
          nbytes = ((bigint) sizeof(typename TYPE::value_type *)) * n3;
          array[i][j] = (typename TYPE::value_type **) smalloc(nbytes, name);
          for (int k = 0; k < n3; k++){
            if (n4 == 0)
              array[i][j][k] = nullptr;
            else
              array[i][j][k] = &data.h_view(i,j,k,0);
          }
        }
      }
    }
  }
  return data;
 }
 template <typename TYPE, typename HTYPE>
  TYPE create_kokkos(TYPE &data, HTYPE &h_data,
                     typename TYPE::value_type **&array, int n1, int n2,
--- a/src/KOKKOS/pair_mliap_kokkos.cpp
+++ b/src/KOKKOS/pair_mliap_kokkos.cpp
@ -240,6 +240,7 @@ void PairMLIAPKokkos<DeviceType>::coeff(int narg, char **arg) {
      if (strcmp(elemname,descriptor->elements[jelem]) == 0)
        break;
    //printf(">>> nelements: %d\n", descriptor->nelements);
    if (jelem < descriptor->nelements)
      map[i] = jelem;
    else if (strcmp(elemname,"NULL") == 0) map[i] = -1;
--- a/src/KOKKOS/pair_snap_kokkos.h
+++ b/src/KOKKOS/pair_snap_kokkos.h
@ -375,7 +375,6 @@ class PairSNAPKokkos : public PairSNAP {
  // Make SNAKokkos a friend
  friend class SNAKokkos<DeviceType, real_type, vector_length>;
 };
--- a/src/KOKKOS/pair_snap_kokkos_impl.h
+++ b/src/KOKKOS/pair_snap_kokkos_impl.h
@ -3,12 +3,10 @@
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   https://www.lammps.org/, Sandia National Laboratories
   LAMMPS development team: developers@lammps.org
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
@ -39,17 +37,6 @@
 namespace LAMMPS_NS {
 // Outstanding issues with quadratic term
 // 1. there seems to a problem with compute_optimized energy calc
 // it does not match compute_regular, even when quadratic coeffs = 0
 //static double t1 = 0.0;
 //static double t2 = 0.0;
 //static double t3 = 0.0;
 //static double t4 = 0.0;
 //static double t5 = 0.0;
 //static double t6 = 0.0;
 //static double t7 = 0.0;
 /* ---------------------------------------------------------------------- */
 template<class DeviceType, typename real_type, int vector_length>
@ -219,7 +206,8 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::compute(int eflag_in,
      // team_size_compute_neigh is defined in `pair_snap_kokkos.h`
      int scratch_size = scratch_size_helper<int>(team_size_compute_neigh * max_neighs);
-      SnapAoSoATeamPolicy<DeviceType, team_size_compute_neigh, TagPairSNAPComputeNeigh> policy_neigh(chunk_size,team_size_compute_neigh,vector_length);
+      SnapAoSoATeamPolicy<DeviceType, team_size_compute_neigh, TagPairSNAPComputeNeigh>
        policy_neigh(chunk_size,team_size_compute_neigh,vector_length);
      policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
      Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this);
    }
@ -259,7 +247,8 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::compute(int eflag_in,
        const int n_teams = chunk_size_div * max_neighs * (twojmax + 1);
        const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
-        SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagPairSNAPComputeUiSmall> policy_ui(n_teams_div, team_size_compute_ui, vector_length);
+        SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagPairSNAPComputeUiSmall>
          policy_ui(n_teams_div, team_size_compute_ui, vector_length);
        policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
        Kokkos::parallel_for("ComputeUiSmall",policy_ui,*this);
      } else {
@ -269,7 +258,8 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::compute(int eflag_in,
        const int n_teams = chunk_size_div * max_neighs;
        const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
-        SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagPairSNAPComputeUiLarge> policy_ui(n_teams_div, team_size_compute_ui, vector_length);
+        SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagPairSNAPComputeUiLarge>
          policy_ui(n_teams_div, team_size_compute_ui, vector_length);
        policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
        Kokkos::parallel_for("ComputeUiLarge",policy_ui,*this);
      }
@ -536,8 +526,7 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::coeff(int narg, char
  Kokkos::deep_copy(d_dinnerelem,h_dinnerelem);
  Kokkos::deep_copy(d_map,h_map);
-  snaKK = SNAKokkos<DeviceType, real_type, vector_length>(*this); //rfac0,twojmax,
+  snaKK = SNAKokkos<DeviceType, real_type, vector_length>(*this);
    //rmin0,switchflag,bzeroflag,chemflag,bnormflag,wselfallflag,nelements,switchinnerflag);
  snaKK.grow_rij(0,0);
  snaKK.init();
 }
--- a/src/KOKKOS/sna_kokkos.h
+++ b/src/KOKKOS/sna_kokkos.h
@ -29,7 +29,9 @@
 #endif
 namespace LAMMPS_NS {
-
+// copied from pair_snap_kokkos.h
 // pre-declare so sna_kokkos.h can refer to it
 template<class DeviceType, typename real_type_, int vector_length_> class PairSNAPKokkos;
 template<typename real_type_, int vector_length_>
 struct WignerWrapper {
  using real_type = real_type_;
@ -170,9 +172,9 @@ class SNAKokkos {
  KOKKOS_INLINE_FUNCTION
  SNAKokkos(const SNAKokkos<DeviceType,real_type,vector_length>& sna, const typename Kokkos::TeamPolicy<DeviceType>::member_type& team);
  template<class CopyClass>
  inline
-  //SNAKokkos(real_type, int, real_type, int, int, int, int, int, int, int);
+  SNAKokkos(const CopyClass&);
  SNAKokkos(const PairSNAPKokkos<DeviceType, real_type, vector_length>&);
  KOKKOS_INLINE_FUNCTION
  ~SNAKokkos();
--- a/src/KOKKOS/sna_kokkos_impl.h
+++ b/src/KOKKOS/sna_kokkos_impl.h
@ -29,17 +29,18 @@ static const double MY_PI  = 3.14159265358979323846; // pi
 static const double MY_PI2  = 1.57079632679489661923; // pi/2
 template<class DeviceType, typename real_type, int vector_length>
 template<class CopyClass>
 inline
-SNAKokkos<DeviceType, real_type, vector_length>::SNAKokkos(const PairSNAPKokkos<DeviceType, real_type, vector_length>& psk)
+SNAKokkos<DeviceType, real_type, vector_length>::SNAKokkos(const CopyClass& copy)
-  : rfac0(psk.rfac0), rmin0(psk.rmin0), switch_flag(psk.switchflag),
+  : twojmax(copy.twojmax), d_coeffelem(copy.d_coeffelem), rmin0(copy.rmin0),
-    bzero_flag(psk.bzeroflag), chem_flag(psk.chemflag), bnorm_flag(psk.bnormflag),
+    rfac0(copy.rfac0), switch_flag(copy.switchflag), switch_inner_flag(copy.switchinnerflag),
-    wselfall_flag(psk.wselfallflag), switch_inner_flag(psk.switchinnerflag),
+    chem_flag(copy.chemflag), bnorm_flag(copy.bnormflag), wselfall_flag(copy.wselfallflag),
-    quadratic_flag(psk.quadraticflag), twojmax(psk.twojmax), d_coeffelem(psk.d_coeffelem)
+    quadratic_flag(copy.quadraticflag), bzero_flag(copy.bzeroflag)
 {
  wself = static_cast<real_type>(1.0);
  if (chem_flag)
-    nelements = psk.nelements;
+    nelements = copy.nelements;
  else
    nelements = 1;
@ -611,7 +612,6 @@ void SNAKokkos<DeviceType, real_type, vector_length>::evaluate_ui_jbend(const Wi
    }
    ulist_wrapper.set(ma, ulist_accum);
    mb++;
  }
@ -830,7 +830,6 @@ typename SNAKokkos<DeviceType, real_type, vector_length>::complex SNAKokkos<Devi
  int jju1 = idxu_block[j1] + (j1+1)*mb1min;
  int jju2 = idxu_block[j2] + (j2+1)*mb2max;
  int icgb = mb1min*(j2+1) + mb2max;
  #ifdef LMP_KK_DEVICE_COMPILE
  #pragma unroll
  #endif
--- a/src/ML-SNAP/compute_gaussian_grid_local.cpp
+++ b/src/ML-SNAP/compute_gaussian_grid_local.cpp
@ -0,0 +1,166 @@
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   https://www.lammps.org/ Sandia National Laboratories
   LAMMPS development team: developers@lammps.org
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 #include "compute_gaussian_grid_local.h"
 #include "atom.h"
 #include "comm.h"
 #include "error.h"
 #include "force.h"
 #include "math_const.h"
 #include "math_special.h"
 #include "memory.h"
 #include "modify.h"
 #include "update.h"
 #include <cmath>
 #include <cstring>
 using namespace LAMMPS_NS;
 using MathConst::MY_2PI;
 using MathSpecial::powint;
 ComputeGaussianGridLocal::ComputeGaussianGridLocal(LAMMPS *lmp, int narg, char **arg) :
    ComputeGridLocal(lmp, narg, arg), cutsq(nullptr), radelem(nullptr),
    sigmaelem(nullptr), prefacelem(nullptr), argfacelem(nullptr)
 {
  // skip over arguments used by base class
  // so that argument positions are identical to
  // regular per-atom compute
  arg += nargbase;
  narg -= nargbase;
  //double rfac0, rmin0;
  //int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;
  int ntypes = atom->ntypes;
  int nargmin = 4 + 2 * ntypes;
  if (narg < nargmin) error->all(FLERR, "Illegal compute {} command", style);
  // process required arguments
  memory->create(radelem, ntypes + 1, "gaussian/atom:radelem");    // offset by 1 to match up with types
  memory->create(sigmaelem, ntypes + 1, "gaussian/atom:sigmaelem");
  memory->create(prefacelem, ntypes + 1, "gaussian/atom:prefacelem");
  memory->create(argfacelem, ntypes + 1, "gaussian/atom:argfacelem");
  rcutfac = utils::numeric(FLERR, arg[3], false, lmp);
  for (int i = 0; i < ntypes; i++) radelem[i + 1] = utils::numeric(FLERR, arg[4 + i], false, lmp);
  for (int i = 0; i < ntypes; i++)
    sigmaelem[i + 1] = utils::numeric(FLERR, arg[ntypes + 4 + i], false, lmp);
  // construct cutsq
  double cut;
  cutmax = 0.0;
  memory->create(cutsq, ntypes + 1, ntypes + 1, "gaussian/atom:cutsq");
  for (int i = 1; i <= ntypes; i++) {
    cut = 2.0 * radelem[i] * rcutfac;
    if (cut > cutmax) cutmax = cut;
    cutsq[i][i] = cut * cut;
    for (int j = i + 1; j <= ntypes; j++) {
      cut = (radelem[i] + radelem[j]) * rcutfac;
      cutsq[i][j] = cutsq[j][i] = cut * cut;
    }
  }
  size_local_cols = size_local_cols_base + ntypes;
  // pre-compute coefficients
  for (int i = 0; i < ntypes; i++) {
    prefacelem[i + 1] = 1.0/powint(sigmaelem[i + 1] * sqrt(MY_2PI), 3);
    argfacelem[i + 1] = 1.0/(2.0 * sigmaelem[i + 1] * sigmaelem[i + 1]);
  }
 }
 /* ---------------------------------------------------------------------- */
 ComputeGaussianGridLocal::~ComputeGaussianGridLocal()
 {
  if (copymode) return;
  memory->destroy(radelem);
  memory->destroy(sigmaelem);
  memory->destroy(prefacelem);
  memory->destroy(argfacelem);
  memory->destroy(cutsq);
 }
 /* ---------------------------------------------------------------------- */
 void ComputeGaussianGridLocal::init()
 {
  if ((modify->get_compute_by_style("^gaussian/grid/local$").size() > 1) && (comm->me == 0))
    error->warning(FLERR, "More than one instance of compute gaussian/grid/local");
 }
 /* ---------------------------------------------------------------------- */
 void ComputeGaussianGridLocal::compute_local()
 {
  invoked_local = update->ntimestep;
  // compute gaussian for each gridpoint
  double **const x = atom->x;
  const int *const mask = atom->mask;
  int *const type = atom->type;
  const int ntotal = atom->nlocal + atom->nghost;
  int igrid = 0;
  for (int iz = nzlo; iz <= nzhi; iz++)
    for (int iy = nylo; iy <= nyhi; iy++)
      for (int ix = nxlo; ix <= nxhi; ix++) {
        double xgrid[3];
        grid2x(ix, iy, iz, xgrid);
        const double xtmp = xgrid[0];
        const double ytmp = xgrid[1];
        const double ztmp = xgrid[2];
        // Zeroing out the components, which are filled as a sum.
        for (int icol = size_local_cols_base; icol < size_local_cols; icol++){
          alocal[igrid][icol] = 0.0;
        }
        for (int j = 0; j < ntotal; j++) {
          // check that j is in compute group
          if (!(mask[j] & groupbit)) continue;
          const double delx = xtmp - x[j][0];
          const double dely = ytmp - x[j][1];
          const double delz = ztmp - x[j][2];
          const double rsq = delx * delx + dely * dely + delz * delz;
          int jtype = type[j];
          if (rsq < cutsq[jtype][jtype]) {
            int icol = size_local_cols_base + jtype - 1;
            alocal[igrid][icol] += prefacelem[jtype] * exp(-rsq * argfacelem[jtype]);
          }
        }
        igrid++;
  }
 }
 /* ----------------------------------------------------------------------
   memory usage
 ------------------------------------------------------------------------- */
 double ComputeGaussianGridLocal::memory_usage()
 {
  int n = atom->ntypes + 1;
  int nbytes = (double) n * sizeof(int);    // map
  return nbytes;
 }
--- a/src/ML-SNAP/compute_gaussian_grid_local.h
+++ b/src/ML-SNAP/compute_gaussian_grid_local.h
@ -0,0 +1,51 @@
 /* -*- c++ -*- ----------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   https://www.lammps.org/ Sandia National Laboratories
   LAMMPS development team: developers@lammps.org
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 #ifdef COMPUTE_CLASS
 // clang-format off
 ComputeStyle(gaussian/grid/local,ComputeGaussianGridLocal);
 // clang-format on
 #else
 #ifndef LMP_COMPUTE_GAUSSIAN_GRID_LOCAL_H
 #define LMP_COMPUTE_GAUSSIAN_GRID_LOCAL_H
 #include "compute_grid_local.h"
 namespace LAMMPS_NS {
 class ComputeGaussianGridLocal : public ComputeGridLocal {
 public:
  ComputeGaussianGridLocal(class LAMMPS *, int, char **);
  ~ComputeGaussianGridLocal() override;
  void init() override;
  void compute_local() override;
  double memory_usage() override;
 protected:
  int ncoeff;
  double **cutsq;
  double rcutfac;     // global cut-off scale
  double *radelem;    // cut-off radius of each atom type
  double *sigmaelem;  // Gaussian width of each atom type
  double *prefacelem; // Gaussian prefactor of each atom type
  double *argfacelem; // Gaussian argument factor of each atom type
  int *map;    // map types to [0,nelements)
  int nelements;
  double cutmax;
 };
 }    // namespace LAMMPS_NS
 #endif
 #endif
--- a/src/ML-SNAP/compute_grid.cpp
+++ b/src/ML-SNAP/compute_grid.cpp
@ -57,6 +57,7 @@ ComputeGrid::ComputeGrid(LAMMPS *lmp, int narg, char **arg) :
 ComputeGrid::~ComputeGrid()
 {
  if (copymode) return;
  deallocate();
 }
@ -111,7 +112,6 @@ void ComputeGrid::assign_coords_all()
 void ComputeGrid::allocate()
 {
  // allocate arrays
  memory->create(grid, size_array_rows, size_array_cols, "grid:grid");
  memory->create(gridall, size_array_rows, size_array_cols, "grid:gridall");
  if (nxlo <= nxhi && nylo <= nyhi && nzlo <= nzhi) {
--- a/src/ML-SNAP/compute_grid_local.cpp
+++ b/src/ML-SNAP/compute_grid_local.cpp
@ -119,6 +119,8 @@ void ComputeGridLocal::allocate()
 void ComputeGridLocal::deallocate()
 {
  if (copymode) return;
  if (gridlocal_allocated) {
    gridlocal_allocated = 0;
    memory->destroy(alocal);
--- a/src/ML-SNAP/compute_sna_grid.cpp
+++ b/src/ML-SNAP/compute_sna_grid.cpp
@ -31,14 +31,13 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) :
  // skip over arguments used by base class
  // so that argument positions are identical to
  // regular per-atom compute
  arg += nargbase;
  narg -= nargbase;
  // begin code common to all SNAP computes
-  double rfac0, rmin0;
+  //double rfac0, rmin0;
-  int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;
+  //int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;
  int ntypes = atom->ntypes;
  int nargmin = 6 + 2 * ntypes;
@ -56,6 +55,8 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) :
  wselfallflag = 0;
  switchinnerflag = 0;
  nelements = 1;
  chunksize = 32768;
  parallel_thresh = 8192;
  // process required arguments
@ -67,8 +68,9 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) :
  twojmax = utils::inumeric(FLERR, arg[5], false, lmp);
  for (int i = 0; i < ntypes; i++) radelem[i + 1] = utils::numeric(FLERR, arg[6 + i], false, lmp);
-  for (int i = 0; i < ntypes; i++)
+  for (int i = 0; i < ntypes; i++) {
    wjelem[i + 1] = utils::numeric(FLERR, arg[6 + ntypes + i], false, lmp);
  }
  // construct cutsq
@ -181,11 +183,12 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) :
 ComputeSNAGrid::~ComputeSNAGrid()
 {
  if (copymode) return;
  memory->destroy(radelem);
  memory->destroy(wjelem);
  memory->destroy(cutsq);
  delete snaptr;
  if (chemflag) memory->destroy(map);
 }
@ -202,6 +205,7 @@ void ComputeSNAGrid::init()
 void ComputeSNAGrid::compute_array()
 {
  invoked_array = update->ntimestep;
  // compute sna for each gridpoint
--- a/src/ML-SNAP/compute_sna_grid.h
+++ b/src/ML-SNAP/compute_sna_grid.h
@ -31,21 +31,27 @@ class ComputeSNAGrid : public ComputeGrid {
  void init() override;
  void compute_array() override;
  double memory_usage() override;
  int ncoeff,nelements; // public for kokkos, but could go in the protected block now
- private:
+ protected:
-  int ncoeff;
+  //int ncoeff;
  double **cutsq;
  double rcutfac;
  double *radelem;
  double *wjelem;
  int *map;    // map types to [0,nelements)
-  int nelements, chemflag;
+  int chemflag;
  int switchinnerflag;
  double *sinnerelem;
  double *dinnerelem;
  int parallel_thresh;
  class SNA *snaptr;
  double cutmax;
  int quadraticflag;
  double rfac0, rmin0;
  int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;
  int chunksize;
 };
 }    // namespace LAMMPS_NS
--- a/src/ML-SNAP/compute_sna_grid_local.cpp
+++ b/src/ML-SNAP/compute_sna_grid_local.cpp
@ -37,8 +37,8 @@ ComputeSNAGridLocal::ComputeSNAGridLocal(LAMMPS *lmp, int narg, char **arg) :
  // begin code common to all SNAP computes
-  double rfac0, rmin0;
+  //double rfac0, rmin0;
-  int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;
+  //int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;
  int ntypes = atom->ntypes;
  int nargmin = 6 + 2 * ntypes;
@ -56,6 +56,8 @@ ComputeSNAGridLocal::ComputeSNAGridLocal(LAMMPS *lmp, int narg, char **arg) :
  wselfallflag = 0;
  switchinnerflag = 0;
  nelements = 1;
  chunksize = 32768;
  parallel_thresh = 8192;
  // process required arguments
@ -180,6 +182,7 @@ ComputeSNAGridLocal::ComputeSNAGridLocal(LAMMPS *lmp, int narg, char **arg) :
 ComputeSNAGridLocal::~ComputeSNAGridLocal()
 {
  if (copymode) return;
  memory->destroy(radelem);
  memory->destroy(wjelem);
  memory->destroy(cutsq);
--- a/src/ML-SNAP/compute_sna_grid_local.h
+++ b/src/ML-SNAP/compute_sna_grid_local.h
@ -32,7 +32,7 @@ class ComputeSNAGridLocal : public ComputeGridLocal {
  void compute_local() override;
  double memory_usage() override;
- private:
+ protected:
  int ncoeff;
  double **cutsq;
  double rcutfac;
@ -46,6 +46,10 @@ class ComputeSNAGridLocal : public ComputeGridLocal {
  class SNA *snaptr;
  double cutmax;
  int quadraticflag;
  double rfac0, rmin0;
  int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;
  int chunksize;
  int parallel_thresh;
 };
 }    // namespace LAMMPS_NS