Merge remote-tracking branch 'github/develop' into collected-small-changes

2025-01-23 12:06:59 -05:00
parent f8484de195 0dfd5d821a
commit b5da9ce3e2
36 changed files with 3434 additions and 56 deletions
--- a/doc/src/Commands_compute.rst
+++ b/doc/src/Commands_compute.rst
@ -58,6 +58,7 @@ KOKKOS, o = OPENMP, t = OPT.
   * :doc:`fep/ta <compute_fep_ta>`
   * :doc:`force/tally <compute_tally>`
   * :doc:`fragment/atom <compute_cluster_atom>`
+   * :doc:`gaussian/grid/local (k) <compute_gaussian_grid_local>`
   * :doc:`global/atom <compute_global_atom>`
   * :doc:`group/group <compute_group_group>`
   * :doc:`gyration <compute_gyration>`
@ -140,8 +141,8 @@ KOKKOS, o = OPENMP, t = OPT.
   * :doc:`smd/vol <compute_smd_vol>`
   * :doc:`snap <compute_sna_atom>`
   * :doc:`sna/atom <compute_sna_atom>`
-   * :doc:`sna/grid <compute_sna_atom>`
-   * :doc:`sna/grid/local <compute_sna_atom>`
+   * :doc:`sna/grid (k) <compute_sna_atom>`
+   * :doc:`sna/grid/local (k) <compute_sna_atom>`
   * :doc:`snad/atom <compute_sna_atom>`
   * :doc:`snav/atom <compute_sna_atom>`
   * :doc:`sph/e/atom <compute_sph_e_atom>`
--- a/doc/src/compute.rst
+++ b/doc/src/compute.rst
@ -236,6 +236,7 @@ The individual style names on the :doc:`Commands compute <Commands_compute>` pag
 * :doc:`fep/ta <compute_fep_ta>` - compute free energies for a test area perturbation
 * :doc:`force/tally <compute_tally>` - force between two groups of atoms via the tally callback mechanism
 * :doc:`fragment/atom <compute_cluster_atom>` - fragment ID for each atom
+* :doc:`gaussian/grid/local <compute_gaussian_grid_local>` - local array of Gaussian atomic contributions on a regular grid
 * :doc:`global/atom <compute_global_atom>` - assign global values to each atom from arrays of global values
 * :doc:`group/group <compute_group_group>` - energy/force between two groups of atoms
 * :doc:`gyration <compute_gyration>` - radius of gyration of group of atoms
--- a/doc/src/compute_gaussian_grid_local.rst
+++ b/doc/src/compute_gaussian_grid_local.rst
@ -0,0 +1,97 @@
+.. index:: compute gaussian/grid/local
+.. index:: compute gaussian/grid/local/kk
+
+compute gaussian/grid/local command
+===================================
+
+Accelerator Variants: *gaussian/grid/local/kk*
+
+Syntax
+""""""
+
+.. code-block:: LAMMPS
+
+   compute ID group-ID gaussian/grid/local grid nx ny nz rcutfac  R_1 R_2 ... sigma_1 sigma_2
+
+* ID, group-ID are documented in :doc:`compute <compute>` command
+* gaussian/grid/local = style name of this compute command
+* *grid* values = nx, ny, nz, number of grid points in x, y, and z directions (positive integer)
+* *rcutfac* = scale factor applied to all cutoff radii (positive real)
+* *R_1, R_2,...* = list of cutoff radii, one for each type (distance units)
+* *sigma_1, sigma_2,...* = Gaussian widths, one for each type (distance units)
+
+Examples
+""""""""
+
+.. code-block:: LAMMPS
+
+    compute mygrid all gaussian/grid/local grid 40 40 40 4.0 0.5 0.5 0.4 0.4
+
+Description
+"""""""""""
+
+Define a computation that calculates a Gaussian representation of the ionic
+structure. This representation is used for the efficient evaluation
+of quantities related to the structure factor in a grid-based workflow,
+such as the ML-DFT workflow MALA :ref:`(Ellis) <Ellis2021b>`, for which it was originally
+implemented. Usage of the workflow is described in a separate publication :ref:`(Fiedler) <Fiedler2023>`.
+
+For each LAMMPS type, a separate sum of Gaussians is calculated, using
+a separate Gaussian broadening per type. The computation
+is always performed on the numerical grid, no atom-based version of this
+compute exists. The Gaussian representation can only be executed in a local
+fashion, thus the output array only contains rows for grid points
+that are local to the processor subdomain. The layout of the grid is the same
+as for the see :doc:`sna/grid/local <compute_sna_atom>` command.
+
+Namely, the array contains one row for each of the
+local grid points, looping over the global index *ix* fastest,
+then *iy*, and *iz* slowest.  Each row of the array contains
+the global indexes *ix*, *iy*, and *iz* first, followed by the *x*, *y*,
+and *z* coordinates of the grid point, followed by the values of the Gaussians
+(one floating point number per type per grid point).
+
+----------
+
+
+.. include:: accel_styles.rst
+
+
+
+----------
+
+Output info
+"""""""""""
+
+Compute *gaussian/grid/local* evaluates a local array.
+The array contains one row for each of the
+local grid points, looping over the global index *ix* fastest,
+then *iy*, and *iz* slowest.  The array contains math :math:`ntypes+6` columns,
+where *ntypes* is the number of LAMMPS types. The first three columns are
+the global indexes *ix*, *iy*, and *iz*, followed by the *x*, *y*,
+and *z* coordinates of the grid point, followed by the *ntypes* columns
+containing the values of the Gaussians for each type.
+
+Restrictions
+""""""""""""
+
+These computes are part of the ML-SNAP package.  They are only enabled
+if LAMMPS was built with that package.  See the :doc:`Build package
+<Build_package>` page for more info.
+
+Related commands
+""""""""""""""""
+
+:doc:`compute sna/grid/local <compute_sna_atom>`
+
+----------
+
+.. _Ellis2021b:
+
+**(Ellis)** Ellis, Fiedler, Popoola, Modine, Stephens, Thompson, Cangi, Rajamanickam, `Phys. Rev. B, 104, 035120, (2021) <https://doi.org/10.1103/PhysRevB.104.035120>`_
+
+.. _Fiedler2023:
+
+**(Fiedler)** Fiedler, Modine, Schmerler, Vogel, Popoola, Thompson, Rajamanickam, and Cangi,
+`npj Comp. Mater., 9, 115 (2023) <https://doi.org/10.1038/s41524-023-01070-z>`_
+
--- a/doc/src/compute_sna_atom.rst
+++ b/doc/src/compute_sna_atom.rst
@ -3,7 +3,9 @@
 .. index:: compute snav/atom
 .. index:: compute snap
 .. index:: compute sna/grid
+.. index:: compute sna/grid/kk
 .. index:: compute sna/grid/local
+.. index:: compute sna/grid/local/kk

 compute sna/atom command
 ========================
@ -20,9 +22,14 @@ compute snap command
 compute sna/grid command
 ========================

+compute sna/grid/kk command
+===========================
+
 compute sna/grid/local command
 ==============================

+Accelerator Variants: *sna/grid/local/kk*
+
 Syntax
 """"""

@ -33,17 +40,17 @@ Syntax
   compute ID group-ID snav/atom rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ...
   compute ID group-ID snap rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ...
   compute ID group-ID snap rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ...
-   compute ID group-ID sna/grid nx ny nz rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ...
-   compute ID group-ID sna/grid/local nx ny nz rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ...
+   compute ID group-ID sna/grid grid nx ny nz rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ...
+   compute ID group-ID sna/grid/local grid nx ny nz rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ...

 * ID, group-ID are documented in :doc:`compute <compute>` command
 * sna/atom = style name of this compute command
-* rcutfac = scale factor applied to all cutoff radii (positive real)
-* rfac0 = parameter in distance to angle conversion (0 < rcutfac < 1)
-* twojmax = band limit for bispectrum components (non-negative integer)
-* R_1, R_2,... = list of cutoff radii, one for each type (distance units)
-* w_1, w_2,... = list of neighbor weights, one for each type
-* nx, ny, nz = number of grid points in x, y, and z directions (positive integer)
+* *rcutfac* = scale factor applied to all cutoff radii (positive real)
+* *rfac0* = parameter in distance to angle conversion (0 < rcutfac < 1)
+* *twojmax* = band limit for bispectrum components (non-negative integer)
+* *R_1, R_2,...* = list of cutoff radii, one for each type (distance units)
+* *w_1, w_2,...* = list of neighbor weights, one for each type
+* *grid* values = nx, ny, nz, number of grid points in x, y, and z directions (positive integer)
 * zero or more keyword/value pairs may be appended
 * keyword = *rmin0* or *switchflag* or *bzeroflag* or *quadraticflag* or *chem* or *bnormflag* or *wselfallflag* or *bikflag* or *switchinnerflag* or *sinner* or *dinner* or *dgradflag* or *nnn* or *wmode* or *delta*

@ -103,7 +110,7 @@ Examples
   compute snap all snap 1.4 0.95 6 2.0 1.0
   compute snap all snap 1.0 0.99363 6 3.81 3.83 1.0 0.93 chem 2 0 1
   compute snap all snap 1.0 0.99363 6 3.81 3.83 1.0 0.93 switchinnerflag 1 sinner 1.35 1.6 dinner 0.25 0.3
-   compute bgrid all sna/grid/local 200 200 200 1.4 0.95 6 2.0 1.0
+   compute bgrid all sna/grid/local grid 200 200 200 1.4 0.95 6 2.0 1.0
   compute bnnn all sna/atom 9.0 0.99363 8 0.5 1.0 rmin0 0.0 nnn 24 wmode 1 delta 0.2

 Description
@ -252,7 +259,8 @@ for finite-temperature Kohn-Sham density functional theory (:ref:`Ellis
 et al. <Ellis2021>`) Neighbor atoms not in the group do not contribute
 to the bispectrum components of the grid points. The distance cutoff
 :math:`R_{ii'}` assumes that *i* has the same type as the neighbor atom
-*i'*.
+*i'*. Both computes can be hardware accelerated with Kokkos by using the
+*sna/grid/kk* and *sna/grid/local/kk* commands, respectively.

 Compute *sna/grid* calculates a global array containing bispectrum
 components for a regular grid of points.
@ -463,6 +471,12 @@ fluctuations in the resulting local atomic environment fingerprint.  The
 detailed formalism is given in the paper by Lafourcade et
 al. :ref:`(Lafourcade) <Lafourcade2023_2>`.

+----------
+
+
+.. include:: accel_styles.rst
+
+
 ----------

 Output info
@ -654,7 +668,7 @@ of Angular Momentum, World Scientific, Singapore (1987).

 .. _Ellis2021:

-**(Ellis)** Ellis, Fiedler, Popoola, Modine, Stephens, Thompson, Cangi, Rajamanickam,  Phys Rev B, 104, 035120, (2021)
+**(Ellis)** Ellis, Fiedler, Popoola, Modine, Stephens, Thompson, Cangi, Rajamanickam, `Phys. Rev. B, 104, 035120, (2021) <https://doi.org/10.1103/PhysRevB.104.035120>`_

 .. _Lafourcade2023_2:

--- a/doc/utils/sphinx-config/false_positives.txt
+++ b/doc/utils/sphinx-config/false_positives.txt
@ -3380,6 +3380,7 @@ Schilfgarde
 Schimansky
 Schiotz
 Schlitter
+Schmerler
 Schmid
 Schnieders
 Schoen
@ -4042,6 +4043,7 @@ VMDARCH
 VMDHOME
 vn
 Voigt
+Vogel
 volfactor
 Volkov
 Volpe
--- a/examples/snap/README.md
+++ b/examples/snap/README.md
@ -9,5 +9,11 @@ in.snap.Mo_Chen                   # SNAP linear Mo potential
 in.snap.compute                   # SNAP compute for training a linear model
 in.snap.compute.quadratic         # SNAP compute for training a quadratic model
 in.snap.scale.Ni_Zuo_JCPA2020     # SNAP linear Ni potential with thermodynamic integration (fix adapt scale)
+in.C_SNAP                         # SNAP carbon potential

 compute_snap_dgrad.py             # SNAP compute with dgradflag (dBi/dRj) for training a non-linear model
+
+in.snap.grid                      # SNAP descriptors on a grid
+in.snap.grid.triclinic            # SNAP descriptors on a grid, triclinic
+in.gaussian.grid                  # Gaussian descriptors on a grid
+
--- a/examples/snap/in.gaussian.grid
+++ b/examples/snap/in.gaussian.grid
@ -0,0 +1,68 @@
+# Demonstrate calculation of Gaussian descriptors on a grid
+# for a cell with two atoms of type 1 and type 2.
+# The output in dump.glocal shows that for grid points
+# sitting on an atom of type 1 or 2:
+# val1 = 1.0/(0.1355*sqrt(2.0*pi))**3 = 25.5219
+# val2 = 1.0/(0.2   *sqrt(2.0*pi))**3 = 7.93670
+# These values are extracted to the log file
+# 
+
+variable 	nrep index 1
+variable 	a index 3.316
+variable 	ngrid index 2
+
+units		metal
+atom_modify	map hash
+
+# generate the box and atom positions using a BCC lattice
+
+variable       	nx equal ${nrep}
+variable 	ny equal ${nrep}
+variable 	nz equal ${nrep}
+
+boundary	p p p
+
+lattice		custom $a &
+		a1 1 0 0 &
+		a2 0 1 0  &
+		a3 0 0 1 &
+		basis 0 0 0 &
+		basis 0.5 0.5 0.5 &
+
+region		box block 0 ${nx} 0 ${ny} 0 ${nz}
+create_box	2 box
+create_atoms	1 box basis 1 1 basis 2 2
+
+mass 		* 180.88
+
+# define atom compute and grid compute
+
+variable 	rcutfac equal 4.67637
+variable 	radelem1 equal 0.5
+variable 	radelem2 equal 0.5
+variable	sigmaelem1 equal 0.1355
+variable	sigmaelem2 equal 0.2
+variable 	gaussian_options string &
+		"${rcutfac} ${radelem1} ${radelem2} ${sigmaelem1} ${sigmaelem2}"
+		
+# build zero potential to force ghost atom creation
+
+pair_style      zero ${rcutfac}
+pair_coeff      * *
+
+# define atom and grid computes
+
+compute 	mygridlocal all gaussian/grid/local grid ${ngrid} ${ngrid} ${ngrid} &
+	 	${gaussian_options}
+
+# define output
+
+dump		1 all local 1000 dump.glocal c_mygridlocal[*]
+dump 		2 all custom 1000 dump.gatom id x y z
+compute		val1 all reduce max c_mygridlocal[7] inputs local
+compute		val2 all reduce max c_mygridlocal[8] inputs local
+thermo_style	custom step c_val1 c_val2
+
+# run
+
+run		0
--- a/examples/snap/in.snap.grid
+++ b/examples/snap/in.snap.grid
--- a/examples/snap/in.snap.grid.triclinic
+++ b/examples/snap/in.snap.grid.triclinic
@ -47,7 +47,6 @@ lattice		custom $a &
 		basis 0.0 0.0 0.5 &
 		spacing 1 1 1

-box 		tilt large
 region		box prism 0 ${nx} 0 ${ny} 0 ${nz} ${ny} ${nz} ${nz}
 create_box	1 box
 create_atoms	1 box
--- a/examples/snap/log.10Dec24.gaussian.grid.g++.1
+++ b/examples/snap/log.10Dec24.gaussian.grid.g++.1
@ -0,0 +1,129 @@
+LAMMPS (19 Nov 2024 - Development - patch_19Nov2024-59-g16e0a7788a)
+OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:99)
+  using 1 OpenMP thread(s) per MPI task
+# Demonstrate calculation of Gaussian descriptors on a grid
+# for a cell with two atoms of type 1 and type 2.
+# The output in dump.glocal shows that for grid points
+# sitting on an atom of type 1 or 2:
+# val1 = 1.0/(0.1355*sqrt(2.0*pi))**3 = 25.5219
+# val2 = 1.0/(0.2   *sqrt(2.0*pi))**3 = 7.93670
+# These values are extracted to the log file
+#
+
+variable 	nrep index 1
+variable 	a index 3.316
+variable 	ngrid index 2
+
+units		metal
+atom_modify	map hash
+
+# generate the box and atom positions using a BCC lattice
+
+variable       	nx equal ${nrep}
+variable       	nx equal 1
+variable 	ny equal ${nrep}
+variable 	ny equal 1
+variable 	nz equal ${nrep}
+variable 	nz equal 1
+
+boundary	p p p
+
+lattice		custom $a 		a1 1 0 0 		a2 0 1 0  		a3 0 0 1 		basis 0 0 0 		basis 0.5 0.5 0.5
+lattice		custom 3.316 		a1 1 0 0 		a2 0 1 0  		a3 0 0 1 		basis 0 0 0 		basis 0.5 0.5 0.5
+Lattice spacing in x,y,z = 3.316 3.316 3.316
+region		box block 0 ${nx} 0 ${ny} 0 ${nz}
+region		box block 0 1 0 ${ny} 0 ${nz}
+region		box block 0 1 0 1 0 ${nz}
+region		box block 0 1 0 1 0 1
+create_box	2 box
+Created orthogonal box = (0 0 0) to (3.316 3.316 3.316)
+  1 by 1 by 1 MPI processor grid
+create_atoms	1 box basis 1 1 basis 2 2
+Created 2 atoms
+  using lattice units in orthogonal box = (0 0 0) to (3.316 3.316 3.316)
+  create_atoms CPU = 0.001 seconds
+
+mass 		* 180.88
+
+# define atom compute and grid compute
+
+variable 	rcutfac equal 4.67637
+variable 	radelem1 equal 0.5
+variable 	radelem2 equal 0.5
+variable	sigmaelem1 equal 0.1355
+variable	sigmaelem2 equal 0.2
+variable 	gaussian_options string 		"${rcutfac} ${radelem1} ${radelem2} ${sigmaelem1} ${sigmaelem2}"
+4.67637 ${radelem1} ${radelem2} ${sigmaelem1} ${sigmaelem2}
+4.67637 0.5 ${radelem2} ${sigmaelem1} ${sigmaelem2}
+4.67637 0.5 0.5 ${sigmaelem1} ${sigmaelem2}
+4.67637 0.5 0.5 0.1355 ${sigmaelem2}
+4.67637 0.5 0.5 0.1355 0.2
+
+# build zero potential to force ghost atom creation
+
+pair_style      zero ${rcutfac}
+pair_style      zero 4.67637
+pair_coeff      * *
+
+# define atom and grid computes
+
+compute 	mygridlocal all gaussian/grid/local grid ${ngrid} ${ngrid} ${ngrid} 	 	${gaussian_options}
+compute 	mygridlocal all gaussian/grid/local grid 2 ${ngrid} ${ngrid} 	 	${gaussian_options}
+compute 	mygridlocal all gaussian/grid/local grid 2 2 ${ngrid} 	 	${gaussian_options}
+compute 	mygridlocal all gaussian/grid/local grid 2 2 2 	 	${gaussian_options}
+compute 	mygridlocal all gaussian/grid/local grid 2 2 2 	 	4.67637 0.5 0.5 0.1355 0.2
+
+# define output
+
+dump		1 all local 1000 dump.glocal c_mygridlocal[*]
+dump 		2 all custom 1000 dump.gatom id x y z
+compute		val1 all reduce max c_mygridlocal[7] inputs local
+compute		val2 all reduce max c_mygridlocal[8] inputs local
+thermo_style	custom step c_val1 c_val2
+
+# run
+
+run		0
+WARNING: No fixes with time integration, atoms won't move (src/verlet.cpp:60)
+Generated 0 of 1 mixed pair_coeff terms from geometric mixing rule
+Neighbor list info ...
+  update: every = 1 steps, delay = 0 steps, check = yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 6.67637
+  ghost atom cutoff = 6.67637
+  binsize = 3.338185, bins = 1 1 1
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair zero, perpetual
+      attributes: half, newton on
+      pair build: half/bin/atomonly/newton
+      stencil: half/bin/3d
+      bin: standard
+Per MPI rank memory allocation (min/avg/max) = 3.492 | 3.492 | 3.492 Mbytes
+   Step         c_val1         c_val2    
+         0   25.521859      7.9367045    
+Loop time of 1.088e-06 on 1 procs for 0 steps with 2 atoms
+
+183.8% CPU use with 1 MPI tasks x 1 OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 0          | 0          | 0          |   0.0 |  0.00
+Neigh   | 0          | 0          | 0          |   0.0 |  0.00
+Comm    | 0          | 0          | 0          |   0.0 |  0.00
+Output  | 0          | 0          | 0          |   0.0 |  0.00
+Modify  | 0          | 0          | 0          |   0.0 |  0.00
+Other   |            | 1.088e-06  |            |       |100.00
+
+Nlocal:              2 ave           2 max           2 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:            339 ave         339 max         339 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:             64 ave          64 max          64 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 64
+Ave neighs/atom = 32
+Neighbor list builds = 0
+Dangerous builds = 0
+Total wall time: 0:00:00
--- a/examples/snap/log.10Dec24.gaussian.grid.g++.4
+++ b/examples/snap/log.10Dec24.gaussian.grid.g++.4
@ -0,0 +1,130 @@
+LAMMPS (19 Nov 2024 - Development - patch_19Nov2024-59-g16e0a7788a)
+OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:99)
+  using 1 OpenMP thread(s) per MPI task
+# Demonstrate calculation of Gaussian descriptors on a grid
+# for a cell with two atoms of type 1 and type 2.
+# The output in dump.glocal shows that for grid points
+# sitting on an atom of type 1 or 2:
+# val1 = 1.0/(0.1355*sqrt(2.0*pi))**3 = 25.5219
+# val2 = 1.0/(0.2   *sqrt(2.0*pi))**3 = 7.93670
+# These values are extracted to the log file
+#
+
+variable 	nrep index 1
+variable 	a index 3.316
+variable 	ngrid index 2
+
+units		metal
+atom_modify	map hash
+
+# generate the box and atom positions using a BCC lattice
+
+variable       	nx equal ${nrep}
+variable       	nx equal 1
+variable 	ny equal ${nrep}
+variable 	ny equal 1
+variable 	nz equal ${nrep}
+variable 	nz equal 1
+
+boundary	p p p
+
+lattice		custom $a 		a1 1 0 0 		a2 0 1 0  		a3 0 0 1 		basis 0 0 0 		basis 0.5 0.5 0.5
+lattice		custom 3.316 		a1 1 0 0 		a2 0 1 0  		a3 0 0 1 		basis 0 0 0 		basis 0.5 0.5 0.5
+Lattice spacing in x,y,z = 3.316 3.316 3.316
+region		box block 0 ${nx} 0 ${ny} 0 ${nz}
+region		box block 0 1 0 ${ny} 0 ${nz}
+region		box block 0 1 0 1 0 ${nz}
+region		box block 0 1 0 1 0 1
+create_box	2 box
+Created orthogonal box = (0 0 0) to (3.316 3.316 3.316)
+  1 by 2 by 2 MPI processor grid
+create_atoms	1 box basis 1 1 basis 2 2
+Created 2 atoms
+  using lattice units in orthogonal box = (0 0 0) to (3.316 3.316 3.316)
+  create_atoms CPU = 0.001 seconds
+
+mass 		* 180.88
+
+# define atom compute and grid compute
+
+variable 	rcutfac equal 4.67637
+variable 	radelem1 equal 0.5
+variable 	radelem2 equal 0.5
+variable	sigmaelem1 equal 0.1355
+variable	sigmaelem2 equal 0.2
+variable 	gaussian_options string 		"${rcutfac} ${radelem1} ${radelem2} ${sigmaelem1} ${sigmaelem2}"
+4.67637 ${radelem1} ${radelem2} ${sigmaelem1} ${sigmaelem2}
+4.67637 0.5 ${radelem2} ${sigmaelem1} ${sigmaelem2}
+4.67637 0.5 0.5 ${sigmaelem1} ${sigmaelem2}
+4.67637 0.5 0.5 0.1355 ${sigmaelem2}
+4.67637 0.5 0.5 0.1355 0.2
+
+# build zero potential to force ghost atom creation
+
+pair_style      zero ${rcutfac}
+pair_style      zero 4.67637
+pair_coeff      * *
+
+# define atom and grid computes
+
+compute 	mygridlocal all gaussian/grid/local grid ${ngrid} ${ngrid} ${ngrid} 	 	${gaussian_options}
+compute 	mygridlocal all gaussian/grid/local grid 2 ${ngrid} ${ngrid} 	 	${gaussian_options}
+compute 	mygridlocal all gaussian/grid/local grid 2 2 ${ngrid} 	 	${gaussian_options}
+compute 	mygridlocal all gaussian/grid/local grid 2 2 2 	 	${gaussian_options}
+compute 	mygridlocal all gaussian/grid/local grid 2 2 2 	 	4.67637 0.5 0.5 0.1355 0.2
+
+# define output
+
+dump		1 all local 1000 dump.glocal c_mygridlocal[*]
+dump 		2 all custom 1000 dump.gatom id x y z
+compute		val1 all reduce max c_mygridlocal[7] inputs local
+compute		val2 all reduce max c_mygridlocal[8] inputs local
+thermo_style	custom step c_val1 c_val2
+
+# run
+
+run		0
+WARNING: No fixes with time integration, atoms won't move (src/verlet.cpp:60)
+Generated 0 of 1 mixed pair_coeff terms from geometric mixing rule
+Neighbor list info ...
+  update: every = 1 steps, delay = 0 steps, check = yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 6.67637
+  ghost atom cutoff = 6.67637
+  binsize = 3.338185, bins = 1 1 1
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair zero, perpetual
+      attributes: half, newton on
+      pair build: half/bin/atomonly/newton
+      stencil: half/bin/3d
+      bin: standard
+WARNING: Proc sub-domain size < neighbor skin, could lead to lost atoms (src/domain.cpp:1202)
+Per MPI rank memory allocation (min/avg/max) = 3.522 | 3.523 | 3.524 Mbytes
+   Step         c_val1         c_val2    
+         0   25.521859      7.9367045    
+Loop time of 2.238e-06 on 4 procs for 0 steps with 2 atoms
+
+89.4% CPU use with 4 MPI tasks x 1 OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 0          | 0          | 0          |   0.0 |  0.00
+Neigh   | 0          | 0          | 0          |   0.0 |  0.00
+Comm    | 0          | 0          | 0          |   0.0 |  0.00
+Output  | 0          | 0          | 0          |   0.0 |  0.00
+Modify  | 0          | 0          | 0          |   0.0 |  0.00
+Other   |            | 2.238e-06  |            |       |100.00
+
+Nlocal:            0.5 ave           1 max           0 min
+Histogram: 2 0 0 0 0 0 0 0 0 2
+Nghost:          274.5 ave         275 max         274 min
+Histogram: 2 0 0 0 0 0 0 0 0 2
+Neighs:             16 ave          40 max           0 min
+Histogram: 2 0 0 0 0 0 1 0 0 1
+
+Total # of neighbors = 64
+Ave neighs/atom = 32
+Neighbor list builds = 0
+Dangerous builds = 0
+Total wall time: 0:00:00
--- a/src/.gitignore
+++ b/src/.gitignore
@ -252,6 +252,8 @@
 /*rheo*.cpp
 /*rheo*.h

+/compute_gaussian_grid_local.cpp
+/compute_gaussian_grid_local.h
 /compute_grid.cpp
 /compute_grid.h
 /compute_grid_local.cpp
--- a/src/KOKKOS/Install.sh
+++ b/src/KOKKOS/Install.sh
@ -119,6 +119,14 @@ action compute_composition_atom_kokkos.cpp compute_composition_atom.cpp
 action compute_composition_atom_kokkos.h compute_composition_atom.h
 action compute_orientorder_atom_kokkos.cpp
 action compute_orientorder_atom_kokkos.h
+action compute_sna_grid_kokkos.cpp compute_sna_grid.cpp
+action compute_sna_grid_kokkos.h compute_sna_grid.h
+action compute_sna_grid_kokkos_impl.h compute_sna_grid.cpp
+action compute_sna_grid_local_kokkos.cpp compute_sna_grid_local.cpp
+action compute_sna_grid_local_kokkos.h compute_sna_grid_local.h
+action compute_sna_grid_local_kokkos_impl.h compute_sna_grid_local.cpp
+action compute_gaussian_grid_local_kokkos.cpp compute_gaussian_grid_local.cpp
+action compute_gaussian_grid_local_kokkos.h compute_gaussian_grid_local.h
 action compute_temp_deform_kokkos.cpp
 action compute_temp_deform_kokkos.h
 action compute_temp_kokkos.cpp
--- a/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp
+++ b/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp
@ -0,0 +1,327 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Drew Rohskopf (SNL)
+------------------------------------------------------------------------- */
+
+#include "compute_gaussian_grid_local_kokkos.h"
+
+#include "atom_kokkos.h"
+#include "atom_masks.h"
+#include "comm.h"
+#include "domain.h"
+#include "error.h"
+#include "force.h"
+#include "memory_kokkos.h"
+#include "modify.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "neighbor_kokkos.h"
+#include "pair.h"
+#include "update.h"
+
+#include <cmath>
+#include <cstring>
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+ComputeGaussianGridLocalKokkos<DeviceType>::ComputeGaussianGridLocalKokkos(LAMMPS *lmp, int narg, char **arg) :
+  ComputeGaussianGridLocal(lmp, narg, arg)
+{
+  kokkosable = 1;
+  atomKK = (AtomKokkos *) atom;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  datamask_read = EMPTY_MASK;
+  datamask_modify = EMPTY_MASK;
+
+  k_cutsq = tdual_fparams("ComputeSNAGridKokkos::cutsq",atom->ntypes+1,atom->ntypes+1);
+  auto d_cutsq = k_cutsq.template view<DeviceType>();
+  rnd_cutsq = d_cutsq;
+
+  host_flag = (execution_space == Host);
+
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = 1; j <= atom->ntypes; j++){
+      k_cutsq.h_view(i,j) = k_cutsq.h_view(j,i) = cutsq[i][j]; //cutsq_tmp;
+      k_cutsq.template modify<LMPHostType>();
+    }
+  }
+  // Set up element lists
+  int n = atom->ntypes;
+  MemKK::realloc_kokkos(d_radelem,"ComputeSNAGridKokkos::radelem",n);
+  MemKK::realloc_kokkos(d_sigmaelem,"ComputeSNAGridKokkos::sigmaelem",n+1);
+  MemKK::realloc_kokkos(d_prefacelem,"ComputeSNAGridKokkos::prefacelem",n+1);
+  MemKK::realloc_kokkos(d_argfacelem,"ComputeSNAGridKokkos::argfacelem",n+1);
+  MemKK::realloc_kokkos(d_map,"ComputeSNAGridKokkos::map",n+1);
+  auto h_radelem = Kokkos::create_mirror_view(d_radelem);
+  auto h_sigmaelem = Kokkos::create_mirror_view(d_sigmaelem);
+  auto h_prefacelem = Kokkos::create_mirror_view(d_prefacelem);
+  auto h_argfacelem = Kokkos::create_mirror_view(d_argfacelem);
+  auto h_map = Kokkos::create_mirror_view(d_map);
+  // start from index 1 because of how compute sna/grid is
+  for (int i = 1; i <= atom->ntypes; i++) {
+    h_radelem(i-1) = radelem[i];
+    h_sigmaelem(i-1) = sigmaelem[i];
+    h_prefacelem(i-1) = prefacelem[i];
+    h_argfacelem(i-1) = argfacelem[i];
+  }
+  Kokkos::deep_copy(d_radelem,h_radelem);
+  Kokkos::deep_copy(d_sigmaelem,h_sigmaelem);
+  Kokkos::deep_copy(d_prefacelem, h_prefacelem);
+  Kokkos::deep_copy(d_argfacelem, h_argfacelem);
+  Kokkos::deep_copy(d_map,h_map);
+
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+ComputeGaussianGridLocalKokkos<DeviceType>::~ComputeGaussianGridLocalKokkos()
+{
+  if (copymode) return;
+
+  memoryKK->destroy_kokkos(k_cutsq,cutsq);
+  memoryKK->destroy_kokkos(k_alocal,alocal);
+  //gridlocal_allocated = 0;
+
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void ComputeGaussianGridLocalKokkos<DeviceType>::setup()
+{
+
+  ComputeGridLocal::setup();
+
+  // allocate arrays
+  memoryKK->create_kokkos(k_alocal, alocal, size_local_rows, size_local_cols, "grid:alocal");
+  array_local = alocal;
+  d_alocal = k_alocal.template view<DeviceType>();
+
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void ComputeGaussianGridLocalKokkos<DeviceType>::init()
+{
+  ComputeGaussianGridLocal::init();
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void ComputeGaussianGridLocalKokkos<DeviceType>::compute_local()
+{
+  if (host_flag) {
+    return;
+  }
+
+  invoked_local = update->ntimestep;
+
+  copymode = 1;
+
+  zlen = nzhi-nzlo+1;
+  ylen = nyhi-nylo+1;
+  xlen = nxhi-nxlo+1;
+  total_range = (nzhi-nzlo+1)*(nyhi-nylo+1)*(nxhi-nxlo+1);
+
+  atomKK->sync(execution_space,X_MASK|F_MASK|TYPE_MASK);
+  x = atomKK->k_x.view<DeviceType>();
+  type = atomKK->k_type.view<DeviceType>();
+  k_cutsq.template sync<DeviceType>();
+
+  // max_neighs is defined here - think of more elaborate methods.
+  max_neighs = 100;
+
+  // Pair snap/kk uses grow_ij with some max number of neighs but compute sna/grid uses total
+  // number of atoms.
+  ntotal = atomKK->nlocal + atomKK->nghost;
+  // Allocate view for number of neighbors per grid point
+  MemKK::realloc_kokkos(d_ninside,"ComputeSNAGridKokkos:ninside",total_range);
+
+  // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user
+  // `total_range` is the number of grid points which may be larger than chunk size.
+  // printf(">>> total_range: %d\n", total_range);
+  chunksize = 32768; // 100*32768
+  chunk_size = MIN(chunksize, total_range);
+  chunk_offset = 0;
+
+  int vector_length_default = 1;
+  int team_size_default = 1;
+  if (!host_flag)
+    team_size_default = 1; // cost will increase with increasing team size //32;//max_neighs;
+
+  if (triclinic){
+    h0 = domain->h[0];
+    h1 = domain->h[1];
+    h2 = domain->h[2];
+    h3 = domain->h[3];
+    h4 = domain->h[4];
+    h5 = domain->h[5];
+    lo0 = domain->boxlo[0];
+    lo1 = domain->boxlo[1];
+    lo2 = domain->boxlo[2];
+  }
+
+  while (chunk_offset < total_range) { // chunk up loop to prevent running out of memory
+
+    if (chunk_size > total_range - chunk_offset)
+      chunk_size = total_range - chunk_offset;
+
+    //Neigh
+    {
+      int vector_length = vector_length_default;
+      int team_size = team_size_default;
+      check_team_size_for<TagComputeGaussianGridLocalNeigh>(chunk_size,team_size,vector_length);
+      typename Kokkos::TeamPolicy<DeviceType, TagComputeGaussianGridLocalNeigh> policy_neigh(chunk_size,team_size,vector_length);
+      Kokkos::parallel_for("ComputeGaussianGridLocalNeigh",policy_neigh,*this);
+    }
+
+    // Proceed to the next chunk.
+    chunk_offset += chunk_size;
+  } // end while
+
+  copymode = 0;
+
+  k_alocal.template modify<DeviceType>();
+  k_alocal.template sync<LMPHostType>();
+
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void ComputeGaussianGridLocalKokkos<DeviceType>::operator() (TagComputeGaussianGridLocalNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagComputeGaussianGridLocalNeigh>::member_type& team) const
+{
+  const int ii = team.league_rank();
+
+  if (ii >= chunk_size) return;
+
+  // extract grid index
+  int igrid = ii + chunk_offset;
+
+  // get a pointer to scratch memory
+  // This is used to cache whether or not an atom is within the cutoff.
+  // If it is, type_cache is assigned to the atom type.
+  // If it's not, it's assigned to -1.
+  const int tile_size = ntotal; //max_neighs; // number of elements per thread
+  const int team_rank = team.team_rank();
+  const int scratch_shift = team_rank * tile_size; // offset into pointer for entire team
+  int* type_cache = (int*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(int), 0) + scratch_shift;
+
+  // convert to grid indices
+
+  int iz = igrid/(xlen*ylen);
+  int i2 = igrid - (iz*xlen*ylen);
+  int iy = i2/xlen;
+  int ix = i2 % xlen;
+  iz += nzlo;
+  iy += nylo;
+  ix += nxlo;
+
+  double xgrid[3];
+
+  // index ii already captures the proper grid point
+  //int igrid = iz * (nx * ny) + iy * nx + ix;
+
+  // grid2x converts igrid to ix,iy,iz like we've done before
+  // multiply grid integers by grid spacing delx, dely, delz
+  //grid2x(igrid, xgrid);
+  xgrid[0] = ix * delx;
+  xgrid[1] = iy * dely;
+  xgrid[2] = iz * delz;
+
+  if (triclinic) {
+
+    // Do a conversion on `xgrid` here like we do in the CPU version.
+
+    // Can't do this:
+    // domainKK->lamda2x(xgrid, xgrid);
+    // Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed
+
+    // Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats.
+    xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0;
+    xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1;
+    xgrid[2] = h2*xgrid[2] + lo2;
+  }
+
+  const F_FLOAT xtmp = xgrid[0];
+  const F_FLOAT ytmp = xgrid[1];
+  const F_FLOAT ztmp = xgrid[2];
+
+  // Zeroing out the components, which are filled as a sum.
+  for (int icol = size_local_cols_base; icol < size_local_cols; icol++){
+    d_alocal(igrid, icol) = 0.0;
+  }
+
+  // Fill grid info columns
+  d_alocal(igrid, 0) = ix;
+  d_alocal(igrid, 1) = iy;
+  d_alocal(igrid, 2) = iz;
+  d_alocal(igrid, 3) = xtmp;
+  d_alocal(igrid, 4) = ytmp;
+  d_alocal(igrid, 5) = ztmp;
+
+  // currently, all grid points are type 1
+  // not clear what a better choice would be
+  const int itype = 1;
+  int ielem = 0;
+  ielem = d_map[itype];
+  const double radi = d_radelem[ielem];
+
+  // Compute the number of neighbors, store rsq
+  int ninside = 0;
+
+  // Looping over ntotal for now.
+  for (int j = 0; j < ntotal; j++){
+    const F_FLOAT dx = x(j,0) - xtmp;
+    const F_FLOAT dy = x(j,1) - ytmp;
+    const F_FLOAT dz = x(j,2) - ztmp;
+    int jtype = type(j);
+    const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
+
+    if (rsq < rnd_cutsq(jtype, jtype) ) {
+      int icol = size_local_cols_base + jtype - 1;
+      d_alocal(igrid, icol) += d_prefacelem(jtype-1) * exp(-rsq * d_argfacelem(jtype-1));
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   check max team size
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+template<class TagStyle>
+void ComputeGaussianGridLocalKokkos<DeviceType>::check_team_size_for(int inum, int &team_size, int vector_length) {
+  int team_size_max;
+
+  team_size_max = Kokkos::TeamPolicy<DeviceType,TagStyle>(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelForTag());
+
+  if (team_size*vector_length > team_size_max)
+    team_size = team_size_max/vector_length;
+}
+
+namespace LAMMPS_NS {
+template class ComputeGaussianGridLocalKokkos<LMPDeviceType>;
+#ifdef LMP_KOKKOS_GPU
+template class ComputeGaussianGridLocalKokkos<LMPHostType>;
+#endif
+}
--- a/src/KOKKOS/compute_gaussian_grid_local_kokkos.h
+++ b/src/KOKKOS/compute_gaussian_grid_local_kokkos.h
@ -0,0 +1,96 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef COMPUTE_CLASS
+// clang-format off
+ComputeStyle(gaussian/grid/local/kk,ComputeGaussianGridLocalKokkos<LMPDeviceType>);
+ComputeStyle(gaussian/grid/local/kk/device,ComputeGaussianGridLocalKokkos<LMPDeviceType>);
+ComputeStyle(gaussian/grid/local/kk/host,ComputeGaussianGridLocalKokkos<LMPHostType>);
+// clang-format on
+
+#else
+
+#ifndef LMP_COMPUTE_GAUSSIAN_GRID_LOCAL_KOKKOS_H
+#define LMP_COMPUTE_GAUSSIAN_GRID_LOCAL_KOKKOS_H
+
+#include "compute_gaussian_grid_local.h"
+#include "kokkos_type.h"
+
+namespace LAMMPS_NS {
+
+// clang-format off
+struct TagComputeGaussianGridLocalNeigh{};
+// clang-format on
+
+template <class DeviceType> class ComputeGaussianGridLocalKokkos : public ComputeGaussianGridLocal {
+ public:
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+
+  // Static team/tile sizes for device offload
+
+#ifdef KOKKOS_ENABLE_HIP
+  static constexpr int team_size_compute_neigh = 2;
+#else
+  static constexpr int team_size_compute_neigh = 4;
+#endif
+
+  ComputeGaussianGridLocalKokkos(class LAMMPS *, int, char **);
+  ~ComputeGaussianGridLocalKokkos() override;
+  void setup() override;
+  void init() override;
+  void compute_local() override;
+
+  template<class TagStyle>
+  void check_team_size_for(int, int&, int);
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagComputeGaussianGridLocalNeigh, const typename Kokkos::TeamPolicy<DeviceType, TagComputeGaussianGridLocalNeigh>::member_type& team) const;
+
+ private:
+  Kokkos::View<double*, DeviceType> d_radelem;              // element radii
+  Kokkos::View<double*, DeviceType> d_sigmaelem;
+  Kokkos::View<double*, DeviceType> d_prefacelem;
+  Kokkos::View<double*, DeviceType> d_argfacelem;
+  Kokkos::View<int*, DeviceType> d_ninside;                // ninside for all atoms in list
+  Kokkos::View<int*, DeviceType> d_map;                    // mapping from atom types to elements
+
+  typedef Kokkos::DualView<F_FLOAT**, DeviceType> tdual_fparams;
+  tdual_fparams k_cutsq;
+  typedef Kokkos::View<const F_FLOAT**, DeviceType,
+      Kokkos::MemoryTraits<Kokkos::RandomAccess> > t_fparams_rnd;
+  t_fparams_rnd rnd_cutsq;
+
+
+  int max_neighs, inum, chunk_size, chunk_offset;
+  int host_flag;
+  int total_range; // total number of loop iterations in grid
+  int xlen, ylen, zlen;
+  int chunksize;
+  int ntotal;
+
+  typename AT::t_x_array_randomread x;
+  typename AT::t_int_1d_randomread type;
+
+  DAT::tdual_float_2d k_alocal;
+  typename AT::t_float_2d d_alocal;
+
+  // triclinic vars
+  double h0, h1, h2, h3, h4, h5;
+  double lo0, lo1, lo2;
+};
+
+}    // namespace LAMMPS_NS
+
+#endif
+#endif
--- a/src/KOKKOS/compute_sna_grid_kokkos.cpp
+++ b/src/KOKKOS/compute_sna_grid_kokkos.cpp
@ -0,0 +1,25 @@
+// clang-format off
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "compute_sna_grid_kokkos.h"
+#include "compute_sna_grid_kokkos_impl.h"
+
+namespace LAMMPS_NS {
+
+template class ComputeSNAGridKokkosDevice<LMPDeviceType>;
+#ifdef LMP_KOKKOS_GPU
+template class ComputeSNAGridKokkosHost<LMPHostType>;
+#endif
+
+}
--- a/src/KOKKOS/compute_sna_grid_kokkos.h
+++ b/src/KOKKOS/compute_sna_grid_kokkos.h
@ -0,0 +1,297 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef COMPUTE_CLASS
+// clang-format off
+ComputeStyle(sna/grid/kk,ComputeSNAGridKokkosDevice<LMPDeviceType>);
+ComputeStyle(sna/grid/kk/device,ComputeSNAGridKokkosDevice<LMPDeviceType>);
+#ifdef LMP_KOKKOS_GPU
+ComputeStyle(sna/grid/kk/host,ComputeSNAGridKokkosHost<LMPHostType>);
+#else
+ComputeStyle(sna/grid/kk/host,ComputeSNAGridKokkosDevice<LMPHostType>);
+#endif
+// clang-format on
+#else
+
+// clang-format off
+#ifndef LMP_COMPUTE_SNA_GRID_KOKKOS_H
+#define LMP_COMPUTE_SNA_GRID_KOKKOS_H
+
+#include "compute_sna_grid.h"
+#include "kokkos_type.h"
+#include "sna_kokkos.h"
+
+namespace LAMMPS_NS {
+
+// Routines for both the CPU and GPU backend
+
+// GPU backend only
+struct TagCSNAGridComputeNeigh{};
+struct TagCSNAGridComputeCayleyKlein{};
+struct TagCSNAGridPreUi{};
+struct TagCSNAGridComputeUiSmall{}; // more parallelism, more divergence
+struct TagCSNAGridComputeUiLarge{}; // less parallelism, no divergence
+struct TagCSNAGridTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist
+template <bool chemsnap> struct TagCSNAGridComputeZi{};
+template <bool chemsnap> struct TagCSNAGridComputeBi{};
+struct TagCSNAGridLocalFill{}; // fill the gridlocal array
+
+struct TagComputeSNAGridLoop{};
+struct TagComputeSNAGrid3D{};
+
+// CPU backend only
+struct TagComputeSNAGridLoopCPU{};
+
+//template<class DeviceType>
+template<class DeviceType, typename real_type_, int vector_length_>
+class ComputeSNAGridKokkos : public ComputeSNAGrid {
+ public:
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+
+  static constexpr int vector_length = vector_length_;
+  using real_type = real_type_;
+  using complex = SNAComplex<real_type>;
+
+  // Static team/tile sizes for device offload
+
+#ifdef KOKKOS_ENABLE_HIP
+  static constexpr int team_size_compute_neigh = 2;
+  static constexpr int tile_size_compute_ck = 2;
+  static constexpr int tile_size_pre_ui = 2;
+  static constexpr int team_size_compute_ui = 2;
+  static constexpr int tile_size_transform_ui = 2;
+  static constexpr int tile_size_compute_zi = 2;
+  static constexpr int min_blocks_compute_zi = 0; // no minimum bound
+  static constexpr int tile_size_compute_bi = 2;
+  static constexpr int tile_size_compute_yi = 2;
+  static constexpr int min_blocks_compute_yi = 0; // no minimum bound
+  static constexpr int team_size_compute_fused_deidrj = 2;
+#else
+  static constexpr int team_size_compute_neigh = 4;
+  static constexpr int tile_size_compute_ck = 4;
+  static constexpr int tile_size_pre_ui = 4;
+  static constexpr int team_size_compute_ui = sizeof(real_type) == 4 ? 8 : 4;
+  static constexpr int tile_size_transform_ui = 4;
+  static constexpr int tile_size_compute_zi = 8;
+  static constexpr int tile_size_compute_bi = 4;
+  static constexpr int tile_size_compute_yi = 8;
+  static constexpr int team_size_compute_fused_deidrj = sizeof(real_type) == 4 ? 4 : 2;
+
+  // this empirically reduces perf fluctuations from compiler version to compiler version
+  static constexpr int min_blocks_compute_zi = 4;
+  static constexpr int min_blocks_compute_yi = 4;
+#endif
+
+  // Custom MDRangePolicy, Rank3, to reduce verbosity of kernel launches
+  // This hides the Kokkos::IndexType<int> and Kokkos::Rank<3...>
+  // and reduces the verbosity of the LaunchBound by hiding the explicit
+  // multiplication by vector_length
+  template <class Device, int num_tiles, class TagComputeSNA, int min_blocks = 0>
+  using Snap3DRangePolicy = typename Kokkos::MDRangePolicy<Device, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, Kokkos::LaunchBounds<vector_length * num_tiles, min_blocks>, TagComputeSNA>;
+
+  // MDRangePolicy for the 3D grid loop:
+  template <class Device, class TagComputeSNA>
+  using CSNAGrid3DPolicy = typename Kokkos::MDRangePolicy<Device, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>>;
+
+  // Testing out team policies
+  template <class Device, int num_teams,  class TagComputeSNA>
+  using CSNAGridTeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::LaunchBounds<vector_length * num_teams>, TagComputeSNA>;
+  //using CSNAGridTeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::IndexType<int>, Kokkos::IndexType<int>, Kokkos::IndexType<int>, TagComputeSNA>;
+  //using team_member = typename team_policy::member_type;
+
+  // Custom SnapAoSoATeamPolicy to reduce the verbosity of kernel launches
+  // This hides the LaunchBounds abstraction by hiding the explicit
+  // multiplication by vector length
+  template <class Device, int num_teams, class TagComputeSNA>
+  using SnapAoSoATeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::LaunchBounds<vector_length * num_teams>, TagComputeSNA>;
+
+  // Helper routine that returns a CPU or a GPU policy as appropriate
+  template <class Device, int num_tiles, class TagComputeSNA, int min_blocks = 0>
+  auto snap_get_policy(const int& chunk_size_div, const int& second_loop) {
+    return Snap3DRangePolicy<Device, num_tiles, TagComputeSNA, min_blocks>({0, 0, 0},
+                                                                 {vector_length, second_loop, chunk_size_div},
+                                                                 {vector_length, num_tiles, 1});
+  }
+
+  ComputeSNAGridKokkos(class LAMMPS *, int, char **);
+  ~ComputeSNAGridKokkos() override;
+
+  void setup() override;
+  void compute_array() override;
+
+  // Utility functions for teams
+
+  template<class TagStyle>
+  void check_team_size_for(int, int&);
+
+  template<class TagStyle>
+  void check_team_size_reduce(int, int&);
+
+  // operator function for example team policy
+  //KOKKOS_INLINE_FUNCTION
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagComputeSNAGridLoop, const int& ) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagComputeSNAGridLoopCPU, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridComputeNeigh>::member_type& team) const;
+
+  // 3D case - used by parallel_for
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagComputeSNAGrid3D, const int& iz, const int& iy, const int& ix) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeCayleyKlein, const int iatom_mod, const int jnbor, const int iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridPreUi, const int& iatom_mod, const int& j, const int& iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridPreUi, const int& iatom, const int& j) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridPreUi, const int& iatom) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridComputeUiSmall>::member_type& team) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridComputeUiLarge>::member_type& team) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridTransformUi, const int& iatom_mod, const int& idxu, const int& iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridTransformUi, const int& iatom, const int& idxu) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridTransformUi, const int& iatom) const;
+
+  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeZi<chemsnap>, const int& iatom_mod, const int& idxz, const int& iatom_div) const;
+
+  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeZi<chemsnap>, const int& iatom, const int& idxz) const;
+
+  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeZi<chemsnap>, const int& iatom) const;
+
+  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeBi<chemsnap>, const int& iatom_mod, const int& idxb, const int& iatom_div) const;
+
+  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeBi<chemsnap>, const int& iatom, const int& idxb) const;
+
+  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeBi<chemsnap>, const int& iatom) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalFill,const int& ii) const;
+
+ protected:
+
+  SNAKokkos<DeviceType, real_type, vector_length> snaKK;
+
+  int max_neighs, chunk_size, chunk_offset;
+  int host_flag;
+  int ntotal;
+  int total_range; // total number of loop iterations in grid
+  int zlen; //= nzhi-nzlo+1;
+  int ylen; //= nyhi-nylo+1;
+  int xlen; //= nxhi-nxlo+1;
+
+  double cutsq_tmp; // temporary cutsq until we get a view
+
+  Kokkos::View<real_type*, DeviceType> d_radelem;              // element radii
+  Kokkos::View<real_type*, DeviceType> d_wjelem;               // elements weights
+  Kokkos::View<real_type**, Kokkos::LayoutRight, DeviceType> d_coeffelem;           // element bispectrum coefficients
+  Kokkos::View<real_type*, DeviceType> d_sinnerelem;           // element inner cutoff midpoint
+  Kokkos::View<real_type*, DeviceType> d_dinnerelem;           // element inner cutoff half-width
+  Kokkos::View<T_INT*, DeviceType> d_ninside;                // ninside for all atoms in list
+  Kokkos::View<T_INT*, DeviceType> d_map;                    // mapping from atom types to elements
+  Kokkos::View<real_type*, DeviceType> d_test;              // test view
+
+  typedef Kokkos::DualView<F_FLOAT**, DeviceType> tdual_fparams;
+  tdual_fparams k_cutsq;
+  typedef Kokkos::View<const F_FLOAT**, DeviceType,
+      Kokkos::MemoryTraits<Kokkos::RandomAccess> > t_fparams_rnd;
+  t_fparams_rnd rnd_cutsq;
+
+  typename AT::t_x_array_randomread x;
+  typename AT::t_int_1d_randomread type;
+  DAT::tdual_float_2d k_grid;
+  DAT::tdual_float_2d k_gridall;
+  typename AT::t_float_2d d_grid;
+  typename AT::t_float_2d d_gridall;
+
+  DAT::tdual_float_4d k_gridlocal;
+  typename AT::t_float_4d d_gridlocal;
+
+
+  // Utility routine which wraps computing per-team scratch size requirements for
+  // ComputeNeigh, ComputeUi, and ComputeFusedDeidrj
+  template <typename scratch_type>
+  int scratch_size_helper(int values_per_team);
+
+  class DomainKokkos *domainKK;
+
+  // triclinic vars
+  double h0, h1, h2, h3, h4, h5;
+  double lo0, lo1, lo2;
+
+  // Make SNAKokkos a friend
+  friend class SNAKokkos<DeviceType, real_type, vector_length>;
+};
+
+// These wrapper classes exist to make the compute style factory happy/avoid having
+// to extend the compute  style factory to support Compute classes w/an arbitrary number
+// of extra template parameters
+
+template <class DeviceType>
+class ComputeSNAGridKokkosDevice : public ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN> {
+
+ private:
+  using Base = ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN>;
+
+ public:
+
+  ComputeSNAGridKokkosDevice(class LAMMPS *, int, char **);
+
+  void compute_array() override;
+
+};
+
+#ifdef LMP_KOKKOS_GPU
+template <class DeviceType>
+class ComputeSNAGridKokkosHost : public ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_HOST_VECLEN> {
+
+ private:
+  using Base = ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_HOST_VECLEN>;
+
+ public:
+
+  ComputeSNAGridKokkosHost(class LAMMPS *, int, char **);
+
+  void compute_array() override;
+
+};
+#endif
+
+}
+
+#endif
+#endif
--- a/src/KOKKOS/compute_sna_grid_kokkos_impl.h
+++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
@ -0,0 +1,786 @@
+// clang-format off
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Christian Trott (SNL), Stan Moore (SNL),
+                         Evan Weinberg (NVIDIA)
+------------------------------------------------------------------------- */
+
+#include "compute_sna_grid_kokkos.h"
+#include "pair_snap_kokkos.h"
+
+#include "atom_kokkos.h"
+#include "atom_masks.h"
+#include "comm.h"
+#include "error.h"
+#include "memory_kokkos.h"
+#include "modify.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "neighbor_kokkos.h"
+#include "domain.h"
+#include "domain_kokkos.h"
+#include "sna.h"
+#include "update.h"
+
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+
+#include <iostream>
+
+#define MAXLINE 1024
+#define MAXWORD 3
+
+namespace LAMMPS_NS {
+
+// Constructor
+
+template<class DeviceType, typename real_type, int vector_length>
+ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::ComputeSNAGridKokkos(LAMMPS *lmp, int narg, char **arg) : ComputeSNAGrid(lmp, narg, arg)
+{
+  kokkosable = 1;
+  atomKK = (AtomKokkos *) atom;
+  domainKK = (DomainKokkos *) domain;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  datamask_read = EMPTY_MASK;
+  datamask_modify = EMPTY_MASK;
+
+  k_cutsq = tdual_fparams("ComputeSNAGridKokkos::cutsq",atom->ntypes+1,atom->ntypes+1);
+  auto d_cutsq = k_cutsq.template view<DeviceType>();
+  rnd_cutsq = d_cutsq;
+
+  host_flag = (execution_space == Host);
+
+  // TODO: Extract cutsq in double loop below, no need for cutsq_tmp
+
+  cutsq_tmp = cutsq[1][1];
+
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = 1; j <= atom->ntypes; j++){
+      k_cutsq.h_view(i,j) = k_cutsq.h_view(j,i) = cutsq_tmp;
+      k_cutsq.template modify<LMPHostType>();
+    }
+  }
+
+   // Set up element lists
+  MemKK::realloc_kokkos(d_radelem,"ComputeSNAGridKokkos::radelem",nelements);
+  MemKK::realloc_kokkos(d_wjelem,"ComputeSNAGridKokkos:wjelem",nelements);
+  MemKK::realloc_kokkos(d_sinnerelem,"ComputeSNAGridKokkos:sinnerelem",nelements);
+  MemKK::realloc_kokkos(d_dinnerelem,"ComputeSNAGridKokkos:dinnerelem",nelements);
+  // test
+  MemKK::realloc_kokkos(d_test, "ComputeSNAGridKokkos::test", nelements);
+
+  int n = atom->ntypes;
+  MemKK::realloc_kokkos(d_map,"ComputeSNAGridKokkos::map",n+1);
+
+  auto h_radelem = Kokkos::create_mirror_view(d_radelem);
+  auto h_wjelem = Kokkos::create_mirror_view(d_wjelem);
+  auto h_sinnerelem = Kokkos::create_mirror_view(d_sinnerelem);
+  auto h_dinnerelem = Kokkos::create_mirror_view(d_dinnerelem);
+  auto h_map = Kokkos::create_mirror_view(d_map);
+  // test
+  auto h_test = Kokkos::create_mirror_view(d_test);
+  h_test(0) = 2.0;
+
+  // start from index 1 because of how compute sna/grid is
+  for (int i = 1; i <= atom->ntypes; i++) {
+    h_radelem(i-1) = radelem[i];
+    h_wjelem(i-1) = wjelem[i];
+    if (switchinnerflag){
+      h_sinnerelem(i) = sinnerelem[i];
+      h_dinnerelem(i) = dinnerelem[i];
+    }
+  }
+
+  // In pair snap some things like `map` get allocated regardless of chem flag.
+  if (chemflag){
+    for (int i = 1; i <= atom->ntypes; i++) {
+      h_map(i) = map[i];
+    }
+  }
+
+  Kokkos::deep_copy(d_radelem,h_radelem);
+  Kokkos::deep_copy(d_wjelem,h_wjelem);
+  if (switchinnerflag){
+    Kokkos::deep_copy(d_sinnerelem,h_sinnerelem);
+    Kokkos::deep_copy(d_dinnerelem,h_dinnerelem);
+  }
+  if (chemflag){
+    Kokkos::deep_copy(d_map,h_map);
+  }
+  Kokkos::deep_copy(d_test,h_test);
+
+  snaKK = SNAKokkos<DeviceType, real_type, vector_length>(*this);
+  snaKK.grow_rij(0,0);
+  snaKK.init();
+}
+
+// Destructor
+
+template<class DeviceType, typename real_type, int vector_length>
+ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::~ComputeSNAGridKokkos()
+{
+  if (copymode) return;
+
+  memoryKK->destroy_kokkos(k_cutsq,cutsq);
+  memoryKK->destroy_kokkos(k_gridall, gridall);
+}
+
+// Setup
+
+template<class DeviceType, typename real_type, int vector_length>
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::setup()
+{
+  // Do not call ComputeGrid::setup(), we don't wanna allocate the grid array there.
+  // Instead, call ComputeGrid::set_grid_global and set_grid_local to set the n indices.
+
+  ComputeGrid::set_grid_global();
+  ComputeGrid::set_grid_local();
+
+  // allocate arrays
+  memoryKK->create_kokkos(k_gridall, gridall, size_array_rows, size_array_cols, "grid:gridall");
+
+  // do not use or allocate gridlocal for now
+
+  gridlocal_allocated = 0;
+  array = gridall;
+
+  d_gridlocal = k_gridlocal.template view<DeviceType>();
+  d_gridall = k_gridall.template view<DeviceType>();
+}
+
+// Compute
+
+template<class DeviceType, typename real_type, int vector_length>
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::compute_array()
+{
+  if (host_flag) {
+    ComputeSNAGrid::compute_array();
+    return;
+  }
+
+  copymode = 1;
+
+  zlen = nzhi-nzlo+1;
+  ylen = nyhi-nylo+1;
+  xlen = nxhi-nxlo+1;
+  total_range = (nzhi-nzlo+1)*(nyhi-nylo+1)*(nxhi-nxlo+1);
+
+  atomKK->sync(execution_space,X_MASK|F_MASK|TYPE_MASK);
+  x = atomKK->k_x.view<DeviceType>();
+  type = atomKK->k_type.view<DeviceType>();
+  k_cutsq.template sync<DeviceType>();
+
+  // max_neighs is defined here - think of more elaborate methods.
+  max_neighs = 100;
+
+  // Pair snap/kk uses grow_ij with some max number of neighs but compute sna/grid uses total
+  // number of atoms.
+
+  ntotal = atomKK->nlocal + atomKK->nghost;
+  // Allocate view for number of neighbors per grid point
+  MemKK::realloc_kokkos(d_ninside,"ComputeSNAGridKokkos:ninside",total_range);
+
+  // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user
+  // `total_range` is the number of grid points which may be larger than chunk size.
+  chunk_size = MIN(chunksize, total_range);
+  chunk_offset = 0;
+  snaKK.grow_rij(chunk_size, max_neighs);
+
+  // Pre-compute ceil(chunk_size / vector_length) for code cleanliness
+  const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length;
+
+  if (triclinic) {
+    h0 = domain->h[0];
+    h1 = domain->h[1];
+    h2 = domain->h[2];
+    h3 = domain->h[3];
+    h4 = domain->h[4];
+    h5 = domain->h[5];
+    lo0 = domain->boxlo[0];
+    lo1 = domain->boxlo[1];
+    lo2 = domain->boxlo[2];
+  }
+
+  while (chunk_offset < total_range) { // chunk up loop to prevent running out of memory
+
+    if (chunk_size > total_range - chunk_offset)
+      chunk_size = total_range - chunk_offset;
+
+
+    //ComputeNeigh
+    {
+      int scratch_size = scratch_size_helper<int>(team_size_compute_neigh * max_neighs); //ntotal);
+
+      SnapAoSoATeamPolicy<DeviceType, team_size_compute_neigh, TagCSNAGridComputeNeigh>
+        policy_neigh(chunk_size, team_size_compute_neigh, vector_length);
+      policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+      Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this);
+    }
+
+    //ComputeCayleyKlein
+    {
+      // tile_size_compute_ck is defined in `compute_sna_grid_kokkos.h`
+      Snap3DRangePolicy<DeviceType, tile_size_compute_ck, TagCSNAGridComputeCayleyKlein>
+        policy_compute_ck({0,0,0}, {vector_length, max_neighs, chunk_size_div}, {vector_length, tile_size_compute_ck, 1});
+      Kokkos::parallel_for("ComputeCayleyKlein", policy_compute_ck, *this);
+    }
+
+    //PreUi
+    {
+      auto policy_pre_ui = snap_get_policy<DeviceType, tile_size_pre_ui, TagCSNAGridPreUi>(chunk_size_div, twojmax + 1);
+      Kokkos::parallel_for("PreUi", policy_pre_ui, *this);
+    }
+
+    // ComputeUi w/ vector parallelism, shared memory, direct atomicAdd into ulisttot
+    {
+      // team_size_compute_ui is defined in `compute_sna_grid_kokkos.h`
+      // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer
+      const int tile_size = vector_length * (twojmax + 1);
+      const int scratch_size = scratch_size_helper<complex>(team_size_compute_ui * tile_size);
+
+      if (chunk_size < parallel_thresh)
+      {
+        // Version with parallelism over j_bend
+
+        // total number of teams needed: (natoms / 32) * (ntotal) * ("bend" locations)
+        const int n_teams = chunk_size_div * max_neighs * (twojmax + 1);
+        const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
+
+        SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagCSNAGridComputeUiSmall>
+          policy_ui(n_teams_div, team_size_compute_ui, vector_length);
+        policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+        Kokkos::parallel_for("ComputeUiSmall", policy_ui, *this);
+      } else {
+        // Version w/out parallelism over j_bend
+
+        // total number of teams needed: (natoms / 32) * (ntotal)
+        const int n_teams = chunk_size_div * max_neighs;
+        const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
+
+        SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagCSNAGridComputeUiLarge>
+          policy_ui(n_teams_div, team_size_compute_ui, vector_length);
+        policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+        Kokkos::parallel_for("ComputeUiLarge", policy_ui, *this);
+      }
+    }
+
+    //TransformUi: un-"fold" ulisttot, zero ylist
+    {
+      // Expand ulisttot_re,_im -> ulisttot
+      // Zero out ylist
+      auto policy_transform_ui = snap_get_policy<DeviceType, tile_size_transform_ui, TagCSNAGridTransformUi>(chunk_size_div, snaKK.idxu_max);
+      Kokkos::parallel_for("TransformUi", policy_transform_ui, *this);
+    }
+
+    //Compute bispectrum
+    // team_size_[compute_zi, compute_bi, transform_bi] are defined in `pair_snap_kokkos.h`
+
+    //ComputeZi and Bi
+    if (nelements > 1) {
+      auto policy_compute_zi = snap_get_policy<DeviceType, tile_size_compute_zi, TagCSNAGridComputeZi<true>, min_blocks_compute_zi>(chunk_size_div, snaKK.idxz_max);
+      Kokkos::parallel_for("ComputeZiChemsnap", policy_compute_zi, *this);
+
+      auto policy_compute_bi = snap_get_policy<DeviceType, tile_size_compute_bi, TagCSNAGridComputeBi<true>>(chunk_size_div, snaKK.idxb_max);
+      Kokkos::parallel_for("ComputeBiChemsnap", policy_compute_bi, *this);
+    } else {
+      auto policy_compute_zi = snap_get_policy<DeviceType, tile_size_compute_zi, TagCSNAGridComputeZi<false>, min_blocks_compute_zi>(chunk_size_div, snaKK.idxz_max);
+      Kokkos::parallel_for("ComputeZi", policy_compute_zi, *this);
+
+      auto policy_compute_bi = snap_get_policy<DeviceType, tile_size_compute_bi, TagCSNAGridComputeBi<false>>(chunk_size_div, snaKK.idxb_max);
+      Kokkos::parallel_for("ComputeBi", policy_compute_bi, *this);
+    }
+
+    // Fill the grid array with bispectrum values
+    {
+      typename Kokkos::RangePolicy<DeviceType,TagCSNAGridLocalFill> policy_fill(0,chunk_size);
+      Kokkos::parallel_for(policy_fill, *this);
+    }
+
+    // Proceed to the next chunk.
+    chunk_offset += chunk_size;
+
+  } // end while
+
+  copymode = 0;
+
+  k_gridlocal.template modify<DeviceType>();
+  k_gridlocal.template sync<LMPHostType>();
+
+  k_gridall.template modify<DeviceType>();
+  k_gridall.template sync<LMPHostType>();
+}
+
+/* ----------------------------------------------------------------------
+   Begin routines that are unique to the GPU codepath. These take advantage
+   of AoSoA data layouts and scratch memory for recursive polynomials
+------------------------------------------------------------------------- */
+
+/*
+ Simple team policy functor seeing how many layers deep we can go with the parallelism.
+ */
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridComputeNeigh>::member_type& team) const {
+
+  // This function follows similar procedure as ComputeNeigh of PairSNAPKokkos.
+  // Main difference is that we don't use the neighbor class or neighbor variables here.
+  // This is because the grid points are not atoms and therefore do not get assigned
+  // neighbors in LAMMPS.
+  // TODO: If we did make a neighborlist for each grid point, we could use current
+  //       routines and avoid having to loop over all atoms (which limits us to
+  //       natoms = max team size).
+
+  // basic quantities associated with this team:
+  // team_rank : rank of thread in this team
+  // league_rank : rank of team in this league
+  // team_size : number of threads in this team
+
+  // extract loop index
+  int ii = team.team_rank() + team.league_rank() * team.team_size();
+
+  if (ii >= chunk_size) return;
+
+  // extract grid index
+  int igrid = ii + chunk_offset;
+
+  // get a pointer to scratch memory
+  // This is used to cache whether or not an atom is within the cutoff.
+  // If it is, type_cache is assigned to the atom type.
+  // If it's not, it's assigned to -1.
+  //const int tile_size = ntotal; //max_neighs; // number of elements per thread
+  //const int team_rank = team.team_rank();
+  //const int scratch_shift = team_rank * tile_size; // offset into pointer for entire team
+  //int* type_cache = (int*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(int), 0) + scratch_shift;
+
+  // convert to grid indices
+
+  int iz = igrid/(xlen*ylen);
+  int i2 = igrid - (iz*xlen*ylen);
+  int iy = i2/xlen;
+  int ix = i2 % xlen;
+  iz += nzlo;
+  iy += nylo;
+  ix += nxlo;
+
+  double xgrid[3];
+
+  // index ii already captures the proper grid point
+  //int igrid = iz * (nx * ny) + iy * nx + ix;
+
+  // grid2x converts igrid to ix,iy,iz like we've done before
+  // multiply grid integers by grid spacing delx, dely, delz
+  //grid2x(igrid, xgrid);
+  xgrid[0] = ix * delx;
+  xgrid[1] = iy * dely;
+  xgrid[2] = iz * delz;
+
+  if (triclinic) {
+
+    // Do a conversion on `xgrid` here like we do in the CPU version.
+
+    // Can't do this:
+    // domainKK->lamda2x(xgrid, xgrid);
+    // Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed
+
+    // Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats.
+    xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0;
+    xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1;
+    xgrid[2] = h2*xgrid[2] + lo2;
+  }
+
+  const F_FLOAT xtmp = xgrid[0];
+  const F_FLOAT ytmp = xgrid[1];
+  const F_FLOAT ztmp = xgrid[2];
+
+  // currently, all grid points are type 1
+  // not clear what a better choice would be
+
+  const int itype = 1;
+  int ielem = 0;
+  if (chemflag) ielem = d_map[itype];
+  //const double radi = d_radelem[ielem];
+
+  // Compute the number of neighbors, store rsq
+  int ninside = 0;
+
+  // Looping over ntotal for now.
+  for (int j = 0; j < ntotal; j++){
+    const F_FLOAT dx = x(j,0) - xtmp;
+    const F_FLOAT dy = x(j,1) - ytmp;
+    const F_FLOAT dz = x(j,2) - ztmp;
+    int jtype = type(j);
+    const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
+
+    // don't include atoms that share location with grid point
+    if (rsq >= rnd_cutsq(itype,jtype) || rsq < 1e-20) {
+      jtype = -1; // use -1 to signal it's outside the radius
+    }
+
+    if (jtype >= 0)
+      ninside++;
+  }
+
+  d_ninside(ii) = ninside;
+
+  // TODO: Adjust for multi-element, currently we set jelem = 0 regardless of type.
+  int offset = 0;
+  for (int j = 0; j < ntotal; j++){
+    //const int jtype = type_cache[j];
+    //if (jtype >= 0) {
+    const F_FLOAT dx = x(j,0) - xtmp;
+    const F_FLOAT dy = x(j,1) - ytmp;
+    const F_FLOAT dz = x(j,2) - ztmp;
+    const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
+    int jtype = type(j);
+    if (rsq < rnd_cutsq(itype,jtype) && rsq > 1e-20) {
+      int jelem = 0;
+      if (chemflag) jelem = d_map[jtype];
+      snaKK.rij(ii,offset,0) = static_cast<real_type>(dx);
+      snaKK.rij(ii,offset,1) = static_cast<real_type>(dy);
+      snaKK.rij(ii,offset,2) = static_cast<real_type>(dz);
+      // pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp
+      // actually since the views here have values starting at 0, let's use jelem
+      snaKK.wj(ii,offset) = static_cast<real_type>(d_wjelem[jelem]);
+      snaKK.rcutij(ii,offset) = static_cast<real_type>((2.0 * d_radelem[jelem])*rcutfac);
+      snaKK.inside(ii,offset) = j;
+      if (switchinnerflag) {
+        snaKK.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]);
+        snaKK.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]);
+      }
+      if (chemflag)
+        snaKK.element(ii,offset) = jelem;
+      else
+        snaKK.element(ii,offset) = 0;
+      offset++;
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+  Pre-compute the Cayley-Klein parameters for reuse in later routines
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeCayleyKlein,const int iatom_mod, const int jnbor, const int iatom_div) const {
+
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+
+  const int ninside = d_ninside(iatom);
+  if (jnbor >= ninside) return;
+
+  snaKK.compute_cayley_klein(iatom, jnbor);
+}
+
+/* ----------------------------------------------------------------------
+  Initialize the "ulisttot" structure with non-zero on-diagonal terms
+  and zero terms elsewhere
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridPreUi, const int& iatom_mod, const int& j, const int& iatom_div) const {
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+
+  int itype = type(iatom);
+  int ielem = d_map[itype];
+
+  snaKK.pre_ui(iatom, j, ielem);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridPreUi, const int& iatom, const int& j) const {
+  if (iatom >= chunk_size) return;
+
+  int itype = type(iatom);
+  int ielem = d_map[itype];
+
+  snaKK.pre_ui(iatom, j, ielem);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridPreUi, const int& iatom) const {
+  if (iatom >= chunk_size) return;
+
+  const int itype = type(iatom);
+  const int ielem = d_map[itype];
+
+  for (int j = 0; j <= twojmax; j++)
+    snaKK.pre_ui(iatom, j, ielem);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridComputeUiSmall>::member_type& team) const {
+
+  // extract flattened atom_div / neighbor number / bend location
+  int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
+
+  // extract neighbor index, iatom_div
+  int iatom_div = flattened_idx / (max_neighs * (twojmax + 1)); // removed "const" to work around GCC 7 bug
+  const int jj_jbend = flattened_idx - iatom_div * (max_neighs * (twojmax + 1));
+  const int jbend = jj_jbend / max_neighs;
+  int jj = jj_jbend - jbend * max_neighs; // removed "const" to work around GCC 7 bug
+
+  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
+    [&] (const int iatom_mod) {
+    const int ii = iatom_mod + vector_length * iatom_div;
+    if (ii >= chunk_size) return;
+
+    const int ninside = d_ninside(ii);
+    if (jj >= ninside) return;
+
+    snaKK.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div);
+  });
+
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridComputeUiLarge>::member_type& team) const {
+
+  // extract flattened atom_div / neighbor number / bend location
+  int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
+
+  // extract neighbor index, iatom_div
+  int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug
+  int jj = flattened_idx - iatom_div * max_neighs;
+
+  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
+    [&] (const int iatom_mod) {
+    const int ii = iatom_mod + vector_length * iatom_div;
+    if (ii >= chunk_size) return;
+
+    const int ninside = d_ninside(ii);
+    if (jj >= ninside) return;
+
+    snaKK.compute_ui_large(team,iatom_mod, jj, iatom_div);
+  });
+}
+
+/* ----------------------------------------------------------------------
+  De-symmetrize ulisttot_re and _im and pack it into a unified ulisttot
+  structure. Zero-initialize ylist. CPU and GPU.
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridTransformUi, const int& iatom_mod, const int& idxu, const int& iatom_div) const {
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+  if (idxu >= snaKK.idxu_max) return;
+  snaKK.transform_ui(iatom, idxu);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridTransformUi, const int& iatom, const int& idxu) const {
+  if (iatom >= chunk_size) return;
+  snaKK.transform_ui(iatom, idxu);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridTransformUi, const int& iatom) const {
+  if (iatom >= chunk_size) return;
+  for (int idxu = 0; idxu < snaKK.idxu_max; idxu++)
+    snaKK.transform_ui(iatom, idxu);
+}
+
+/* ----------------------------------------------------------------------
+  Compute all elements of the Z tensor and store them into the `zlist`
+   view
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeZi<chemsnap>, const int& iatom_mod, const int& jjz, const int& iatom_div) const {
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+  if (jjz >= snaKK.idxz_max) return;
+  snaKK.template compute_zi<chemsnap>(iatom, jjz);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeZi<chemsnap>, const int& iatom, const int& jjz) const {
+  if (iatom >= chunk_size) return;
+  snaKK.template compute_zi<chemsnap>(iatom, jjz);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeZi<chemsnap>, const int& iatom) const {
+  if (iatom >= chunk_size) return;
+  for (int jjz = 0; jjz < snaKK.idxz_max; jjz++)
+    snaKK.template compute_zi<chemsnap>(iatom, jjz);
+}
+
+/* ----------------------------------------------------------------------
+  Compute the energy triple products and store in the "blist" view
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeBi<chemsnap>, const int& iatom_mod, const int& jjb, const int& iatom_div) const {
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+  if (jjb >= snaKK.idxb_max) return;
+  snaKK.template compute_bi<chemsnap>(iatom, jjb);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeBi<chemsnap>, const int& iatom, const int& jjb) const {
+  if (iatom >= chunk_size) return;
+  snaKK.template compute_bi<chemsnap>(iatom, jjb);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeBi<chemsnap>, const int& iatom) const {
+  if (iatom >= chunk_size) return;
+  for (int jjb = 0; jjb < snaKK.idxb_max; jjb++)
+    snaKK.template compute_bi<chemsnap>(iatom, jjb);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalFill, const int& ii) const {
+
+  // extract grid index
+  int igrid = ii + chunk_offset;
+
+  // convert to grid indices
+
+  int iz = igrid/(xlen*ylen);
+  int i2 = igrid - (iz*xlen*ylen);
+  int iy = i2/xlen;
+  int ix = i2 % xlen;
+  iz += nzlo;
+  iy += nylo;
+  ix += nxlo;
+
+  double xgrid[3];
+
+  // index ii already captures the proper grid point
+  // int igrid = iz * (nx * ny) + iy * nx + ix;
+  // printf("ii igrid: %d %d\n", ii, igrid);
+
+  // grid2x converts igrid to ix,iy,iz like we've done before
+  //grid2x(igrid, xgrid);
+  xgrid[0] = ix * delx;
+  xgrid[1] = iy * dely;
+  xgrid[2] = iz * delz;
+  if (triclinic) {
+
+    // Do a conversion on `xgrid` here like we do in the CPU version.
+
+    // Can't do this:
+    // domainKK->lamda2x(xgrid, xgrid);
+    // Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed
+
+    // Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats.
+    xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0;
+    xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1;
+    xgrid[2] = h2*xgrid[2] + lo2;
+  }
+
+  const F_FLOAT xtmp = xgrid[0];
+  const F_FLOAT ytmp = xgrid[1];
+  const F_FLOAT ztmp = xgrid[2];
+  d_gridall(igrid,0) = xtmp;
+  d_gridall(igrid,1) = ytmp;
+  d_gridall(igrid,2) = ztmp;
+
+  const auto idxb_max = snaKK.idxb_max;
+
+  // linear contributions
+
+  for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
+    const auto idxb = icoeff % idxb_max;
+    const auto idx_chem = icoeff / idxb_max;
+    d_gridall(igrid,icoeff+3) = snaKK.blist(ii,idx_chem,idxb);
+  }
+
+}
+
+/* ----------------------------------------------------------------------
+   utility functions
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+template<class TagStyle>
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::check_team_size_for(int inum, int &team_size) {
+  int team_size_max;
+
+  team_size_max = Kokkos::TeamPolicy<DeviceType,TagStyle>(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelForTag());
+
+  if (team_size*vector_length > team_size_max)
+    team_size = team_size_max/vector_length;
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+template<class TagStyle>
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::check_team_size_reduce(int inum, int &team_size) {
+  int team_size_max;
+
+  team_size_max = Kokkos::TeamPolicy<DeviceType,TagStyle>(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelReduceTag());
+
+  if (team_size*vector_length > team_size_max)
+    team_size = team_size_max/vector_length;
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+template<typename scratch_type>
+int ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::scratch_size_helper(int values_per_team) {
+  typedef Kokkos::View<scratch_type*, Kokkos::DefaultExecutionSpace::scratch_memory_space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > ScratchViewType;
+
+  return ScratchViewType::shmem_size(values_per_team);
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   routines used by template reference classes
+------------------------------------------------------------------------- */
+
+
+template<class DeviceType>
+ComputeSNAGridKokkosDevice<DeviceType>::ComputeSNAGridKokkosDevice(class LAMMPS *lmp, int narg, char **arg)
+   : ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN>(lmp, narg, arg) { ; }
+
+template<class DeviceType>
+void ComputeSNAGridKokkosDevice<DeviceType>::compute_array()
+{
+  Base::compute_array();
+}
+
+#ifdef LMP_KOKKOS_GPU
+template<class DeviceType>
+ComputeSNAGridKokkosHost<DeviceType>::ComputeSNAGridKokkosHost(class LAMMPS *lmp, int narg, char **arg)
+   : ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_HOST_VECLEN>(lmp, narg, arg) { ; }
+
+template<class DeviceType>
+void ComputeSNAGridKokkosHost<DeviceType>::compute_array()
+{
+  Base::compute_array();
+}
+#endif
+
+}
--- a/src/KOKKOS/compute_sna_grid_local_kokkos.cpp
+++ b/src/KOKKOS/compute_sna_grid_local_kokkos.cpp
@ -0,0 +1,25 @@
+// clang-format off
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "compute_sna_grid_local_kokkos.h"
+#include "compute_sna_grid_local_kokkos_impl.h"
+
+namespace LAMMPS_NS {
+
+template class ComputeSNAGridLocalKokkosDevice<LMPDeviceType>;
+#ifdef LMP_KOKKOS_GPU
+template class ComputeSNAGridLocalKokkosHost<LMPHostType>;
+#endif
+
+}
--- a/src/KOKKOS/compute_sna_grid_local_kokkos.h
+++ b/src/KOKKOS/compute_sna_grid_local_kokkos.h
@ -0,0 +1,288 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef COMPUTE_CLASS
+// clang-format off
+ComputeStyle(sna/grid/local/kk,ComputeSNAGridLocalKokkosDevice<LMPDeviceType>);
+ComputeStyle(sna/grid/local/kk/device,ComputeSNAGridLocalKokkosDevice<LMPDeviceType>);
+#ifdef LMP_KOKKOS_GPU
+ComputeStyle(sna/grid/local/kk/host,ComputeSNAGridLocalKokkosHost<LMPHostType>);
+#else
+ComputeStyle(sna/grid/local/kk/host,ComputeSNAGridLocalKokkosDevice<LMPHostType>);
+#endif
+// clang-format on
+#else
+
+// clang-format off
+#ifndef LMP_COMPUTE_SNA_GRID_LOCAL_KOKKOS_H
+#define LMP_COMPUTE_SNA_GRID_LOCAL_KOKKOS_H
+
+#include "compute_sna_grid_local.h"
+#include "kokkos_type.h"
+#include "sna_kokkos.h"
+
+namespace LAMMPS_NS {
+
+// Routines for both the CPU and GPU backend
+
+// GPU backend only
+struct TagCSNAGridLocalComputeNeigh{};
+struct TagCSNAGridLocalComputeCayleyKlein{};
+struct TagCSNAGridLocalPreUi{};
+struct TagCSNAGridLocalComputeUiSmall{}; // more parallelism, more divergence
+struct TagCSNAGridLocalComputeUiLarge{}; // less parallelism, no divergence
+struct TagCSNAGridLocalTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist
+template <bool chemsnap> struct TagCSNAGridLocalComputeZi{};
+template <bool chemsnap> struct TagCSNAGridLocalComputeBi{};
+struct TagCSNAGridLocal2Fill{}; // fill the gridlocal array
+
+struct TagComputeSNAGridLocalLoop{};
+struct TagComputeSNAGridLocal3D{};
+
+// CPU backend only
+struct TagComputeSNAGridLocalLoopCPU{};
+
+//template<class DeviceType>
+template<class DeviceType, typename real_type_, int vector_length_>
+class ComputeSNAGridLocalKokkos : public ComputeSNAGridLocal {
+ public:
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+
+  static constexpr int vector_length = vector_length_;
+  using real_type = real_type_;
+  using complex = SNAComplex<real_type>;
+
+  // Static team/tile sizes for device offload
+
+#ifdef KOKKOS_ENABLE_HIP
+  static constexpr int team_size_compute_neigh = 2;
+  static constexpr int tile_size_compute_ck = 2;
+  static constexpr int tile_size_pre_ui = 2;
+  static constexpr int team_size_compute_ui = 2;
+  static constexpr int tile_size_transform_ui = 2;
+  static constexpr int tile_size_compute_zi = 2;
+  static constexpr int min_blocks_compute_zi = 0; // no minimum bound
+  static constexpr int tile_size_compute_bi = 2;
+  static constexpr int tile_size_compute_yi = 2;
+  static constexpr int min_blocks_compute_yi = 0; // no minimum bound
+  static constexpr int team_size_compute_fused_deidrj = 2;
+#else
+  static constexpr int team_size_compute_neigh = 4;
+  static constexpr int tile_size_compute_ck = 4;
+  static constexpr int tile_size_pre_ui = 4;
+  static constexpr int team_size_compute_ui = sizeof(real_type) == 4 ? 8 : 4;
+  static constexpr int tile_size_transform_ui = 4;
+  static constexpr int tile_size_compute_zi = 8;
+  static constexpr int tile_size_compute_bi = 4;
+  static constexpr int tile_size_compute_yi = 8;
+  static constexpr int team_size_compute_fused_deidrj = sizeof(real_type) == 4 ? 4 : 2;
+
+  // this empirically reduces perf fluctuations from compiler version to compiler version
+  static constexpr int min_blocks_compute_zi = 4;
+  static constexpr int min_blocks_compute_yi = 4;
+#endif
+
+  // Custom MDRangePolicy, Rank3, to reduce verbosity of kernel launches
+  // This hides the Kokkos::IndexType<int> and Kokkos::Rank<3...>
+  // and reduces the verbosity of the LaunchBound by hiding the explicit
+  // multiplication by vector_length
+  template <class Device, int num_tiles, class TagComputeSNA, int min_blocks = 0>
+  using Snap3DRangePolicy = typename Kokkos::MDRangePolicy<Device, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, Kokkos::LaunchBounds<vector_length * num_tiles, min_blocks>, TagComputeSNA>;
+
+  // MDRangePolicy for the 3D grid loop:
+  template <class Device, class TagComputeSNA>
+  using CSNAGridLocal3DPolicy = typename Kokkos::MDRangePolicy<Device, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>>;
+
+  // Testing out team policies
+  template <class Device, int num_teams,  class TagComputeSNA>
+  using CSNAGridLocalTeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::LaunchBounds<vector_length * num_teams>, TagComputeSNA>;
+
+  // Custom SnapAoSoATeamPolicy to reduce the verbosity of kernel launches
+  // This hides the LaunchBounds abstraction by hiding the explicit
+  // multiplication by vector length
+  template <class Device, int num_teams, class TagComputeSNA>
+  using SnapAoSoATeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::LaunchBounds<vector_length * num_teams>, TagComputeSNA>;
+
+  // Helper routine that returns a CPU or a GPU policy as appropriate
+  template <class Device, int num_tiles, class TagComputeSNA, int min_blocks = 0>
+  auto snap_get_policy(const int& chunk_size_div, const int& second_loop) {
+    return Snap3DRangePolicy<Device, num_tiles, TagComputeSNA, min_blocks>({0, 0, 0},
+                                                                 {vector_length, second_loop, chunk_size_div},
+                                                                 {vector_length, num_tiles, 1});
+  }
+
+  ComputeSNAGridLocalKokkos(class LAMMPS *, int, char **);
+  ~ComputeSNAGridLocalKokkos() override;
+
+  void setup() override;
+  void compute_local() override;
+
+  // Utility functions for teams
+
+  template<class TagStyle>
+  void check_team_size_for(int, int&);
+
+  template<class TagStyle>
+  void check_team_size_reduce(int, int&);
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagComputeSNAGridLocalLoop, const int& ) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagComputeSNAGridLocalLoopCPU, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridLocalComputeNeigh>::member_type& team) const;
+
+  // 3D case - used by parallel_for
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagComputeSNAGridLocal3D, const int& iz, const int& iy, const int& ix) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalComputeCayleyKlein, const int iatom_mod, const int jnbor, const int iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalPreUi, const int& iatom_mod, const int& j, const int& iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalPreUi, const int& iatom, const int& j) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalPreUi, const int& iatom) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridLocalComputeUiSmall>::member_type& team) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridLocalComputeUiLarge>::member_type& team) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalTransformUi, const int& iatom_mod, const int& idxu, const int& iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalTransformUi, const int& iatom, const int& idxu) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalTransformUi, const int& iatom) const;
+
+  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalComputeZi<chemsnap>, const int& iatom_mod, const int& idxz, const int& iatom_div) const;
+
+  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalComputeZi<chemsnap>, const int& iatom, const int& idxz) const;
+
+  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalComputeZi<chemsnap>, const int& iatom) const;
+
+  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalComputeBi<chemsnap>, const int& iatom_mod, const int& idxb, const int& iatom_div) const;
+
+  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalComputeBi<chemsnap>, const int& iatom, const int& idxb) const;
+
+  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalComputeBi<chemsnap>, const int& iatom) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocal2Fill,const int& ii) const;
+
+ protected:
+
+  SNAKokkos<DeviceType, real_type, vector_length> snaKK;
+
+  int max_neighs, chunk_size, chunk_offset;
+  int host_flag;
+  int ntotal;
+  int total_range; // total number of loop iterations in grid
+  int zlen; //= nzhi-nzlo+1;
+  int ylen; //= nyhi-nylo+1;
+  int xlen; //= nxhi-nxlo+1;
+
+  double cutsq_tmp; // temporary cutsq until we get a view
+
+  Kokkos::View<real_type*, DeviceType> d_radelem;              // element radii
+  Kokkos::View<real_type*, DeviceType> d_wjelem;               // elements weights
+  Kokkos::View<real_type**, Kokkos::LayoutRight, DeviceType> d_coeffelem;           // element bispectrum coefficients
+  Kokkos::View<real_type*, DeviceType> d_sinnerelem;           // element inner cutoff midpoint
+  Kokkos::View<real_type*, DeviceType> d_dinnerelem;           // element inner cutoff half-width
+  Kokkos::View<T_INT*, DeviceType> d_ninside;                // ninside for all atoms in list
+  Kokkos::View<T_INT*, DeviceType> d_map;                    // mapping from atom types to elements
+  Kokkos::View<real_type*, DeviceType> d_test;              // test view
+
+  typedef Kokkos::DualView<F_FLOAT**, DeviceType> tdual_fparams;
+  tdual_fparams k_cutsq;
+  typedef Kokkos::View<const F_FLOAT**, DeviceType,
+      Kokkos::MemoryTraits<Kokkos::RandomAccess> > t_fparams_rnd;
+  t_fparams_rnd rnd_cutsq;
+
+  typename AT::t_x_array_randomread x;
+  typename AT::t_int_1d_randomread type;
+
+  DAT::tdual_float_2d k_alocal;
+  typename AT::t_float_2d d_alocal;
+
+
+  // Utility routine which wraps computing per-team scratch size requirements for
+  // ComputeNeigh, ComputeUi, and ComputeFusedDeidrj
+  template <typename scratch_type>
+  int scratch_size_helper(int values_per_team);
+
+  class DomainKokkos *domainKK;
+
+  // triclinic vars
+  double h0, h1, h2, h3, h4, h5;
+  double lo0, lo1, lo2;
+
+  // Make SNAKokkos a friend
+  friend class SNAKokkos<DeviceType, real_type, vector_length>;
+};
+
+// These wrapper classes exist to make the compute style factory happy/avoid having
+// to extend the compute  style factory to support Compute classes w/an arbitrary number
+// of extra template parameters
+
+template <class DeviceType>
+class ComputeSNAGridLocalKokkosDevice : public ComputeSNAGridLocalKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN> {
+
+ private:
+  using Base = ComputeSNAGridLocalKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN>;
+
+ public:
+
+  ComputeSNAGridLocalKokkosDevice(class LAMMPS *, int, char **);
+
+  void compute_local() override;
+
+};
+
+#ifdef LMP_KOKKOS_GPU
+template <class DeviceType>
+class ComputeSNAGridLocalKokkosHost : public ComputeSNAGridLocalKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_HOST_VECLEN> {
+
+ private:
+  using Base = ComputeSNAGridLocalKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_HOST_VECLEN>;
+
+ public:
+
+  ComputeSNAGridLocalKokkosHost(class LAMMPS *, int, char **);
+
+  void compute_local() override;
+
+};
+#endif
+
+}
+
+#endif
+#endif
--- a/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h
+++ b/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h
@ -0,0 +1,783 @@
+// clang-format off
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Andrew Rohskopf (SNL)
+------------------------------------------------------------------------- */
+
+#include "compute_sna_grid_local_kokkos.h"
+#include "pair_snap_kokkos.h"
+
+#include "atom_kokkos.h"
+#include "atom_masks.h"
+#include "comm.h"
+#include "error.h"
+#include "memory_kokkos.h"
+#include "modify.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "neighbor_kokkos.h"
+#include "domain.h"
+#include "domain_kokkos.h"
+#include "sna.h"
+#include "update.h"
+
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+
+#include <iostream>
+
+#define MAXLINE 1024
+#define MAXWORD 3
+
+namespace LAMMPS_NS {
+
+// Constructor
+
+template<class DeviceType, typename real_type, int vector_length>
+ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::ComputeSNAGridLocalKokkos(LAMMPS *lmp, int narg, char **arg) : ComputeSNAGridLocal(lmp, narg, arg)
+{
+  kokkosable = 1;
+  atomKK = (AtomKokkos *) atom;
+  domainKK = (DomainKokkos *) domain;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  datamask_read = EMPTY_MASK;
+  datamask_modify = EMPTY_MASK;
+
+  k_cutsq = tdual_fparams("ComputeSNAGridLocalKokkos::cutsq",atom->ntypes+1,atom->ntypes+1);
+  auto d_cutsq = k_cutsq.template view<DeviceType>();
+  rnd_cutsq = d_cutsq;
+
+  host_flag = (execution_space == Host);
+
+  // TODO: Extract cutsq in double loop below, no need for cutsq_tmp
+
+  cutsq_tmp = cutsq[1][1];
+
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = 1; j <= atom->ntypes; j++){
+      k_cutsq.h_view(i,j) = k_cutsq.h_view(j,i) = cutsq_tmp;
+      k_cutsq.template modify<LMPHostType>();
+    }
+  }
+
+   // Set up element lists
+  MemKK::realloc_kokkos(d_radelem,"ComputeSNAGridLocalKokkos::radelem",nelements);
+  MemKK::realloc_kokkos(d_wjelem,"ComputeSNAGridLocalKokkos:wjelem",nelements);
+  MemKK::realloc_kokkos(d_sinnerelem,"ComputeSNAGridLocalKokkos:sinnerelem",nelements);
+  MemKK::realloc_kokkos(d_dinnerelem,"ComputeSNAGridLocalKokkos:dinnerelem",nelements);
+  // test
+  MemKK::realloc_kokkos(d_test, "ComputeSNAGridLocalKokkos::test", nelements);
+
+  int n = atom->ntypes;
+  MemKK::realloc_kokkos(d_map,"ComputeSNAGridLocalKokkos::map",n+1);
+
+  auto h_radelem = Kokkos::create_mirror_view(d_radelem);
+  auto h_wjelem = Kokkos::create_mirror_view(d_wjelem);
+  auto h_sinnerelem = Kokkos::create_mirror_view(d_sinnerelem);
+  auto h_dinnerelem = Kokkos::create_mirror_view(d_dinnerelem);
+  auto h_map = Kokkos::create_mirror_view(d_map);
+  // test
+  auto h_test = Kokkos::create_mirror_view(d_test);
+  h_test(0) = 2.0;
+
+  // start from index 1 because of how compute sna/grid is
+  for (int i = 1; i <= atom->ntypes; i++) {
+    h_radelem(i-1) = radelem[i];
+    h_wjelem(i-1) = wjelem[i];
+    if (switchinnerflag){
+      h_sinnerelem(i) = sinnerelem[i];
+      h_dinnerelem(i) = dinnerelem[i];
+    }
+  }
+
+  // In pair snap some things like `map` get allocated regardless of chem flag.
+  if (chemflag){
+    for (int i = 1; i <= atom->ntypes; i++) {
+      h_map(i) = map[i];
+    }
+  }
+
+  Kokkos::deep_copy(d_radelem,h_radelem);
+  Kokkos::deep_copy(d_wjelem,h_wjelem);
+  if (switchinnerflag){
+    Kokkos::deep_copy(d_sinnerelem,h_sinnerelem);
+    Kokkos::deep_copy(d_dinnerelem,h_dinnerelem);
+  }
+  if (chemflag){
+    Kokkos::deep_copy(d_map,h_map);
+  }
+  Kokkos::deep_copy(d_test,h_test);
+
+  snaKK = SNAKokkos<DeviceType, real_type, vector_length>(*this);
+  snaKK.grow_rij(0,0);
+  snaKK.init();
+}
+
+// Destructor
+
+template<class DeviceType, typename real_type, int vector_length>
+ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::~ComputeSNAGridLocalKokkos()
+{
+  if (copymode) return;
+
+  memoryKK->destroy_kokkos(k_cutsq,cutsq);
+  memoryKK->destroy_kokkos(k_alocal,alocal);
+}
+
+// Setup
+
+template<class DeviceType, typename real_type, int vector_length>
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::setup()
+{
+
+  ComputeGridLocal::setup();
+
+  // allocate arrays
+  memoryKK->create_kokkos(k_alocal, alocal, size_local_rows, size_local_cols, "grid:alocal");
+  array_local = alocal;
+  d_alocal = k_alocal.template view<DeviceType>();
+}
+
+// Compute
+
+template<class DeviceType, typename real_type, int vector_length>
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::compute_local()
+{
+  if (host_flag) {
+    ComputeSNAGridLocal::compute_array();
+    return;
+  }
+
+  copymode = 1;
+
+  zlen = nzhi-nzlo+1;
+  ylen = nyhi-nylo+1;
+  xlen = nxhi-nxlo+1;
+  total_range = (nzhi-nzlo+1)*(nyhi-nylo+1)*(nxhi-nxlo+1);
+
+  atomKK->sync(execution_space,X_MASK|F_MASK|TYPE_MASK);
+  x = atomKK->k_x.view<DeviceType>();
+  type = atomKK->k_type.view<DeviceType>();
+  k_cutsq.template sync<DeviceType>();
+
+  // max_neighs is defined here - think of more elaborate methods.
+  max_neighs = 100;
+
+  // Pair snap/kk uses grow_ij with some max number of neighs but compute sna/grid uses total
+  // number of atoms.
+
+  ntotal = atomKK->nlocal + atomKK->nghost;
+  // Allocate view for number of neighbors per grid point
+  MemKK::realloc_kokkos(d_ninside,"ComputeSNAGridLocalKokkos:ninside",total_range);
+
+  // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user
+  // `total_range` is the number of grid points which may be larger than chunk size.
+  chunk_size = MIN(chunksize, total_range);
+  chunk_offset = 0;
+  //snaKK.grow_rij(chunk_size, ntotal);
+  snaKK.grow_rij(chunk_size, max_neighs);
+
+  //chunk_size = total_range;
+
+  // Pre-compute ceil(chunk_size / vector_length) for code cleanliness
+  const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length;
+
+  if (triclinic) {
+    h0 = domain->h[0];
+    h1 = domain->h[1];
+    h2 = domain->h[2];
+    h3 = domain->h[3];
+    h4 = domain->h[4];
+    h5 = domain->h[5];
+    lo0 = domain->boxlo[0];
+    lo1 = domain->boxlo[1];
+    lo2 = domain->boxlo[2];
+  }
+
+  while (chunk_offset < total_range) { // chunk up loop to prevent running out of memory
+
+    if (chunk_size > total_range - chunk_offset)
+      chunk_size = total_range - chunk_offset;
+
+
+    //ComputeNeigh
+    {
+      int scratch_size = scratch_size_helper<int>(team_size_compute_neigh * max_neighs); //ntotal);
+
+      SnapAoSoATeamPolicy<DeviceType, team_size_compute_neigh, TagCSNAGridLocalComputeNeigh>
+        policy_neigh(chunk_size, team_size_compute_neigh, vector_length);
+      policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+      Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this);
+    }
+
+    //ComputeCayleyKlein
+    {
+      // tile_size_compute_ck is defined in `compute_sna_grid_kokkos.h`
+      Snap3DRangePolicy<DeviceType, tile_size_compute_ck, TagCSNAGridLocalComputeCayleyKlein>
+        policy_compute_ck({0,0,0}, {vector_length, max_neighs, chunk_size_div}, {vector_length, tile_size_compute_ck, 1});
+      Kokkos::parallel_for("ComputeCayleyKlein", policy_compute_ck, *this);
+    }
+
+    //PreUi
+    {
+      auto policy_pre_ui = snap_get_policy<DeviceType, tile_size_pre_ui, TagCSNAGridLocalPreUi>(chunk_size_div, twojmax + 1);
+      Kokkos::parallel_for("PreUi", policy_pre_ui, *this);
+    }
+
+    // ComputeUi w/ vector parallelism, shared memory, direct atomicAdd into ulisttot
+    {
+      // team_size_compute_ui is defined in `compute_sna_grid_kokkos.h`
+      // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer
+      const int tile_size = vector_length * (twojmax + 1);
+      const int scratch_size = scratch_size_helper<complex>(team_size_compute_ui * tile_size);
+
+      if (chunk_size < parallel_thresh)
+      {
+        // Version with parallelism over j_bend
+
+        // total number of teams needed: (natoms / 32) * (ntotal) * ("bend" locations)
+        const int n_teams = chunk_size_div * max_neighs * (twojmax + 1);
+        const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
+
+        SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagCSNAGridLocalComputeUiSmall>
+          policy_ui(n_teams_div, team_size_compute_ui, vector_length);
+        policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+        Kokkos::parallel_for("ComputeUiSmall", policy_ui, *this);
+      } else {
+        // Version w/out parallelism over j_bend
+
+        // total number of teams needed: (natoms / 32) * (ntotal)
+        const int n_teams = chunk_size_div * max_neighs;
+        const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
+
+        SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagCSNAGridLocalComputeUiLarge>
+          policy_ui(n_teams_div, team_size_compute_ui, vector_length);
+        policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+        Kokkos::parallel_for("ComputeUiLarge", policy_ui, *this);
+      }
+    }
+
+    //TransformUi: un-"fold" ulisttot, zero ylist
+    {
+      // Expand ulisttot_re,_im -> ulisttot
+      // Zero out ylist
+      auto policy_transform_ui = snap_get_policy<DeviceType, tile_size_transform_ui, TagCSNAGridLocalTransformUi>(chunk_size_div, snaKK.idxu_max);
+      Kokkos::parallel_for("TransformUi", policy_transform_ui, *this);
+    }
+
+    //Compute bispectrum
+    // team_size_[compute_zi, compute_bi, transform_bi] are defined in `pair_snap_kokkos.h`
+
+    //ComputeZi and Bi
+    if (nelements > 1) {
+      auto policy_compute_zi = snap_get_policy<DeviceType, tile_size_compute_zi, TagCSNAGridLocalComputeZi<true>, min_blocks_compute_zi>(chunk_size_div, snaKK.idxz_max);
+      Kokkos::parallel_for("ComputeZiChemsnap", policy_compute_zi, *this);
+
+      auto policy_compute_bi = snap_get_policy<DeviceType, tile_size_compute_bi, TagCSNAGridLocalComputeBi<true>>(chunk_size_div, snaKK.idxb_max);
+      Kokkos::parallel_for("ComputeBiChemsnap", policy_compute_bi, *this);
+    } else {
+      auto policy_compute_zi = snap_get_policy<DeviceType, tile_size_compute_zi, TagCSNAGridLocalComputeZi<false>, min_blocks_compute_zi>(chunk_size_div, snaKK.idxz_max);
+      Kokkos::parallel_for("ComputeZi", policy_compute_zi, *this);
+
+      auto policy_compute_bi = snap_get_policy<DeviceType, tile_size_compute_bi, TagCSNAGridLocalComputeBi<false>>(chunk_size_div, snaKK.idxb_max);
+      Kokkos::parallel_for("ComputeBi", policy_compute_bi, *this);
+    }
+
+    // Fill the grid array with bispectrum values
+    {
+      typename Kokkos::RangePolicy<DeviceType,TagCSNAGridLocal2Fill> policy_fill(0,chunk_size);
+      Kokkos::parallel_for(policy_fill, *this);
+    }
+
+    // Proceed to the next chunk.
+    chunk_offset += chunk_size;
+
+  } // end while
+
+  copymode = 0;
+
+  k_alocal.template modify<DeviceType>();
+  k_alocal.template sync<LMPHostType>();
+}
+
+/* ----------------------------------------------------------------------
+   Begin routines that are unique to the GPU codepath. These take advantage
+   of AoSoA data layouts and scratch memory for recursive polynomials
+------------------------------------------------------------------------- */
+
+/*
+ Simple team policy functor seeing how many layers deep we can go with the parallelism.
+ */
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridLocalComputeNeigh>::member_type& team) const {
+
+  // This function follows similar procedure as ComputeNeigh of PairSNAPKokkos.
+  // Main difference is that we don't use the neighbor class or neighbor variables here.
+  // This is because the grid points are not atoms and therefore do not get assigned
+  // neighbors in LAMMPS.
+  // TODO: If we did make a neighborlist for each grid point, we could use current
+  //       routines and avoid having to loop over all atoms (which limits us to
+  //       natoms = max team size).
+
+  // basic quantities associated with this team:
+  // team_rank : rank of thread in this team
+  // league_rank : rank of team in this league
+  // team_size : number of threads in this team
+
+  // extract loop index
+  int ii = team.team_rank() + team.league_rank() * team.team_size();
+
+  if (ii >= chunk_size) return;
+
+  // extract grid index
+  int igrid = ii + chunk_offset;
+
+  // get a pointer to scratch memory
+  // This is used to cache whether or not an atom is within the cutoff.
+  // If it is, type_cache is assigned to the atom type.
+  // If it's not, it's assigned to -1.
+  //const int tile_size = ntotal; //max_neighs; // number of elements per thread
+  //const int team_rank = team.team_rank();
+  //const int scratch_shift = team_rank * tile_size; // offset into pointer for entire team
+  //int* type_cache = (int*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(int), 0) + scratch_shift;
+
+  // convert to grid indices
+
+  int iz = igrid/(xlen*ylen);
+  int i2 = igrid - (iz*xlen*ylen);
+  int iy = i2/xlen;
+  int ix = i2 % xlen;
+  iz += nzlo;
+  iy += nylo;
+  ix += nxlo;
+
+  double xgrid[3];
+
+  // index ii already captures the proper grid point
+  //int igrid = iz * (nx * ny) + iy * nx + ix;
+
+  // grid2x converts igrid to ix,iy,iz like we've done before
+  // multiply grid integers by grid spacing delx, dely, delz
+  //grid2x(igrid, xgrid);
+  xgrid[0] = ix * delx;
+  xgrid[1] = iy * dely;
+  xgrid[2] = iz * delz;
+
+  if (triclinic) {
+
+    // Do a conversion on `xgrid` here like we do in the CPU version.
+
+    // Can't do this:
+    // domainKK->lamda2x(xgrid, xgrid);
+    // Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed
+
+    // Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats.
+    xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0;
+    xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1;
+    xgrid[2] = h2*xgrid[2] + lo2;
+  }
+
+  const F_FLOAT xtmp = xgrid[0];
+  const F_FLOAT ytmp = xgrid[1];
+  const F_FLOAT ztmp = xgrid[2];
+
+  // Zeroing out the components, which are filled as a sum.
+  for (int icol = size_local_cols_base; icol < size_local_cols; icol++){
+    d_alocal(igrid, icol) = 0.0;
+  }
+
+  // Fill grid info columns
+  d_alocal(igrid, 0) = ix;
+  d_alocal(igrid, 1) = iy;
+  d_alocal(igrid, 2) = iz;
+  d_alocal(igrid, 3) = xtmp;
+  d_alocal(igrid, 4) = ytmp;
+  d_alocal(igrid, 5) = ztmp;
+
+  // currently, all grid points are type 1
+  // not clear what a better choice would be
+
+  const int itype = 1;
+  int ielem = 0;
+  if (chemflag) ielem = d_map[itype];
+  //const double radi = d_radelem[ielem];
+
+  // Compute the number of neighbors, store rsq
+  int ninside = 0;
+
+  // Looping over ntotal for now.
+  for (int j = 0; j < ntotal; j++){
+    const F_FLOAT dx = x(j,0) - xtmp;
+    const F_FLOAT dy = x(j,1) - ytmp;
+    const F_FLOAT dz = x(j,2) - ztmp;
+    int jtype = type(j);
+    const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
+
+    // don't include atoms that share location with grid point
+    if (rsq >= rnd_cutsq(itype,jtype) || rsq < 1e-20) {
+      jtype = -1; // use -1 to signal it's outside the radius
+    }
+
+    if (jtype >= 0)
+      ninside++;
+  }
+
+  d_ninside(ii) = ninside;
+
+  // TODO: Adjust for multi-element, currently we set jelem = 0 regardless of type.
+  int offset = 0;
+  for (int j = 0; j < ntotal; j++){
+    //const int jtype = type_cache[j];
+    //if (jtype >= 0) {
+    const F_FLOAT dx = x(j,0) - xtmp;
+    const F_FLOAT dy = x(j,1) - ytmp;
+    const F_FLOAT dz = x(j,2) - ztmp;
+    const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
+    int jtype = type(j);
+    if (rsq < rnd_cutsq(itype,jtype) && rsq > 1e-20) {
+      int jelem = 0;
+      if (chemflag) jelem = d_map[jtype];
+      snaKK.rij(ii,offset,0) = static_cast<real_type>(dx);
+      snaKK.rij(ii,offset,1) = static_cast<real_type>(dy);
+      snaKK.rij(ii,offset,2) = static_cast<real_type>(dz);
+      // pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp
+      // actually since the views here have values starting at 0, let's use jelem
+      snaKK.wj(ii,offset) = static_cast<real_type>(d_wjelem[jelem]);
+      snaKK.rcutij(ii,offset) = static_cast<real_type>((2.0 * d_radelem[jelem])*rcutfac);
+      snaKK.inside(ii,offset) = j;
+      if (switchinnerflag) {
+        snaKK.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]);
+        snaKK.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]);
+      }
+      if (chemflag)
+        snaKK.element(ii,offset) = jelem;
+      else
+        snaKK.element(ii,offset) = 0;
+      offset++;
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+  Pre-compute the Cayley-Klein parameters for reuse in later routines
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeCayleyKlein,const int iatom_mod, const int jnbor, const int iatom_div) const {
+
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+
+  const int ninside = d_ninside(iatom);
+  if (jnbor >= ninside) return;
+
+  snaKK.compute_cayley_klein(iatom, jnbor);
+}
+
+/* ----------------------------------------------------------------------
+  Initialize the "ulisttot" structure with non-zero on-diagonal terms
+  and zero terms elsewhere
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalPreUi, const int& iatom_mod, const int& j, const int& iatom_div) const {
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+
+  int itype = type(iatom);
+  int ielem = d_map[itype];
+
+  snaKK.pre_ui(iatom, j, ielem);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalPreUi, const int& iatom, const int& j) const {
+  if (iatom >= chunk_size) return;
+
+  int itype = type(iatom);
+  int ielem = d_map[itype];
+
+  snaKK.pre_ui(iatom, j, ielem);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalPreUi, const int& iatom) const {
+  if (iatom >= chunk_size) return;
+
+  const int itype = type(iatom);
+  const int ielem = d_map[itype];
+
+  for (int j = 0; j <= twojmax; j++)
+    snaKK.pre_ui(iatom, j, ielem);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridLocalComputeUiSmall>::member_type& team) const {
+
+  // extract flattened atom_div / neighbor number / bend location
+  int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
+
+  // extract neighbor index, iatom_div
+  int iatom_div = flattened_idx / (max_neighs * (twojmax + 1)); // removed "const" to work around GCC 7 bug
+  const int jj_jbend = flattened_idx - iatom_div * (max_neighs * (twojmax + 1));
+  const int jbend = jj_jbend / max_neighs;
+  int jj = jj_jbend - jbend * max_neighs; // removed "const" to work around GCC 7 bug
+
+  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
+    [&] (const int iatom_mod) {
+    const int ii = iatom_mod + vector_length * iatom_div;
+    if (ii >= chunk_size) return;
+
+    const int ninside = d_ninside(ii);
+    if (jj >= ninside) return;
+
+    snaKK.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div);
+  });
+
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridLocalComputeUiLarge>::member_type& team) const {
+
+  // extract flattened atom_div / neighbor number / bend location
+  int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
+
+  // extract neighbor index, iatom_div
+  int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug
+  int jj = flattened_idx - iatom_div * max_neighs;
+
+  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
+    [&] (const int iatom_mod) {
+    const int ii = iatom_mod + vector_length * iatom_div;
+    if (ii >= chunk_size) return;
+
+    const int ninside = d_ninside(ii);
+    if (jj >= ninside) return;
+
+    snaKK.compute_ui_large(team,iatom_mod, jj, iatom_div);
+  });
+}
+
+/* ----------------------------------------------------------------------
+  De-symmetrize ulisttot_re and _im and pack it into a unified ulisttot
+  structure. Zero-initialize ylist. CPU and GPU.
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalTransformUi, const int& iatom_mod, const int& idxu, const int& iatom_div) const {
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+  if (idxu >= snaKK.idxu_max) return;
+  snaKK.transform_ui(iatom, idxu);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalTransformUi, const int& iatom, const int& idxu) const {
+  if (iatom >= chunk_size) return;
+  snaKK.transform_ui(iatom, idxu);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalTransformUi, const int& iatom) const {
+  if (iatom >= chunk_size) return;
+  for (int idxu = 0; idxu < snaKK.idxu_max; idxu++)
+    snaKK.transform_ui(iatom, idxu);
+}
+
+/* ----------------------------------------------------------------------
+  Compute all elements of the Z tensor and store them into the `zlist`
+   view
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeZi<chemsnap>, const int& iatom_mod, const int& jjz, const int& iatom_div) const {
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+  if (jjz >= snaKK.idxz_max) return;
+  snaKK.template compute_zi<chemsnap>(iatom, jjz);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeZi<chemsnap>, const int& iatom, const int& jjz) const {
+  if (iatom >= chunk_size) return;
+  snaKK.template compute_zi<chemsnap>(iatom, jjz);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeZi<chemsnap>, const int& iatom) const {
+  if (iatom >= chunk_size) return;
+  for (int jjz = 0; jjz < snaKK.idxz_max; jjz++)
+    snaKK.template compute_zi<chemsnap>(iatom, jjz);
+}
+
+/* ----------------------------------------------------------------------
+  Compute the energy triple products and store in the "blist" view
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeBi<chemsnap>, const int& iatom_mod, const int& jjb, const int& iatom_div) const {
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+  if (jjb >= snaKK.idxb_max) return;
+  snaKK.template compute_bi<chemsnap>(iatom, jjb);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeBi<chemsnap>, const int& iatom, const int& jjb) const {
+  if (iatom >= chunk_size) return;
+  snaKK.template compute_bi<chemsnap>(iatom, jjb);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeBi<chemsnap>, const int& iatom) const {
+  if (iatom >= chunk_size) return;
+  for (int jjb = 0; jjb < snaKK.idxb_max; jjb++)
+    snaKK.template compute_bi<chemsnap>(iatom, jjb);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocal2Fill, const int& ii) const {
+
+  // extract grid index
+  int igrid = ii + chunk_offset;
+
+  // convert to grid indices
+
+  int iz = igrid/(xlen*ylen);
+  int i2 = igrid - (iz*xlen*ylen);
+  int iy = i2/xlen;
+  int ix = i2 % xlen;
+  iz += nzlo;
+  iy += nylo;
+  ix += nxlo;
+
+  double xgrid[3];
+
+  // index ii already captures the proper grid point
+  // int igrid = iz * (nx * ny) + iy * nx + ix;
+  // printf("ii igrid: %d %d\n", ii, igrid);
+
+  // grid2x converts igrid to ix,iy,iz like we've done before
+  //grid2x(igrid, xgrid);
+  xgrid[0] = ix * delx;
+  xgrid[1] = iy * dely;
+  xgrid[2] = iz * delz;
+  if (triclinic) {
+
+    // Do a conversion on `xgrid` here like we do in the CPU version.
+
+    // Can't do this:
+    // domainKK->lamda2x(xgrid, xgrid);
+    // Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed
+
+    // Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats.
+    xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0;
+    xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1;
+    xgrid[2] = h2*xgrid[2] + lo2;
+  }
+
+
+  const auto idxb_max = snaKK.idxb_max;
+
+  // linear contributions
+
+  for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
+    const auto idxb = icoeff % idxb_max;
+    const auto idx_chem = icoeff / idxb_max;
+    d_alocal(igrid,icoeff+6) = snaKK.blist(ii,idx_chem,idxb);
+  }
+
+}
+
+/* ----------------------------------------------------------------------
+   utility functions
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+template<class TagStyle>
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::check_team_size_for(int inum, int &team_size) {
+  int team_size_max;
+
+  team_size_max = Kokkos::TeamPolicy<DeviceType,TagStyle>(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelForTag());
+
+  if (team_size*vector_length > team_size_max)
+    team_size = team_size_max/vector_length;
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+template<class TagStyle>
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::check_team_size_reduce(int inum, int &team_size) {
+  int team_size_max;
+
+  team_size_max = Kokkos::TeamPolicy<DeviceType,TagStyle>(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelReduceTag());
+
+  if (team_size*vector_length > team_size_max)
+    team_size = team_size_max/vector_length;
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+template<typename scratch_type>
+int ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::scratch_size_helper(int values_per_team) {
+  typedef Kokkos::View<scratch_type*, Kokkos::DefaultExecutionSpace::scratch_memory_space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > ScratchViewType;
+
+  return ScratchViewType::shmem_size(values_per_team);
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   routines used by template reference classes
+------------------------------------------------------------------------- */
+
+
+template<class DeviceType>
+ComputeSNAGridLocalKokkosDevice<DeviceType>::ComputeSNAGridLocalKokkosDevice(class LAMMPS *lmp, int narg, char **arg)
+   : ComputeSNAGridLocalKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN>(lmp, narg, arg) { ; }
+
+template<class DeviceType>
+void ComputeSNAGridLocalKokkosDevice<DeviceType>::compute_local()
+{
+  Base::compute_local();
+}
+
+#ifdef LMP_KOKKOS_GPU
+template<class DeviceType>
+ComputeSNAGridLocalKokkosHost<DeviceType>::ComputeSNAGridLocalKokkosHost(class LAMMPS *lmp, int narg, char **arg)
+   : ComputeSNAGridLocalKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_HOST_VECLEN>(lmp, narg, arg) { ; }
+
+template<class DeviceType>
+void ComputeSNAGridLocalKokkosHost<DeviceType>::compute_local()
+{
+  Base::compute_local();
+}
+#endif
+
+}
--- a/src/KOKKOS/kokkos_type.h
+++ b/src/KOKKOS/kokkos_type.h
@ -792,6 +792,14 @@ typedef tdual_float_3d::t_dev_um t_float_3d_um;
 typedef tdual_float_3d::t_dev_const_um t_float_3d_const_um;
 typedef tdual_float_3d::t_dev_const_randomread t_float_3d_randomread;

+//4d float array n
+typedef Kokkos::DualView<LMP_FLOAT****, Kokkos::LayoutRight, LMPDeviceType> tdual_float_4d;
+typedef tdual_float_4d::t_dev t_float_4d;
+typedef tdual_float_4d::t_dev_const t_float_4d_const;
+typedef tdual_float_4d::t_dev_um t_float_4d_um;
+typedef tdual_float_4d::t_dev_const_um t_float_4d_const_um;
+typedef tdual_float_4d::t_dev_const_randomread t_float_4d_randomread;
+
 #ifdef LMP_KOKKOS_NO_LEGACY
 typedef Kokkos::DualView<X_FLOAT*[4], Kokkos::LayoutLeft, LMPDeviceType> tdual_float_1d_4;
 #else
@ -1126,6 +1134,14 @@ typedef tdual_float_3d::t_host_um t_float_3d_um;
 typedef tdual_float_3d::t_host_const_um t_float_3d_const_um;
 typedef tdual_float_3d::t_host_const_randomread t_float_3d_randomread;

+//4d float array n
+typedef Kokkos::DualView<LMP_FLOAT****, Kokkos::LayoutRight, LMPDeviceType> tdual_float_4d;
+typedef tdual_float_4d::t_host t_float_4d;
+typedef tdual_float_4d::t_host_const t_float_4d_const;
+typedef tdual_float_4d::t_host_um t_float_4d_um;
+typedef tdual_float_4d::t_host_const_um t_float_4d_const_um;
+typedef tdual_float_4d::t_host_const_randomread t_float_4d_randomread;
+
 #ifdef LMP_KOKKOS_NO_LEGACY
 typedef Kokkos::DualView<X_FLOAT*[4], Kokkos::LayoutLeft, LMPDeviceType> tdual_float_1d_4;
 #else
--- a/src/KOKKOS/memory_kokkos.h
+++ b/src/KOKKOS/memory_kokkos.h
@ -101,6 +101,7 @@ template <typename TYPE, typename HTYPE>
 {
  data = TYPE(std::string(name),n1,n2);
  h_data = Kokkos::create_mirror_view(data);
+  //printf(">>> name: %s\n", name);
  return data;
 }

@ -111,6 +112,7 @@ TYPE create_kokkos(TYPE &data, typename TYPE::value_type **&array,
  data = TYPE(std::string(name),n1,n2);
  bigint nbytes = ((bigint) sizeof(typename TYPE::value_type *)) * n1;
  array = (typename TYPE::value_type **) smalloc(nbytes,name);
+  //printf(">>> name %s nbytes %d\n", name, nbytes);

  for (int i = 0; i < n1; i++) {
    if (n2 == 0)
@ -121,6 +123,56 @@ TYPE create_kokkos(TYPE &data, typename TYPE::value_type **&array,
  return data;
 }

+/* ----------------------------------------------------------------------
+   create a 4d array with indices 2,3,4 offset, but not first
+   2nd index from n2lo to n2hi inclusive
+   3rd index from n3lo to n3hi inclusive
+   4th index from n4lo to n4hi inclusive
+   cannot grow it
+------------------------------------------------------------------------- */
+
+template <typename TYPE>
+TYPE create4d_offset_kokkos(TYPE &data, typename TYPE::value_type ****&array,
+                             int n1, int n2lo, int n2hi, int n3lo, int n3hi, int n4lo, int n4hi,
+                             const char *name)
+{
+  //if (n1 <= 0 || n2lo > n2hi || n3lo > n3hi || n4lo > n4hi) array =  nullptr;
+
+  printf("^^^^^ memoryKK->create_4d_offset_kokkos\n");
+
+  int n2 = n2hi - n2lo + 1;
+  int n3 = n3hi - n3lo + 1;
+  int n4 = n4hi - n4lo + 1;
+  data = TYPE(std::string(name),n1,n2,n3,n4);
+  bigint nbytes = ((bigint) sizeof(typename TYPE::value_type ***)) * n1;
+  array = (typename TYPE::value_type ****) smalloc(nbytes,name);
+
+  for (int i = 0; i < n1; i++) {
+    if (n2 == 0) {
+      array[i] = nullptr;
+    } else {
+      nbytes = ((bigint) sizeof(typename TYPE::value_type **)) * n2;
+      array[i] = (typename TYPE::value_type ***) smalloc(nbytes,name);
+      for (int j = 0; j < n2; j++){
+        if (n3 == 0){
+          array[i][j] = nullptr;
+        } else {
+          nbytes = ((bigint) sizeof(typename TYPE::value_type *)) * n3;
+          array[i][j] = (typename TYPE::value_type **) smalloc(nbytes, name);
+          for (int k = 0; k < n3; k++){
+            if (n4 == 0)
+              array[i][j][k] = nullptr;
+            else
+              array[i][j][k] = &data.h_view(i,j,k,0);
+          }
+        }
+      }
+    }
+  }
+
+  return data;
+}
+
 template <typename TYPE, typename HTYPE>
  TYPE create_kokkos(TYPE &data, HTYPE &h_data,
                     typename TYPE::value_type **&array, int n1, int n2,
--- a/src/KOKKOS/pair_mliap_kokkos.cpp
+++ b/src/KOKKOS/pair_mliap_kokkos.cpp
@ -240,6 +240,7 @@ void PairMLIAPKokkos<DeviceType>::coeff(int narg, char **arg) {
      if (strcmp(elemname,descriptor->elements[jelem]) == 0)
        break;

+    //printf(">>> nelements: %d\n", descriptor->nelements);
    if (jelem < descriptor->nelements)
      map[i] = jelem;
    else if (strcmp(elemname,"NULL") == 0) map[i] = -1;
--- a/src/KOKKOS/pair_snap_kokkos.h
+++ b/src/KOKKOS/pair_snap_kokkos.h
@ -375,7 +375,6 @@ class PairSNAPKokkos : public PairSNAP {

  // Make SNAKokkos a friend
  friend class SNAKokkos<DeviceType, real_type, vector_length>;
-
 };


--- a/src/KOKKOS/pair_snap_kokkos_impl.h
+++ b/src/KOKKOS/pair_snap_kokkos_impl.h
@ -3,12 +3,10 @@
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   https://www.lammps.org/, Sandia National Laboratories
   LAMMPS development team: developers@lammps.org
-
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.
-
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */

@ -39,17 +37,6 @@

 namespace LAMMPS_NS {

-// Outstanding issues with quadratic term
-// 1. there seems to a problem with compute_optimized energy calc
-// it does not match compute_regular, even when quadratic coeffs = 0
-
-//static double t1 = 0.0;
-//static double t2 = 0.0;
-//static double t3 = 0.0;
-//static double t4 = 0.0;
-//static double t5 = 0.0;
-//static double t6 = 0.0;
-//static double t7 = 0.0;
 /* ---------------------------------------------------------------------- */

 template<class DeviceType, typename real_type, int vector_length>
@ -219,7 +206,8 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::compute(int eflag_in,
      // team_size_compute_neigh is defined in `pair_snap_kokkos.h`
      int scratch_size = scratch_size_helper<int>(team_size_compute_neigh * max_neighs);

-      SnapAoSoATeamPolicy<DeviceType, team_size_compute_neigh, TagPairSNAPComputeNeigh> policy_neigh(chunk_size,team_size_compute_neigh,vector_length);
+      SnapAoSoATeamPolicy<DeviceType, team_size_compute_neigh, TagPairSNAPComputeNeigh>
+        policy_neigh(chunk_size,team_size_compute_neigh,vector_length);
      policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
      Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this);
    }
@ -259,7 +247,8 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::compute(int eflag_in,
        const int n_teams = chunk_size_div * max_neighs * (twojmax + 1);
        const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;

-        SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagPairSNAPComputeUiSmall> policy_ui(n_teams_div, team_size_compute_ui, vector_length);
+        SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagPairSNAPComputeUiSmall>
+          policy_ui(n_teams_div, team_size_compute_ui, vector_length);
        policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
        Kokkos::parallel_for("ComputeUiSmall",policy_ui,*this);
      } else {
@ -269,7 +258,8 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::compute(int eflag_in,
        const int n_teams = chunk_size_div * max_neighs;
        const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;

-        SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagPairSNAPComputeUiLarge> policy_ui(n_teams_div, team_size_compute_ui, vector_length);
+        SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagPairSNAPComputeUiLarge>
+          policy_ui(n_teams_div, team_size_compute_ui, vector_length);
        policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
        Kokkos::parallel_for("ComputeUiLarge",policy_ui,*this);
      }
@ -536,8 +526,7 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::coeff(int narg, char
  Kokkos::deep_copy(d_dinnerelem,h_dinnerelem);
  Kokkos::deep_copy(d_map,h_map);

-  snaKK = SNAKokkos<DeviceType, real_type, vector_length>(*this); //rfac0,twojmax,
-    //rmin0,switchflag,bzeroflag,chemflag,bnormflag,wselfallflag,nelements,switchinnerflag);
+  snaKK = SNAKokkos<DeviceType, real_type, vector_length>(*this);
  snaKK.grow_rij(0,0);
  snaKK.init();
 }
--- a/src/KOKKOS/sna_kokkos.h
+++ b/src/KOKKOS/sna_kokkos.h
@ -29,7 +29,9 @@
 #endif

 namespace LAMMPS_NS {
-
+// copied from pair_snap_kokkos.h
+// pre-declare so sna_kokkos.h can refer to it
+template<class DeviceType, typename real_type_, int vector_length_> class PairSNAPKokkos;
 template<typename real_type_, int vector_length_>
 struct WignerWrapper {
  using real_type = real_type_;
@ -170,9 +172,9 @@ class SNAKokkos {
  KOKKOS_INLINE_FUNCTION
  SNAKokkos(const SNAKokkos<DeviceType,real_type,vector_length>& sna, const typename Kokkos::TeamPolicy<DeviceType>::member_type& team);

+  template<class CopyClass>
  inline
-  //SNAKokkos(real_type, int, real_type, int, int, int, int, int, int, int);
-  SNAKokkos(const PairSNAPKokkos<DeviceType, real_type, vector_length>&);
+  SNAKokkos(const CopyClass&);

  KOKKOS_INLINE_FUNCTION
  ~SNAKokkos();
--- a/src/KOKKOS/sna_kokkos_impl.h
+++ b/src/KOKKOS/sna_kokkos_impl.h
@ -29,17 +29,18 @@ static const double MY_PI  = 3.14159265358979323846; // pi
 static const double MY_PI2  = 1.57079632679489661923; // pi/2

 template<class DeviceType, typename real_type, int vector_length>
+template<class CopyClass>
 inline
-SNAKokkos<DeviceType, real_type, vector_length>::SNAKokkos(const PairSNAPKokkos<DeviceType, real_type, vector_length>& psk)
-  : rfac0(psk.rfac0), rmin0(psk.rmin0), switch_flag(psk.switchflag),
-    bzero_flag(psk.bzeroflag), chem_flag(psk.chemflag), bnorm_flag(psk.bnormflag),
-    wselfall_flag(psk.wselfallflag), switch_inner_flag(psk.switchinnerflag),
-    quadratic_flag(psk.quadraticflag), twojmax(psk.twojmax), d_coeffelem(psk.d_coeffelem)
+SNAKokkos<DeviceType, real_type, vector_length>::SNAKokkos(const CopyClass& copy)
+  : twojmax(copy.twojmax), d_coeffelem(copy.d_coeffelem), rmin0(copy.rmin0),
+    rfac0(copy.rfac0), switch_flag(copy.switchflag), switch_inner_flag(copy.switchinnerflag),
+    chem_flag(copy.chemflag), bnorm_flag(copy.bnormflag), wselfall_flag(copy.wselfallflag),
+    quadratic_flag(copy.quadraticflag), bzero_flag(copy.bzeroflag)
 {
  wself = static_cast<real_type>(1.0);

  if (chem_flag)
-    nelements = psk.nelements;
+    nelements = copy.nelements;
  else
    nelements = 1;

@ -611,7 +612,6 @@ void SNAKokkos<DeviceType, real_type, vector_length>::evaluate_ui_jbend(const Wi
    }

    ulist_wrapper.set(ma, ulist_accum);
-
    mb++;
  }

@ -830,7 +830,6 @@ typename SNAKokkos<DeviceType, real_type, vector_length>::complex SNAKokkos<Devi
  int jju1 = idxu_block[j1] + (j1+1)*mb1min;
  int jju2 = idxu_block[j2] + (j2+1)*mb2max;
  int icgb = mb1min*(j2+1) + mb2max;
-
  #ifdef LMP_KK_DEVICE_COMPILE
  #pragma unroll
  #endif
--- a/src/ML-SNAP/compute_gaussian_grid_local.cpp
+++ b/src/ML-SNAP/compute_gaussian_grid_local.cpp
@ -0,0 +1,166 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/ Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "compute_gaussian_grid_local.h"
+
+#include "atom.h"
+#include "comm.h"
+#include "error.h"
+#include "force.h"
+#include "math_const.h"
+#include "math_special.h"
+#include "memory.h"
+#include "modify.h"
+#include "update.h"
+
+#include <cmath>
+#include <cstring>
+
+using namespace LAMMPS_NS;
+using MathConst::MY_2PI;
+using MathSpecial::powint;
+
+ComputeGaussianGridLocal::ComputeGaussianGridLocal(LAMMPS *lmp, int narg, char **arg) :
+    ComputeGridLocal(lmp, narg, arg), cutsq(nullptr), radelem(nullptr),
+    sigmaelem(nullptr), prefacelem(nullptr), argfacelem(nullptr)
+{
+  // skip over arguments used by base class
+  // so that argument positions are identical to
+  // regular per-atom compute
+
+  arg += nargbase;
+  narg -= nargbase;
+
+  //double rfac0, rmin0;
+  //int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;
+
+  int ntypes = atom->ntypes;
+  int nargmin = 4 + 2 * ntypes;
+
+  if (narg < nargmin) error->all(FLERR, "Illegal compute {} command", style);
+
+  // process required arguments
+
+  memory->create(radelem, ntypes + 1, "gaussian/atom:radelem");    // offset by 1 to match up with types
+  memory->create(sigmaelem, ntypes + 1, "gaussian/atom:sigmaelem");
+  memory->create(prefacelem, ntypes + 1, "gaussian/atom:prefacelem");
+  memory->create(argfacelem, ntypes + 1, "gaussian/atom:argfacelem");
+
+  rcutfac = utils::numeric(FLERR, arg[3], false, lmp);
+
+  for (int i = 0; i < ntypes; i++) radelem[i + 1] = utils::numeric(FLERR, arg[4 + i], false, lmp);
+  for (int i = 0; i < ntypes; i++)
+    sigmaelem[i + 1] = utils::numeric(FLERR, arg[ntypes + 4 + i], false, lmp);
+
+  // construct cutsq
+  double cut;
+  cutmax = 0.0;
+  memory->create(cutsq, ntypes + 1, ntypes + 1, "gaussian/atom:cutsq");
+  for (int i = 1; i <= ntypes; i++) {
+    cut = 2.0 * radelem[i] * rcutfac;
+    if (cut > cutmax) cutmax = cut;
+    cutsq[i][i] = cut * cut;
+    for (int j = i + 1; j <= ntypes; j++) {
+      cut = (radelem[i] + radelem[j]) * rcutfac;
+      cutsq[i][j] = cutsq[j][i] = cut * cut;
+    }
+  }
+
+  size_local_cols = size_local_cols_base + ntypes;
+
+  // pre-compute coefficients
+  for (int i = 0; i < ntypes; i++) {
+    prefacelem[i + 1] = 1.0/powint(sigmaelem[i + 1] * sqrt(MY_2PI), 3);
+    argfacelem[i + 1] = 1.0/(2.0 * sigmaelem[i + 1] * sigmaelem[i + 1]);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+ComputeGaussianGridLocal::~ComputeGaussianGridLocal()
+{
+  if (copymode) return;
+  memory->destroy(radelem);
+  memory->destroy(sigmaelem);
+  memory->destroy(prefacelem);
+  memory->destroy(argfacelem);
+  memory->destroy(cutsq);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ComputeGaussianGridLocal::init()
+{
+  if ((modify->get_compute_by_style("^gaussian/grid/local$").size() > 1) && (comm->me == 0))
+    error->warning(FLERR, "More than one instance of compute gaussian/grid/local");
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ComputeGaussianGridLocal::compute_local()
+{
+  invoked_local = update->ntimestep;
+
+  // compute gaussian for each gridpoint
+
+  double **const x = atom->x;
+  const int *const mask = atom->mask;
+  int *const type = atom->type;
+  const int ntotal = atom->nlocal + atom->nghost;
+
+  int igrid = 0;
+  for (int iz = nzlo; iz <= nzhi; iz++)
+    for (int iy = nylo; iy <= nyhi; iy++)
+      for (int ix = nxlo; ix <= nxhi; ix++) {
+        double xgrid[3];
+        grid2x(ix, iy, iz, xgrid);
+        const double xtmp = xgrid[0];
+        const double ytmp = xgrid[1];
+        const double ztmp = xgrid[2];
+
+        // Zeroing out the components, which are filled as a sum.
+        for (int icol = size_local_cols_base; icol < size_local_cols; icol++){
+          alocal[igrid][icol] = 0.0;
+        }
+
+        for (int j = 0; j < ntotal; j++) {
+
+          // check that j is in compute group
+
+          if (!(mask[j] & groupbit)) continue;
+
+          const double delx = xtmp - x[j][0];
+          const double dely = ytmp - x[j][1];
+          const double delz = ztmp - x[j][2];
+          const double rsq = delx * delx + dely * dely + delz * delz;
+          int jtype = type[j];
+          if (rsq < cutsq[jtype][jtype]) {
+            int icol = size_local_cols_base + jtype - 1;
+            alocal[igrid][icol] += prefacelem[jtype] * exp(-rsq * argfacelem[jtype]);
+          }
+        }
+        igrid++;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   memory usage
+------------------------------------------------------------------------- */
+
+double ComputeGaussianGridLocal::memory_usage()
+{
+  int n = atom->ntypes + 1;
+  int nbytes = (double) n * sizeof(int);    // map
+
+  return nbytes;
+}
--- a/src/ML-SNAP/compute_gaussian_grid_local.h
+++ b/src/ML-SNAP/compute_gaussian_grid_local.h
@ -0,0 +1,51 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/ Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef COMPUTE_CLASS
+// clang-format off
+ComputeStyle(gaussian/grid/local,ComputeGaussianGridLocal);
+// clang-format on
+#else
+
+#ifndef LMP_COMPUTE_GAUSSIAN_GRID_LOCAL_H
+#define LMP_COMPUTE_GAUSSIAN_GRID_LOCAL_H
+
+#include "compute_grid_local.h"
+
+namespace LAMMPS_NS {
+
+class ComputeGaussianGridLocal : public ComputeGridLocal {
+ public:
+  ComputeGaussianGridLocal(class LAMMPS *, int, char **);
+  ~ComputeGaussianGridLocal() override;
+  void init() override;
+  void compute_local() override;
+  double memory_usage() override;
+
+ protected:
+  int ncoeff;
+  double **cutsq;
+  double rcutfac;     // global cut-off scale
+  double *radelem;    // cut-off radius of each atom type
+  double *sigmaelem;  // Gaussian width of each atom type
+  double *prefacelem; // Gaussian prefactor of each atom type
+  double *argfacelem; // Gaussian argument factor of each atom type
+  int *map;    // map types to [0,nelements)
+  int nelements;
+  double cutmax;
+};
+
+}    // namespace LAMMPS_NS
+
+#endif
+#endif
--- a/src/ML-SNAP/compute_grid.cpp
+++ b/src/ML-SNAP/compute_grid.cpp
@ -57,6 +57,7 @@ ComputeGrid::ComputeGrid(LAMMPS *lmp, int narg, char **arg) :

 ComputeGrid::~ComputeGrid()
 {
+  if (copymode) return;
  deallocate();
 }

@ -111,7 +112,6 @@ void ComputeGrid::assign_coords_all()
 void ComputeGrid::allocate()
 {
  // allocate arrays
-
  memory->create(grid, size_array_rows, size_array_cols, "grid:grid");
  memory->create(gridall, size_array_rows, size_array_cols, "grid:gridall");
  if (nxlo <= nxhi && nylo <= nyhi && nzlo <= nzhi) {
--- a/src/ML-SNAP/compute_grid_local.cpp
+++ b/src/ML-SNAP/compute_grid_local.cpp
@ -119,6 +119,8 @@ void ComputeGridLocal::allocate()

 void ComputeGridLocal::deallocate()
 {
+  if (copymode) return;
+
  if (gridlocal_allocated) {
    gridlocal_allocated = 0;
    memory->destroy(alocal);
--- a/src/ML-SNAP/compute_sna_grid.cpp
+++ b/src/ML-SNAP/compute_sna_grid.cpp
@ -31,14 +31,13 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) :
  // skip over arguments used by base class
  // so that argument positions are identical to
  // regular per-atom compute
-
  arg += nargbase;
  narg -= nargbase;

  // begin code common to all SNAP computes

-  double rfac0, rmin0;
-  int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;
+  //double rfac0, rmin0;
+  //int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;

  int ntypes = atom->ntypes;
  int nargmin = 6 + 2 * ntypes;
@ -56,6 +55,8 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) :
  wselfallflag = 0;
  switchinnerflag = 0;
  nelements = 1;
+  chunksize = 32768;
+  parallel_thresh = 8192;

  // process required arguments

@ -67,8 +68,9 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) :
  twojmax = utils::inumeric(FLERR, arg[5], false, lmp);

  for (int i = 0; i < ntypes; i++) radelem[i + 1] = utils::numeric(FLERR, arg[6 + i], false, lmp);
-  for (int i = 0; i < ntypes; i++)
+  for (int i = 0; i < ntypes; i++) {
    wjelem[i + 1] = utils::numeric(FLERR, arg[6 + ntypes + i], false, lmp);
+  }

  // construct cutsq

@ -181,11 +183,12 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) :

 ComputeSNAGrid::~ComputeSNAGrid()
 {
+  if (copymode) return;
+
  memory->destroy(radelem);
  memory->destroy(wjelem);
  memory->destroy(cutsq);
  delete snaptr;
-
  if (chemflag) memory->destroy(map);
 }

@ -202,6 +205,7 @@ void ComputeSNAGrid::init()

 void ComputeSNAGrid::compute_array()
 {
+
  invoked_array = update->ntimestep;

  // compute sna for each gridpoint
--- a/src/ML-SNAP/compute_sna_grid.h
+++ b/src/ML-SNAP/compute_sna_grid.h
@ -31,21 +31,27 @@ class ComputeSNAGrid : public ComputeGrid {
  void init() override;
  void compute_array() override;
  double memory_usage() override;
+  int ncoeff,nelements; // public for kokkos, but could go in the protected block now

- private:
-  int ncoeff;
+ protected:
+  //int ncoeff;
  double **cutsq;
  double rcutfac;
  double *radelem;
  double *wjelem;
  int *map;    // map types to [0,nelements)
-  int nelements, chemflag;
+  int chemflag;
  int switchinnerflag;
  double *sinnerelem;
  double *dinnerelem;
+  int parallel_thresh;
  class SNA *snaptr;
  double cutmax;
  int quadraticflag;
+  double rfac0, rmin0;
+  int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;
+  int chunksize;
+
 };

 }    // namespace LAMMPS_NS
--- a/src/ML-SNAP/compute_sna_grid_local.cpp
+++ b/src/ML-SNAP/compute_sna_grid_local.cpp
@ -37,8 +37,8 @@ ComputeSNAGridLocal::ComputeSNAGridLocal(LAMMPS *lmp, int narg, char **arg) :

  // begin code common to all SNAP computes

-  double rfac0, rmin0;
-  int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;
+  //double rfac0, rmin0;
+  //int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;

  int ntypes = atom->ntypes;
  int nargmin = 6 + 2 * ntypes;
@ -56,6 +56,8 @@ ComputeSNAGridLocal::ComputeSNAGridLocal(LAMMPS *lmp, int narg, char **arg) :
  wselfallflag = 0;
  switchinnerflag = 0;
  nelements = 1;
+  chunksize = 32768;
+  parallel_thresh = 8192;

  // process required arguments

@ -180,6 +182,7 @@ ComputeSNAGridLocal::ComputeSNAGridLocal(LAMMPS *lmp, int narg, char **arg) :

 ComputeSNAGridLocal::~ComputeSNAGridLocal()
 {
+  if (copymode) return;
  memory->destroy(radelem);
  memory->destroy(wjelem);
  memory->destroy(cutsq);
--- a/src/ML-SNAP/compute_sna_grid_local.h
+++ b/src/ML-SNAP/compute_sna_grid_local.h
@ -32,7 +32,7 @@ class ComputeSNAGridLocal : public ComputeGridLocal {
  void compute_local() override;
  double memory_usage() override;

- private:
+ protected:
  int ncoeff;
  double **cutsq;
  double rcutfac;
@ -46,6 +46,10 @@ class ComputeSNAGridLocal : public ComputeGridLocal {
  class SNA *snaptr;
  double cutmax;
  int quadraticflag;
+  double rfac0, rmin0;
+  int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;
+  int chunksize;
+  int parallel_thresh;
 };

 }    // namespace LAMMPS_NS