From be94176c034234ad4280420ca5d38cafb776c7ce Mon Sep 17 00:00:00 2001
From: Lenz Fiedler <l.fiedler@hzdr.de>
Date: Thu, 12 Jan 2023 15:44:04 +0100
Subject: [PATCH 01/51] Re-starting MALA branch in MALA fork

---
 examples/snap/in.grid.gaussian              |  66 ++++++++
 src/ML-SNAP/compute_gaussian_grid_local.cpp | 167 ++++++++++++++++++++
 src/ML-SNAP/compute_gaussian_grid_local.h   |  51 ++++++
 src/ML-SNAP/compute_sna_grid_local.cpp      |   2 +-
 4 files changed, 285 insertions(+), 1 deletion(-)
 create mode 100644 examples/snap/in.grid.gaussian
 create mode 100644 src/ML-SNAP/compute_gaussian_grid_local.cpp
 create mode 100644 src/ML-SNAP/compute_gaussian_grid_local.h
diff --git a/examples/snap/in.grid.gaussian b/examples/snap/in.grid.gaussian
new file mode 100644
index 0000000000..9caa61e455
--- /dev/null
+++ b/examples/snap/in.grid.gaussian
@@ -0,0 +1,66 @@
+# Demonstrate calculation of Gaussian descriptors on a grid
+# for a cell with two atoms of type 1 and type 2.
+# The output in dump.glocal shows that for grid points
+# sitting on an atom of type 1 or 2:
+# val1 = 1.0/(0.1355*sqrt(2.0*pi))**3 = 25.5219
+# val2 = 1.0/(0.2   *sqrt(2.0*pi))**3 = 7.93670
+# 
+
+variable 	nrep index 1
+variable 	a index 3.316
+variable 	ngrid index 2
+
+units		metal
+atom_modify	map hash
+
+# generate the box and atom positions using a BCC lattice
+
+variable       	nx equal ${nrep}
+variable 	ny equal ${nrep}
+variable 	nz equal ${nrep}
+
+boundary	p p p
+
+lattice		custom $a &
+		a1 1 0 0 &
+		a2 0 1 0  &
+		a3 0 0 1 &
+		basis 0 0 0 &
+		basis 0.5 0.5 0.5 &
+
+region		box block 0 ${nx} 0 ${ny} 0 ${nz}
+create_box	2 box
+create_atoms	1 box basis 1 1 basis 2 2
+
+mass 		* 180.88
+
+# define atom compute and grid compute
+
+group 		snapgroup type 1
+variable 	rcutfac equal 4.67637
+variable 	radelem1 equal 0.5
+variable 	radelem2 equal 0.5
+variable	sigmaelem1 equal 0.1355
+variable	sigmaelem2 equal 0.2
+variable 	gaussian_options string &
+		"${rcutfac} ${radelem1} ${radelem2} ${sigmaelem1} ${sigmaelem2}"
+		
+# build zero potential to force ghost atom creation
+
+pair_style      zero ${rcutfac}
+pair_coeff      * *
+
+# define atom and grid computes
+
+compute 	mygridlocal all gaussian/grid/local grid ${ngrid} ${ngrid} ${ngrid} &
+	 	${gaussian_options}
+
+# define output
+
+dump 1 all local 1000 dump.glocal c_mygridlocal[*]
+dump 2 all custom 1000 dump.gatom id x y z
+
+# run
+
+run		0
+
diff --git a/src/ML-SNAP/compute_gaussian_grid_local.cpp b/src/ML-SNAP/compute_gaussian_grid_local.cpp
new file mode 100644
index 0000000000..ec75563bcf
--- /dev/null
+++ b/src/ML-SNAP/compute_gaussian_grid_local.cpp
@@ -0,0 +1,167 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/ Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "compute_gaussian_grid_local.h"
+
+#include "atom.h"
+#include "comm.h"
+#include "error.h"
+#include "force.h"
+#include "math_const.h"
+#include "math_special.h"
+#include "memory.h"
+#include "modify.h"
+#include "update.h"
+
+#include <cmath>
+#include <cstring>
+
+using namespace LAMMPS_NS;
+using MathConst::MY_2PI;
+using MathSpecial::powint;
+
+ComputeGaussianGridLocal::ComputeGaussianGridLocal(LAMMPS *lmp, int narg, char **arg) :
+    ComputeGridLocal(lmp, narg, arg), cutsq(nullptr), radelem(nullptr),
+    sigmaelem(nullptr), prefacelem(nullptr), argfacelem(nullptr)
+{
+  // skip over arguments used by base class
+  // so that argument positions are identical to
+  // regular per-atom compute
+
+  arg += nargbase;
+  narg -= nargbase;
+
+  double rfac0, rmin0;
+  int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;
+
+  int ntypes = atom->ntypes;
+  int nargmin = 4 + 2 * ntypes;
+
+  if (narg < nargmin) error->all(FLERR, "Illegal compute {} command", style);
+
+  // process required arguments
+
+  memory->create(radelem, ntypes + 1, "gaussian/atom:radelem");    // offset by 1 to match up with types
+  memory->create(sigmaelem, ntypes + 1, "gaussian/atom:sigmaelem");
+  memory->create(prefacelem, ntypes + 1, "gaussian/atom:prefacelem");
+  memory->create(argfacelem, ntypes + 1, "gaussian/atom:argfacelem");
+
+  rcutfac = utils::numeric(FLERR, arg[3], false, lmp);
+
+  for (int i = 0; i < ntypes; i++) radelem[i + 1] = utils::numeric(FLERR, arg[4 + i], false, lmp);
+  for (int i = 0; i < ntypes; i++)
+    sigmaelem[i + 1] = utils::numeric(FLERR, arg[ntypes + 4 + i], false, lmp);
+  
+  // construct cutsq
+
+  double cut;
+  cutmax = 0.0;
+  memory->create(cutsq, ntypes + 1, ntypes + 1, "gaussian/atom:cutsq");
+  for (int i = 1; i <= ntypes; i++) {
+    cut = 2.0 * radelem[i] * rcutfac;
+    if (cut > cutmax) cutmax = cut;
+    cutsq[i][i] = cut * cut;
+    for (int j = i + 1; j <= ntypes; j++) {
+      cut = (radelem[i] + radelem[j]) * rcutfac;
+      cutsq[i][j] = cutsq[j][i] = cut * cut;
+    }
+  }
+
+  size_local_cols = size_local_cols_base + ntypes;
+
+  // pre-compute coefficients
+  
+  for (int i = 0; i < ntypes; i++) {
+    prefacelem[i + 1] = 1.0/powint(sigmaelem[i + 1] * sqrt(MY_2PI), 3);
+    argfacelem[i + 1] = 1.0/(2.0 * sigmaelem[i + 1] * sigmaelem[i + 1]);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+ComputeGaussianGridLocal::~ComputeGaussianGridLocal()
+{
+  memory->destroy(radelem);
+  memory->destroy(sigmaelem);
+  memory->destroy(prefacelem);
+  memory->destroy(argfacelem);
+  memory->destroy(cutsq);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ComputeGaussianGridLocal::init()
+{
+  if ((modify->get_compute_by_style("^gaussian/grid/local$").size() > 1) && (comm->me == 0))
+    error->warning(FLERR, "More than one instance of compute gaussian/grid/local");
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ComputeGaussianGridLocal::compute_local()
+{
+  invoked_local = update->ntimestep;
+
+  // compute gaussian for each gridpoint
+
+  double **const x = atom->x;
+  const int *const mask = atom->mask;
+  int *const type = atom->type;
+  const int ntotal = atom->nlocal + atom->nghost;
+
+  int igrid = 0;
+  for (int iz = nzlo; iz <= nzhi; iz++)
+    for (int iy = nylo; iy <= nyhi; iy++)
+      for (int ix = nxlo; ix <= nxhi; ix++) {
+        double xgrid[3];
+        grid2x(ix, iy, iz, xgrid);
+        const double xtmp = xgrid[0];
+        const double ytmp = xgrid[1];
+        const double ztmp = xgrid[2];
+
+        // Zeroing out the components, which are filled as a sum.
+        for (int icol = size_local_cols_base; icol < size_local_cols; icol++){
+          alocal[igrid][icol] = 0.0;
+        }
+
+        for (int j = 0; j < ntotal; j++) {
+
+          // check that j is in compute group
+
+          if (!(mask[j] & groupbit)) continue;
+
+          const double delx = xtmp - x[j][0];
+          const double dely = ytmp - x[j][1];
+          const double delz = ztmp - x[j][2];
+          const double rsq = delx * delx + dely * dely + delz * delz;
+          int jtype = type[j];
+          if (rsq < cutsq[jtype][jtype]) {
+          int icol = size_local_cols_base + jtype - 1;
+            alocal[igrid][icol] += prefacelem[jtype] * exp(-rsq * argfacelem[jtype]);
+          }
+        }
+	    igrid++;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   memory usage
+------------------------------------------------------------------------- */
+
+double ComputeGaussianGridLocal::memory_usage()
+{
+  int n = atom->ntypes + 1;
+  int nbytes = (double) n * sizeof(int);    // map
+
+  return nbytes;
+}
diff --git a/src/ML-SNAP/compute_gaussian_grid_local.h b/src/ML-SNAP/compute_gaussian_grid_local.h
new file mode 100644
index 0000000000..cfab841a6e
--- /dev/null
+++ b/src/ML-SNAP/compute_gaussian_grid_local.h
@@ -0,0 +1,51 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/ Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef COMPUTE_CLASS
+// clang-format off
+ComputeStyle(gaussian/grid/local,ComputeGaussianGridLocal);
+// clang-format on
+#else
+
+#ifndef LMP_COMPUTE_GAUSSIAN_GRID_LOCAL_H
+#define LMP_COMPUTE_GAUSSIAN_GRID_LOCAL_H
+
+#include "compute_grid_local.h"
+
+namespace LAMMPS_NS {
+
+class ComputeGaussianGridLocal : public ComputeGridLocal {
+ public:
+  ComputeGaussianGridLocal(class LAMMPS *, int, char **);
+  ~ComputeGaussianGridLocal() override;
+  void init() override;
+  void compute_local() override;
+  double memory_usage() override;
+
+ private:
+  int ncoeff;
+  double **cutsq;
+  double rcutfac;     // global cut-off scale
+  double *radelem;    // cut-off radius of each atom type
+  double *sigmaelem;  // Gaussian width of each atom type
+  double *prefacelem; // Gaussian prefactor of each atom type
+  double *argfacelem; // Gaussian argument factor of each atom type
+  int *map;    // map types to [0,nelements)
+  int nelements;
+  double cutmax;
+};
+
+}    // namespace LAMMPS_NS
+
+#endif
+#endif
diff --git a/src/ML-SNAP/compute_sna_grid_local.cpp b/src/ML-SNAP/compute_sna_grid_local.cpp
index 80a1baddab..76fe03a03b 100644
--- a/src/ML-SNAP/compute_sna_grid_local.cpp
+++ b/src/ML-SNAP/compute_sna_grid_local.cpp
@@ -203,7 +203,7 @@ void ComputeSNAGridLocal::init()
 
 void ComputeSNAGridLocal::compute_local()
 {
-  invoked_array = update->ntimestep;
+  invoked_local = update->ntimestep;
 
   // compute sna for each gridpoint
 

From e1e7984822ef494e23bd67e6770398830bbfebba Mon Sep 17 00:00:00 2001
From: rohskopf <drew.rohskopf@gmail.com>
Date: Fri, 10 Mar 2023 16:19:24 -0700
Subject: [PATCH 02/51] Start ComputeSNAGridKokkos implementation

---
 src/KOKKOS/compute_sna_grid_kokkos.cpp | 64 +++++++++++++++++++++
 src/KOKKOS/compute_sna_grid_kokkos.h   | 80 ++++++++++++++++++++++++++
 2 files changed, 144 insertions(+)
 create mode 100644 src/KOKKOS/compute_sna_grid_kokkos.cpp
 create mode 100644 src/KOKKOS/compute_sna_grid_kokkos.h

diff --git a/src/KOKKOS/compute_sna_grid_kokkos.cpp b/src/KOKKOS/compute_sna_grid_kokkos.cpp
new file mode 100644
index 0000000000..0eb6e1767c
--- /dev/null
+++ b/src/KOKKOS/compute_sna_grid_kokkos.cpp
@@ -0,0 +1,64 @@
+// clang-format off
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "compute_sna_grid_kokkos.h"
+
+#include "atom_kokkos.h"
+#include "atom_masks.h"
+#include "comm.h"
+#include "error.h"
+#include "memory_kokkos.h"
+#include "modify.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "neighbor_kokkos.h"
+#include "sna_kokkos.h"
+#include "update.h"
+
+using namespace LAMMPS_NS;
+
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+ComputeSNAGridKokkos<DeviceType>::ComputeSNAGridKokkos(LAMMPS *lmp, int narg, char **arg) :
+  ComputeSNAGrid(lmp, narg, arg)
+{
+
+  printf("^^^ inside ComputeSNAGridKokkos constructor\n");
+  kokkosable = 1;
+  atomKK = (AtomKokkos *) atom;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  datamask_read = EMPTY_MASK;
+  datamask_modify = EMPTY_MASK;
+
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+ComputeSNAGridKokkos<DeviceType>::~ComputeSNAGridKokkos()
+{
+  if (copymode) return;
+
+
+}
+
+namespace LAMMPS_NS {
+template class ComputeSNAGridKokkos<LMPDeviceType>;
+#ifdef LMP_KOKKOS_GPU
+template class ComputeSNAGridKokkos<LMPHostType>;
+#endif
+}
+
diff --git a/src/KOKKOS/compute_sna_grid_kokkos.h b/src/KOKKOS/compute_sna_grid_kokkos.h
new file mode 100644
index 0000000000..ad365fca43
--- /dev/null
+++ b/src/KOKKOS/compute_sna_grid_kokkos.h
@@ -0,0 +1,80 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef COMPUTE_CLASS
+// clang-format off
+ComputeStyle(sna/grid/kk,ComputeSNAGridKokkos<LMPDeviceType>);
+ComputeStyle(sna/grid/kk/device,ComputeSNAGridKokkos<LMPDeviceType>);
+ComputeStyle(sna/grid/kk/host,ComputeSNAGridKokkos<LMPHostType>);
+// clang-format on
+#else
+
+// clang-format off
+#ifndef LMP_COMPUTE_SNA_GRID_KOKKOS_H
+#define LMP_COMPUTE_SNA_GRID_KOKKOS_H
+
+#include "compute_sna_grid.h"
+#include "kokkos_type.h"
+
+namespace LAMMPS_NS {
+
+//template<int CSTYLE, int NCOL>
+//struct TagComputeCoordAtom{};
+
+template<class DeviceType>
+class ComputeSNAGridKokkos : public ComputeSNAGrid {
+ public:
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+
+  ComputeSNAGridKokkos(class LAMMPS *, int, char **);
+  ~ComputeSNAGridKokkos() override;
+  //void init() override;
+  //void compute_peratom() override;
+  //enum {NONE,CUTOFF,ORIENT};
+
+  //template<int CSTYLE, int NCOL>
+  //KOKKOS_INLINE_FUNCTION
+  //void operator()(TagComputeCoordAtom<CSTYLE,NCOL>, const int&) const;
+
+ private:
+
+
+  /*
+  int inum;
+
+  typename AT::t_x_array_randomread x;
+  typename ArrayTypes<DeviceType>::t_int_1d_randomread type;
+  typename ArrayTypes<DeviceType>::t_int_1d mask;
+
+  typename AT::t_neighbors_2d d_neighbors;
+  typename AT::t_int_1d_randomread d_ilist;
+  typename AT::t_int_1d_randomread d_numneigh;
+
+  typename AT::t_int_1d d_typelo;
+  typename AT::t_int_1d d_typehi;
+
+  DAT::tdual_float_1d k_cvec;
+  typename AT::t_float_1d d_cvec;
+  DAT::tdual_float_2d k_carray;
+  typename AT::t_float_2d d_carray;
+
+  typename AT::t_float_2d d_normv;
+  */
+};
+
+}
+
+#endif
+#endif
+

From 234346c37d44d3a75255f8e0583381fea69eed07 Mon Sep 17 00:00:00 2001
From: rohskopf <drew.rohskopf@gmail.com>
Date: Sun, 12 Mar 2023 17:00:48 -0600
Subject: [PATCH 03/51] Experiment with different implementations

---
 src/KOKKOS/.compute_sna_grid_kokkos.cpp.swo | Bin 0 -> 16384 bytes
 src/KOKKOS/compute_sna_grid_kokkos.cpp      |  79 +++++++++++++++++---
 src/KOKKOS/compute_sna_grid_kokkos.h        |  23 +++++-
 3 files changed, 89 insertions(+), 13 deletions(-)
 create mode 100644 src/KOKKOS/.compute_sna_grid_kokkos.cpp.swo

diff --git a/src/KOKKOS/.compute_sna_grid_kokkos.cpp.swo b/src/KOKKOS/.compute_sna_grid_kokkos.cpp.swo
new file mode 100644
index 0000000000000000000000000000000000000000..1f6b26487397da6524dd5e8647ab7bba3ab61fb1
GIT binary patch
literal 16384
zcmeI2U5jK#8OJLzCT@~VG*LmXp7cV}J@nh`3vSx%m|=EyBJ+~T_Ksj&wyEh;-F-6W
zR2{3TXKa?l51=3#F@hJ0f)@%w!8<R+2m$>7UMMK)M!oRDAn`(U<G<>ho}HfQnMJeW
zD0Sf1J#(u1sXG69o_eYdwb}f{$5-imy~%LBm$A=(<e}*4*S}3`FFeY4q}9M|xsoF<
zGuu3O{>d|Ej;w^jmkNW8gr{j{8b)^;%g-(^Kih6}RhkX0=ora6eH}-gEw#0!Oud_B
zx3e0`g91T;+bS@$z1sPc8)r_>`cOYrdW;@@WaG9!3l)L_L4lw^P#`D}6bK3g1%d)W
zf&U)`Y<4&MEFR}w#Ush$TNU4T72kh<JHH-2r2HR6d3s3s&7vHy82q*0esMc|1qFfv
zL4lw^P#`D}6bK3g1%d)WfuKN8ASm!3P=H5_orB#IcjE(_^YQ-wPuS>x6I=zSz?%;-
z_Gj>2z`=*WUr#dj7Wg5MAO+8WB~S++1owb9A7t$3;5G0a@CBg3^WYL#1T&xl%HWp|
zF!l@ZRZs$Vfw%8x>?ZgL_&)d|SO=%UBjA1Dk0%)W19$`60Iz_rfGdE3o9|)l_uvh1
z1N;{J27DR344wlgz<uCe@U#0E`ziPyh`<xzBzO?~`d-F<1%3d&0lo$dxCs9CZpK~(
z99#vT0?&i9;KShcdl>sMcolpOTmT;c?+1Uui<lpQZ-H09%iv4EfYV?WJOJ(oH!x<u
z178Olz>VQs;}<qZ`3n7O*JH*;tyXM0hNt4Gi()(OiYuc`R99BlZ~ZsdYHjmmC@LRi
zL|S}$D2%Pm8DWdF9G!tOwtR%CXHf3Ujt$Y{kHZolS~WID7f6b@KiE{dHpio=Q#4by
zgV<DQ&6O=EX>6-?dGWc<rHz#p^ae)!uX+jZ!$Oi~wYmIwNo{Voz*lQDcI{IV8@t<X
zPie|esyJ&zLTlfJBq?87S-gC?T3Y9^CSym5p!VAQy8mUYg@M5xt0ZNvT*;DGC+<-h
zCcDCQz!i6V@r4W2thfeV^o?j?hgv!{i5BwnF)+P^(=iwf8QZeAMTB1O2+FkD<eSNe
zcDR(Bx;#n9)gslS{Ame?Yci|7J^aRL=Bt^apf;CV%^%UVdf+N8tZ(3lTJR`;5?AA(
z7{zHGXP4Y*9yBAVJJec8iykV84?UFlgE*O3_MuvdIa^Md$pOk+Gc-rD6;yE*oFjGI
zXjbStZZ&4yM<ee?i!v2UD>T7vV(EI42rH<YaAU~Ay{NP@zq-2K?yR+2t!01A2!325
zs3Dal=%#i=<hlc!?)Ncz=bzEgPHYFQyu|V?A*rWT>c`fdT*?Yh_qrqgh-Mo2m=+6>
z52B-@;gG<h!>;2Qq`g>9MydDNOk;?8<<hll*Jx895=0(%W2vqdaWWoR@2W=hdwiH!
z14*6x-qOEtw@z0n9d)=>X)zFP2vC(rFANdP<Ei7Q&|Ww+&R_?Q^aMgiWut>bzB!_l
zZ^4zFM@zoE3@4xg;zYQBbNxLm+^YMd*XFK^(6s|G8M`0LZTYyCO4hcvyociCoO{$s
zS^Ke+u0M$X$*0PF-aBczl1t65vu%YqomO%0gpUE@NykM|m9|CKD&0wh?Arn6>|*k;
z0ujfpcq)&$TW(zS(I=5Y_F91>?6bsJV}@#X<Z|)HUi6XZ<_$M;sft7e9muEF>+GF8
z@k`TwT&}R=C;`##tce_07M~H_p^cGwOvbx{ka}GD&9i01El;`G>OicxH$wfz)%7c%
z?5xhWmt8vAEq8g~K<NR~v^Z|8?b^D8DKqzqJ)Q;s@f~sIT>v9~(93!Em!F|&rsD6r
z@;sX%*P+`q>+bAz{8*Ye5>q`Hv4XXM9y-6>_51y~)uo>*Nv``<!bjyDdB{`;$<q(b
z_=cSS=Z$p-T+_^qYm-{X&xhF1C2I6F6}ijoW>GAiP9f7KL&#nlX7CPCAC&go!+bbC
z&pUB{JRu)mI9}jQ+Vj8bxCPqe{oGym-4m(Oqp2<UoIY8*tI~Axj?X487|fLWxhJ}U
zJx>c8Q;#ycT6bhtb4K!&1+|6nN#gS2{KeHp>ba!~@{Y?JR%NwBY>R{ne~Yjjb)G+}
z`vsn>v$VEBOF|0G6Ivf`CUKWmu(p>*)DczG720x`(^ytplN-h=$*7*);ab#*uDGN&
z{@<;jA`FU-H<6Gc8p=HR(Z$8u{6h0=?a8zAE9cKOn@zutqBfUN%n_+NQ<}TU2q^-+
z+Nihdw4}C$mMIpDZn8!%T-rz7Y~*`G<<rgPnaUI!=+%a6I2S?a)Z`gZTFtxGKl)e@
zr;f^e8&>eNMOc-&J3^bM^CRkYrTebkfwh@wH5xlRJM~*7jq14lHQ#-xxWchFfzRkz
z_zo9M)li`OqGqrx7v!e3xZ7A&2`1qLQNN0{X^f^vw9YXW2?nhlrznD<JC_X^Rv}2r
zvR}iBE9Guy(8C%3u>TMHf4KM&*YVt1g#CZm|A+nmZrC2rAPxKfu>T*wEe`wt|Kcl<
nu>V)?)m_;C{~KFS{~9gq|0mvE`1b-~{~z}M{vF2u?*9LuF+hoN

literal 0
HcmV?d00001

diff --git a/src/KOKKOS/compute_sna_grid_kokkos.cpp b/src/KOKKOS/compute_sna_grid_kokkos.cpp
index 0eb6e1767c..bce0b37763 100644
--- a/src/KOKKOS/compute_sna_grid_kokkos.cpp
+++ b/src/KOKKOS/compute_sna_grid_kokkos.cpp
@@ -31,8 +31,8 @@ using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
 
-template<class DeviceType>
-ComputeSNAGridKokkos<DeviceType>::ComputeSNAGridKokkos(LAMMPS *lmp, int narg, char **arg) :
+template<class DeviceType, typename real_type, int vector_length>
+ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::ComputeSNAGridKokkos(LAMMPS *lmp, int narg, char **arg) :
   ComputeSNAGrid(lmp, narg, arg)
 {
 
@@ -43,22 +43,83 @@ ComputeSNAGridKokkos<DeviceType>::ComputeSNAGridKokkos(LAMMPS *lmp, int narg, ch
   datamask_read = EMPTY_MASK;
   datamask_modify = EMPTY_MASK;
 
+  host_flag = (execution_space == Host);
+
 }
 
 /* ---------------------------------------------------------------------- */
 
-template<class DeviceType>
-ComputeSNAGridKokkos<DeviceType>::~ComputeSNAGridKokkos()
+template<class DeviceType, typename real_type, int vector_length>
+ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::~ComputeSNAGridKokkos()
 {
   if (copymode) return;
 
 
 }
 
-namespace LAMMPS_NS {
-template class ComputeSNAGridKokkos<LMPDeviceType>;
-#ifdef LMP_KOKKOS_GPU
-template class ComputeSNAGridKokkos<LMPHostType>;
-#endif
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::init()
+{
+
+  printf("^^^ beginning of ComputeSNAGridKokkos init()\n");
+
+  // init non-kk compute
+  // this calls snaptr->init(), we probably want to init the kokkos snaptr?
+  // let's copy pair_snap_kokkos by making a snaKK in header
+  ComputeSNAGrid::init();
+
+  // adjust neighbor list request for KOKKOS
+
+  // taken from compute_coord_atom_kokkos
+  // this segfaults
+  /*
+  printf("^^^ before neigh request\n");
+  auto request = neighbor->find_request(this);
+  request->set_kokkos_host(std::is_same<DeviceType,LMPHostType>::value &&
+                           !std::is_same<DeviceType,LMPDeviceType>::value);
+  request->set_kokkos_device(std::is_same<DeviceType,LMPDeviceType>::value);
+  */
+
+
+  // taken from pair_snap_kokkos init
+  // compile errors with:
+  // error: pointer to incomplete class type "LAMMPS_NS::KokkosLMP" is not allowed"
+  /*
+  if (host_flag) {
+    if (lmp->kokkos->nthreads > 1)
+      error->all(FLERR,"compute sna grid can currently only run on a single "
+                         "CPU thread");
+
+    // this calls snaptr->init()
+    // we probably wanna call init of kokkos snaptr
+    ComputeSNAGrid::init();
+    return;
+  }
+
+  if (force->newton_pair == 0)
+    error->all(FLERR,"Pair style SNAP requires newton pair on");
+
+  // neighbor list request for KOKKOS
+
+  neighflag = lmp->kokkos->neighflag;
+
+  auto request = neighbor->add_request(this, NeighConst::REQ_FULL);
+  request->set_kokkos_host(std::is_same<DeviceType,LMPHostType>::value &&
+                           !std::is_same<DeviceType,LMPDeviceType>::value);
+  request->set_kokkos_device(std::is_same<DeviceType,LMPDeviceType>::value);
+  if (neighflag == FULL)
+    error->all(FLERR,"Must use half neighbor list style with pair snap/kk");
+  */
+
+  // Overall, I think maybe this compute does not need a neighlist request because the original
+  // compute_sna_grid.cpp does not have one.
 }
 
+namespace LAMMPS_NS {
+template class ComputeSNAGridKokkos<LMPDeviceType, real_type, vector_length>;
+#ifdef LMP_KOKKOS_GPU
+template class ComputeSNAGridKokkos<LMPHostType, real_type, vector_length>;
+#endif
+}
diff --git a/src/KOKKOS/compute_sna_grid_kokkos.h b/src/KOKKOS/compute_sna_grid_kokkos.h
index ad365fca43..4261d207f7 100644
--- a/src/KOKKOS/compute_sna_grid_kokkos.h
+++ b/src/KOKKOS/compute_sna_grid_kokkos.h
@@ -14,8 +14,8 @@
 #ifdef COMPUTE_CLASS
 // clang-format off
 ComputeStyle(sna/grid/kk,ComputeSNAGridKokkos<LMPDeviceType>);
-ComputeStyle(sna/grid/kk/device,ComputeSNAGridKokkos<LMPDeviceType>);
-ComputeStyle(sna/grid/kk/host,ComputeSNAGridKokkos<LMPHostType>);
+//ComputeStyle(sna/grid/kk/device,ComputeSNAGridKokkos<LMPDeviceType>);
+//ComputeStyle(sna/grid/kk/host,ComputeSNAGridKokkos<LMPHostType>);
 // clang-format on
 #else
 
@@ -25,27 +25,42 @@ ComputeStyle(sna/grid/kk/host,ComputeSNAGridKokkos<LMPHostType>);
 
 #include "compute_sna_grid.h"
 #include "kokkos_type.h"
+#include "sna_kokkos.h"
 
 namespace LAMMPS_NS {
 
 //template<int CSTYLE, int NCOL>
 //struct TagComputeCoordAtom{};
 
-template<class DeviceType>
+// copying pair_snap_kokkos, template args are real_type and vector_length
+template<class DeviceType, typename real_type_, int vector_length_>
 class ComputeSNAGridKokkos : public ComputeSNAGrid {
  public:
   typedef DeviceType device_type;
   typedef ArrayTypes<DeviceType> AT;
 
+  static constexpr int vector_length = vector_length_;
+  using real_type = real_type_;
+
   ComputeSNAGridKokkos(class LAMMPS *, int, char **);
   ~ComputeSNAGridKokkos() override;
-  //void init() override;
+  void init() override;
   //void compute_peratom() override;
   //enum {NONE,CUTOFF,ORIENT};
 
   //template<int CSTYLE, int NCOL>
   //KOKKOS_INLINE_FUNCTION
   //void operator()(TagComputeCoordAtom<CSTYLE,NCOL>, const int&) const;
+  
+ protected:
+
+  // these are used by pair_snap_kokkos
+  // neighflag gets set in init()
+  // what about host_flag?
+  // dunno... commented these out for now
+  int host_flag, neighflag;
+
+  SNAKokkos<DeviceType, real_type, vector_length> snaKK;
 
  private:
 

From a0a7f14db5d284f41c4fe738a2d4d8d0a87b8dc5 Mon Sep 17 00:00:00 2001
From: rohskopf <drew.rohskopf@gmail.com>
Date: Sun, 12 Mar 2023 17:01:20 -0600
Subject: [PATCH 04/51] Remove swo

---
 src/KOKKOS/.compute_sna_grid_kokkos.cpp.swo | Bin 16384 -> 0 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 src/KOKKOS/.compute_sna_grid_kokkos.cpp.swo

diff --git a/src/KOKKOS/.compute_sna_grid_kokkos.cpp.swo b/src/KOKKOS/.compute_sna_grid_kokkos.cpp.swo
deleted file mode 100644
index 1f6b26487397da6524dd5e8647ab7bba3ab61fb1..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 16384
zcmeI2U5jK#8OJLzCT@~VG*LmXp7cV}J@nh`3vSx%m|=EyBJ+~T_Ksj&wyEh;-F-6W
zR2{3TXKa?l51=3#F@hJ0f)@%w!8<R+2m$>7UMMK)M!oRDAn`(U<G<>ho}HfQnMJeW
zD0Sf1J#(u1sXG69o_eYdwb}f{$5-imy~%LBm$A=(<e}*4*S}3`FFeY4q}9M|xsoF<
zGuu3O{>d|Ej;w^jmkNW8gr{j{8b)^;%g-(^Kih6}RhkX0=ora6eH}-gEw#0!Oud_B
zx3e0`g91T;+bS@$z1sPc8)r_>`cOYrdW;@@WaG9!3l)L_L4lw^P#`D}6bK3g1%d)W
zf&U)`Y<4&MEFR}w#Ush$TNU4T72kh<JHH-2r2HR6d3s3s&7vHy82q*0esMc|1qFfv
zL4lw^P#`D}6bK3g1%d)WfuKN8ASm!3P=H5_orB#IcjE(_^YQ-wPuS>x6I=zSz?%;-
z_Gj>2z`=*WUr#dj7Wg5MAO+8WB~S++1owb9A7t$3;5G0a@CBg3^WYL#1T&xl%HWp|
zF!l@ZRZs$Vfw%8x>?ZgL_&)d|SO=%UBjA1Dk0%)W19$`60Iz_rfGdE3o9|)l_uvh1
z1N;{J27DR344wlgz<uCe@U#0E`ziPyh`<xzBzO?~`d-F<1%3d&0lo$dxCs9CZpK~(
z99#vT0?&i9;KShcdl>sMcolpOTmT;c?+1Uui<lpQZ-H09%iv4EfYV?WJOJ(oH!x<u
z178Olz>VQs;}<qZ`3n7O*JH*;tyXM0hNt4Gi()(OiYuc`R99BlZ~ZsdYHjmmC@LRi
zL|S}$D2%Pm8DWdF9G!tOwtR%CXHf3Ujt$Y{kHZolS~WID7f6b@KiE{dHpio=Q#4by
zgV<DQ&6O=EX>6-?dGWc<rHz#p^ae)!uX+jZ!$Oi~wYmIwNo{Voz*lQDcI{IV8@t<X
zPie|esyJ&zLTlfJBq?87S-gC?T3Y9^CSym5p!VAQy8mUYg@M5xt0ZNvT*;DGC+<-h
zCcDCQz!i6V@r4W2thfeV^o?j?hgv!{i5BwnF)+P^(=iwf8QZeAMTB1O2+FkD<eSNe
zcDR(Bx;#n9)gslS{Ame?Yci|7J^aRL=Bt^apf;CV%^%UVdf+N8tZ(3lTJR`;5?AA(
z7{zHGXP4Y*9yBAVJJec8iykV84?UFlgE*O3_MuvdIa^Md$pOk+Gc-rD6;yE*oFjGI
zXjbStZZ&4yM<ee?i!v2UD>T7vV(EI42rH<YaAU~Ay{NP@zq-2K?yR+2t!01A2!325
zs3Dal=%#i=<hlc!?)Ncz=bzEgPHYFQyu|V?A*rWT>c`fdT*?Yh_qrqgh-Mo2m=+6>
z52B-@;gG<h!>;2Qq`g>9MydDNOk;?8<<hll*Jx895=0(%W2vqdaWWoR@2W=hdwiH!
z14*6x-qOEtw@z0n9d)=>X)zFP2vC(rFANdP<Ei7Q&|Ww+&R_?Q^aMgiWut>bzB!_l
zZ^4zFM@zoE3@4xg;zYQBbNxLm+^YMd*XFK^(6s|G8M`0LZTYyCO4hcvyociCoO{$s
zS^Ke+u0M$X$*0PF-aBczl1t65vu%YqomO%0gpUE@NykM|m9|CKD&0wh?Arn6>|*k;
z0ujfpcq)&$TW(zS(I=5Y_F91>?6bsJV}@#X<Z|)HUi6XZ<_$M;sft7e9muEF>+GF8
z@k`TwT&}R=C;`##tce_07M~H_p^cGwOvbx{ka}GD&9i01El;`G>OicxH$wfz)%7c%
z?5xhWmt8vAEq8g~K<NR~v^Z|8?b^D8DKqzqJ)Q;s@f~sIT>v9~(93!Em!F|&rsD6r
z@;sX%*P+`q>+bAz{8*Ye5>q`Hv4XXM9y-6>_51y~)uo>*Nv``<!bjyDdB{`;$<q(b
z_=cSS=Z$p-T+_^qYm-{X&xhF1C2I6F6}ijoW>GAiP9f7KL&#nlX7CPCAC&go!+bbC
z&pUB{JRu)mI9}jQ+Vj8bxCPqe{oGym-4m(Oqp2<UoIY8*tI~Axj?X487|fLWxhJ}U
zJx>c8Q;#ycT6bhtb4K!&1+|6nN#gS2{KeHp>ba!~@{Y?JR%NwBY>R{ne~Yjjb)G+}
z`vsn>v$VEBOF|0G6Ivf`CUKWmu(p>*)DczG720x`(^ytplN-h=$*7*);ab#*uDGN&
z{@<;jA`FU-H<6Gc8p=HR(Z$8u{6h0=?a8zAE9cKOn@zutqBfUN%n_+NQ<}TU2q^-+
z+Nihdw4}C$mMIpDZn8!%T-rz7Y~*`G<<rgPnaUI!=+%a6I2S?a)Z`gZTFtxGKl)e@
zr;f^e8&>eNMOc-&J3^bM^CRkYrTebkfwh@wH5xlRJM~*7jq14lHQ#-xxWchFfzRkz
z_zo9M)li`OqGqrx7v!e3xZ7A&2`1qLQNN0{X^f^vw9YXW2?nhlrznD<JC_X^Rv}2r
zvR}iBE9Guy(8C%3u>TMHf4KM&*YVt1g#CZm|A+nmZrC2rAPxKfu>T*wEe`wt|Kcl<
nu>V)?)m_;C{~KFS{~9gq|0mvE`1b-~{~z}M{vF2u?*9LuF+hoN


From 584a6200f5db3032d57d7c66bb0a8168a2e3ea1e Mon Sep 17 00:00:00 2001
From: rohskopf <drew.rohskopf@gmail.com>
Date: Sun, 12 Mar 2023 20:02:57 -0600
Subject: [PATCH 05/51] Mimic pair snap kokkos pattern

---
 src/KOKKOS/.compute_sna_grid_kokkos.h.swo | Bin 0 -> 12288 bytes
 src/KOKKOS/compute_sna_grid_kokkos.cpp    | 110 +------------------
 src/KOKKOS/compute_sna_grid_kokkos.h      |  98 +++++++++--------
 src/KOKKOS/compute_sna_grid_kokkos_impl.h | 127 ++++++++++++++++++++++
 4 files changed, 187 insertions(+), 148 deletions(-)
 create mode 100644 src/KOKKOS/.compute_sna_grid_kokkos.h.swo
 create mode 100644 src/KOKKOS/compute_sna_grid_kokkos_impl.h

diff --git a/src/KOKKOS/.compute_sna_grid_kokkos.h.swo b/src/KOKKOS/.compute_sna_grid_kokkos.h.swo
new file mode 100644
index 0000000000000000000000000000000000000000..12ede098aa5071704880ee2741eba65408343158
GIT binary patch
literal 12288
zcmeI2&yO6%6~_xOKWt*KumlnUsqjj$Uhhnge+4tFR~cr<yVlPB5Ho9WqEp*lGt-&=
zrK*})ulQjRDaR<e<&a1@1aaX2B)}z#5>VpY!au-)LlltU1_DP)1mCLeo*%oj>m{-f
zin^uG%y!qSSFhfydauUr&C-?2EA&FKL~tD=q#WPpu3mqQuKwgWVXlZeva4J6z2xcA
z`3p~-J-e?E8edKoT0RScSXxpz*7Dl&@>;`kq9E=mZp)C_ZQ;3gH|lmHS?t_QXWpKB
zAosulJkV3E!iA&c?3vSMt52PHj2?er;{Y9Vk=z5h2XYVO9>_hAdm#5f?t$C`xd;AV
zJfPy+$Tdvn_H;toul?K7`|SJc^!jo7{GlD~>23a%dm#5f?t$C`xd(C&<Q~X9kb5Ba
zK<<Iu1Gxus4}1?jz+6H;#E<+}?|^am|Nrdw|GW1S5`!nfo!}7o^L>Q$Kmi;9hrtI&
z3Hd2_1^D1Oa2)*o2ZX!@x`2a6!Gquq@ZWn0`49L6ybpc@ehpp%E~tW2;25|c{QC$Y
zH^AHA8t8*6I0X)ae;y{}Z{V-s=O6&jfC{)5e0&cf{{sI2zXR96iy#J1f**qO;KtpA
zd<DJ$pM&3mMeqc892^0^yo-=ufS-XUz~kUi@Cf+mPD0)UZ-8F`2?U5h2%ZCT;QArh
z3;qaf@I2T6P4F=I46*nWd<fnHe*u32?}B&0AHeUy>p;itOK=qbfO7;K26uzIz}JZD
z4e%BC5{zC`JaoMlNx#TT@cz?;(isaMYCGm33plMb%VlM^{D>+0j7wcRj6~3Xn(Qm%
zFd^8sR<>aJyFKO)b#(1v`^0fh>v5bucP45;D>Rta-9~t!vQ2lbxZ`OBJBApO@S{mH
zNknYHLN^KoXJ*>HAmm=Vv!}*!anJXpnjmX*Y(_$+&eSUP?G7{GzNztRSFF;oA!L}N
zWw}vr)8{ncK_s^Ao@8x4eNr3Hk35%pp{Mk%5v?XK+B8-xi+Igu*}P@56iGGh_)JP#
zudJ-BH|UJ%eL&e+jG$(z(_w<n=$?eeu*Y-CddQyi!Zv+|Nlq`)NgC%Cc;9n)6ET~m
zuxs5;-U+gwtySvdPnOmiP5Vl%TCc6nV|uah`b=>QlNdiqyPg{@rv}~1@#zah@gVkL
z0G&(5A5LVROnEJrw`hH3FrCG9ylt^>#Awv#0#V^RLO1e!&lo0SA2f;+rN6s4F4V4E
zuGYR~Bomu&SuDqGxJfjLWzncb`Bfzt9fUn*0$EaVA$7ZobUU^xRUJ;9WS>!~wy`r?
z%u(2yT1NKbI1(ybiqtF}F6oxVLdwLZhan&t_JU0=wBBZiWAAJU7ROu+#Q9a^7_Bd0
zUB1tV9dz5am`Ma3*2iAV7Gbh6mzRgD78`VsFSF+(yK}x=?lZrK)v5P2c50G6MkHn_
z5~go!C9@Ps8gsUN9=26yp5o2E<z=&JBZU_3mZ8Kf=ZV>CWX<agXY5aXYMTNxZITW?
zPL7F=WW15*80Y0l*<pIT^F)#H#5iG-z`Vql+CHCOg&W}<tqV7E1CQv8m1e`iGl&lU
zwspHyOnDAgPbd|?8Vj9T$32NIW1dAf^wTca`KETozEoeUG^vS{iG)_0HCJ?LZbDX$
zL~R~*epF~=LAWM057bx|$GW61RM$4@7^kv$v1CpS+4EY+P>0-AUDz)9tSv8I+NjrG
zT&mPBnUCfYa{7Yqk`;)(6Zc*lePpNZO1rbe^*eN{iP#RG&Er6Ztt(^Z@U-UHs|~2e
zhMV$2$M3nEo`{(z1}~zXKuXUV^joUg3#uo^x|kAg>-s+;FzlFgGX5-fMm6fCGy49k
zZY5KQs29RO6p`xM%KApLW@Ba9i_czO$cp5Wt~6m*YW9{wB7v{NcA<qM98<_<EeybY
z$zlJFrZ77eX8S?suxU)%(y!)P-LBO^Rh`XRM~@Q8+L?pJy720iHje{AGEOFcaBW)0
zE?pA~>z=xk6?_w0hbvxNyO*wSrDT^=;qUZJkOiq)uQVF?HFc}5<nT@#JgEnItF6>%
z%R~L~LQ^7C6c>En=YDq1b3I|Q^mb7v5hJs>x<MCt$OZH1x;e(tI*w<d<V7vwaG@B~
z32bpC6UCmSGHR(UCU}veUX9#S{Ow{l2yYY1+oe4$RiX7P)C!eq>0IHdbCvpq^QBUW
zZh5Lh8#s1*48>8PNFo(RA@(%N=|-_pq>K7|6b2kAQE+XA)S9YMylt$Z(`QPhvnMC$
zfK?lM;KT`B1X5S?Y80Z@I?7BvJ@{g|a6gL87>cu?Ow%VQ@;^#mTQ5eUt;g*s70a?^
zZEbB8hb<OPtjQc!jrs6mo#BwrPzyb73{HnCph2{GK{^Z(s;DUN9BU==xzqD;l3u}a
k+=DUObREYG-$&3IUVtVjBGwEo%oM0|>XiBZ-&YX%2H#=@?f?J)

literal 0
HcmV?d00001

diff --git a/src/KOKKOS/compute_sna_grid_kokkos.cpp b/src/KOKKOS/compute_sna_grid_kokkos.cpp
index bce0b37763..197234cf1d 100644
--- a/src/KOKKOS/compute_sna_grid_kokkos.cpp
+++ b/src/KOKKOS/compute_sna_grid_kokkos.cpp
@@ -13,113 +13,13 @@
 ------------------------------------------------------------------------- */
 
 #include "compute_sna_grid_kokkos.h"
-
-#include "atom_kokkos.h"
-#include "atom_masks.h"
-#include "comm.h"
-#include "error.h"
-#include "memory_kokkos.h"
-#include "modify.h"
-#include "neigh_list.h"
-#include "neigh_request.h"
-#include "neighbor_kokkos.h"
-#include "sna_kokkos.h"
-#include "update.h"
-
-using namespace LAMMPS_NS;
-
-
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType, typename real_type, int vector_length>
-ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::ComputeSNAGridKokkos(LAMMPS *lmp, int narg, char **arg) :
-  ComputeSNAGrid(lmp, narg, arg)
-{
-
-  printf("^^^ inside ComputeSNAGridKokkos constructor\n");
-  kokkosable = 1;
-  atomKK = (AtomKokkos *) atom;
-  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
-  datamask_read = EMPTY_MASK;
-  datamask_modify = EMPTY_MASK;
-
-  host_flag = (execution_space == Host);
-
-}
-
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType, typename real_type, int vector_length>
-ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::~ComputeSNAGridKokkos()
-{
-  if (copymode) return;
-
-
-}
-
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType, typename real_type, int vector_length>
-void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::init()
-{
-
-  printf("^^^ beginning of ComputeSNAGridKokkos init()\n");
-
-  // init non-kk compute
-  // this calls snaptr->init(), we probably want to init the kokkos snaptr?
-  // let's copy pair_snap_kokkos by making a snaKK in header
-  ComputeSNAGrid::init();
-
-  // adjust neighbor list request for KOKKOS
-
-  // taken from compute_coord_atom_kokkos
-  // this segfaults
-  /*
-  printf("^^^ before neigh request\n");
-  auto request = neighbor->find_request(this);
-  request->set_kokkos_host(std::is_same<DeviceType,LMPHostType>::value &&
-                           !std::is_same<DeviceType,LMPDeviceType>::value);
-  request->set_kokkos_device(std::is_same<DeviceType,LMPDeviceType>::value);
-  */
-
-
-  // taken from pair_snap_kokkos init
-  // compile errors with:
-  // error: pointer to incomplete class type "LAMMPS_NS::KokkosLMP" is not allowed"
-  /*
-  if (host_flag) {
-    if (lmp->kokkos->nthreads > 1)
-      error->all(FLERR,"compute sna grid can currently only run on a single "
-                         "CPU thread");
-
-    // this calls snaptr->init()
-    // we probably wanna call init of kokkos snaptr
-    ComputeSNAGrid::init();
-    return;
-  }
-
-  if (force->newton_pair == 0)
-    error->all(FLERR,"Pair style SNAP requires newton pair on");
-
-  // neighbor list request for KOKKOS
-
-  neighflag = lmp->kokkos->neighflag;
-
-  auto request = neighbor->add_request(this, NeighConst::REQ_FULL);
-  request->set_kokkos_host(std::is_same<DeviceType,LMPHostType>::value &&
-                           !std::is_same<DeviceType,LMPDeviceType>::value);
-  request->set_kokkos_device(std::is_same<DeviceType,LMPDeviceType>::value);
-  if (neighflag == FULL)
-    error->all(FLERR,"Must use half neighbor list style with pair snap/kk");
-  */
-
-  // Overall, I think maybe this compute does not need a neighlist request because the original
-  // compute_sna_grid.cpp does not have one.
-}
+#include "compute_sna_grid_kokkos_impl.h"
 
 namespace LAMMPS_NS {
-template class ComputeSNAGridKokkos<LMPDeviceType, real_type, vector_length>;
+
+template class ComputeSNAGridKokkosDevice<LMPDeviceType>;
 #ifdef LMP_KOKKOS_GPU
-template class ComputeSNAGridKokkos<LMPHostType, real_type, vector_length>;
+template class ComputeSNAGridKokkosHost<LMPHostType>;
 #endif
+
 }
diff --git a/src/KOKKOS/compute_sna_grid_kokkos.h b/src/KOKKOS/compute_sna_grid_kokkos.h
index 4261d207f7..9ab23f5bd2 100644
--- a/src/KOKKOS/compute_sna_grid_kokkos.h
+++ b/src/KOKKOS/compute_sna_grid_kokkos.h
@@ -13,9 +13,13 @@
 
 #ifdef COMPUTE_CLASS
 // clang-format off
-ComputeStyle(sna/grid/kk,ComputeSNAGridKokkos<LMPDeviceType>);
-//ComputeStyle(sna/grid/kk/device,ComputeSNAGridKokkos<LMPDeviceType>);
-//ComputeStyle(sna/grid/kk/host,ComputeSNAGridKokkos<LMPHostType>);
+ComputeStyle(sna/grid/kk,ComputeSNAGridKokkosDevice<LMPDeviceType>);
+ComputeStyle(sna/grid/kk/device,ComputeSNAGridKokkosDevice<LMPDeviceType>);
+#ifdef LMP_KOKKOS_GPU
+ComputeStyle(sna/grid/kk/host,ComputeSNAGridKokkosHost<LMPHostType>);
+#else
+ComputeStyle(sna/grid/kk/host,ComputeSNAGridKokkosDevice<LMPHostType>);
+#endif
 // clang-format on
 #else
 
@@ -25,71 +29,79 @@ ComputeStyle(sna/grid/kk,ComputeSNAGridKokkos<LMPDeviceType>);
 
 #include "compute_sna_grid.h"
 #include "kokkos_type.h"
+//#include "neigh_list_kokkos.h"
 #include "sna_kokkos.h"
+//#include "pair_kokkos.h"
 
 namespace LAMMPS_NS {
 
-//template<int CSTYLE, int NCOL>
-//struct TagComputeCoordAtom{};
-
-// copying pair_snap_kokkos, template args are real_type and vector_length
 template<class DeviceType, typename real_type_, int vector_length_>
 class ComputeSNAGridKokkos : public ComputeSNAGrid {
  public:
+  //enum {EnabledNeighFlags=FULL|HALF|HALFTHREAD};
+  //enum {COUL_FLAG=0};
   typedef DeviceType device_type;
   typedef ArrayTypes<DeviceType> AT;
+  typedef EV_FLOAT value_type;
 
   static constexpr int vector_length = vector_length_;
   using real_type = real_type_;
+  //using complex = SNAComplex<real_type>;
 
   ComputeSNAGridKokkos(class LAMMPS *, int, char **);
   ~ComputeSNAGridKokkos() override;
-  void init() override;
-  //void compute_peratom() override;
-  //enum {NONE,CUTOFF,ORIENT};
 
-  //template<int CSTYLE, int NCOL>
-  //KOKKOS_INLINE_FUNCTION
-  //void operator()(TagComputeCoordAtom<CSTYLE,NCOL>, const int&) const;
+  void init() override;
+  //void compute_array(int, int) override;
+  //double memory_usage() override;
   
  protected:
-
-  // these are used by pair_snap_kokkos
-  // neighflag gets set in init()
-  // what about host_flag?
-  // dunno... commented these out for now
-  int host_flag, neighflag;
-
   SNAKokkos<DeviceType, real_type, vector_length> snaKK;
 
- private:
 
+  using KKDeviceType = typename KKDevice<DeviceType>::value;
 
-  /*
-  int inum;
-
-  typename AT::t_x_array_randomread x;
-  typename ArrayTypes<DeviceType>::t_int_1d_randomread type;
-  typename ArrayTypes<DeviceType>::t_int_1d mask;
-
-  typename AT::t_neighbors_2d d_neighbors;
-  typename AT::t_int_1d_randomread d_ilist;
-  typename AT::t_int_1d_randomread d_numneigh;
-
-  typename AT::t_int_1d d_typelo;
-  typename AT::t_int_1d d_typehi;
-
-  DAT::tdual_float_1d k_cvec;
-  typename AT::t_float_1d d_cvec;
-  DAT::tdual_float_2d k_carray;
-  typename AT::t_float_2d d_carray;
-
-  typename AT::t_float_2d d_normv;
-  */
 };
 
+// These wrapper classes exist to make the compute style factory happy/avoid having
+// to extend the compute style factory to support Compute classes w/an arbitrary number
+// of extra template parameters
+
+template <class DeviceType>
+class ComputeSNAGridKokkosDevice : public ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN> {
+
+ private:
+  using Base = ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN>;
+
+ public:
+
+  ComputeSNAGridKokkosDevice(class LAMMPS *, int, char **);
+  //ComputeSNAGridKokkosDevice(class LAMMPS *);
+
+  void init() override;
+  //double memory_usage() override;
+
+};
+
+#ifdef LMP_KOKKOS_GPU
+template <class DeviceType>
+class ComputeSNAGridKokkosHost : public ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_HOST_VECLEN> {
+
+ private:
+  using Base = ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_HOST_VECLEN>;
+
+ public:
+
+  ComputeSNAGridKokkosHost(class LAMMPS *, int, char **);
+  //ComputeSNAGridKokkosHost(class LAMMPS *);
+
+  void init();
+  //double memory_usage();
+
+};
+#endif
+
 }
 
 #endif
 #endif
-
diff --git a/src/KOKKOS/compute_sna_grid_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
new file mode 100644
index 0000000000..e958fcdb45
--- /dev/null
+++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
@@ -0,0 +1,127 @@
+// clang-format off
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Christian Trott (SNL), Stan Moore (SNL),
+                         Evan Weinberg (NVIDIA)
+------------------------------------------------------------------------- */
+
+#include "compute_sna_grid_kokkos.h"
+
+#include "atom_kokkos.h"
+#include "atom_masks.h"
+#include "comm.h"
+#include "error.h"
+#include "force.h"
+#include "kokkos.h"
+#include "memory_kokkos.h"
+#include "neighbor_kokkos.h"
+#include "neigh_request.h"
+#include "sna.h"
+
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+
+#define MAXLINE 1024
+#define MAXWORD 3
+
+namespace LAMMPS_NS {
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::ComputeSNAGridKokkos(LAMMPS *lmp, int narg, char **arg) : ComputeSNAGrid (lmp, narg, arg)
+{
+  //respa_enable = 0;
+
+  kokkosable = 1;
+  atomKK = (AtomKokkos *) atom;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  //datamask_read = EMPTY_MASK;
+  //datamask_modify = EMPTY_MASK;
+
+  //host_flag = (execution_space == Host);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::~ComputeSNAGridKokkos()
+{
+  if (copymode) return;
+
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::init()
+{
+
+  printf("^^^ inside ComputeSNAGridKokkos init\n");
+  // from pair_snap_kokkos_impl.h :
+  /*
+  if (host_flag) {
+    if (lmp->kokkos->nthreads > 1)
+      error->all(FLERR,"Pair style snap/kk can currently only run on a single "
+                         "CPU thread");
+
+    PairSNAP::init_style();
+    return;
+  }
+
+  if (force->newton_pair == 0)
+    error->all(FLERR,"Pair style SNAP requires newton pair on");
+
+  // neighbor list request for KOKKOS
+
+  neighflag = lmp->kokkos->neighflag;
+
+  auto request = neighbor->add_request(this, NeighConst::REQ_FULL);
+  request->set_kokkos_host(std::is_same<DeviceType,LMPHostType>::value &&
+                           !std::is_same<DeviceType,LMPDeviceType>::value);
+  request->set_kokkos_device(std::is_same<DeviceType,LMPDeviceType>::value);
+  if (neighflag == FULL)
+    error->all(FLERR,"Must use half neighbor list style with pair snap/kk");
+  */
+}
+
+/* ----------------------------------------------------------------------
+   routines used by template reference classes
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+ComputeSNAGridKokkosDevice<DeviceType>::ComputeSNAGridKokkosDevice(class LAMMPS *lmp, int narg, char **arg)
+   : ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN>(lmp, narg, arg) { ; }
+
+template<class DeviceType>
+void ComputeSNAGridKokkosDevice<DeviceType>::init()
+{
+  Base::init();
+}
+
+#ifdef LMP_KOKKOS_GPU
+template<class DeviceType>
+ComputeSNAGridKokkosHost<DeviceType>::ComputeSNAGridKokkosHost(class LAMMPS *lmp, int narg, char **arg)
+   : ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_HOST_VECLEN>(lmp, narg, arg) { ; }
+
+template<class DeviceType>
+void ComputeSNAGridKokkosHost<DeviceType>::init()
+{
+  Base::init();
+}
+#endif
+
+}

From de4dbec66100284c04f712f6788ff61699009222 Mon Sep 17 00:00:00 2001
From: rohskopf <drew.rohskopf@gmail.com>
Date: Sun, 12 Mar 2023 20:03:35 -0600
Subject: [PATCH 06/51] Remove swo

---
 src/KOKKOS/.compute_sna_grid_kokkos.h.swo | Bin 12288 -> 0 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 src/KOKKOS/.compute_sna_grid_kokkos.h.swo

diff --git a/src/KOKKOS/.compute_sna_grid_kokkos.h.swo b/src/KOKKOS/.compute_sna_grid_kokkos.h.swo
deleted file mode 100644
index 12ede098aa5071704880ee2741eba65408343158..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 12288
zcmeI2&yO6%6~_xOKWt*KumlnUsqjj$Uhhnge+4tFR~cr<yVlPB5Ho9WqEp*lGt-&=
zrK*})ulQjRDaR<e<&a1@1aaX2B)}z#5>VpY!au-)LlltU1_DP)1mCLeo*%oj>m{-f
zin^uG%y!qSSFhfydauUr&C-?2EA&FKL~tD=q#WPpu3mqQuKwgWVXlZeva4J6z2xcA
z`3p~-J-e?E8edKoT0RScSXxpz*7Dl&@>;`kq9E=mZp)C_ZQ;3gH|lmHS?t_QXWpKB
zAosulJkV3E!iA&c?3vSMt52PHj2?er;{Y9Vk=z5h2XYVO9>_hAdm#5f?t$C`xd;AV
zJfPy+$Tdvn_H;toul?K7`|SJc^!jo7{GlD~>23a%dm#5f?t$C`xd(C&<Q~X9kb5Ba
zK<<Iu1Gxus4}1?jz+6H;#E<+}?|^am|Nrdw|GW1S5`!nfo!}7o^L>Q$Kmi;9hrtI&
z3Hd2_1^D1Oa2)*o2ZX!@x`2a6!Gquq@ZWn0`49L6ybpc@ehpp%E~tW2;25|c{QC$Y
zH^AHA8t8*6I0X)ae;y{}Z{V-s=O6&jfC{)5e0&cf{{sI2zXR96iy#J1f**qO;KtpA
zd<DJ$pM&3mMeqc892^0^yo-=ufS-XUz~kUi@Cf+mPD0)UZ-8F`2?U5h2%ZCT;QArh
z3;qaf@I2T6P4F=I46*nWd<fnHe*u32?}B&0AHeUy>p;itOK=qbfO7;K26uzIz}JZD
z4e%BC5{zC`JaoMlNx#TT@cz?;(isaMYCGm33plMb%VlM^{D>+0j7wcRj6~3Xn(Qm%
zFd^8sR<>aJyFKO)b#(1v`^0fh>v5bucP45;D>Rta-9~t!vQ2lbxZ`OBJBApO@S{mH
zNknYHLN^KoXJ*>HAmm=Vv!}*!anJXpnjmX*Y(_$+&eSUP?G7{GzNztRSFF;oA!L}N
zWw}vr)8{ncK_s^Ao@8x4eNr3Hk35%pp{Mk%5v?XK+B8-xi+Igu*}P@56iGGh_)JP#
zudJ-BH|UJ%eL&e+jG$(z(_w<n=$?eeu*Y-CddQyi!Zv+|Nlq`)NgC%Cc;9n)6ET~m
zuxs5;-U+gwtySvdPnOmiP5Vl%TCc6nV|uah`b=>QlNdiqyPg{@rv}~1@#zah@gVkL
z0G&(5A5LVROnEJrw`hH3FrCG9ylt^>#Awv#0#V^RLO1e!&lo0SA2f;+rN6s4F4V4E
zuGYR~Bomu&SuDqGxJfjLWzncb`Bfzt9fUn*0$EaVA$7ZobUU^xRUJ;9WS>!~wy`r?
z%u(2yT1NKbI1(ybiqtF}F6oxVLdwLZhan&t_JU0=wBBZiWAAJU7ROu+#Q9a^7_Bd0
zUB1tV9dz5am`Ma3*2iAV7Gbh6mzRgD78`VsFSF+(yK}x=?lZrK)v5P2c50G6MkHn_
z5~go!C9@Ps8gsUN9=26yp5o2E<z=&JBZU_3mZ8Kf=ZV>CWX<agXY5aXYMTNxZITW?
zPL7F=WW15*80Y0l*<pIT^F)#H#5iG-z`Vql+CHCOg&W}<tqV7E1CQv8m1e`iGl&lU
zwspHyOnDAgPbd|?8Vj9T$32NIW1dAf^wTca`KETozEoeUG^vS{iG)_0HCJ?LZbDX$
zL~R~*epF~=LAWM057bx|$GW61RM$4@7^kv$v1CpS+4EY+P>0-AUDz)9tSv8I+NjrG
zT&mPBnUCfYa{7Yqk`;)(6Zc*lePpNZO1rbe^*eN{iP#RG&Er6Ztt(^Z@U-UHs|~2e
zhMV$2$M3nEo`{(z1}~zXKuXUV^joUg3#uo^x|kAg>-s+;FzlFgGX5-fMm6fCGy49k
zZY5KQs29RO6p`xM%KApLW@Ba9i_czO$cp5Wt~6m*YW9{wB7v{NcA<qM98<_<EeybY
z$zlJFrZ77eX8S?suxU)%(y!)P-LBO^Rh`XRM~@Q8+L?pJy720iHje{AGEOFcaBW)0
zE?pA~>z=xk6?_w0hbvxNyO*wSrDT^=;qUZJkOiq)uQVF?HFc}5<nT@#JgEnItF6>%
z%R~L~LQ^7C6c>En=YDq1b3I|Q^mb7v5hJs>x<MCt$OZH1x;e(tI*w<d<V7vwaG@B~
z32bpC6UCmSGHR(UCU}veUX9#S{Ow{l2yYY1+oe4$RiX7P)C!eq>0IHdbCvpq^QBUW
zZh5Lh8#s1*48>8PNFo(RA@(%N=|-_pq>K7|6b2kAQE+XA)S9YMylt$Z(`QPhvnMC$
zfK?lM;KT`B1X5S?Y80Z@I?7BvJ@{g|a6gL87>cu?Ow%VQ@;^#mTQ5eUt;g*s70a?^
zZEbB8hb<OPtjQc!jrs6mo#BwrPzyb73{HnCph2{GK{^Z(s;DUN9BU==xzqD;l3u}a
k+=DUObREYG-$&3IUVtVjBGwEo%oM0|>XiBZ-&YX%2H#=@?f?J)


From 212b86405251bbdcc593a27cb0aec4ebc19b0374 Mon Sep 17 00:00:00 2001
From: rohskopf <drew.rohskopf@gmail.com>
Date: Sat, 8 Apr 2023 11:19:07 -0600
Subject: [PATCH 07/51] Add all SNAP computations and 4D view Kokkos memory
 allocator

---
 src/KOKKOS/compute_sna_grid_kokkos.cpp    |  56 ++
 src/KOKKOS/compute_sna_grid_kokkos.h      | 259 +++++++-
 src/KOKKOS/compute_sna_grid_kokkos_impl.h | 765 +++++++++++++++++++++-
 src/KOKKOS/memory_kokkos.h                |  50 ++
 src/ML-SNAP/compute_grid.cpp              |   4 +-
 src/ML-SNAP/compute_sna_grid.cpp          |  29 +-
 src/ML-SNAP/compute_sna_grid.h            |  12 +-
 7 files changed, 1118 insertions(+), 57 deletions(-)

diff --git a/src/KOKKOS/compute_sna_grid_kokkos.cpp b/src/KOKKOS/compute_sna_grid_kokkos.cpp
index 197234cf1d..8a05ba7901 100644
--- a/src/KOKKOS/compute_sna_grid_kokkos.cpp
+++ b/src/KOKKOS/compute_sna_grid_kokkos.cpp
@@ -23,3 +23,59 @@ template class ComputeSNAGridKokkosHost<LMPHostType>;
 #endif
 
 }
+
+
+
+
+// The following chunk will compile but we're gonna try a wrapper approach like pair snap.
+/*
+#include "compute_sna_grid_kokkos.h"
+
+#include "atom_kokkos.h"
+#include "atom_masks.h"
+#include "comm.h"
+#include "error.h"
+#include "memory_kokkos.h"
+#include "modify.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "neighbor_kokkos.h"
+#include "sna_kokkos.h"
+#include "update.h"
+
+using namespace LAMMPS_NS;
+
+// ----------------------------------------------------------------------
+
+template<class DeviceType>
+ComputeSNAGridKokkos<DeviceType>::ComputeSNAGridKokkos(LAMMPS *lmp, int narg, char **arg) :
+  ComputeSNAGrid(lmp, narg, arg)
+{
+
+  printf("^^^ inside ComputeSNAGridKokkos constructor\n");
+  kokkosable = 1;
+  atomKK = (AtomKokkos *) atom;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  datamask_read = EMPTY_MASK;
+  datamask_modify = EMPTY_MASK;
+
+}
+
+// ----------------------------------------------------------------------
+
+template<class DeviceType>
+ComputeSNAGridKokkos<DeviceType>::~ComputeSNAGridKokkos()
+{
+  if (copymode) return;
+
+
+}
+
+namespace LAMMPS_NS {
+template class ComputeSNAGridKokkos<LMPDeviceType>;
+#ifdef LMP_KOKKOS_GPU
+template class ComputeSNAGridKokkos<LMPHostType>;
+#endif
+}
+*/
+
diff --git a/src/KOKKOS/compute_sna_grid_kokkos.h b/src/KOKKOS/compute_sna_grid_kokkos.h
index 9ab23f5bd2..b461f755b8 100644
--- a/src/KOKKOS/compute_sna_grid_kokkos.h
+++ b/src/KOKKOS/compute_sna_grid_kokkos.h
@@ -29,42 +29,233 @@ ComputeStyle(sna/grid/kk/host,ComputeSNAGridKokkosDevice<LMPHostType>);
 
 #include "compute_sna_grid.h"
 #include "kokkos_type.h"
+//#include "pair_snap.h"
+//#include "kokkos_type.h"
 //#include "neigh_list_kokkos.h"
 #include "sna_kokkos.h"
 //#include "pair_kokkos.h"
 
 namespace LAMMPS_NS {
 
+// Routines for both the CPU and GPU backend
+//template<int NEIGHFLAG, int EVFLAG>
+//struct TagPairSNAPComputeForce{};
+
+
+// GPU backend only
+/*
+struct TagPairSNAPComputeNeigh{};
+struct TagPairSNAPComputeCayleyKlein{};
+struct TagPairSNAPPreUi{};
+struct TagPairSNAPComputeUiSmall{}; // more parallelism, more divergence
+struct TagPairSNAPComputeUiLarge{}; // less parallelism, no divergence
+struct TagPairSNAPTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist
+struct TagPairSNAPComputeZi{};
+struct TagPairSNAPBeta{};
+struct TagPairSNAPComputeBi{};
+struct TagPairSNAPTransformBi{}; // re-order blist from AoSoA to AoS
+struct TagPairSNAPComputeYi{};
+struct TagPairSNAPComputeYiWithZlist{};
+template<int dir>
+struct TagPairSNAPComputeFusedDeidrjSmall{}; // more parallelism, more divergence
+template<int dir>
+struct TagPairSNAPComputeFusedDeidrjLarge{}; // less parallelism, no divergence
+*/
+//struct TagPairSNAPPreUi{};
+struct TagCSNAGridComputeNeigh{};
+struct TagCSNAGridComputeCayleyKlein{};
+struct TagCSNAGridPreUi{};
+struct TagCSNAGridComputeUiSmall{}; // more parallelism, more divergence
+struct TagCSNAGridComputeUiLarge{}; // less parallelism, no divergence
+struct TagCSNAGridTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist
+struct TagCSNAGridComputeZi{};
+struct TagCSNAGridComputeBi{};
+
+struct TagComputeSNAGridLoop{};
+struct TagComputeSNAGrid3D{};
+//struct TagCSNAGridTeam{};
+
+// CPU backend only
+/*
+struct TagPairSNAPComputeNeighCPU{};
+struct TagPairSNAPPreUiCPU{};
+struct TagPairSNAPComputeUiCPU{};
+struct TagPairSNAPTransformUiCPU{};
+struct TagPairSNAPComputeZiCPU{};
+struct TagPairSNAPBetaCPU{};
+struct TagPairSNAPComputeBiCPU{};
+struct TagPairSNAPZeroYiCPU{};
+struct TagPairSNAPComputeYiCPU{};
+struct TagPairSNAPComputeDuidrjCPU{};
+struct TagPairSNAPComputeDeidrjCPU{};
+*/
+struct TagComputeSNAGridLoopCPU{};
+
+//template<class DeviceType>
 template<class DeviceType, typename real_type_, int vector_length_>
 class ComputeSNAGridKokkos : public ComputeSNAGrid {
  public:
-  //enum {EnabledNeighFlags=FULL|HALF|HALFTHREAD};
-  //enum {COUL_FLAG=0};
   typedef DeviceType device_type;
   typedef ArrayTypes<DeviceType> AT;
-  typedef EV_FLOAT value_type;
 
   static constexpr int vector_length = vector_length_;
   using real_type = real_type_;
-  //using complex = SNAComplex<real_type>;
+  using complex = SNAComplex<real_type>;
+
+  // Static team/tile sizes for device offload
+
+#ifdef KOKKOS_ENABLE_HIP
+  static constexpr int team_size_compute_neigh = 2;
+  static constexpr int tile_size_compute_ck = 2;
+  static constexpr int tile_size_pre_ui = 2;
+  static constexpr int team_size_compute_ui = 2;
+  static constexpr int tile_size_transform_ui = 2;
+  static constexpr int tile_size_compute_zi = 2;
+  static constexpr int tile_size_compute_bi = 2;
+  static constexpr int tile_size_transform_bi = 2;
+  static constexpr int tile_size_compute_yi = 2;
+  static constexpr int team_size_compute_fused_deidrj = 2;
+#else
+  static constexpr int team_size_compute_neigh = 4;
+  static constexpr int tile_size_compute_ck = 4;
+  static constexpr int tile_size_pre_ui = 4;
+  static constexpr int team_size_compute_ui = sizeof(real_type) == 4 ? 8 : 4;
+  static constexpr int tile_size_transform_ui = 4;
+  static constexpr int tile_size_compute_zi = 8;
+  static constexpr int tile_size_compute_bi = 4;
+  static constexpr int tile_size_transform_bi = 4;
+  static constexpr int tile_size_compute_yi = 8;
+  static constexpr int team_size_compute_fused_deidrj = sizeof(real_type) == 4 ? 4 : 2;
+#endif
+
+  // Custom MDRangePolicy, Rank3, to reduce verbosity of kernel launches
+  // This hides the Kokkos::IndexType<int> and Kokkos::Rank<3...>
+  // and reduces the verbosity of the LaunchBound by hiding the explicit
+  // multiplication by vector_length
+  template <class Device, int num_tiles, class TagComputeSNAP>
+  using Snap3DRangePolicy = typename Kokkos::MDRangePolicy<Device, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, Kokkos::LaunchBounds<vector_length * num_tiles>, TagComputeSNAP>;
+
+  // MDRangePolicy for the 3D grid loop:
+  template <class Device, class TagComputeSNAP>
+  using CSNAGrid3DPolicy = typename Kokkos::MDRangePolicy<Device, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>>;
+
+  // Testing out team policies
+  template <class Device, int num_teams,  class TagComputeSNAP>
+  using CSNAGridTeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::LaunchBounds<vector_length * num_teams>, TagComputeSNAP>;
+  //using CSNAGridTeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::IndexType<int>, Kokkos::IndexType<int>, Kokkos::IndexType<int>, TagComputeSNAP>;
+  //using team_member = typename team_policy::member_type;
+
+  // Custom SnapAoSoATeamPolicy to reduce the verbosity of kernel launches
+  // This hides the LaunchBounds abstraction by hiding the explicit
+  // multiplication by vector length
+  template <class Device, int num_teams, class TagComputeSNAP>
+  using SnapAoSoATeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::LaunchBounds<vector_length * num_teams>, TagComputeSNAP>;
 
   ComputeSNAGridKokkos(class LAMMPS *, int, char **);
   ~ComputeSNAGridKokkos() override;
 
   void init() override;
-  //void compute_array(int, int) override;
-  //double memory_usage() override;
-  
+  void setup() override;
+  void compute_array() override;
+
+  // Utility functions for teams
+
+  template<class TagStyle>
+  void check_team_size_for(int, int&);
+
+  template<class TagStyle>
+  void check_team_size_reduce(int, int&);
+
+  // operator function for example team policy
+  //KOKKOS_INLINE_FUNCTION
+  //void operator() (TagCSNAGridTeam, const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridTeam>::member_type& team) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagComputeSNAGridLoop, const int& ) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagComputeSNAGridLoopCPU, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridComputeNeigh>::member_type& team) const;
+
+  // 3D case - used by parallel_for
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagComputeSNAGrid3D, const int& iz, const int& iy, const int& ix) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeCayleyKlein, const int iatom_mod, const int jnbor, const int iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridPreUi,const int iatom_mod, const int j, const int iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridComputeUiSmall>::member_type& team) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridComputeUiLarge>::member_type& team) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridTransformUi,const int iatom_mod, const int j, const int iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeZi,const int iatom_mod, const int idxz, const int iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeBi,const int iatom_mod, const int idxb, const int iatom_div) const;
+
  protected:
+
   SNAKokkos<DeviceType, real_type, vector_length> snaKK;
 
+  int chunk_size, chunk_offset;
+  int host_flag;
+  int ntotal;
+  int total_range; // total number of loop iterations in grid
+  int zlen; //= nzhi-nzlo+1;
+  int ylen; //= nyhi-nylo+1;
+  int xlen; //= nxhi-nxlo+1;
 
-  using KKDeviceType = typename KKDevice<DeviceType>::value;
+  double cutsq_tmp; // temporary cutsq until we get a view
+
+  Kokkos::View<real_type*, DeviceType> d_radelem;              // element radii
+  Kokkos::View<real_type*, DeviceType> d_wjelem;               // elements weights
+  //Kokkos::View<real_type**, Kokkos::LayoutRight, DeviceType> d_coeffelem;           // element bispectrum coefficients
+  Kokkos::View<real_type*, DeviceType> d_sinnerelem;           // element inner cutoff midpoint
+  Kokkos::View<real_type*, DeviceType> d_dinnerelem;           // element inner cutoff half-width
+  Kokkos::View<T_INT*, DeviceType> d_ninside;                // ninside for all atoms in list
+  Kokkos::View<T_INT*, DeviceType> d_map;                    // mapping from atom types to elements
+
+  typedef Kokkos::DualView<F_FLOAT**, DeviceType> tdual_fparams;
+  tdual_fparams k_cutsq;
+  typedef Kokkos::View<const F_FLOAT**, DeviceType,
+      Kokkos::MemoryTraits<Kokkos::RandomAccess> > t_fparams_rnd;
+  t_fparams_rnd rnd_cutsq;
+
+  typename AT::t_x_array_randomread x;
+  typename AT::t_int_1d_randomread type;
+  DAT::tdual_float_2d k_grid;
+  DAT::tdual_float_2d k_gridall;
+  typename AT::t_float_2d d_grid;
+  typename AT::t_float_2d d_gridall;
+
+  //DAT::tdual_float_4d k_gridlocal;
+  //typedef Kokkos::DualView<real_type****, Kokkos::LayoutLeft, DeviceType> t_gridlocal_4d;
+  //typedef Kokkos::View<real_type****, DeviceType> t_4d;
+  typedef Kokkos::DualView<LMP_FLOAT****, LMPDeviceType> tdual_float_4d;
+  tdual_float_4d k_gridlocal;
+  tdual_float_4d d_gridlocal; 
+
+
+  // Utility routine which wraps computing per-team scratch size requirements for
+  // ComputeNeigh, ComputeUi, and ComputeFusedDeidrj
+  template <typename scratch_type>
+  int scratch_size_helper(int values_per_team);
 
 };
 
 // These wrapper classes exist to make the compute style factory happy/avoid having
-// to extend the compute style factory to support Compute classes w/an arbitrary number
+// to extend the compute  style factory to support Compute classes w/an arbitrary number
 // of extra template parameters
 
 template <class DeviceType>
@@ -76,10 +267,9 @@ class ComputeSNAGridKokkosDevice : public ComputeSNAGridKokkos<DeviceType, SNAP_
  public:
 
   ComputeSNAGridKokkosDevice(class LAMMPS *, int, char **);
-  //ComputeSNAGridKokkosDevice(class LAMMPS *);
 
   void init() override;
-  //double memory_usage() override;
+  void compute_array() override;
 
 };
 
@@ -93,10 +283,9 @@ class ComputeSNAGridKokkosHost : public ComputeSNAGridKokkos<DeviceType, SNAP_KO
  public:
 
   ComputeSNAGridKokkosHost(class LAMMPS *, int, char **);
-  //ComputeSNAGridKokkosHost(class LAMMPS *);
 
-  void init();
-  //double memory_usage();
+  void init() override;
+  void compute_array() override;
 
 };
 #endif
@@ -105,3 +294,45 @@ class ComputeSNAGridKokkosHost : public ComputeSNAGridKokkos<DeviceType, SNAP_KO
 
 #endif
 #endif
+
+// The following will compile with the chunk in cpp file but we're gonna try wrapper like pair snap.
+/*
+#ifdef COMPUTE_CLASS
+// clang-format off
+ComputeStyle(sna/grid/kk,ComputeSNAGridKokkos<LMPDeviceType>);
+ComputeStyle(sna/grid/kk/device,ComputeSNAGridKokkos<LMPDeviceType>);
+ComputeStyle(sna/grid/kk/host,ComputeSNAGridKokkos<LMPHostType>);
+// clang-format on
+#else
+
+// clang-format off
+#ifndef LMP_COMPUTE_SNA_GRID_KOKKOS_H
+#define LMP_COMPUTE_SNA_GRID_KOKKOS_H
+
+#include "compute_sna_grid.h"
+#include "kokkos_type.h"
+
+namespace LAMMPS_NS {
+
+//template<int CSTYLE, int NCOL>
+//struct TagComputeCoordAtom{};
+
+template<class DeviceType>
+class ComputeSNAGridKokkos : public ComputeSNAGrid {
+ public:
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+
+  ComputeSNAGridKokkos(class LAMMPS *, int, char **);
+  ~ComputeSNAGridKokkos() override;
+
+ private:
+
+};
+
+}
+
+#endif
+#endif
+*/
+
diff --git a/src/KOKKOS/compute_sna_grid_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
index e958fcdb45..b0cf30d070 100644
--- a/src/KOKKOS/compute_sna_grid_kokkos_impl.h
+++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
@@ -3,12 +3,10 @@
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    https://www.lammps.org/, Sandia National Laboratories
    LAMMPS development team: developers@lammps.org
-
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
-
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
@@ -18,17 +16,20 @@
 ------------------------------------------------------------------------- */
 
 #include "compute_sna_grid_kokkos.h"
+#include "pair_snap_kokkos.h"
 
 #include "atom_kokkos.h"
 #include "atom_masks.h"
 #include "comm.h"
 #include "error.h"
-#include "force.h"
-#include "kokkos.h"
 #include "memory_kokkos.h"
-#include "neighbor_kokkos.h"
+#include "modify.h"
+#include "neigh_list.h"
 #include "neigh_request.h"
+#include "neighbor_kokkos.h"
+//#include "sna_kokkos.h"
 #include "sna.h"
+#include "update.h"
 
 #include <cmath>
 #include <cstdlib>
@@ -39,69 +40,757 @@
 
 namespace LAMMPS_NS {
 
-/* ---------------------------------------------------------------------- */
+// Constructor
 
 template<class DeviceType, typename real_type, int vector_length>
-ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::ComputeSNAGridKokkos(LAMMPS *lmp, int narg, char **arg) : ComputeSNAGrid (lmp, narg, arg)
+ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::ComputeSNAGridKokkos(LAMMPS *lmp, int narg, char **arg) : ComputeSNAGrid(lmp, narg, arg)
 {
   //respa_enable = 0;
 
   kokkosable = 1;
   atomKK = (AtomKokkos *) atom;
   execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
-  //datamask_read = EMPTY_MASK;
-  //datamask_modify = EMPTY_MASK;
+  datamask_read = EMPTY_MASK;
+  datamask_modify = EMPTY_MASK;
 
-  //host_flag = (execution_space == Host);
+  k_cutsq = tdual_fparams("ComputeSNAGridKokkos::cutsq",atom->ntypes+1,atom->ntypes+1);
+  auto d_cutsq = k_cutsq.template view<DeviceType>();
+  rnd_cutsq = d_cutsq;
+
+  host_flag = (execution_space == Host);
+
+  // ComputeSNAGrid constructor allocates `map` so let's do same here.
+  // actually, let's move this down to init
+  //int n = atom->ntypes;
+  //printf("^^^ realloc d_map\n");
+  //MemKK::realloc_kokkos(d_map,"ComputeSNAGridKokkos::map",n+1);
+  
+
+  printf("^^^^^ cutsq: %f\n", cutsq[1][1]);
+
+  cutsq_tmp = cutsq[1][1];
+
+  //memoryKK->create_kokkos(k_gridlocal,
+  //printf("^^^^^ gridlocal: %f\n", gridlocal[0][0][0][0]); 
 }
 
-/* ---------------------------------------------------------------------- */
+// Destructor
 
 template<class DeviceType, typename real_type, int vector_length>
 ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::~ComputeSNAGridKokkos()
 {
   if (copymode) return;
 
+  //memoryKK->destroy_kokkos(k_eatom,eatom);
+  //memoryKK->destroy_kokkos(k_vatom,vatom);
+  printf("^^^ Finish ComputeSNAGridKokkos destructor\n");
 }
 
-/* ---------------------------------------------------------------------- */
+// Init
 
 template<class DeviceType, typename real_type, int vector_length>
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::init()
 {
+  printf("^^^ Begin ComputeSNAGridKokkos init()\n");
+  // The part of pair_snap_kokkos_impl.h that allocates snap params is coeff(), and it 
+  // calls the original coeff function. So let's do that here: 
+
+  ComputeSNAGrid::init();
+
+  // Set up element lists
+  printf("^^^ Begin kokkos reallocs\n");
+  MemKK::realloc_kokkos(d_radelem,"ComputeSNAGridKokkos::radelem",nelements);
+  MemKK::realloc_kokkos(d_wjelem,"pair:wjelem",nelements);
+  // pair snap kokkos uses `ncoeffall` in the following, inherits from original.
+  //MemKK::realloc_kokkos(d_coeffelem,"pair:coeffelem",nelements,ncoeff);
+  MemKK::realloc_kokkos(d_sinnerelem,"pair:sinnerelem",nelements);
+  MemKK::realloc_kokkos(d_dinnerelem,"pair:dinnerelem",nelements);
+  int n = atom->ntypes;
+  //printf("^^^ realloc d_map\n");
+  printf("^^^ n: %d\n", n);
+  MemKK::realloc_kokkos(d_map,"ComputeSNAGridKokkos::map",n+1);
+
+  printf("^^^ begin mirrow view creation\n");
+  auto h_radelem = Kokkos::create_mirror_view(d_radelem);
+  auto h_wjelem = Kokkos::create_mirror_view(d_wjelem);
+  //auto h_coeffelem = Kokkos::create_mirror_view(d_coeffelem);
+  auto h_sinnerelem = Kokkos::create_mirror_view(d_sinnerelem);
+  auto h_dinnerelem = Kokkos::create_mirror_view(d_dinnerelem);
+  auto h_map = Kokkos::create_mirror_view(d_map);
+
+  printf("^^^ begin loop over elements, nelements = %d\n", nelements);
+  for (int ielem = 0; ielem < nelements; ielem++) {
+    printf("^^^^^ ielem %d\n", ielem);
+    h_radelem(ielem) = radelem[ielem];
+    printf("^^^^^ 1\n");
+    h_wjelem(ielem) = wjelem[ielem];
+    printf("^^^^^ 2\n");
+    if (switchinnerflag){
+      h_sinnerelem(ielem) = sinnerelem[ielem];
+      h_dinnerelem(ielem) = dinnerelem[ielem];
+    }
+    // pair snap kokkos uses `ncoeffall` in the following.
+    //for (int jcoeff = 0; jcoeff < ncoeff; jcoeff++) {
+    //  h_coeffelem(ielem,jcoeff) = coeffelem[ielem][jcoeff];
+    //}
+  }
+
+  printf("^^^ begin loop over map\n");
+  // NOTE: At this point it's becoming obvious that compute sna grid is not like pair snap, where 
+  // some things like `map` get allocated regardless of chem flag.
+  if (chemflag){ 
+    for (int i = 1; i <= atom->ntypes; i++) {
+      h_map(i) = map[i];
+      printf("%d\n", map[i]);
+    }
+  }
+
+  Kokkos::deep_copy(d_radelem,h_radelem);
+  Kokkos::deep_copy(d_wjelem,h_wjelem);
+  if (switchinnerflag){
+    Kokkos::deep_copy(d_sinnerelem,h_sinnerelem);
+    Kokkos::deep_copy(d_dinnerelem,h_dinnerelem);
+  }
+  if (chemflag){
+    Kokkos::deep_copy(d_map,h_map);
+  }
+
+  snaKK = SNAKokkos<DeviceType, real_type, vector_length>(rfac0,twojmax,
+    rmin0,switchflag,bzeroflag,chemflag,bnormflag,wselfallflag,nelements,switchinnerflag);
+  snaKK.grow_rij(0,0);
+  snaKK.init();
 
-  printf("^^^ inside ComputeSNAGridKokkos init\n");
-  // from pair_snap_kokkos_impl.h :
-  /*
   if (host_flag) {
-    if (lmp->kokkos->nthreads > 1)
-      error->all(FLERR,"Pair style snap/kk can currently only run on a single "
-                         "CPU thread");
 
-    PairSNAP::init_style();
+    // The following lmp->kokkos will compile error with pointer to incomplete class type not allowed.
+    //if (lmp->kokkos->nthreads > 1)
+    //  error->all(FLERR,"Compute style sna/grid/kk can currently only run on a single "
+    //                     "CPU thread");
+
+    ComputeSNAGrid::init();
     return;
   }
 
-  if (force->newton_pair == 0)
-    error->all(FLERR,"Pair style SNAP requires newton pair on");
+  printf("^^^ Finished ComputeSNAGridKokkos init\n");
 
-  // neighbor list request for KOKKOS
-
-  neighflag = lmp->kokkos->neighflag;
-
-  auto request = neighbor->add_request(this, NeighConst::REQ_FULL);
-  request->set_kokkos_host(std::is_same<DeviceType,LMPHostType>::value &&
-                           !std::is_same<DeviceType,LMPDeviceType>::value);
-  request->set_kokkos_device(std::is_same<DeviceType,LMPDeviceType>::value);
-  if (neighflag == FULL)
-    error->all(FLERR,"Must use half neighbor list style with pair snap/kk");
-  */
 }
 
+// Setup
+
+template<class DeviceType, typename real_type, int vector_length>
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::setup()
+{
+  // Do not call ComputeGrid::setup(), we don't wanna allocate the grid array there.
+  // Instead, call ComputeGrid::set_grid_global and set_grid_local to set the n indices.
+  //ComputeGrid::setup();
+  printf("^^^^^ SETUP!\n");
+  //printf("^^^^^ gridlocal: %f\n", gridlocal[0][0][0][0]);
+  ComputeGrid::set_grid_global();
+  ComputeGrid::set_grid_local();
+  
+  // allocate arrays
+
+  memoryKK->create_kokkos(k_grid,grid, size_array_rows, size_array_cols, "grid:grid");
+  memoryKK->create_kokkos(k_gridall, gridall, size_array_rows, size_array_cols, "grid:gridall");
+  if (nxlo <= nxhi && nylo <= nyhi && nzlo <= nzhi) {
+    gridlocal_allocated = 1;
+    memoryKK->create4d_offset_kokkos(k_gridlocal, gridlocal, size_array_cols, nzlo, nzhi, nylo, 
+                                     nyhi, nxlo, nxhi, "grid:gridlocal");
+  }
+  array = gridall;
+}
+
+// Compute
+
+template<class DeviceType, typename real_type, int vector_length>
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::compute_array()
+{
+  printf("^^^ Begin ComputeSNAGridKokkos compute_array()\n");
+
+  if (DeviceType::in_parallel()) {
+    printf("^^^ compute_array() is a host function\n");
+  } else {
+    printf("^^^ compute_array() is not a host function\n");
+  }
+
+  if (host_flag) {
+    /*
+    atomKK->sync(Host,X_MASK|F_MASK|TYPE_MASK);
+    PairSNAP::compute(eflag_in,vflag_in);
+    atomKK->modified(Host,F_MASK);
+    */
+    return;
+  }
+
+  copymode = 1;
+
+  zlen = nzhi-nzlo+1;
+  ylen = nyhi-nylo+1;
+  xlen = nxhi-nxlo+1;
+  printf("^^^ nzlo nzhi nylo nyhi nxlo nxhi: %d %d %d %d %d %d\n", nzlo, nzhi, nylo, nyhi, nxlo, nxhi);
+  total_range = (nzhi-nzlo+1)*(nyhi-nylo+1)*(nxhi-nxlo+1);
+
+  atomKK->sync(execution_space,X_MASK|F_MASK|TYPE_MASK);
+  x = atomKK->k_x.view<DeviceType>();
+  // This will error because trying to access host view on the device:
+  //printf("x(0,0): %f\n", x(0,0));
+  type = atomKK->k_type.view<DeviceType>();
+  k_cutsq.template sync<DeviceType>();
+
+
+  MemKK::realloc_kokkos(d_ninside,"PairSNAPKokkos:ninside",total_range);
+
+  //printf("^^^ nzlo nzhi nylo nyhi nxlo nxhi: %d %d %d %d %d %d\n", nzlo, nzhi, nylo, nyhi, nxlo, nxhi);
+  
+  // Pair snap/kk uses grow_ij with some max number of neighs but compute sna/grid uses total 
+  // number of atoms.
+  
+  //const int ntotal = atomKK->nlocal + atomKK->nghost;
+  ntotal = atomKK->nlocal + atomKK->nghost;
+  //printf("^^^ ntotal:  %d\n", ntotal);
+
+  // ensure rij, inside, and typej are of size jnum
+  // snaKK.grow_rij(int, int) requires 2 args where one is a chunksize.
+
+  chunk_size = MIN(chunksize, total_range); // "chunksize" variable is set by user
+  //printf("^^^ chunk_size: %d\n", chunk_size);
+  snaKK.grow_rij(chunk_size, ntotal);
+
+  // Launch 3 teams of the maximum number of threads per team
+  //const int team_size_max = team_policy(3, 1).team_size_max(
+  //    TagCSNAGridTeamPolicy, Kokkos::ParallelForTag());
+  //typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridTeamPolicy> team_policy_test(3,1);
+
+  // Using custom policy:
+  /* 
+  CSNAGridTeamPolicy<DeviceType, team_size_compute_neigh ,TagCSNAGridTeam> team_policy(chunk_size,team_size_compute_neigh,vector_length);
+  //team_policy = team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+  Kokkos::parallel_for("TeamPolicy",team_policy,*this);
+  */
+
+
+  chunk_size = total_range; 
+  printf("%d %d %d\n", chunk_size, team_size_compute_neigh, vector_length);
+  // team_size_compute_neigh is defined in `pair_snap_kokkos.h`
+
+
+  // Pre-compute ceil(chunk_size / vector_length) for code cleanliness
+  const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length;
+
+  //ComputeNeigh 
+  {
+    int scratch_size = scratch_size_helper<int>(team_size_compute_neigh * ntotal);
+
+    SnapAoSoATeamPolicy<DeviceType, team_size_compute_neigh, TagCSNAGridComputeNeigh> 
+      policy_neigh(chunk_size, team_size_compute_neigh, vector_length);
+    policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+    Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this);
+  }
+
+  //ComputeCayleyKlein
+  {
+    // tile_size_compute_ck is defined in `compute_sna_grid_kokkos.h`
+    Snap3DRangePolicy<DeviceType, tile_size_compute_ck, TagCSNAGridComputeCayleyKlein>
+      policy_compute_ck({0,0,0}, {vector_length, ntotal, chunk_size_div}, {vector_length, tile_size_compute_ck, 1});
+    Kokkos::parallel_for("ComputeCayleyKlein", policy_compute_ck, *this);
+  }
+
+  //PreUi
+  {
+    // tile_size_pre_ui is defined in `compute_sna_grid_kokkos.h`
+    Snap3DRangePolicy<DeviceType, tile_size_pre_ui, TagCSNAGridPreUi>
+      policy_preui({0,0,0},{vector_length,twojmax+1,chunk_size_div},{vector_length,tile_size_pre_ui,1});
+    Kokkos::parallel_for("PreUi",policy_preui,*this);
+  }
+
+  // ComputeUi w/ vector parallelism, shared memory, direct atomicAdd into ulisttot
+  {
+    // team_size_compute_ui is defined in `compute_sna_grid_kokkos.h`
+    // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer
+    const int tile_size = vector_length * (twojmax + 1);
+    const int scratch_size = scratch_size_helper<complex>(team_size_compute_ui * tile_size);
+
+    if (chunk_size < parallel_thresh)
+    {
+      // Version with parallelism over j_bend
+
+      // total number of teams needed: (natoms / 32) * (ntotal) * ("bend" locations)
+      const int n_teams = chunk_size_div * ntotal * (twojmax + 1);
+      const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
+
+      SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagCSNAGridComputeUiSmall>
+        policy_ui(n_teams_div, team_size_compute_ui, vector_length);
+      policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+      Kokkos::parallel_for("ComputeUiSmall", policy_ui, *this);
+    } else {
+      // Version w/out parallelism  over j_bend
+
+      // total number of teams needed: (natoms / 32) * (ntotal)
+      const int n_teams = chunk_size_div * ntotal;
+      const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
+
+      SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagCSNAGridComputeUiLarge>
+        policy_ui(n_teams_div, team_size_compute_ui, vector_length);
+      policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+      Kokkos::parallel_for("ComputeUiLarge", policy_ui, *this);
+    }
+  }
+
+  //TransformUi: un-"fold" ulisttot, zero ylist
+  {
+    // team_size_transform_ui is defined in `pair_snap_kokkos.h`
+    Snap3DRangePolicy<DeviceType, tile_size_transform_ui, TagCSNAGridTransformUi>
+        policy_transform_ui({0,0,0},{vector_length,snaKK.idxu_max,chunk_size_div},{vector_length,tile_size_transform_ui,1});
+    Kokkos::parallel_for("TransformUi",policy_transform_ui,*this);
+  }
+
+  //Compute bispectrum in AoSoA data layout, transform Bi
+  //if (quadraticflag || eflag) {
+
+  //ComputeZi
+  const int idxz_max = snaKK.idxz_max;
+  Snap3DRangePolicy<DeviceType, tile_size_compute_zi, TagCSNAGridComputeZi>
+      policy_compute_zi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_zi,1});
+  Kokkos::parallel_for("ComputeZi",policy_compute_zi,*this);
+
+  //ComputeBi
+  const int idxb_max = snaKK.idxb_max;
+  Snap3DRangePolicy<DeviceType, tile_size_compute_bi, TagCSNAGridComputeBi>
+      policy_compute_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_compute_bi,1});
+  Kokkos::parallel_for("ComputeBi",policy_compute_bi,*this);
+
+  //Looks like best way to grab blist is in a parallel_for
+
+  //Transform data layout of blist out of AoSoA
+  //We need this because `blist` gets used in ComputeForce which doesn't
+  //take advantage of AoSoA, which at best would only be beneficial on the margins
+  //NOTE: Do we need this in compute sna/grid/kk?
+  /*
+  Snap3DRangePolicy<DeviceType, tile_size_transform_bi, TagPairSNAPTransformBi>
+      policy_transform_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_transform_bi,1});
+  Kokkos::parallel_for("TransformBi",policy_transform_bi,*this);
+  */
+
+
+
+  // let's try a simple parallel for loop
+  // NOTE: We get the compiler error calling host function DeviceType::in_parallel() in this 
+  // function, because this is a host-device function.
+  /*
+  typename Kokkos::RangePolicy<DeviceType,TagComputeSNAGridLoop> policy_loop(0,4);
+  Kokkos::parallel_for("Loop",policy_loop,*this);
+  */
+
+
+  // Simple working loop:
+  /* 
+  Kokkos::parallel_for("Loop1", 4, KOKKOS_LAMBDA (const int& i) {
+    printf("Greeting from iteration %i\n",i);
+  });
+  */
+
+  /*
+  // NOTE: We get the compiler error calling host function DeviceType::in_parallel() in this 
+  // function, because this is a host-device function.
+  const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length;
+  Snap3DRangePolicy<DeviceType, tile_size_compute_ck, TagCSNAGridComputeCayleyKlein>
+      policy_compute_ck({0,0,0},{vector_length,ntotal,chunk_size_div},{vector_length,tile_size_compute_ck,1});
+  Kokkos::parallel_for("ComputeCayleyKlein",policy_compute_ck,*this);
+  */
+
+  // Simple example of 3D MD range policy.
+  // Begin loop over grid points.
+  /*
+  // NOTE: We don't get the compiler error calling host function DeviceType::in_parallel() in this 
+  // function, but we get it in the above function.
+  int n = 3; // bounds for mdrange policy
+  typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagComputeSNAGrid3D> policy_3d({0,0,0},{n,n,n});
+  Kokkos::parallel_for("3D",policy_3d,*this);
+  */
+
+  printf("^^^ End ComputeSNAGridKokkos compute_array()\n");
+}
+
+/* ----------------------------------------------------------------------
+   Begin routines that are unique to the GPU codepath. These take advantage
+   of AoSoA data layouts and scratch memory for recursive polynomials
+------------------------------------------------------------------------- */
+
+/*
+ Simple team policy functor seeing how many layers deep we can go with the parallelism.
+ */
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridComputeNeigh>::member_type& team) const {
+
+  // this function is following the same procedure in ComputeNeigh of PairSNAPKokkos
+
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+  // basic quantities associated with this team:
+  // team_rank : rank of thread in this team
+  // league_rank : rank of team in this league
+  // team_size : number of threads in this team
+  //printf("%d %d %d\n", team.team_rank(), team.league_rank(), team.team_size());
+
+  // extract loop index
+  int ii = team.team_rank() + team.league_rank() * team.team_size();
+  if (ii >= chunk_size) return;
+
+  // get a pointer to scratch memory
+  // This is used to cache whether or not an atom is within the cutoff.
+  // If it is, type_cache is assigned to the atom type.
+  // If it's not, it's assigned to -1.
+  const int tile_size = ntotal; // number of elements per thread
+  const int team_rank = team.team_rank();
+  const int scratch_shift = team_rank * tile_size; // offset into pointer for entire team
+  //printf("ntotal scratch_shift: %d %d\n", ntotal, scratch_shift);
+  int* type_cache = (int*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(int), 0) + scratch_shift;
+
+  //printf("ii: %d\n", ii);
+
+  // convert to grid indices
+
+  int iz = ii/(xlen*ylen);
+  int i2 = ii - (iz*xlen*ylen);
+  int iy = i2/xlen;
+  int ix = i2 % xlen;
+  iz += nzlo;
+  iy += nylo;
+  ix += nxlo;
+
+  double xgrid[3];
+  //int igrid = iz * (nx * ny) + iy * nx + ix;
+
+  // these end up being the same...?
+  //printf("ii igrid: %d %d\n", ii, igrid);
+
+  // grid2x converts igrid to ix,iy,iz like we've done before
+  //grid2x(igrid, xgrid);
+  xgrid[0] = ix * delx;
+  xgrid[1] = iy * dely;
+  xgrid[2] = iz * delz;
+  const double xtmp = xgrid[0];
+  const double ytmp = xgrid[1];
+  const double ztmp = xgrid[2];
+
+  // currently, all grid points are type 1
+  // not clear what a better choice would be
+
+  const int itype = 1;
+  const int ielem = d_map[itype];
+  const double radi = d_radelem[ielem];
+
+  // We need a DomainKokkos::lamda2x parallel for loop here, but let's ignore for now.
+  if (triclinic){
+    printf("We are triclinic %f %f %f\n", xtmp, ytmp, ztmp);
+  } else {
+    //printf("We are not triclinic\n");
+  }
+
+  // can check xgrid positions with original
+  //printf("%f %f %f\n", xgrid[0], xgrid[1], xgrid[2]);
+
+  // Compute the number of neighbors, store rsq
+  int ninside = 0;
+  // want to loop over ntotal... keep getting seg fault when accessing type_cache[j]?
+  //printf("ntotal: %d\n", ntotal);
+  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,ntotal),
+    [&] (const int j, int& count) {
+
+    // From pair snap/kk :
+    /*
+    T_INT j = d_neighbors(i,jj);
+    const F_FLOAT dx = x(j,0) - xtmp;
+    const F_FLOAT dy = x(j,1) - ytmp;
+    const F_FLOAT dz = x(j,2) - ztmp;
+    */
+    // From compute sna/grid/kk :
+    /*
+    const double delx = xtmp - x[j][0];
+    const double dely = ytmp - x[j][1];
+    const double delz = ztmp - x[j][2];
+    */
+    const F_FLOAT dx = x(j,0) - xtmp;
+    const F_FLOAT dy = x(j,1) - ytmp;
+    const F_FLOAT dz = x(j,2) - ztmp;
+    //printf("dx: %f\n", dx);
+
+    //const double rsq = delx * delx + dely * dely + delz * delz;
+    int jtype = type(j);
+    //printf("jtype: %d\n", jtype);
+    //int jelem = 0;
+    //if (rsq < cutsq[jtype][jtype] && rsq > 1e-20) {
+    const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
+
+    //if (rsq >= rnd_cutsq(itype,jtype)) {
+    if (rsq >= cutsq_tmp){
+      jtype = -1; // use -1 to signal it's outside the radius
+    }
+    //printf("jtype rsq rnd_cutsq: %d %f %f\n", jtype, rsq, rnd_cutsq(itype, jtype));
+
+    if (j > 340){
+      printf("j: %d\n", j);
+    }
+
+    //printf("j: %d\n", j);
+    type_cache[j] = jtype;
+
+    if (jtype >= 0)
+     count++;
+
+  }, ninside);
+
+  //printf("ninside: %d\n", ninside);
+
+  d_ninside(ii) = ninside;
+
+  // TODO: Make sure itype is appropriate instead of ielem
+  Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team,ntotal),
+    [&] (const int j, int& offset, bool final) {
+
+    const int jtype = type_cache[j];
+
+    if (jtype >= 0) {
+      if (final) {
+        const F_FLOAT dx = x(j,0) - xtmp;
+        const F_FLOAT dy = x(j,1) - ytmp;
+        const F_FLOAT dz = x(j,2) - ztmp;
+        const int jelem = d_map[jtype];
+        my_sna.rij(ii,offset,0) = static_cast<real_type>(dx);
+        my_sna.rij(ii,offset,1) = static_cast<real_type>(dy);
+        my_sna.rij(ii,offset,2) = static_cast<real_type>(dz);
+        my_sna.wj(ii,offset) = static_cast<real_type>(d_wjelem[jelem]);
+        my_sna.rcutij(ii,offset) = static_cast<real_type>((radi + d_radelem[jelem])*rcutfac);
+        my_sna.inside(ii,offset) = j;
+        if (switchinnerflag) {
+          my_sna.sinnerij(ii,offset) = 0.5*(d_sinnerelem[itype] + d_sinnerelem[jelem]);
+          my_sna.dinnerij(ii,offset) = 0.5*(d_dinnerelem[itype] + d_dinnerelem[jelem]);
+        }
+        if (chemflag)
+          my_sna.element(ii,offset) = jelem;
+        else
+          my_sna.element(ii,offset) = 0;
+      }
+      offset++;
+    }
+  });
+}
+
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeCayleyKlein,const int iatom_mod, const int jnbor, const int iatom_div) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+  //printf("^^^ ComputeCayleyKlein\n");
+
+  /*
+  if (DeviceType::in_parallel()) {
+    printf("operator() of TagCSNAGridComputeCayleyKlein is a host function\n");
+  } else {
+    printf("operator() of TagCSNAGridComputeCayleyKlein is not a host function\n");
+  }
+  */
+
+  const int ii = iatom_mod + iatom_div * vector_length;
+  if (ii >= chunk_size) return;
+
+  const int ninside = ntotal; //d_ninside(ii);
+  if (jnbor >= ninside) return;
+
+  my_sna.compute_cayley_klein(iatom_mod,jnbor,iatom_div);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridPreUi, const int iatom_mod, const int j, const int iatom_div) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+  const int ii = iatom_mod + iatom_div * vector_length;
+  if (ii >= chunk_size) return;
+
+  int itype = type(ii);
+  int ielem = d_map[itype];
+
+  my_sna.pre_ui(iatom_mod, j, ielem, iatom_div);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridComputeUiSmall>::member_type& team) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+  // extract flattened atom_div / neighbor number / bend_location
+  int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
+
+  // extract neighbor index, iatom_div
+  int iatom_div = flattened_idx / (ntotal * (twojmax + 1)); // removed "const" to work around GCC 7 bug
+  const int jj_jbend = flattened_idx - iatom_div * (ntotal * (twojmax + 1));
+  const int jbend = jj_jbend / ntotal;
+  int jj = jj_jbend - jbend * ntotal; // removed "const" to work around GCC 7 bug
+
+  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
+    [&] (const int iatom_mod) {
+    const int ii = iatom_mod + vector_length * iatom_div;
+    if (ii >= chunk_size) return;
+
+    const int ninside = d_ninside(ii);
+    if (jj >= ninside) return;
+
+    my_sna.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div);
+  });
+
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridComputeUiLarge>::member_type& team) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+  // extract flattened atom_div / neighbor number / bend location
+  int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
+
+  // extract neighbor index, iatom_div
+  int iatom_div = flattened_idx / ntotal; // removed "const" to work around GCC 7 bug
+  int jj = flattened_idx - iatom_div * ntotal;
+
+  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
+    [&] (const int iatom_mod) {
+    const int ii = iatom_mod + vector_length * iatom_div;
+    if (ii >= chunk_size) return;
+
+    const int ninside = d_ninside(ii);
+    if (jj >= ninside) return;
+
+    my_sna.compute_ui_large(team,iatom_mod, jj, iatom_div);
+  });
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridTransformUi,const int iatom_mod, const int idxu, const int iatom_div) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+
+  if (idxu > my_sna.idxu_max) return;
+
+  int elem_count = chemflag ? nelements : 1;
+
+  for (int ielem = 0; ielem < elem_count; ielem++){
+
+    const FullHalfMapper mapper = my_sna.idxu_full_half[idxu];
+
+    auto utot_re = my_sna.ulisttot_re_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div);
+    auto utot_im = my_sna.ulisttot_im_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div);
+
+    if (mapper.flip_sign == 1){
+      utot_im = -utot_im;
+    } else if (mapper.flip_sign == -1){
+      utot_re = -utot_re;
+    }
+
+    my_sna.ulisttot_pack(iatom_mod, idxu, ielem, iatom_div) = { utot_re, utot_im };
+
+    if (mapper.flip_sign == 0) {
+      my_sna.ylist_pack_re(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.;
+      my_sna.ylist_pack_im(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.;
+    }
+  }
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeZi,const int iatom_mod, const int jjz, const int iatom_div) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+
+  if (jjz >= my_sna.idxz_max) return;
+
+  my_sna.compute_zi(iatom_mod,jjz,iatom_div);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeBi,const int iatom_mod, const int jjb, const int iatom_div) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+
+  if (jjb >= my_sna.idxb_max) return;
+
+  my_sna.compute_bi(iatom_mod,jjb,iatom_div);
+}
+
+/*
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridComputeNeigh>::member_type& team) const {
+
+
+}
+*/
+
+/* ----------------------------------------------------------------------
+   Begin routines that are unique to the CPU codepath. These do not take
+   advantage of AoSoA data layouts, but that could be a good point of
+   future optimization and unification with the above kernels. It's unlikely
+   that scratch memory optimizations will ever be useful for the CPU due to
+   different arithmetic intensity requirements for the CPU vs GPU.
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagComputeSNAGridLoopCPU,const int& ii) const {
+
+}
+
+/* ----------------------------------------------------------------------
+   utility functions
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+template<class TagStyle>
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::check_team_size_for(int inum, int &team_size) {
+  int team_size_max;
+
+  team_size_max = Kokkos::TeamPolicy<DeviceType,TagStyle>(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelForTag());
+
+  if (team_size*vector_length > team_size_max)
+    team_size = team_size_max/vector_length;
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+template<class TagStyle>
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::check_team_size_reduce(int inum, int &team_size) {
+  int team_size_max;
+
+  team_size_max = Kokkos::TeamPolicy<DeviceType,TagStyle>(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelReduceTag());
+
+  if (team_size*vector_length > team_size_max)
+    team_size = team_size_max/vector_length;
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+template<typename scratch_type>
+int ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::scratch_size_helper(int values_per_team) {
+  typedef Kokkos::View<scratch_type*, Kokkos::DefaultExecutionSpace::scratch_memory_space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > ScratchViewType;
+
+  return ScratchViewType::shmem_size(values_per_team);
+}
+
+/* ---------------------------------------------------------------------- */
+
 /* ----------------------------------------------------------------------
    routines used by template reference classes
 ------------------------------------------------------------------------- */
 
+
 template<class DeviceType>
 ComputeSNAGridKokkosDevice<DeviceType>::ComputeSNAGridKokkosDevice(class LAMMPS *lmp, int narg, char **arg)
    : ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN>(lmp, narg, arg) { ; }
@@ -112,6 +801,12 @@ void ComputeSNAGridKokkosDevice<DeviceType>::init()
   Base::init();
 }
 
+template<class DeviceType>
+void ComputeSNAGridKokkosDevice<DeviceType>::compute_array()
+{
+  Base::compute_array();
+}
+
 #ifdef LMP_KOKKOS_GPU
 template<class DeviceType>
 ComputeSNAGridKokkosHost<DeviceType>::ComputeSNAGridKokkosHost(class LAMMPS *lmp, int narg, char **arg)
@@ -122,6 +817,12 @@ void ComputeSNAGridKokkosHost<DeviceType>::init()
 {
   Base::init();
 }
+
+template<class DeviceType>
+void ComputeSNAGridKokkosHost<DeviceType>::compute_array()
+{
+  Base::compute_array();
+}
 #endif
 
 }
diff --git a/src/KOKKOS/memory_kokkos.h b/src/KOKKOS/memory_kokkos.h
index 9d894a344a..35a7ceaeb4 100644
--- a/src/KOKKOS/memory_kokkos.h
+++ b/src/KOKKOS/memory_kokkos.h
@@ -183,6 +183,56 @@ TYPE create_kokkos(TYPE &data, typename TYPE::value_type **&array,
   return data;
 }
 
+/* ----------------------------------------------------------------------
+   create a 4d array with indices 2,3,4 offset, but not first
+   2nd index from n2lo to n2hi inclusive
+   3rd index from n3lo to n3hi inclusive
+   4th index from n4lo to n4hi inclusive
+   cannot grow it
+------------------------------------------------------------------------- */
+
+template <typename TYPE>
+TYPE create4d_offset_kokkos(TYPE &data, typename TYPE::value_type ****&array,
+                             int n1, int n2lo, int n2hi, int n3lo, int n3hi, int n4lo, int n4hi,
+                             const char *name)
+{
+  //if (n1 <= 0 || n2lo > n2hi || n3lo > n3hi || n4lo > n4hi) array =  nullptr;
+
+  printf("^^^^^ memoryKK->create_4d_offset_kokkos\n");
+
+  int n2 = n2hi - n2lo + 1;
+  int n3 = n3hi - n3lo + 1;
+  int n4 = n4hi - n4lo + 1;
+  data = TYPE(std::string(name),n1,n2,n3,n4);
+  bigint nbytes = ((bigint) sizeof(typename TYPE::value_type ***)) * n1;
+  array = (typename TYPE::value_type ****) smalloc(nbytes,name);
+
+  for (int i = 0; i < n1; i++) {
+    if (n2 == 0) {
+      array[i] = nullptr;
+    } else {
+      nbytes = ((bigint) sizeof(typename TYPE::value_type **)) * n2;
+      array[i] = (typename TYPE::value_type ***) smalloc(nbytes,name);
+      for (int j = 0; j < n2; j++){
+        if (n3 == 0){
+          array[i][j] = nullptr;
+        } else {
+          nbytes = ((bigint) sizeof(typename TYPE::value_type *)) * n3;
+          array[i][j] = (typename TYPE::value_type **) smalloc(nbytes, name);
+          for (int k = 0; k < n3; k++){
+            if (n4 == 0)
+              array[i][j][k] = nullptr;
+            else
+              array[i][j][k] = &data.h_view(i,j,k,0);
+          }
+        }
+      }
+    }
+  }
+
+  return data;
+}
+
 template <typename TYPE, typename HTYPE>
   TYPE create_kokkos(TYPE &data, HTYPE &h_data,
                      typename TYPE::value_type **&array, int n1, int n2,
diff --git a/src/ML-SNAP/compute_grid.cpp b/src/ML-SNAP/compute_grid.cpp
index 2179bb8ebd..ad70df30e8 100644
--- a/src/ML-SNAP/compute_grid.cpp
+++ b/src/ML-SNAP/compute_grid.cpp
@@ -57,6 +57,8 @@ ComputeGrid::ComputeGrid(LAMMPS *lmp, int narg, char **arg) :
 
 ComputeGrid::~ComputeGrid()
 {
+  printf("^^^ begin ComputeGrid destructor\n");
+  if (copymode) return;
   deallocate();
 }
 
@@ -111,7 +113,7 @@ void ComputeGrid::assign_coords_all()
 void ComputeGrid::allocate()
 {
   // allocate arrays
-
+  printf("^^^^^^^^^^^^^^^ ComputeGrid::allocate()\n");
   memory->create(grid, size_array_rows, size_array_cols, "grid:grid");
   memory->create(gridall, size_array_rows, size_array_cols, "grid:gridall");
   if (nxlo <= nxhi && nylo <= nyhi && nzlo <= nzhi) {
diff --git a/src/ML-SNAP/compute_sna_grid.cpp b/src/ML-SNAP/compute_sna_grid.cpp
index 4243202545..36780213f2 100644
--- a/src/ML-SNAP/compute_sna_grid.cpp
+++ b/src/ML-SNAP/compute_sna_grid.cpp
@@ -31,14 +31,14 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) :
   // skip over arguments used by base class
   // so that argument positions are identical to
   // regular per-atom compute
-
+  printf("^^^ inside compute sna grid constructor\n");
   arg += nargbase;
   narg -= nargbase;
 
   // begin code common to all SNAP computes
 
-  double rfac0, rmin0;
-  int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;
+  //double rfac0, rmin0;
+  //int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;
 
   int ntypes = atom->ntypes;
   int nargmin = 6 + 2 * ntypes;
@@ -56,6 +56,8 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) :
   wselfallflag = 0;
   switchinnerflag = 0;
   nelements = 1;
+  chunksize = 32768;
+  parallel_thresh = 8192;
 
   // process required arguments
 
@@ -112,6 +114,7 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) :
       quadraticflag = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
       iarg += 2;
     } else if (strcmp(arg[iarg], "chem") == 0) {
+      printf("^^^ chem flag, creating map\n");
       if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
       chemflag = 1;
       memory->create(map, ntypes + 1, "compute_sna_grid:map");
@@ -181,11 +184,17 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) :
 
 ComputeSNAGrid::~ComputeSNAGrid()
 {
-  memory->destroy(radelem);
-  memory->destroy(wjelem);
-  memory->destroy(cutsq);
-  delete snaptr;
+  if (copymode) return;
 
+  printf("^^^ begin ComputeSNAGrid destructor\n");
+  memory->destroy(radelem);
+  printf("^^^^ CSG 1\n");
+  memory->destroy(wjelem);
+  printf("^^^^ CSG 2\n");
+  memory->destroy(cutsq);
+  printf("^^^^ CSG 3\n");
+  delete snaptr;
+  printf("^^^^ CSG 4\n");
   if (chemflag) memory->destroy(map);
 }
 
@@ -196,12 +205,16 @@ void ComputeSNAGrid::init()
   if ((modify->get_compute_by_style("^sna/grid$").size() > 1) && (comm->me == 0))
     error->warning(FLERR, "More than one instance of compute sna/grid");
   snaptr->init();
+
+  printf("^^^ finished ComputeSNAGrid init()\n");
 }
 
 /* ---------------------------------------------------------------------- */
 
 void ComputeSNAGrid::compute_array()
 {
+  printf("^^^ inside ComputeSNAGrid compute_array()\n");
+
   invoked_array = update->ntimestep;
 
   // compute sna for each gridpoint
@@ -211,6 +224,8 @@ void ComputeSNAGrid::compute_array()
   int *const type = atom->type;
   const int ntotal = atom->nlocal + atom->nghost;
 
+  printf("^^^ ntotal: %d\n", ntotal);
+
   // ensure rij, inside, and typej are of size jnum
 
   snaptr->grow_rij(ntotal);
diff --git a/src/ML-SNAP/compute_sna_grid.h b/src/ML-SNAP/compute_sna_grid.h
index 3a5a373826..a158c2342f 100644
--- a/src/ML-SNAP/compute_sna_grid.h
+++ b/src/ML-SNAP/compute_sna_grid.h
@@ -31,21 +31,27 @@ class ComputeSNAGrid : public ComputeGrid {
   void init() override;
   void compute_array() override;
   double memory_usage() override;
+  int ncoeff,nelements; // public for kokkos, but could go in the protected block now
 
- private:
-  int ncoeff;
+ protected:
+  //int ncoeff;
   double **cutsq;
   double rcutfac;
   double *radelem;
   double *wjelem;
   int *map;    // map types to [0,nelements)
-  int nelements, chemflag;
+  int chemflag;
   int switchinnerflag;
   double *sinnerelem;
   double *dinnerelem;
+  int parallel_thresh;
   class SNA *snaptr;
   double cutmax;
   int quadraticflag;
+  double rfac0, rmin0;
+  int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;
+  int chunksize;
+
 };
 
 }    // namespace LAMMPS_NS

From 75392648469488627f091138314cfc3ee59121ea Mon Sep 17 00:00:00 2001
From: rohskopf <drew.rohskopf@gmail.com>
Date: Sat, 8 Apr 2023 13:08:24 -0600
Subject: [PATCH 08/51] Sync device and host compute arrays

---
 src/KOKKOS/compute_sna_grid_kokkos.h      | 10 +++--
 src/KOKKOS/compute_sna_grid_kokkos_impl.h | 48 ++++++++---------------
 src/KOKKOS/kokkos_type.h                  | 16 ++++++++
 3 files changed, 40 insertions(+), 34 deletions(-)

diff --git a/src/KOKKOS/compute_sna_grid_kokkos.h b/src/KOKKOS/compute_sna_grid_kokkos.h
index b461f755b8..6b85300cda 100644
--- a/src/KOKKOS/compute_sna_grid_kokkos.h
+++ b/src/KOKKOS/compute_sna_grid_kokkos.h
@@ -242,9 +242,13 @@ class ComputeSNAGridKokkos : public ComputeSNAGrid {
   //DAT::tdual_float_4d k_gridlocal;
   //typedef Kokkos::DualView<real_type****, Kokkos::LayoutLeft, DeviceType> t_gridlocal_4d;
   //typedef Kokkos::View<real_type****, DeviceType> t_4d;
-  typedef Kokkos::DualView<LMP_FLOAT****, LMPDeviceType> tdual_float_4d;
-  tdual_float_4d k_gridlocal;
-  tdual_float_4d d_gridlocal; 
+  // should we use LMPDeviceType below?
+  //typedef Kokkos::DualView<LMP_FLOAT****, LMPDeviceType> tdual_float_4d;
+  //typedef tdual_float_4d::t_dev tdev_float_4d;
+  //tdual_float_4d k_gridlocal;
+  //tdev_float_4d d_gridlocal; 
+  DAT::tdual_float_4d k_gridlocal;
+  typename AT::t_float_4d d_gridlocal;
 
 
   // Utility routine which wraps computing per-team scratch size requirements for
diff --git a/src/KOKKOS/compute_sna_grid_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
index b0cf30d070..583b5d1a46 100644
--- a/src/KOKKOS/compute_sna_grid_kokkos_impl.h
+++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
@@ -198,6 +198,10 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::setup()
                                      nyhi, nxlo, nxhi, "grid:gridlocal");
   }
   array = gridall;
+
+  d_gridlocal = k_gridlocal.template view<DeviceType>();
+  d_grid = k_grid.template view<DeviceType>();
+  d_gridall = k_gridall.template view<DeviceType>();
 }
 
 // Compute
@@ -372,41 +376,21 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::compute_array()
   */
 
 
+  // populate the gridlocal array
+  // best to do parallel loop over grid points again
+  // ...
 
-  // let's try a simple parallel for loop
-  // NOTE: We get the compiler error calling host function DeviceType::in_parallel() in this 
-  // function, because this is a host-device function.
-  /*
-  typename Kokkos::RangePolicy<DeviceType,TagComputeSNAGridLoop> policy_loop(0,4);
-  Kokkos::parallel_for("Loop",policy_loop,*this);
-  */
+  // d_grid(0,0) = 1.0; // attempt to access inaccessible memory space
 
+  k_gridlocal.template modify<DeviceType>();
+  k_gridlocal.template sync<LMPHostType>();
 
-  // Simple working loop:
-  /* 
-  Kokkos::parallel_for("Loop1", 4, KOKKOS_LAMBDA (const int& i) {
-    printf("Greeting from iteration %i\n",i);
-  });
-  */
+  k_grid.template modify<DeviceType>();
+  k_grid.template sync<LMPHostType>();
 
-  /*
-  // NOTE: We get the compiler error calling host function DeviceType::in_parallel() in this 
-  // function, because this is a host-device function.
-  const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length;
-  Snap3DRangePolicy<DeviceType, tile_size_compute_ck, TagCSNAGridComputeCayleyKlein>
-      policy_compute_ck({0,0,0},{vector_length,ntotal,chunk_size_div},{vector_length,tile_size_compute_ck,1});
-  Kokkos::parallel_for("ComputeCayleyKlein",policy_compute_ck,*this);
-  */
-
-  // Simple example of 3D MD range policy.
-  // Begin loop over grid points.
-  /*
-  // NOTE: We don't get the compiler error calling host function DeviceType::in_parallel() in this 
-  // function, but we get it in the above function.
-  int n = 3; // bounds for mdrange policy
-  typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagComputeSNAGrid3D> policy_3d({0,0,0},{n,n,n});
-  Kokkos::parallel_for("3D",policy_3d,*this);
-  */
+  k_gridall.template modify<DeviceType>();
+  k_gridall.template sync<LMPHostType>();
+  
 
   printf("^^^ End ComputeSNAGridKokkos compute_array()\n");
 }
@@ -437,6 +421,8 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
   int ii = team.team_rank() + team.league_rank() * team.team_size();
   if (ii >= chunk_size) return;
 
+  d_gridall(ii,0) = 100.0;
+
   // get a pointer to scratch memory
   // This is used to cache whether or not an atom is within the cutoff.
   // If it is, type_cache is assigned to the atom type.
diff --git a/src/KOKKOS/kokkos_type.h b/src/KOKKOS/kokkos_type.h
index a496f6ff94..456a22ac56 100644
--- a/src/KOKKOS/kokkos_type.h
+++ b/src/KOKKOS/kokkos_type.h
@@ -717,6 +717,14 @@ typedef tdual_float_3d::t_dev_um t_float_3d_um;
 typedef tdual_float_3d::t_dev_const_um t_float_3d_const_um;
 typedef tdual_float_3d::t_dev_const_randomread t_float_3d_randomread;
 
+//4d float array n
+typedef Kokkos::DualView<LMP_FLOAT****, Kokkos::LayoutRight, LMPDeviceType> tdual_float_4d;
+typedef tdual_float_4d::t_dev t_float_4d;
+typedef tdual_float_4d::t_dev_const t_float_4d_const;
+typedef tdual_float_4d::t_dev_um t_float_4d_um;
+typedef tdual_float_4d::t_dev_const_um t_float_4d_const_um;
+typedef tdual_float_4d::t_dev_const_randomread t_float_4d_randomread;
+
 #ifdef LMP_KOKKOS_NO_LEGACY
 typedef Kokkos::DualView<X_FLOAT*[4], Kokkos::LayoutLeft, LMPDeviceType> tdual_float_1d_4;
 #else
@@ -1017,6 +1025,14 @@ typedef tdual_float_2d::t_host_um t_float_2d_um;
 typedef tdual_float_2d::t_host_const_um t_float_2d_const_um;
 typedef tdual_float_2d::t_host_const_randomread t_float_2d_randomread;
 
+//4d float array n
+typedef Kokkos::DualView<LMP_FLOAT****, Kokkos::LayoutRight, LMPDeviceType> tdual_float_4d;
+typedef tdual_float_4d::t_host t_float_4d;
+typedef tdual_float_4d::t_host_const t_float_4d_const;
+typedef tdual_float_4d::t_host_um t_float_4d_um;
+typedef tdual_float_4d::t_host_const_um t_float_4d_const_um;
+typedef tdual_float_4d::t_host_const_randomread t_float_4d_randomread;
+
 #ifdef LMP_KOKKOS_NO_LEGACY
 typedef Kokkos::DualView<X_FLOAT*[4], Kokkos::LayoutLeft, LMPDeviceType> tdual_float_1d_4;
 #else

From bd1134c083c7035d3b1efabb06bc43f54a7521aa Mon Sep 17 00:00:00 2001
From: rohskopf <drew.rohskopf@gmail.com>
Date: Sat, 8 Apr 2023 17:21:27 -0600
Subject: [PATCH 09/51] Debug inability to deep copy

---
 src/KOKKOS/compute_sna_grid_kokkos.h      |  15 ++
 src/KOKKOS/compute_sna_grid_kokkos_impl.h | 267 +++++++++++++++++++---
 src/ML-SNAP/compute_sna_grid.cpp          |   4 +-
 3 files changed, 255 insertions(+), 31 deletions(-)

diff --git a/src/KOKKOS/compute_sna_grid_kokkos.h b/src/KOKKOS/compute_sna_grid_kokkos.h
index 6b85300cda..abd1c985b6 100644
--- a/src/KOKKOS/compute_sna_grid_kokkos.h
+++ b/src/KOKKOS/compute_sna_grid_kokkos.h
@@ -70,6 +70,9 @@ struct TagCSNAGridComputeUiLarge{}; // less parallelism, no divergence
 struct TagCSNAGridTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist
 struct TagCSNAGridComputeZi{};
 struct TagCSNAGridComputeBi{};
+struct TagCSNAGridTransformBi{}; // re-order blist from AoSoA to AoS
+struct TagCSNAGridLocalFill{}; // fill the gridlocal array
+struct TagCSNAGridLocalFill2{}; // fill the gridlocal array using same kinda loop as ComputeForce
 
 struct TagComputeSNAGridLoop{};
 struct TagComputeSNAGrid3D{};
@@ -179,6 +182,9 @@ class ComputeSNAGridKokkos : public ComputeSNAGrid {
   KOKKOS_INLINE_FUNCTION
   void operator() (TagCSNAGridComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridComputeNeigh>::member_type& team) const;
 
+  // PrintNeigh
+  //void operator() (TagPrintNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagPrintNeigh>::member_type& team) const;
+
   // 3D case - used by parallel_for
   KOKKOS_INLINE_FUNCTION
   void operator()(TagComputeSNAGrid3D, const int& iz, const int& iy, const int& ix) const;
@@ -204,6 +210,15 @@ class ComputeSNAGridKokkos : public ComputeSNAGrid {
   KOKKOS_INLINE_FUNCTION
   void operator() (TagCSNAGridComputeBi,const int iatom_mod, const int idxb, const int iatom_div) const;
 
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalFill,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridLocalFill>::member_type& team) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalFill2,const int& ii) const;
+
  protected:
 
   SNAKokkos<DeviceType, real_type, vector_length> snaKK;
diff --git a/src/KOKKOS/compute_sna_grid_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
index 583b5d1a46..3148bf32ce 100644
--- a/src/KOKKOS/compute_sna_grid_kokkos_impl.h
+++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
@@ -35,6 +35,8 @@
 #include <cstdlib>
 #include <cstring>
 
+#include <iostream>
+
 #define MAXLINE 1024
 #define MAXWORD 3
 
@@ -46,7 +48,7 @@ template<class DeviceType, typename real_type, int vector_length>
 ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::ComputeSNAGridKokkos(LAMMPS *lmp, int narg, char **arg) : ComputeSNAGrid(lmp, narg, arg)
 {
   //respa_enable = 0;
-
+  printf("^^^ Begin ComputeSNAGridKokkos constructor\n");
   kokkosable = 1;
   atomKK = (AtomKokkos *) atom;
   execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
@@ -64,6 +66,10 @@ ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::ComputeSNAGridKokkos
   //int n = atom->ntypes;
   //printf("^^^ realloc d_map\n");
   //MemKK::realloc_kokkos(d_map,"ComputeSNAGridKokkos::map",n+1);
+ 
+
+  printf("^^^ wjelem[0]: %f\n", wjelem[0]);
+  printf("^^^ wjelem[1]: %f\n", wjelem[1]);
   
 
   printf("^^^^^ cutsq: %f\n", cutsq[1][1]);
@@ -71,7 +77,76 @@ ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::ComputeSNAGridKokkos
   cutsq_tmp = cutsq[1][1];
 
   //memoryKK->create_kokkos(k_gridlocal,
-  //printf("^^^^^ gridlocal: %f\n", gridlocal[0][0][0][0]); 
+  //printf("^^^^^ gridlocal: %f\n", gridlocal[0][0][0][0]);
+
+
+   // Set up element lists
+  printf("^^^ Begin kokkos reallocs with nelements = %d\n", nelements);
+  MemKK::realloc_kokkos(d_radelem,"ComputeSNAGridKokkos::radelem",nelements);
+  MemKK::realloc_kokkos(d_wjelem,"ComputeSNAGridKokkos:wjelem",nelements);
+  // pair snap kokkos uses `ncoeffall` in the following, inherits from original.
+  //MemKK::realloc_kokkos(d_coeffelem,"pair:coeffelem",nelements,ncoeff);
+  MemKK::realloc_kokkos(d_sinnerelem,"ComputeSNAGridKokkos:sinnerelem",nelements);
+  MemKK::realloc_kokkos(d_dinnerelem,"ComputeSNAGridKokkos:dinnerelem",nelements);
+  int n = atom->ntypes;
+  //printf("^^^ realloc d_map\n");
+  printf("^^^ n: %d\n", n);
+  MemKK::realloc_kokkos(d_map,"ComputeSNAGridKokkos::map",n+1);
+
+  printf("^^^ begin mirrow view creation\n");
+  auto h_radelem = Kokkos::create_mirror_view(d_radelem);
+  auto h_wjelem = Kokkos::create_mirror_view(d_wjelem);
+  //auto h_coeffelem = Kokkos::create_mirror_view(d_coeffelem);
+  auto h_sinnerelem = Kokkos::create_mirror_view(d_sinnerelem);
+  auto h_dinnerelem = Kokkos::create_mirror_view(d_dinnerelem);
+  auto h_map = Kokkos::create_mirror_view(d_map);
+
+  printf("^^^ begin loop over elements, nelements = %d\n", nelements);
+  // start from index 1 because of how compute sna/grid is
+  for (int i = 1; i <= atom->ntypes; i++) {
+    printf("^^^^^ i %d\n", i);
+    h_radelem(i) = radelem[i];
+    h_wjelem(i) = wjelem[i];
+    printf("^^^^^ radelem wjelem %f %f\n", radelem[i], wjelem[i]);
+    printf("host^^^ radelem wjelem %f %f\n", h_radelem(i), h_wjelem(i));
+    if (switchinnerflag){
+      h_sinnerelem(i) = sinnerelem[i];
+      h_dinnerelem(i) = dinnerelem[i];
+    }
+    // pair snap kokkos uses `ncoeffall` in the following.
+    //for (int jcoeff = 0; jcoeff < ncoeff; jcoeff++) {
+    //  h_coeffelem(ielem,jcoeff) = coeffelem[ielem][jcoeff];
+    //}
+  }
+
+  printf("^^^ begin loop over map\n");
+  // NOTE: At this point it's becoming obvious that compute sna grid is not like pair snap, where 
+  // some things like `map` get allocated regardless of chem flag.
+  if (chemflag){ 
+    for (int i = 1; i <= atom->ntypes; i++) {
+      h_map(i) = map[i];
+      printf("%d\n", map[i]);
+    }
+  }
+
+  Kokkos::deep_copy(d_radelem,h_radelem);
+  Kokkos::deep_copy(d_wjelem,h_wjelem);
+  if (switchinnerflag){
+    Kokkos::deep_copy(d_sinnerelem,h_sinnerelem);
+    Kokkos::deep_copy(d_dinnerelem,h_dinnerelem);
+  }
+  if (chemflag){
+    Kokkos::deep_copy(d_map,h_map);
+  }
+
+  double bytes =  MemKK::memory_usage(d_wjelem);
+  printf("^^^ bytes: %f\n", bytes);
+
+  snaKK = SNAKokkos<DeviceType, real_type, vector_length>(rfac0,twojmax,
+    rmin0,switchflag,bzeroflag,chemflag,bnormflag,wselfallflag,nelements,switchinnerflag);
+  snaKK.grow_rij(0,0);
+  snaKK.init();
+
 }
 
 // Destructor
@@ -97,14 +172,15 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::init()
 
   ComputeSNAGrid::init();
 
+  /*
   // Set up element lists
   printf("^^^ Begin kokkos reallocs\n");
   MemKK::realloc_kokkos(d_radelem,"ComputeSNAGridKokkos::radelem",nelements);
-  MemKK::realloc_kokkos(d_wjelem,"pair:wjelem",nelements);
+  MemKK::realloc_kokkos(d_wjelem,"ComputeSNAGridKokkos:wjelem",nelements);
   // pair snap kokkos uses `ncoeffall` in the following, inherits from original.
   //MemKK::realloc_kokkos(d_coeffelem,"pair:coeffelem",nelements,ncoeff);
-  MemKK::realloc_kokkos(d_sinnerelem,"pair:sinnerelem",nelements);
-  MemKK::realloc_kokkos(d_dinnerelem,"pair:dinnerelem",nelements);
+  MemKK::realloc_kokkos(d_sinnerelem,"ComputeSNAGridKokkos:sinnerelem",nelements);
+  MemKK::realloc_kokkos(d_dinnerelem,"ComputeSNAGridKokkos:dinnerelem",nelements);
   int n = atom->ntypes;
   //printf("^^^ realloc d_map\n");
   printf("^^^ n: %d\n", n);
@@ -119,15 +195,16 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::init()
   auto h_map = Kokkos::create_mirror_view(d_map);
 
   printf("^^^ begin loop over elements, nelements = %d\n", nelements);
-  for (int ielem = 0; ielem < nelements; ielem++) {
-    printf("^^^^^ ielem %d\n", ielem);
-    h_radelem(ielem) = radelem[ielem];
-    printf("^^^^^ 1\n");
-    h_wjelem(ielem) = wjelem[ielem];
-    printf("^^^^^ 2\n");
+  // start from index 1 because of how compute sna/grid is
+  for (int i = 1; i <= atom->ntypes; i++) {
+    printf("^^^^^ i %d\n", i);
+    h_radelem(i) = radelem[i];
+    h_wjelem(i) = wjelem[i];
+    printf("^^^^^ radelem wjelem %f %f\n", radelem[i], wjelem[i]);
+    printf("host^^^ radelem wjelem %f %f\n", h_radelem(i), h_wjelem(i));
     if (switchinnerflag){
-      h_sinnerelem(ielem) = sinnerelem[ielem];
-      h_dinnerelem(ielem) = dinnerelem[ielem];
+      h_sinnerelem(i) = sinnerelem[i];
+      h_dinnerelem(i) = dinnerelem[i];
     }
     // pair snap kokkos uses `ncoeffall` in the following.
     //for (int jcoeff = 0; jcoeff < ncoeff; jcoeff++) {
@@ -159,6 +236,7 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::init()
     rmin0,switchflag,bzeroflag,chemflag,bnormflag,wselfallflag,nelements,switchinnerflag);
   snaKK.grow_rij(0,0);
   snaKK.init();
+  */
 
   if (host_flag) {
 
@@ -167,7 +245,7 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::init()
     //  error->all(FLERR,"Compute style sna/grid/kk can currently only run on a single "
     //                     "CPU thread");
 
-    ComputeSNAGrid::init();
+    //ComputeSNAGrid::init();
     return;
   }
 
@@ -363,18 +441,34 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::compute_array()
       policy_compute_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_compute_bi,1});
   Kokkos::parallel_for("ComputeBi",policy_compute_bi,*this);
 
-  //Looks like best way to grab blist is in a parallel_for
-
   //Transform data layout of blist out of AoSoA
   //We need this because `blist` gets used in ComputeForce which doesn't
   //take advantage of AoSoA, which at best would only be beneficial on the margins
   //NOTE: Do we need this in compute sna/grid/kk?
-  /*
-  Snap3DRangePolicy<DeviceType, tile_size_transform_bi, TagPairSNAPTransformBi>
+  Snap3DRangePolicy<DeviceType, tile_size_transform_bi, TagCSNAGridTransformBi>
       policy_transform_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_transform_bi,1});
   Kokkos::parallel_for("TransformBi",policy_transform_bi,*this);
+
+  //Looks like best way to grab blist is in a parallel_for
+
+  //GridFill
+  /* 
+  {
+    int scratch_size = scratch_size_helper<int>(team_size_compute_neigh * ntotal);
+
+    SnapAoSoATeamPolicy<DeviceType, team_size_compute_neigh, TagCSNAGridLocalFill> 
+      policy_fill(chunk_size, team_size_compute_neigh, vector_length);
+    policy_fill = policy_fill.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+    Kokkos::parallel_for("GridLocalFill",policy_fill,*this);
+  }
   */
 
+  //GridFill2
+  {
+    typename Kokkos::RangePolicy<DeviceType,TagCSNAGridLocalFill2> policy_fill(0,chunk_size);
+    Kokkos::parallel_for(policy_fill, *this);
+  }
+
 
   // populate the gridlocal array
   // best to do parallel loop over grid points again
@@ -408,6 +502,11 @@ KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridComputeNeigh>::member_type& team) const {
 
   // this function is following the same procedure in ComputeNeigh of PairSNAPKokkos
+  //printf("d_wjelem[1]: %f %f %f %f\n", d_wjelem[1], d_wjelem[0], d_wjelem(1), d_wjelem(0));
+  //artificially set values here since we can't get the deep_copy to work
+  //d_wjelem[1] = 1.0;
+  //d_radelem[1] = 0.5;
+
 
   SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
 
@@ -421,7 +520,7 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
   int ii = team.team_rank() + team.league_rank() * team.team_size();
   if (ii >= chunk_size) return;
 
-  d_gridall(ii,0) = 100.0;
+  //d_gridall(ii,0) = 100.0;
 
   // get a pointer to scratch memory
   // This is used to cache whether or not an atom is within the cutoff.
@@ -456,15 +555,16 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
   xgrid[0] = ix * delx;
   xgrid[1] = iy * dely;
   xgrid[2] = iz * delz;
-  const double xtmp = xgrid[0];
-  const double ytmp = xgrid[1];
-  const double ztmp = xgrid[2];
+  const F_FLOAT xtmp = xgrid[0];
+  const F_FLOAT ytmp = xgrid[1];
+  const F_FLOAT ztmp = xgrid[2];
 
   // currently, all grid points are type 1
   // not clear what a better choice would be
 
   const int itype = 1;
-  const int ielem = d_map[itype];
+  int ielem = 0;
+  if (chemflag) ielem = d_map[itype];
   const double radi = d_radelem[ielem];
 
   // We need a DomainKokkos::lamda2x parallel for loop here, but let's ignore for now.
@@ -529,7 +629,8 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
 
   //printf("ninside: %d\n", ninside);
 
-  d_ninside(ii) = ninside;
+  d_ninside(ii) = ninside; 
+  //printf("%d\n", d_ninside(ii));
 
   // TODO: Make sure itype is appropriate instead of ielem
   Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team,ntotal),
@@ -542,16 +643,45 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
         const F_FLOAT dx = x(j,0) - xtmp;
         const F_FLOAT dy = x(j,1) - ytmp;
         const F_FLOAT dz = x(j,2) - ztmp;
-        const int jelem = d_map[jtype];
+        int jtype = type(j);
+        //printf("jtype: %d\n", jtype);
+        int jelem = 0;
+        if (chemflag) jelem = d_map[jtype];
+        //d_wjelem[jelem] = 1.0;
+        //d_radelem[jelem] = 1.0;
         my_sna.rij(ii,offset,0) = static_cast<real_type>(dx);
         my_sna.rij(ii,offset,1) = static_cast<real_type>(dy);
         my_sna.rij(ii,offset,2) = static_cast<real_type>(dz);
-        my_sna.wj(ii,offset) = static_cast<real_type>(d_wjelem[jelem]);
-        my_sna.rcutij(ii,offset) = static_cast<real_type>((radi + d_radelem[jelem])*rcutfac);
+        // pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp
+        my_sna.wj(ii,offset) = static_cast<real_type>(d_wjelem[jtype]);
+        //my_sna.rcutij(ii,offset) = static_cast<real_type>((radi + d_radelem[jtype])*rcutfac);
+        my_sna.rcutij(ii,offset) = static_cast<real_type>((2.0 * d_radelem[jtype])*rcutfac);
         my_sna.inside(ii,offset) = j;
+
+        //printf("%f\n", my_sna.wj(ii,offset));
+
+        //printf("jelem: %d\n", jelem);
+        //printf("rij: %f %f %f\n", dx, dy, dz);
+        //printf("params: %f %f %f\n", d_wjelem[jtype], d_radelem[jtype], rcutfac);
+        //printf("%f %f %f\n", my_sna.rij(ii,offset,0), my_sna.rij(ii,offset,1), my_sna.rij(offset,2));
+        //printf("%f %f %f\n", my_sna.wj(ii,offset), my_sna.rcutij(ii,offset), my_sna.inside(ii,offset));
+        // we can't use std::cout on device code, maybe make another function for this?
+        //std::cout << my_sna.rij(ii,offset,0) << std::endl;
+        //printf("%f %f %f\n", dx, dy, dz);
+        // apparently isnan is also a host function and not allowed here...
+        /*
+        if (isnan(dx) || isnan(dy) || isnan(dz)){
+          printf("Found a nan!\n");
+        }
+        if (isnan(d_wjelem[jelem]) || isnan(radi) || isnan(d_radelem[jelem]) || isnan(rcutfac) || isnan(j)){
+          printf("Found a nan 2!\n");
+        }
+        */
+        // Our best bet is to make another non-host function for printing
+
         if (switchinnerflag) {
-          my_sna.sinnerij(ii,offset) = 0.5*(d_sinnerelem[itype] + d_sinnerelem[jelem]);
-          my_sna.dinnerij(ii,offset) = 0.5*(d_dinnerelem[itype] + d_dinnerelem[jelem]);
+          my_sna.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]);
+          my_sna.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]);
         }
         if (chemflag)
           my_sna.element(ii,offset) = jelem;
@@ -621,7 +751,7 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
     const int ii = iatom_mod + vector_length * iatom_div;
     if (ii >= chunk_size) return;
 
-    const int ninside = d_ninside(ii);
+    const int ninside = d_ninside(ii); // use ntotal or d_ninside?
     if (jj >= ninside) return;
 
     my_sna.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div);
@@ -713,6 +843,83 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
   my_sna.compute_bi(iatom_mod,jjb,iatom_div);
 }
 
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+
+  if (idxb >= my_sna.idxb_max) return;
+
+  const int ntriples = my_sna.ntriples;
+
+  for (int itriple = 0; itriple < ntriples; itriple++) {
+
+    const real_type blocal = my_sna.blist_pack(iatom_mod, idxb, itriple, iatom_div);
+
+    my_sna.blist(iatom, itriple, idxb) = blocal;
+  }
+
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalFill,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridLocalFill>::member_type& team) const {
+
+  // this function is following the same procedure in ComputeNeigh so that we can fill the grid
+
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+  // basic quantities associated with this team:
+  // team_rank : rank of thread in this team
+  // league_rank : rank of team in this league
+  // team_size : number of threads in this team
+  //printf("%d %d %d\n", team.team_rank(), team.league_rank(), team.team_size());
+
+  // extract loop index
+  int ii = team.team_rank() + team.league_rank() * team.team_size();
+  if (ii >= chunk_size) return;
+
+  //d_gridall(ii,0) = 100.0;
+
+  const auto idxb_max = snaKK.idxb_max;
+
+  // linear contributions
+  
+
+
+  for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
+    const auto idxb = icoeff % idxb_max;
+    const auto idx_chem = icoeff / idxb_max;
+    d_gridall(ii,icoeff) = my_sna.blist(ii,idx_chem,idxb);
+  }
+
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalFill2, const int& ii) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+  const auto idxb_max = snaKK.idxb_max;
+
+  // linear contributions
+
+  for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
+    const auto idxb = icoeff % idxb_max;
+    const auto idx_chem = icoeff / idxb_max;
+    //printf("blist: %f\n", my_sna.blist(ii,idx_chem,idxb));
+    d_gridall(ii,icoeff) = my_sna.blist(ii,idx_chem,idxb);
+
+    if (icoeff == 0){
+      //printf("%f\n", my_sna.blist(ii,idx_chem,idxb));
+    }
+  }
+
+}
+
 /*
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
diff --git a/src/ML-SNAP/compute_sna_grid.cpp b/src/ML-SNAP/compute_sna_grid.cpp
index 36780213f2..9125b7dcd4 100644
--- a/src/ML-SNAP/compute_sna_grid.cpp
+++ b/src/ML-SNAP/compute_sna_grid.cpp
@@ -69,8 +69,10 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) :
   twojmax = utils::inumeric(FLERR, arg[5], false, lmp);
 
   for (int i = 0; i < ntypes; i++) radelem[i + 1] = utils::numeric(FLERR, arg[6 + i], false, lmp);
-  for (int i = 0; i < ntypes; i++)
+  for (int i = 0; i < ntypes; i++) {
     wjelem[i + 1] = utils::numeric(FLERR, arg[6 + ntypes + i], false, lmp);
+    printf("^^^^^ ComputeSNAGrid wj: %f\n", wjelem[i+1]);
+  }
 
   // construct cutsq
 

From 02122c809c4e3df9b092b3cf7bfb9f154faa4eef Mon Sep 17 00:00:00 2001
From: rohskopf <drew.rohskopf@gmail.com>
Date: Sat, 8 Apr 2023 17:52:35 -0600
Subject: [PATCH 10/51] Change ntotal to n_ninside

---
 src/KOKKOS/compute_sna_grid_kokkos_impl.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/KOKKOS/compute_sna_grid_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
index 3148bf32ce..d21d29485b 100644
--- a/src/KOKKOS/compute_sna_grid_kokkos_impl.h
+++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
@@ -712,9 +712,11 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
   const int ii = iatom_mod + iatom_div * vector_length;
   if (ii >= chunk_size) return;
 
-  const int ninside = ntotal; //d_ninside(ii);
+  const int ninside = d_ninside(ii); // use d_ninside or ntotal?
   if (jnbor >= ninside) return;
 
+  printf("ninside: %d\n", ninside);
+
   my_sna.compute_cayley_klein(iatom_mod,jnbor,iatom_div);
 }
 

From a3d8ab308861a02b77e07694c36bbebf5f8a14a7 Mon Sep 17 00:00:00 2001
From: rohskopf <drew.rohskopf@gmail.com>
Date: Sat, 8 Apr 2023 19:03:25 -0600
Subject: [PATCH 11/51] Add cutoff view properly

---
 src/KOKKOS/compute_sna_grid_kokkos_impl.h | 22 +++++++++++++++++-----
 src/KOKKOS/pair_snap_kokkos_impl.h        |  2 ++
 src/KOKKOS/sna_kokkos_impl.h              |  3 +++
 3 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/src/KOKKOS/compute_sna_grid_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
index d21d29485b..6c4e11b25a 100644
--- a/src/KOKKOS/compute_sna_grid_kokkos_impl.h
+++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
@@ -76,6 +76,14 @@ ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::ComputeSNAGridKokkos
 
   cutsq_tmp = cutsq[1][1];
 
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = 1; j <= atom->ntypes; j++){
+      k_cutsq.h_view(i,j) = k_cutsq.h_view(j,i) = cutsq_tmp;
+      k_cutsq.template modify<LMPHostType>();
+    }
+  }
+
+
   //memoryKK->create_kokkos(k_gridlocal,
   //printf("^^^^^ gridlocal: %f\n", gridlocal[0][0][0][0]);
 
@@ -502,10 +510,13 @@ KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridComputeNeigh>::member_type& team) const {
 
   // this function is following the same procedure in ComputeNeigh of PairSNAPKokkos
-  //printf("d_wjelem[1]: %f %f %f %f\n", d_wjelem[1], d_wjelem[0], d_wjelem(1), d_wjelem(0));
+  if (d_wjelem[1] > 0){
+    printf("d_wjelem[1]: %f %f %f %f\n", d_wjelem[1], d_wjelem[0], d_wjelem(1), d_wjelem(0));
+  }
   //artificially set values here since we can't get the deep_copy to work
   //d_wjelem[1] = 1.0;
   //d_radelem[1] = 0.5;
+  //printf("%f\n", rnd_cutsq(1,1)); 
 
 
   SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
@@ -609,11 +620,12 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
     //if (rsq < cutsq[jtype][jtype] && rsq > 1e-20) {
     const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
 
-    //if (rsq >= rnd_cutsq(itype,jtype)) {
-    if (rsq >= cutsq_tmp){
+    //if (rsq >= cutsq_tmp){
+    if (rsq >= rnd_cutsq(itype,jtype)) {
       jtype = -1; // use -1 to signal it's outside the radius
+    } else {
+      //printf("jtype rsq rnd_cutsq: %d %f %f\n", jtype, rsq, rnd_cutsq(itype, jtype));
     }
-    //printf("jtype rsq rnd_cutsq: %d %f %f\n", jtype, rsq, rnd_cutsq(itype, jtype));
 
     if (j > 340){
       printf("j: %d\n", j);
@@ -715,7 +727,7 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
   const int ninside = d_ninside(ii); // use d_ninside or ntotal?
   if (jnbor >= ninside) return;
 
-  printf("ninside: %d\n", ninside);
+  //printf("ninside: %d\n", ninside);
 
   my_sna.compute_cayley_klein(iatom_mod,jnbor,iatom_div);
 }
diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h
index 45bacb4c97..02ba7f1604 100644
--- a/src/KOKKOS/pair_snap_kokkos_impl.h
+++ b/src/KOKKOS/pair_snap_kokkos_impl.h
@@ -664,6 +664,8 @@ template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeNeigh>::member_type& team) const {
 
+
+  printf("d_wjelem: %f %f %f %f\n", d_wjelem[0], d_wjelem[1], d_wjelem(0), d_wjelem(1));
   SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
 
   // extract atom number
diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h
index fd58f1c4f3..ba23a38af2 100644
--- a/src/KOKKOS/sna_kokkos_impl.h
+++ b/src/KOKKOS/sna_kokkos_impl.h
@@ -393,6 +393,9 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_cayley_klein(const
   const real_type z0 = r * cs / sn;
   const real_type dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq;
 
+  //printf("%f %f %f\n", sn, cs, z0);
+  //printf("%f %f %f %f %f\n", x, y, z, rcut, rmin0);
+
   const real_type wj_local = wj(iatom, jnbor);
   real_type sfac, dsfac;
   compute_s_dsfac(r, rcut, sinner, dinner, sfac, dsfac);

From a720328770ddae90d7a08a534e6433244d9f959e Mon Sep 17 00:00:00 2001
From: rohskopf <drew.rohskopf@gmail.com>
Date: Sat, 8 Apr 2023 21:32:46 -0600
Subject: [PATCH 12/51] Matching descriptors when no neighbors, good checkpoint
 for debugging

---
 src/KOKKOS/compute_sna_grid_kokkos.h      |  1 +
 src/KOKKOS/compute_sna_grid_kokkos_impl.h | 30 ++++++++++++++++-------
 src/KOKKOS/sna_kokkos_impl.h              |  4 ++-
 3 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/src/KOKKOS/compute_sna_grid_kokkos.h b/src/KOKKOS/compute_sna_grid_kokkos.h
index abd1c985b6..571e09742e 100644
--- a/src/KOKKOS/compute_sna_grid_kokkos.h
+++ b/src/KOKKOS/compute_sna_grid_kokkos.h
@@ -240,6 +240,7 @@ class ComputeSNAGridKokkos : public ComputeSNAGrid {
   Kokkos::View<real_type*, DeviceType> d_dinnerelem;           // element inner cutoff half-width
   Kokkos::View<T_INT*, DeviceType> d_ninside;                // ninside for all atoms in list
   Kokkos::View<T_INT*, DeviceType> d_map;                    // mapping from atom types to elements
+  Kokkos::View<real_type*, DeviceType> d_test;              // test view
 
   typedef Kokkos::DualView<F_FLOAT**, DeviceType> tdual_fparams;
   tdual_fparams k_cutsq;
diff --git a/src/KOKKOS/compute_sna_grid_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
index 6c4e11b25a..ee080fab3b 100644
--- a/src/KOKKOS/compute_sna_grid_kokkos_impl.h
+++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
@@ -96,6 +96,9 @@ ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::ComputeSNAGridKokkos
   //MemKK::realloc_kokkos(d_coeffelem,"pair:coeffelem",nelements,ncoeff);
   MemKK::realloc_kokkos(d_sinnerelem,"ComputeSNAGridKokkos:sinnerelem",nelements);
   MemKK::realloc_kokkos(d_dinnerelem,"ComputeSNAGridKokkos:dinnerelem",nelements);
+  // test
+  MemKK::realloc_kokkos(d_test, "ComputeSNAGridKokkos::test", nelements);
+
   int n = atom->ntypes;
   //printf("^^^ realloc d_map\n");
   printf("^^^ n: %d\n", n);
@@ -108,13 +111,16 @@ ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::ComputeSNAGridKokkos
   auto h_sinnerelem = Kokkos::create_mirror_view(d_sinnerelem);
   auto h_dinnerelem = Kokkos::create_mirror_view(d_dinnerelem);
   auto h_map = Kokkos::create_mirror_view(d_map);
+  // test
+  auto h_test = Kokkos::create_mirror_view(d_test);
+  h_test(0) = 2.0;
 
   printf("^^^ begin loop over elements, nelements = %d\n", nelements);
   // start from index 1 because of how compute sna/grid is
   for (int i = 1; i <= atom->ntypes; i++) {
     printf("^^^^^ i %d\n", i);
-    h_radelem(i) = radelem[i];
-    h_wjelem(i) = wjelem[i];
+    h_radelem(i-1) = radelem[i];
+    h_wjelem(i-1) = wjelem[i];
     printf("^^^^^ radelem wjelem %f %f\n", radelem[i], wjelem[i]);
     printf("host^^^ radelem wjelem %f %f\n", h_radelem(i), h_wjelem(i));
     if (switchinnerflag){
@@ -146,6 +152,8 @@ ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::ComputeSNAGridKokkos
   if (chemflag){
     Kokkos::deep_copy(d_map,h_map);
   }
+  // test
+  Kokkos::deep_copy(d_test,h_test);
 
   double bytes =  MemKK::memory_usage(d_wjelem);
   printf("^^^ bytes: %f\n", bytes);
@@ -510,13 +518,14 @@ KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridComputeNeigh>::member_type& team) const {
 
   // this function is following the same procedure in ComputeNeigh of PairSNAPKokkos
-  if (d_wjelem[1] > 0){
-    printf("d_wjelem[1]: %f %f %f %f\n", d_wjelem[1], d_wjelem[0], d_wjelem(1), d_wjelem(0));
-  }
+  //printf("d_wjelem[1]: %f %f %f %f\n", d_wjelem[1], d_wjelem[0], d_wjelem(1), d_wjelem(0));
   //artificially set values here since we can't get the deep_copy to work
   //d_wjelem[1] = 1.0;
   //d_radelem[1] = 0.5;
-  //printf("%f\n", rnd_cutsq(1,1)); 
+  //printf("%f\n", rnd_cutsq(1,1));
+
+  //Print the test view to see that the deep copy works:
+  //printf("%f\n", d_test(0));
 
 
   SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
@@ -569,6 +578,7 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
   const F_FLOAT xtmp = xgrid[0];
   const F_FLOAT ytmp = xgrid[1];
   const F_FLOAT ztmp = xgrid[2];
+  //printf("%f %f %f\n", xtmp, ytmp, ztmp);
 
   // currently, all grid points are type 1
   // not clear what a better choice would be
@@ -665,9 +675,10 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
         my_sna.rij(ii,offset,1) = static_cast<real_type>(dy);
         my_sna.rij(ii,offset,2) = static_cast<real_type>(dz);
         // pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp
-        my_sna.wj(ii,offset) = static_cast<real_type>(d_wjelem[jtype]);
+        // actually since the views here have values starting at 0, let's use jelem
+        my_sna.wj(ii,offset) = static_cast<real_type>(d_wjelem[jelem]);
         //my_sna.rcutij(ii,offset) = static_cast<real_type>((radi + d_radelem[jtype])*rcutfac);
-        my_sna.rcutij(ii,offset) = static_cast<real_type>((2.0 * d_radelem[jtype])*rcutfac);
+        my_sna.rcutij(ii,offset) = static_cast<real_type>((2.0 * d_radelem[jelem])*rcutfac);
         my_sna.inside(ii,offset) = j;
 
         //printf("%f\n", my_sna.wj(ii,offset));
@@ -741,7 +752,8 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
   if (ii >= chunk_size) return;
 
   int itype = type(ii);
-  int ielem = d_map[itype];
+  //int ielem = d_map[itype];
+  int ielem = 0;
 
   my_sna.pre_ui(iatom_mod, j, ielem, iatom_div);
 }
diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h
index ba23a38af2..8102a8b6b7 100644
--- a/src/KOKKOS/sna_kokkos_impl.h
+++ b/src/KOKKOS/sna_kokkos_impl.h
@@ -393,7 +393,8 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_cayley_klein(const
   const real_type z0 = r * cs / sn;
   const real_type dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq;
 
-  //printf("%f %f %f\n", sn, cs, z0);
+  //printf("jnbor: %d %f %f %f %f %f\n", jnbor, x,y,z, rfac0, rcut);
+  //printf("%f %f %f %f %f %f %f\n", rscale0, r, rmin0, theta0, sn, cs, z0);
   //printf("%f %f %f %f %f\n", x, y, z, rcut, rmin0);
 
   const real_type wj_local = wj(iatom, jnbor);
@@ -773,6 +774,7 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_bi(const int& iato
             sumzu -= bzero[j];
           }
         }
+        //printf("%f\n", sumzu);
         blist_pack(iatom_mod, jjb, itriple, iatom_div) = sumzu;
             //} // end loop over j
           //} // end loop over j1, j2

From d75ceabfb038b7a76d74b2c541578dd008f49d24 Mon Sep 17 00:00:00 2001
From: rohskopf <drew.rohskopf@gmail.com>
Date: Sat, 8 Apr 2023 21:55:04 -0600
Subject: [PATCH 13/51] Fix neighbor criteria so atoms sharing positions with
 gridpoints aren't included as neighbors

---
 src/KOKKOS/compute_sna_grid_kokkos_impl.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/KOKKOS/compute_sna_grid_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
index ee080fab3b..9041509e3f 100644
--- a/src/KOKKOS/compute_sna_grid_kokkos_impl.h
+++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
@@ -578,7 +578,7 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
   const F_FLOAT xtmp = xgrid[0];
   const F_FLOAT ytmp = xgrid[1];
   const F_FLOAT ztmp = xgrid[2];
-  //printf("%f %f %f\n", xtmp, ytmp, ztmp);
+  printf("rtmp: %f %f %f\n", xtmp, ytmp, ztmp);
 
   // currently, all grid points are type 1
   // not clear what a better choice would be
@@ -631,10 +631,10 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
     const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
 
     //if (rsq >= cutsq_tmp){
-    if (rsq >= rnd_cutsq(itype,jtype)) {
+    if (rsq >= rnd_cutsq(itype,jtype) || rsq < 1e-10) {
       jtype = -1; // use -1 to signal it's outside the radius
     } else {
-      //printf("jtype rsq rnd_cutsq: %d %f %f\n", jtype, rsq, rnd_cutsq(itype, jtype));
+      printf("jtype rsq rnd_cutsq: %d %.11f %f\n", jtype, rsq, rnd_cutsq(itype, jtype));
     }
 
     if (j > 340){
@@ -667,6 +667,9 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
         const F_FLOAT dz = x(j,2) - ztmp;
         int jtype = type(j);
         //printf("jtype: %d\n", jtype);
+        if (dx==0 && dy==0 && dz==0){
+          printf("rij: %f %f %f\n", xtmp, ytmp, ztmp);
+        }
         int jelem = 0;
         if (chemflag) jelem = d_map[jtype];
         //d_wjelem[jelem] = 1.0;

From 40db9b1701a130bafd0689bfb5c9a0b3b9ca530b Mon Sep 17 00:00:00 2001
From: rohskopf <drew.rohskopf@gmail.com>
Date: Sun, 9 Apr 2023 15:54:10 -0600
Subject: [PATCH 14/51] Agreement between Kokkos and original compute sna/grid
 with switchflag = 1; note that switchflag = 0 gives wrongly zeroed values for
 Kokkos because of bug in compute_s_dsfac function of sna_kokkos_impl.h
 causing sfac to be zero

---
 src/KOKKOS/compute_sna_grid_kokkos_impl.h | 10 ++++------
 src/KOKKOS/sna_kokkos_impl.h              | 18 ++++++++++++++++--
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/src/KOKKOS/compute_sna_grid_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
index 9041509e3f..b37082ca5f 100644
--- a/src/KOKKOS/compute_sna_grid_kokkos_impl.h
+++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
@@ -578,7 +578,7 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
   const F_FLOAT xtmp = xgrid[0];
   const F_FLOAT ytmp = xgrid[1];
   const F_FLOAT ztmp = xgrid[2];
-  printf("rtmp: %f %f %f\n", xtmp, ytmp, ztmp);
+  //printf("rtmp: %f %f %f\n", xtmp, ytmp, ztmp);
 
   // currently, all grid points are type 1
   // not clear what a better choice would be
@@ -631,14 +631,11 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
     const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
 
     //if (rsq >= cutsq_tmp){
+    // don't include atoms that share location with grid point
     if (rsq >= rnd_cutsq(itype,jtype) || rsq < 1e-10) {
       jtype = -1; // use -1 to signal it's outside the radius
     } else {
-      printf("jtype rsq rnd_cutsq: %d %.11f %f\n", jtype, rsq, rnd_cutsq(itype, jtype));
-    }
-
-    if (j > 340){
-      printf("j: %d\n", j);
+      //printf("jtype rsq rnd_cutsq: %d %.11f %f\n", jtype, rsq, rnd_cutsq(itype, jtype));
     }
 
     //printf("j: %d\n", j);
@@ -830,6 +827,7 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
 
     auto utot_re = my_sna.ulisttot_re_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div);
     auto utot_im = my_sna.ulisttot_im_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div);
+    //printf("^^^ utot: %f %f\n", utot_re, utot_im);
 
     if (mapper.flip_sign == 1){
       utot_im = -utot_im;
diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h
index 8102a8b6b7..55256f60cd 100644
--- a/src/KOKKOS/sna_kokkos_impl.h
+++ b/src/KOKKOS/sna_kokkos_impl.h
@@ -400,6 +400,7 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_cayley_klein(const
   const real_type wj_local = wj(iatom, jnbor);
   real_type sfac, dsfac;
   compute_s_dsfac(r, rcut, sinner, dinner, sfac, dsfac);
+  //printf("^^^ sfac wj_local: %f %f\n", sfac, wj_local);
   sfac *= wj_local;
   dsfac *= wj_local;
 
@@ -520,6 +521,8 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_ui_small(const typ
   const complex b = b_pack(iatom_mod, jnbor, iatom_div);
   const real_type sfac = sfac_pack(iatom_mod, jnbor, iatom_div, 0);
 
+  //printf("^^^ %f %f %f %f %f\n", a.re, a.im, b.re, b.im, sfac);
+
   const int jelem = element(iatom_mod + vector_length * iatom_div, jnbor);
 
   // we need to "choose" when to bend
@@ -606,6 +609,7 @@ void SNAKokkos<DeviceType, real_type, vector_length>::evaluate_ui_jbend(const Wi
       ulist_accum.im = -rootpq * (b.re * ulist_prev.im - b.im * ulist_prev.re);
 
     }
+    //printf("^^^ ulist %f %f\n", ulist_accum.re, ulist_accum.im);
 
     ulist_wrapper.set(ma, ulist_accum);
   }
@@ -647,7 +651,7 @@ void SNAKokkos<DeviceType, real_type, vector_length>::evaluate_ui_jbend(const Wi
     }
 
     ulist_wrapper.set(ma, ulist_accum);
-
+    //printf("^^^ ulist_accum: %f %f\n", ulist_accum.re, ulist_accum.im);
     mb++;
   }
 
@@ -656,10 +660,15 @@ void SNAKokkos<DeviceType, real_type, vector_length>::evaluate_ui_jbend(const Wi
 
   for (int ma = 0; ma < j; ma++) {
     const complex ulist_prev = ulist_wrapper.get(ma);
+    //printf("ulist_prev %f %f\n", ulist_prev.re, ulist_prev.im);
 
     // atomic add the previous level here
     Kokkos::atomic_add(&(ulisttot_re_pack(iatom_mod, jjup + ma, jelem, iatom_div)), ulist_prev.re * sfac);
     Kokkos::atomic_add(&(ulisttot_im_pack(iatom_mod, jjup + ma, jelem, iatom_div)), ulist_prev.im * sfac);
+
+    // see if we can see this value
+    //printf("^^^ %f\n", ulisttot_re_pack(iatom_mod, jjup + ma, jelem, iatom_div));
+    //printf("^^^ sfac: %f\n", sfac);
   }
 
 }
@@ -750,6 +759,7 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_bi(const int& iato
 
             const complex utot = ulisttot_pack(iatom_mod, jju_index, elem3, iatom_div);
             const complex zloc = zlist_pack(iatom_mod, jjz_index, idouble, iatom_div);
+            //printf("^^^ %f %f %f %f\n", utot.re, zloc.re, utot.im, zloc.im); 
             sumzu_temp += utot.re * zloc.re + utot.im * zloc.im;
 
           }
@@ -875,7 +885,7 @@ typename SNAKokkos<DeviceType, real_type, vector_length>::complex SNAKokkos<Devi
   int jju1 = idxu_block[j1] + (j1+1)*mb1min;
   int jju2 = idxu_block[j2] + (j2+1)*mb2max;
   int icgb = mb1min*(j2+1) + mb2max;
-
+  //printf("^^^ na nb: %d %d\n", na, nb);
   #ifdef LMP_KK_DEVICE_COMPILE
   #pragma unroll
   #endif
@@ -893,6 +903,7 @@ typename SNAKokkos<DeviceType, real_type, vector_length>::complex SNAKokkos<Devi
       const complex utot2 = ulisttot_pack(iatom_mod, jju2+ma2, elem2, iatom_div);
       const real_type cgcoeff_a = cgblock[icga];
       const real_type cgcoeff_b = cgblock[icgb];
+      //printf("^^^ utot %f %f %f %f\n", utot1.re, utot1.im, utot2.re, utot2.im);
       ztmp.re += cgcoeff_a * cgcoeff_b * (utot1.re * utot2.re - utot1.im * utot2.im);
       ztmp.im += cgcoeff_a * cgcoeff_b * (utot1.re * utot2.im + utot1.im * utot2.re);
       ma1++;
@@ -910,6 +921,7 @@ typename SNAKokkos<DeviceType, real_type, vector_length>::complex SNAKokkos<Devi
     ztmp.re *= scale;
     ztmp.im *= scale;
   }
+  //printf("^^^ ztmp.re ztmp.im: %f %f\n", ztmp.re, ztmp.im);
 
   return ztmp;
 }
@@ -2303,6 +2315,8 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_s_dsfac(const real
   constexpr real_type zero = static_cast<real_type>(0.0);
   constexpr real_type onehalf = static_cast<real_type>(0.5);
 
+  //printf("^^^ flags: %d %d\n", switch_flag, switch_inner_flag);
+
   if (switch_flag == 0) { sfac_outer = zero; dsfac_outer = zero; }
   else if (switch_flag == 1) {
     if (r <= rmin0) { sfac_outer = one; dsfac_outer = zero; }

From 470581d4696912f5a756391195a8257422d7bd0c Mon Sep 17 00:00:00 2001
From: rohskopf <drew.rohskopf@gmail.com>
Date: Mon, 10 Apr 2023 17:18:00 -0600
Subject: [PATCH 15/51] Organize sna method calls and clean up

---
 .../.compute_sna_grid_kokkos_impl.h.swo       | Bin 0 -> 49152 bytes
 src/KOKKOS/compute_sna_grid_kokkos.h          |   7 +-
 src/KOKKOS/compute_sna_grid_kokkos_impl.h     | 380 ++----------------
 src/KOKKOS/sna_kokkos_impl.h                  |  21 -
 src/ML-SNAP/compute_grid.cpp                  |   2 -
 src/ML-SNAP/compute_sna_grid.cpp              |  13 -
 6 files changed, 30 insertions(+), 393 deletions(-)
 create mode 100644 src/KOKKOS/.compute_sna_grid_kokkos_impl.h.swo

diff --git a/src/KOKKOS/.compute_sna_grid_kokkos_impl.h.swo b/src/KOKKOS/.compute_sna_grid_kokkos_impl.h.swo
new file mode 100644
index 0000000000000000000000000000000000000000..7f92a608886209c3570518c3c072aa00307d6609
GIT binary patch
literal 49152
zcmeI53w-2NmFI)+0YS&{J?qRR3{Xi|9^Jf}&@}cF+U}%Vk_M$^ib~Z#$v>$_sH$|5
zZd$=7DmpqoP#F{-%&afQk;RWQpA}toc11;86(5MAtEk{x$8mMWb-(A_d;kA`RjQIo
zCkQ*6`t+~!f82ZSz2`p9x#yn$X#YjK2F=Fa{tkYR@96m68&A)F{k>^(+rX1LviVAR
zqB<pS7X4M%_n)!x?9)zLG!QSJ?+Er4vc=+bwXa&q_3hrfd-vYqzFfIDJyQ!Z)lxPy
zUJ3J=sq)lRxta-!(}muNr3kORU*o_U2NvhROl_=Z<MAD*ZCI~e{iMzl&7VAB|Kf4h
z-mY<AjRR{OSmVGN2i7>S#(^~stZ`tC1OK5oP@6us;{}xHVXjo8?(Z=Te;?uQRe0cf
ztl|A5-Ti?3dt1Z%N4fir?(aD+0iVvJ-Se&P@2eZ$KgK=x63cxJ@BhfXKhyoa&=ttj
zKei$MKQz2Q&b{B~{yyR1_QJ<^w;g}!-d7vmyT@yP);O@nfi(`SabS%DYaCePz#0eE
zIIzZnH4dzCV2uM0Ne*Q59UXU(v*(LE%>MtV%jJ(A*U|9~@H}u5`1uJP9q$99;OXGk
z$9Hsm1Wbc6I2nBBu^k;ZgE0_*XM#>}B6tk=8AA3q!Dqk+z~6uugAnWi8^DR+c<@;8
zZwT!_0C$2ffscWYg7<(ofos84;7V{gK$X$a3*L+};9Bq!5P-h`j{@IAiEuA?6}T8Y
z6PyR044wo&j}qbz@L#~q;AP+v@C@)%6d3n`d%<_XZQwBI2j4*X@d|JyI1`)!o&mmr
z(&TRN4loJ2!TsnBJ`8RGhrl=(1M9%=se^BVkAfS4)X5p(7bqdV2)^Kz5sSAZqrZtv
zH7u2aN>B)jsj$nOYux=3uL*u2y!4z72^Nl-RCPA2<tCKWSRp&!by(lX+m<P`M`F4X
zmW0fT2~A{XCv7Qv)=N3<T~vsPOeLF7zIV4Sz*xCrQemlPLZYrehd-OnHQ8FZ*t5A*
zo1YG<JX*J|%SgHV`b@Q!t<=m|rCc;&DId(4^=354WoN2^DUX?n@~m;DECy4SJ~3r6
zVYP#}6Pa32t)==UhZ}nPg~8dGTG>74;cm5f85qb_f-IIW#jsK-S272~U^bQ4xLp#5
zEC`)Ab}>7>RAho^T$f%xY8OjIj6){l$L(sV_;JW&d|N+D#kHZ5@m#$v710GFCQKR(
zio16Y48YPtp`3G8Meq}rq&v4+qod)WE$35Dp8ve+OqbTJ*QBLklt{6zK`cgCDkp7P
zkd~L7X;YHc7?f&NO(L$*R+){fp+%nK0OY)mcq@`|&9opH2WXe1t*{p4?8|mZxhiZy
zYrws3q$Yi1IH;MKY2$cc3SqUTz3&KZ!5I6$F;kh`Os#r(Z!N&QjLv3?<$O3cziH3l
zz6;9L+6di#b4n{mV)@V$wljw5N$JB4Pbbql#wy)RtvD@x)tG2-nIr~-%jCkrWs<=~
z#+E4PlH(A8qhuhf*9X>L%0F%LaA9Qc_PqmU$6T$F&DCrGMk|vq&x{rVQz)0G8MF%J
zS(;**))?e9^@ykrSMVVt13oFFg1LZ~VY!s4PG@t0Irm&6e6n4vQp#I)64G5ogIoN@
zvnyh`3~ZaW2M5Dkkl?oq2B}(pCR@miO=m0FVl~yGLur;f)5+esZhgb8&1F7Y%NDcM
zsf?{^64)`gZ{)d|!7anPCA-mkhKl@xw+k2ma~(^Dg(Ewe4{TYIMH(1Tb}2WQP<HQz
zZfc8M=}>RFyL4QU_txAoE7c<nrQ>i%v%x7rJZ)r5%g79sHV1=Tjk%%_l*Vfln+FD(
z#ZB$mGB~(z*mM_)(`i}*<65>do;JCOY{hhUbKPYIOnuChhPB}&K!>lrO}Rq0S~YR?
zrH$mIlr09yd`T+xdF$w)d2TD0s<q0D<hi3m;@DPYGng3~X0GixDIbi5rN9hsdCtXq
zFWhcUOFr4NYiNgA-@oB>!iA+=VJ07#O(883o)kAHALe)#mc}<H!tp3y2uJH46|?Xy
z5wCM*ng$W{PISf*JDJesL0?~#raW?UN`25$5ROlbQYE%to4m|ag3D)^4w}7^QLFyB
zZL|%~f?~Nc-!z3lmJaoybLC>O{)u!N=E3SzwaN21-w6gxXTyqE=uS6HloXno@Cxyc
zp4Ix9?ml`;i{Fle*^;?92uq_uW!$8OF50zy*Oo2`Pm7}+j53Cm#%0+xQLa>J>=P9@
z9TI7zQm)lZYItZ*SK18McrsWnR|5B>qp#bnP61Y*6T?AZY7>FEaL1PIgFDPv$Oytv
zFOO>F>7GJxFen(;_48px2a5S#;fVw}e`vosKR`;z7R){-hcIXMFd&zzfd<J1m0A{R
z6<TwxYUzemQ!S6xX0w%`*O-xs#62rrlZT_qX(p8!7RVJzYWt3!E!+A}?>YPQEqgYe
z(cj;15Gp3jaJG~Wvs6xHx?IW1{K7CouJ-p1_nPy|2ZKsUmPw2#l8dTP+E$*P*W8%Y
zhW`H3x|+}+SNkPl%Y&d&tjfw<TL8165b^8~-jt^`i5dgPfXUk@L8bakYogv;u8d2J
ziCS&CI?&fQJ3HGOzv-i`*dh#R?qTAdEWMtdRS5&luS=?j7nSj#r<%)BKQaY`xxT@2
zfjOkW@H@z08j_iLvybVnP@u|&!y>O_gp|hD-D7g=*6H7)Z-_GVGH#c~dsu5Q;=z_N
zZ2UVqK7?HPEaXp-|NZ{<?~(aG0{#}H!Trem_ksTgJ_i04+z2iQ7l1zn-$Dm)H~2dE
z8u%)BHJAr!unydZ9^msJ1D*wTf!mSwKMSr0)1U;3U>$e@cq=mf%fZ#)kHBNV*O2Gm
z2i^nT4aULw;BMsjonQwz0X!BQ2W~}%{|Kmp?cmqQ?f(jX1?~bR@JR3oa38Y!E#PMG
zPVjbc19%Cz0PF-kU>)cNPXxa~wtuaY@w@nY0=OHV-UU7lJ_Y^<cptbNl)*({1Pp>b
zK<eNEuoG+rj|O)k|BJ1{5pV?<1zB)5*a)5qo&r|;cZAt6pnHyW$f9{`wi+Orx)DBG
zsbr5V6<}T#(1$w?E)sB&S3yP+@ifXv(SBvpZo1{R0}AnF1+EtxCF&9REy1y7g-J@D
zvZ78b@L0wr`AzKBt1p%=aV>?Vu;zH5<lEv&*+`3~z%bajKD6uDM5Y-Khqmm?*h4)T
z=A=FKPcGOyJd(L+$F@B?hBh}YF3n+c4icKL=f(5eDpblYOx>`%$Xa)?qMj6vvHCPf
zgT5r^@5AmWW^qHsXghDk>t<p>L;S_^l1R0UifxwIAE;`#@3?5!wjD=Nxy|z2ta@#@
zRjJ_BUxro6vbGGW21Q`rjLsV`DY1ANLkU|#YSyO7vS1ZgldDEb*45h7SZmExN@h66
zDke8^5kk61=SZ>>#h{2v`B0ELm@Oc=WTt}(YRPQT%GV<PNvE+x!oYyU+*GwxR7&km
zNAI==W7(NPtzLvC2^X4#M6gkzFph*Ki>D)%Y*?#q@~P}E6|<%6c#z+0HaF(^oDLB`
zo3*660c#O4+Gj{gotX#<lvp*zjSN6u_0_#85`%qYd9XQtfAoU0RbC0lwHnY9MAwLM
zbA5G!w&)Z@dmjbqu6qr)qBjz`OH@8}PayA^JVKKD%E$)uNyd$2$A@e41r%jrX{MO=
zfw$}*+1rI;&E-;vr@CuM2~h~Y#Jw=c&!GFsj;Fe86O!txi+E1Q!B(ScGABumLtQCl
z3Y`dYQ`$_GX-M6YsMYsNwp9^wN|dUo`J<||RTY)9@r50aq<6Hh{P>SnhdaxaKX3&i
zx;$vLcl4rHLrkZ|V9F?jwRtl(Q_6|-yQ)(I%TcfJ&g+qC<jPbSro&u07#kA}c9Ag_
zl@?Mz`%>~IKbN7u6y;)+E@}AXL{(=us{CtWS{kO$NJ#18%>s|c0Z*<A7;CyUD+KFj
zkIi0hG9|t65A4nxSwVI!Fu@`$lO?Ci5svYyK98CoXi~J@yL;md5|Fj29j4kFJsg8P
ztyLI}%FG}l)2F&jihi@r?L6%%qZZg177A!%mEVknr<jlp1$%qA!!4+H4T<J6vvdE@
zwvk<XhgM@m68V1~d3rx`w#fhePLTW>GXEFB9pH1|`JezcgOkCZgFgjN2R}ph{}lKD
zcs|$zL?>_-_!ICr@Iz$(`@uhg4+F6ocp~^6^8b&)kHGyv^aEc9cY!y7?O-eD1}ShV
z_;c`Q;BNB#dhpla1)vJbK+5uP@H@)&W$^dl4PXxt{lSmW6Wk7N0xtxjH|PMrM^A7&
z5c_~Za6GsZUBPvr2nrw#9uIzup5RXKLeK~9MF;SDunYVfvi}#rhr!L@gWw;))4{#S
z^xp;F0Y||3;3OdX{(lc{0`CVCpdZ|UZsv3C?fXm38Ge0zQ4gC;^e}g=J*>@O3>55H
zDGk2`zLJ=2WUOh5^nF__`LkI0?E8lQSC*<HwiFQwX2Zl+I#0E#0WIiq!cHzL@Paz?
z*;_hOzP?wI9yKPL1u_UvyS*)ujH>3cQf`HGCA#~ys1)9|c$4e`$SUmyY|;r97H}1*
zts*ILqGD!NkKpQ2vRj~HH^Z;3`f95hR03~Bvek3eA5qg-Bsc9u_sbXGi>%?0l+a8|
z{ACn%d7))h_aG0_TELZ5<w{EI8N9soCB<b_Nj;MjgTPA2EF%X?B(sv&OT--;Gx6HI
zS>u-Bp5OA=A2-+pFBk16?IqD>PHeC?12<iow8&0_+G$|G>u$YBX4Pz7OEZD$UyW^K
zv|!I<rCN`rRKD#iEU1=U>?c^=peU4bY`|iT5Om3|ESkcVnyNg%nSySyy^dw&eLEHw
z-q1t^VkZ^4EfV9T#PLdbHd6^FQ~l|FC-_#1VW~fDjbFtKDm{8AsFd|eWMuoO#6GHC
z%~pfL7y`E5L>721dTcu@^{qKTrd*~lnyQRtbN%VsY<UuSJ1r}HXU;V`k69rEy{)5D
zpUza#i(=^**-;Y1E9vq<FrAT_be5%QYn-Y@w<@Y;A;o5L0%_kGlWJpdoKOR{))8a#
zMB>Lcj?>MOimeZc5#MNxRz}Iq=s251Zy~Jln}zcRhmox?A>o%TlKo#ScqzLzrRE~C
zFOGz*B@OqiO4c`@uET~Tw1XuU%hfU_ROm}f<5d(MQ-QgRv@SE_*fpvND^^K)y54xU
zk}m|9U}4%I1dX)3-e!4@`E9cv-)jj({=Wrz_8*aTMgI5a_htY8ci`W_FTpRs2f#Q8
zKps2}d<fb87VtVy0cCIrcrJJtcsnxw6c_>`=f4xY4O|0W2`&UDfuA6+{|NjLd<)zI
z#2!G-1w0F!1;ifURB#G-GI$dBJ@Wj0;ETYY3HVE{H-g84dy(tE3*;<78I-_Q@E9Ov
z{%7z>a2>cDYykt{Y_Jjh2HyMxd<eV_><3Q+at7cg@P6<<a6NbhcsaNlOoC^E@1qa+
zI5+?<0nY``0fXRi-~{kn^a9@jSA+AwW-tJr3?2!N0e7GyxE{O&TnbJD8$d5efn&kX
z(HDFIh`m4w90%+#8XRP|qs|bzgkepX57M3cvb07vS(=(lCqji4+2VDb$UW~&*UB}l
z5i1y%1k!aNwZP!BI$z4ILJ|u}EPOJh!Ucm{lYF-f-SUN!v2W#nDO{U;YByx7aIKy<
zFPX@CM;Bv3tOXC8#a}XE39}4dEY)V(X6jOe7Bs#Vg=u`e3gz)6`K$fzfh&_3txJbO
zlkF32a>^lY{%r+f3BT;V-9F*P+lugO0lSh<xNa6?v92y1VgMdAq3oh16KEJ1TSsa-
ztoXEKFzsA~Y+NNRx!p#}N1H$*UgC0*IiY!rATD)bDJ+*x`{5T4xuCHuYzfUo3uzBp
zM-5Y!uEe&;+$dd34kzfD2=<6ECYDu}H|ee#HgSv|?3ymKQ9~|EI{UE%XK#o3lY?VI
zreM~^55`5r^T#t{q7tDhrnW2t_j!eB-_(><ma141PFiDGT^O?TH*svuQq^GA%tnzK
zUyF2M51pyD)#ghos_AT;*amaQUBp-+Q4V&_dC=SZsK#7fz_}KoJWFJ-YCK1hevTf$
zxMHXMGxFShJM_vfz7tCdQc9H$O@uwALxnOYv$|8IdAXbScXM(#r*|S_&2bWfIHmZp
zJjGZ+pT<$;=^^(tCa=}x<y^>c9D{IH3h+X&UCOVv-0o`8+di(w{vHj>bptEIM|;5(
zY|8Ho#|NRBSGA24#b__M?E^j)y<0pYOje^$(vj}Pcr5kfr7yGiPZ|WrnP>y@zg%|;
zkpcNn)9zH^&loEnOE0wtEl<y9u!F7!HNV8MT+ujb&4{~HD<I7vqI022GBif{UlXP&
zrVZIOag>bCC@QdNBeHEqvM8s(B$MWpQ;alXy{7<r4^_Q~u2mj4=D0qiBx19v7l}-w
z(;$<H2R7F+TZmwpvUmb%VPPGiB`FJ8ctP9qEU+~z%TxZj26?{J&wy}VQphNq?Ww!w
z0mcjE(QH9NSlYWVP2jK=o93j_f~bvSu+lg632{0?p6DTg5>`NYjy?F1csqyU%8aSa
z$j(fsI3h9|pnWNwS~IiRQYjmkQcf>OVO&GzmQxZO6koYYkGx9Kt287G3pE)K)c#-W
z*|#FAiu~{W^NIZb9w7Vv&jA;Mv%y(F_WR!sZUC<auK=>we<rx+c+TE~ZQxPhVc-}b
zd;8x5HvqBu{}QtME5J*^OTb|;3(f#PMQ(p7*au|KemD3v^7=2q`#~0@z-N%pKMmdm
z-U_Bc89W_)1sVMca2WgrxQFun71#@;tldEN?7s`X1O6BIHn<TCgWHg~KLOqi{xcW?
zPXu=&XI~5C+<qS9zz>kKzYksv4uDI*bHR6y!L|a-fdxOq?+6D~*#J9CdzRmB6<Y@e
z<jkWS0?CN6lRx>mKO8Rdl;gwGYKxbdVT@DDIybPH_yr?q6sWgqi9e8uk-~za*i;uh
z)^&122_<M-0o}OZS+R_Gnw#aOu?+gU`ttc|s3J{!nY4~7zftp8W@14@#*aSw(Z`J1
zYl}v`2t#b54_wkSqS%e{*DLdgH|~|$l}&@F=k5zCBf6oz+@=;)9agqYdNzeS8j8v^
z_usPPCnGO)y=>w~HIOaE_-rwitQavX_8E_=FI7@&;41qN2iZ!B?38ovY?F+tbw<Py
zkzs6Ewv>m<TSl7NL${!nF!QS)y*RUDGZ&fsnMBrTaXzPrk<GGcdJNu9#2bh=wT_G6
z%e{a!WOEZUr73F+4GT=SmtMOu7Pp&ABxs#qNd8#ESt&9WJ0-=cx(x{O17uT$SaNdO
zhAzmZtTe2boueEf>%`#zLbcepcKI?x^&(DL^2`die#Q%{MI+}v&16QRDu?XsXt$eu
z*4c~pQ8KlON>H6}q~j7NTLQ&S-a1?UO~SB3Ft<6jl4oUa4>8l{yA9^Ke36JQ?%F_1
zJbDA=y4E5LSja|->(bZt_yDX@)j%~79I=)aCwn=0W6u{LT9l(bzL_yjktjQ)LYh>$
zp1sv0^EjL-nag}ZPzWSHsMI@g8B~O&(Aj1SyFWCueXbZ0Td4`w{P+eSKUO|SbGBE`
z4{OfD+?Ko~#4375?1yP1QnyxEYrc-zS7Dy%OuL>qT?y!FP1i5_Ds`B>*mVDq^x-=G
z`VCOLJk&Sjg#mr#WuPZR>iQ$D8l@wjw*5lctJt^TSVLUtKAuT*VJ*Yvs2&=UIEp$@
zf=hAE)zh{t8jShf1rE5!t3Y4Rxv3TtXB;46scGP;8v?`^gzsQ7JRV6e2pF~dsG~@_
z%jRsHr*x9L3(MJr)p6XTVE4il+T{I^ICO{_uPWZ^8n0{B`hqbbdAsz2q2;`lB$J9O
zr`fz^vs6DACEEM)CN4)T(eNGPhJ+29jW7*u2IqB>!)l9BmSD|BSj#Fu{b*M0u#`+n
zQ;LV(m<7^4NiV?^X05uB!q1O&4`pIx;G@^|xOQZ6L0CwV@gt%0rqX3y<6!Z^`Mn@p
zl0yhp^cYGg&JTG>=qpu2!N|)2gvhic4$Hu<VYSThV@M3{8nRiDNrzgl+f>z@cWQWf
z-N`gj3DskaxLvSE>X=y0I&-5;0Iqn^2laY$)3|+cdJs9Fh=RBRdu9J$_K+_`wiWq*
zD}UtodE|Vt{r@W<J^+sa9pK%__<8U|@FnE>&w)>a+rXQ_8^8z<x&I}|`C_}j8Jr6q
z4ju;n1sPw?{9gqg4`lED3*eLBdhl%UE#&)mf_H!l=mK9xt`}K<1l)s6|5k845Zn9^
zJQjQtxn9ome+|gK{FUGZ;9?-Y`ovaW&iLPjjQ>^e74Rys1&H1LP2m0DJwSN!Y#_4#
z6TlyX$Aia#6M&rOzZJY36u=ai1f$?&@I`b0*8n-Ee-;pZz_-9%;CAp?a1HQ3ar!W=
zt{+&I=q&FB5_>`jM|IVJ!GB!c%yIB>-W0O)ETgFXU|MuimVk>IQ>%My$CP#{=5U2F
zUbc%gQEvHFSA9~eLo-~Ze6A9iL~&O{Q6ih+B}pt9pjov?kyfidWL(!)cvS6AZTnXe
zzeQ_fG1|+q>D52B&Ay(_jOzYNdK1UEt9F#MhmqadS0@1$zca`z2k8an!h8uIF4=-;
zw9VFFT%6?iBjB|Oe5<f<#{W%-k0;SIOIyN+0Hbm~n8w#sFOG8pG=H@$7GBvrev4{Y
z>#gR=a9N2r+22u{frW1FnuK&N@#0gix<wdKjkj_ZB^d`>f}U#s<Isx|Z3x?v@*>iR
zTUp^QiQSW=6e1g*>c-3qF)ieT#M)?&Kfbrwuls~7YN2>;Pb_bY9Q4x$k=Q6ucSxgQ
zbsG7qGE;OCpe(a|HvGVAb;2BjFICYF7q`X|70&+bv&?7<!73Rd5(XM%0e@gJk$Ec%
zG=z&Kfh1KgWVI_|)x}MU0wcaKTSJin`uc|Uj_epP?94FDsmDS&Q=VApL_6Q?o}Z^N
z>id>#;cHiC^JXH<O(4bM1Cg_>v*npW-jqu?Um6YgS3-!+(X)A^wN%7$6D9*n^5SIL
z-tFsUdVp3<G@bhg1_QT<-C3^S_`xQaF9%h&jFeDRyyU<mSyWnvktE7*>qg+xNHhVz
z=xfyg;`ggp{V~nuR(>L8vg)^4t%R-kc<t{SrbrnLM|-7xM6s6S*Clb6@i{ouqQk`R
z7va|-IRM>$cZq2pEx*WZ6MkKy9i@(6nzJOo^nsOu9KS9Jg<lc-Okxb86)%_@kx6{V
zG+Q<5B4cc4)XEcjpD4blWRe4~;pax>WSCZrkJ`Y@^4OoG?JIyKPn#;Cy4cA|81Zb`
zuhpQkTCAo^Mr+hsEOe@P88u=>;RMwIRues)W90(+RIhPM8#$AJT?-00sYY8D_WxEh
zhpYGfp*p0C^+=-kuCESP8Oz=MXsOJIs&^*zs_h3;39sdUh$-QvYsyb_ydwK2Y`xl3
zx=SmN9|R)(dY{ZDe)%8@_Bsu_$p5z^o1Tk2EAqepzMk0q{}Xs8xEopj>)@4O16U7!
zihTbQAm0FZ4akD^pdUOHd=WWcz6J0>@G7ta91p&SyniE*{r(fddy)0UpZ}}CH9-9M
zi~N5sI2Ak%90z`f{QvLZKJWu@Gk6iW2<!(>0lz~pApZU34F4_Q^`Hta1kVPi0I}!)
z-{9Lo{QbWNybxRky1`G;4SWOK0ImaX0b<K9`h#P@N6`~p2i^i+0Or8OU_Cev+zTJS
z3H~d%1-uFbK=}JK@Ko>=Ah!J<L`U#4@HgNE;2bajo&i2Xoygz+1pb-pPl6l3)!=e)
zD!477JFqoxhkti42&eal82u07jT&Xx+=0nU4{UIb{Tsv06K*~cPFzV#JBq^9-H4as
zqbA;pQZltf0<D54OC@2*YCA!f%YXz65}DG(U8>?AWd#g(RF;$X2P$k-*`o!`)gong
z&1pF~Nl@3;vt*NV{1!9QxkS_KcTE#@fF#dIC(gIMzB4HXm;nvpJO<nJcV>3(*}G-L
z<fUiKr6$wtS@vibNjCT>lw?vuu{YPF9i(X}%_5=rwgZfboWGA`b`6c-@?G4btF{~0
zu+<dq<0qYBzM%+_#{@9|<{Xw3@4zXS%9%@PWTI}LREJfkP}-SN%~}U85D=Y<EB}0i
z?a^%hB6a+Gq3n6-ZfDx5pwpxR3o&EE6s~Mt!|MjDec#*LYo<iG$oVt%om>sZjXG~b
zsKJ^xo5O03^LgXy{yU@lmy9zafJt_ptu-9b?_H3PTYgxxyM?yZY&qf{2Or|q%XL9x
zAjG->7v8n8R43w>s8#U(lS2)Uui}%(?7!#9hL|uut(C;Oh-+oCY#T@6^_U`=K=)#%
zXK*@<21YFOWJxAU8o!29ZBG=-jF#sj(?Bl)W)Ve;I3@O6#wl<<yJ0SxXO)CH+Lr{<
zYNK5UYNsNyj&o`>(D$YCY_Alvu4`}<QS43f{;MSiLiw11HhQ}UZV#V!8GWuH^)#X=
zE^H_m@82og<K~02DYfsm7R_)%%^ca*s8F4vv_k%^)z}xvO_Z^Rbh7cN95Ep<+Aka#
z)(Z3NBcQ@i^@W<iqcM|e5!&nH^7u}qgC*Ta!n+BmzAx&;;{({4r#^soe;>g1e@~6_
zhlkijm+46QrhUHtQh7dapU=n7`}O&peXdTc`qWx}!(1wq?9qtikG32NEZ5X2nZYDM
z{9nrhX({1cI-F0Fu%6JI4NgTTh&`yN4^YYk!ULlFW*de!=72dl&p&OfxZAQ1eOl68
zEPYp^R7vt#n(ty`ON+uiS1#9dFZuWkq_YL--}BfGOmiNalfbICu86CtL={Pv+$?2T
zbQ*1iXCXwz7P(LsE|HGhVUetNhx876WWAFN%19S(Fej^pee_gD$#6rTSlvg@HpsJ(
z@h}x0>XtX%Vt#M?q)?LaHw^ny$yb(bj@`m%<+roV)!de9SY^=8>_`I%ac9%rC5vNp
zhhhjLZYon<y^L4xI`K}7wZtwZ(p_;aTWRGnA=8yM4B+eHEzOB=j3XPqdrAdGyPfii
z)HO*)jj1HjbENLANx2MyWu#jEP#ZK2L;t~uLL=5ujwxBHwlqT8T149pA$f>1Pfe+m
zpb5t2IsW@EdQL@*V_4#Ad)zf<LK%VR^fW%X61U^yB`iG1WzTwd4n>18BqHm_oMwbf
z29`D^W<MIl#Zy=Cgkxh-3F>4Z(*%JzK~vGe1_i&yBb?|w(J=@qKH~wy934IC95IcY
zdD9tdhNA6&QJu!MIcd!lJ0!4k5APSUlIqp=z>qgS6Ucd0_$!qcHy~{!Qti}Y;J%=u
z1G)A%Nu3v3UD9-Li5zp@GCTH@FQIJsM0&)~7K5@+-qS>20qn3r667^hBCytE6g;Lq
znyoSwUOvMV9@cO>FBMfrgDD5aRj;h%CA_XFmg#q3ffGEN!$J8h0N%m9Z(9nW+a73V
ztzo=4_K~Xf3m@zKR-1wQHb%66UDZiA*^0bFM7j(ovS#_>Kv)dLQT}wLoWp}BJt$lC
zYYDYU5Ot{>po-5gGPADOk46iBeQpt>38!t(l&(^@Z7>4A_bsq4O6(dWiQuS|#?3)9
zJ#Tf2HjF=j^FMJzDf0iDk$c7Goyh-Zy3Od{Aoq*^zxRWeftP|J5dVMYg8zdK;J<^b
z!ByZ&5CE|a*ax<PEx_vwp2W4-415gTz?EPx7y?}&1)czYkAC1z@Lq5g$b&&}BKVHe
z8;tW;&i;QKoxmaR3?OF%z6!1caxUP{fSm9DI%WDCxDi|rq>R^qR{}Zb@4x@Ij_XH)
zH^GNjgGsOv^nh<8^WOsIfB_#u#vcN5W?#+-ydB&C{up@MfZLJTKMSq_=Ycc8>EHzL
z3*>e=r~g_YXY|ux9q0z%LY{vccxx=z8~xeYWZgO$mgMJEeD-d!w`hCh*BGSf!z5Mb
z8`RAu;vKZCahlLtBATA9>S8R`wdj%wyHpx8y!=G;Xhnx3#~(@*-z?-vGJ&NMcXT9}
zy<93nDce@KwgRE}?oCz;j+yjF9_+>9Ws@+AW!KSIEJ%~K(WH`_@L#iT6KI=$+xb~0
z-CAnd5u_8eQP9QtyE5k4?*^vA2P*FibN4`{v@*IHyS+BFvV`J6OWJdSR^qbOd|+<m
zWjitAF!5E;CjItWXJf83R(pxXp&GANr^#EYt3!)pbse6bj{OUVP0sPiw4|So^%E<i
zgZMb}Zml8`$=Shi(E3rrXITBP(0fKVPEwLimQ;O%;8FKfQ@vEhVV_M$*Lu5ZK{s}V
z<<vq-H}P}<T`5oFCw!v=hY@G2PlWoSOvX%cS*yrLAR7wI_i$@r4tbo9*NAf=UBPZn
zrTD}xa#gIwavAFu8hA2Gl-v=YZI7^bXm(&hNfFEVz84>j#F7J!<gEN52imQGVJ5Pu
zY3$hw`x#%+TcY*i{FYVCiOiy8qk4yCq?NTk8T7phELN$9z@yPb06=@m*VuCI2nj7t
zPhN=SZA8#>GRa2KL1Wa=oRRrM3t1(e>Y`RBy6M)j=a-6&%A<AcxkT)`j$zqEO#PDX
zfk`R7)5}Y97CnV!RC3v8A4^ao%Ab^1gi9PNM|q`TEB$<vkS%odO+vQN)7wtZ*Cw+C
z@2yn|tA!FB11vf$a=*``mM&3@NbRi}wU#=2J6+Lg%H;yz>5EeHDrCP!%i+RoDdnHG
zrM|b{d)<<soME3p)J%Lkt`_#VWg_P~yyjM{rx^F*%4u*VC#C9@vPM2`XlT+xoF@Mq
zBQH4^;_hi5-1E~72|o`uaL)!2(h;YisO)voj~GkEKICN&-^!)$Nzdw8FWM>5Q?Vbe
zS}U>9#LJGV7ere=qehrIa<gwBH5Pr6Hl-Txw5HW%uDC)r7C1Y$KIqw?wJKRB9jmhN
zUlO(F*;JjO2cV+!YgV12N=fVbMRQ%>a3uk{#awq@)V2BfZd9}UJ=C37{j}>FiPoc?
zpb4Q?J)k~0sIOp$%E|v%AaDK%`7^oy|664Ie*xn6@8{rW;3jY_knaHg9guGTmcS4g
z1S#+s@Mv&5^89CjoDH}dTn@x0;I-g2;5lF;I17kB|GSXsZv&qK*Mc_y@$G*hI0sB4
z=l6mh@Iz$#Yrz|V`1E@;_$TD|*8wlr%Rc|tz}vtPAie;^ci)-d4DfSg`5%J&!4JUq
z!HwY6;1kIASAi?RW^g~U{e9r$pa5hqe-ro>^1S#1xDL#NIq+;Ce)-meU%}I#f*%9n
z@4tf2gPXwzf%xfrH`ouJ1%69Cd<A>~{7>+1kod_W#Ghev^N62oX2J~WG2Pp<ZeDQh
zuvFkSg1Egxbcyn**fRSgld=P;w;aIny<hwLWP)DeGFoCZ$!eBQjkHBt`n1qgBsj9%
z%cYsJtP)GM{6HAF!!`VDAC-9`-T|!}QQf#Fj9W~Pous7t?1){ADRl>%SZhVqV%KdA
zJI7JX7Q4pG8cWSN^(l6xWn9;}P`0@7Kg!M_dkGl~HQDxwazoDCHiZiwziEuceq_9%
zQQ>=E2T@^nDl<D}5|}Mlrr2s^?>leK-?q)1#kQ(^o|O^YYw9C)vuf)TZPjPCKA+3m
zA+V>W<AyAZPNW@?)-4V_p+<?kw%ctJ^ofaaKAKV_TPxMbKfr2kXDYZ-MYkjKAgQ_;
zIpB2b(X%SPRjg%%Yr1y0lge6j)LR<^;xS-yGBc^IyT{nSQP-DlvWyNLkDrzft3^*b
z8Y8IAL@@ulaY(5jh@5W%hNQYf`YPqfNVeF})H3ce%a#~0qOswjJy@w{F7e;+|HNRW
zUXoTmSSgtYZ?KA%P1boqDw@6-jF{C#vP(>Eb|I!MSl%!ldwVOUbrd1i9Ecf|6E12p
zo(g;Udb>&?s1L+c<T5|4CDj&}(aDr*QeHtI)aA=ki<h`}E?Bz6{j9rwQK5t%s`H`x
zYDj#dEjdxdg{6Apo2a0FfZbeObT`NaIdvJ4nG&B7i8eG@!W7Z1y`y!!uHxlIizzKG
z)nbd<|L<DGHmmMSIOlMKPduUMQX*!b<^F@!O0=2yh0qb*|34F1^{<g<MgHICWU5<{
z^ZyRWx&MzM&)*7O1TF&^a5mTojswSnFC*XI3El|A2fzTh2f6;6;8yT1a0M6z&jSa*
zqrs!Vy~y_>@9zNT0g?C51{=Y@BJbY=Hi6$F=l=%W3a$fh0X0wsJHR#|zWwD}e)pjZ
z*a;p19u7W*%zq<z7kEApoxol&2=;(u!7q{h2f=aRSa1ij|NFpu!HdC*fcN8nBG(3d
z2EKh7ycxU!ydDTIUkhFXc7tbu4PZSu27Cd2-w0};3Pe}X4@5U`2RH%z8rlDrdig&g
z?Ydz(WTY3vo}{%HlA%6Im>e}RacV)J#bdRSYKv~{ozqKXe^}co^MIFN<q0f5kIPDP
zS%Ih62G@bA{if6^!t#sN%<fLqSFAcjQxmO$+!ty^Rm7qP!))ARa3No*r2~xWad4+m
zv7>CX5#?SI%1w+)Dl?tUO(mv7C6%OHuiz4ex)_N1q;+`}-SAZ`o1-e<thHCsM{DCT
z--n5F;-AE8Das>Lugj@F?;6`R!V;ZbC3g}qalAaCeTvi6#RmySYe$BLXhDJuRHs0C
zNUyx2bC6rZr6yNl+e2vuHX0@x+T`SD(^c2xTE!!S(5@T5jO-RyOIN#a_Q6_F$yn8}
z?mkps>xY`x`l0&nuBWrtx_g*+wSLKxoqclh(1Tj*c9ydLM{PilW9t-?35$u&Z#bqy
zE0<4_&txXDg)wHFMmb)lmR@gn;o4M!UL`nsdNOsm9E=shX>{4+d<`MFyY15Rr^}no
zk~7WoeAr;XU(3irC%wSo(GfQxd*-+R6tKr#+G0A-v8{5~v%c#v-AjI+vZoLhErnK5
zC1!_2%n~yZyRn#IaY0Gjs41IvOd6B)XzGqQh~vq|rAAw%HQRO~-#Jq#TtH(P)W+l5
zP@I%DrZK{3i#i~;_FzxEXo)mJP7SmPOSYhh14oV!vSg7P7A(puS>(hoNyS@2ne*7l
sARxzAsvKaj=f`w0LV{+qAO18qc*9s5#KL^EX4${wFeYq!u*R|f3;$<ZrvLx|

literal 0
HcmV?d00001

diff --git a/src/KOKKOS/compute_sna_grid_kokkos.h b/src/KOKKOS/compute_sna_grid_kokkos.h
index 571e09742e..830601c0fb 100644
--- a/src/KOKKOS/compute_sna_grid_kokkos.h
+++ b/src/KOKKOS/compute_sna_grid_kokkos.h
@@ -72,7 +72,7 @@ struct TagCSNAGridComputeZi{};
 struct TagCSNAGridComputeBi{};
 struct TagCSNAGridTransformBi{}; // re-order blist from AoSoA to AoS
 struct TagCSNAGridLocalFill{}; // fill the gridlocal array
-struct TagCSNAGridLocalFill2{}; // fill the gridlocal array using same kinda loop as ComputeForce
+//struct TagCSNAGridLocalFill2{}; // fill the gridlocal array using same kinda loop as ComputeForce
 
 struct TagComputeSNAGridLoop{};
 struct TagComputeSNAGrid3D{};
@@ -214,10 +214,7 @@ class ComputeSNAGridKokkos : public ComputeSNAGrid {
   void operator() (TagCSNAGridTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const;
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (TagCSNAGridLocalFill,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridLocalFill>::member_type& team) const;
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (TagCSNAGridLocalFill2,const int& ii) const;
+  void operator() (TagCSNAGridLocalFill,const int& ii) const;
 
  protected:
 
diff --git a/src/KOKKOS/compute_sna_grid_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
index b37082ca5f..ec55b5fae4 100644
--- a/src/KOKKOS/compute_sna_grid_kokkos_impl.h
+++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
@@ -47,8 +47,6 @@ namespace LAMMPS_NS {
 template<class DeviceType, typename real_type, int vector_length>
 ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::ComputeSNAGridKokkos(LAMMPS *lmp, int narg, char **arg) : ComputeSNAGrid(lmp, narg, arg)
 {
-  //respa_enable = 0;
-  printf("^^^ Begin ComputeSNAGridKokkos constructor\n");
   kokkosable = 1;
   atomKK = (AtomKokkos *) atom;
   execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
@@ -61,18 +59,7 @@ ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::ComputeSNAGridKokkos
 
   host_flag = (execution_space == Host);
 
-  // ComputeSNAGrid constructor allocates `map` so let's do same here.
-  // actually, let's move this down to init
-  //int n = atom->ntypes;
-  //printf("^^^ realloc d_map\n");
-  //MemKK::realloc_kokkos(d_map,"ComputeSNAGridKokkos::map",n+1);
- 
-
-  printf("^^^ wjelem[0]: %f\n", wjelem[0]);
-  printf("^^^ wjelem[1]: %f\n", wjelem[1]);
-  
-
-  printf("^^^^^ cutsq: %f\n", cutsq[1][1]);
+  // TODO: Extract cutsq in double loop below, no need for cutsq_tmp
 
   cutsq_tmp = cutsq[1][1];
 
@@ -83,31 +70,19 @@ ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::ComputeSNAGridKokkos
     }
   }
 
-
-  //memoryKK->create_kokkos(k_gridlocal,
-  //printf("^^^^^ gridlocal: %f\n", gridlocal[0][0][0][0]);
-
-
    // Set up element lists
-  printf("^^^ Begin kokkos reallocs with nelements = %d\n", nelements);
   MemKK::realloc_kokkos(d_radelem,"ComputeSNAGridKokkos::radelem",nelements);
   MemKK::realloc_kokkos(d_wjelem,"ComputeSNAGridKokkos:wjelem",nelements);
-  // pair snap kokkos uses `ncoeffall` in the following, inherits from original.
-  //MemKK::realloc_kokkos(d_coeffelem,"pair:coeffelem",nelements,ncoeff);
   MemKK::realloc_kokkos(d_sinnerelem,"ComputeSNAGridKokkos:sinnerelem",nelements);
   MemKK::realloc_kokkos(d_dinnerelem,"ComputeSNAGridKokkos:dinnerelem",nelements);
   // test
   MemKK::realloc_kokkos(d_test, "ComputeSNAGridKokkos::test", nelements);
 
   int n = atom->ntypes;
-  //printf("^^^ realloc d_map\n");
-  printf("^^^ n: %d\n", n);
   MemKK::realloc_kokkos(d_map,"ComputeSNAGridKokkos::map",n+1);
 
-  printf("^^^ begin mirrow view creation\n");
   auto h_radelem = Kokkos::create_mirror_view(d_radelem);
   auto h_wjelem = Kokkos::create_mirror_view(d_wjelem);
-  //auto h_coeffelem = Kokkos::create_mirror_view(d_coeffelem);
   auto h_sinnerelem = Kokkos::create_mirror_view(d_sinnerelem);
   auto h_dinnerelem = Kokkos::create_mirror_view(d_dinnerelem);
   auto h_map = Kokkos::create_mirror_view(d_map);
@@ -115,31 +90,20 @@ ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::ComputeSNAGridKokkos
   auto h_test = Kokkos::create_mirror_view(d_test);
   h_test(0) = 2.0;
 
-  printf("^^^ begin loop over elements, nelements = %d\n", nelements);
   // start from index 1 because of how compute sna/grid is
   for (int i = 1; i <= atom->ntypes; i++) {
-    printf("^^^^^ i %d\n", i);
     h_radelem(i-1) = radelem[i];
     h_wjelem(i-1) = wjelem[i];
-    printf("^^^^^ radelem wjelem %f %f\n", radelem[i], wjelem[i]);
-    printf("host^^^ radelem wjelem %f %f\n", h_radelem(i), h_wjelem(i));
     if (switchinnerflag){
       h_sinnerelem(i) = sinnerelem[i];
       h_dinnerelem(i) = dinnerelem[i];
     }
-    // pair snap kokkos uses `ncoeffall` in the following.
-    //for (int jcoeff = 0; jcoeff < ncoeff; jcoeff++) {
-    //  h_coeffelem(ielem,jcoeff) = coeffelem[ielem][jcoeff];
-    //}
   }
 
-  printf("^^^ begin loop over map\n");
-  // NOTE: At this point it's becoming obvious that compute sna grid is not like pair snap, where 
-  // some things like `map` get allocated regardless of chem flag.
+  // In pair snap some things like `map` get allocated regardless of chem flag.
   if (chemflag){ 
     for (int i = 1; i <= atom->ntypes; i++) {
       h_map(i) = map[i];
-      printf("%d\n", map[i]);
     }
   }
 
@@ -152,11 +116,9 @@ ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::ComputeSNAGridKokkos
   if (chemflag){
     Kokkos::deep_copy(d_map,h_map);
   }
-  // test
   Kokkos::deep_copy(d_test,h_test);
 
   double bytes =  MemKK::memory_usage(d_wjelem);
-  printf("^^^ bytes: %f\n", bytes);
 
   snaKK = SNAKokkos<DeviceType, real_type, vector_length>(rfac0,twojmax,
     rmin0,switchflag,bzeroflag,chemflag,bnormflag,wselfallflag,nelements,switchinnerflag);
@@ -171,10 +133,6 @@ template<class DeviceType, typename real_type, int vector_length>
 ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::~ComputeSNAGridKokkos()
 {
   if (copymode) return;
-
-  //memoryKK->destroy_kokkos(k_eatom,eatom);
-  //memoryKK->destroy_kokkos(k_vatom,vatom);
-  printf("^^^ Finish ComputeSNAGridKokkos destructor\n");
 }
 
 // Init
@@ -182,90 +140,10 @@ ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::~ComputeSNAGridKokko
 template<class DeviceType, typename real_type, int vector_length>
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::init()
 {
-  printf("^^^ Begin ComputeSNAGridKokkos init()\n");
-  // The part of pair_snap_kokkos_impl.h that allocates snap params is coeff(), and it 
-  // calls the original coeff function. So let's do that here: 
-
-  ComputeSNAGrid::init();
-
-  /*
-  // Set up element lists
-  printf("^^^ Begin kokkos reallocs\n");
-  MemKK::realloc_kokkos(d_radelem,"ComputeSNAGridKokkos::radelem",nelements);
-  MemKK::realloc_kokkos(d_wjelem,"ComputeSNAGridKokkos:wjelem",nelements);
-  // pair snap kokkos uses `ncoeffall` in the following, inherits from original.
-  //MemKK::realloc_kokkos(d_coeffelem,"pair:coeffelem",nelements,ncoeff);
-  MemKK::realloc_kokkos(d_sinnerelem,"ComputeSNAGridKokkos:sinnerelem",nelements);
-  MemKK::realloc_kokkos(d_dinnerelem,"ComputeSNAGridKokkos:dinnerelem",nelements);
-  int n = atom->ntypes;
-  //printf("^^^ realloc d_map\n");
-  printf("^^^ n: %d\n", n);
-  MemKK::realloc_kokkos(d_map,"ComputeSNAGridKokkos::map",n+1);
-
-  printf("^^^ begin mirrow view creation\n");
-  auto h_radelem = Kokkos::create_mirror_view(d_radelem);
-  auto h_wjelem = Kokkos::create_mirror_view(d_wjelem);
-  //auto h_coeffelem = Kokkos::create_mirror_view(d_coeffelem);
-  auto h_sinnerelem = Kokkos::create_mirror_view(d_sinnerelem);
-  auto h_dinnerelem = Kokkos::create_mirror_view(d_dinnerelem);
-  auto h_map = Kokkos::create_mirror_view(d_map);
-
-  printf("^^^ begin loop over elements, nelements = %d\n", nelements);
-  // start from index 1 because of how compute sna/grid is
-  for (int i = 1; i <= atom->ntypes; i++) {
-    printf("^^^^^ i %d\n", i);
-    h_radelem(i) = radelem[i];
-    h_wjelem(i) = wjelem[i];
-    printf("^^^^^ radelem wjelem %f %f\n", radelem[i], wjelem[i]);
-    printf("host^^^ radelem wjelem %f %f\n", h_radelem(i), h_wjelem(i));
-    if (switchinnerflag){
-      h_sinnerelem(i) = sinnerelem[i];
-      h_dinnerelem(i) = dinnerelem[i];
-    }
-    // pair snap kokkos uses `ncoeffall` in the following.
-    //for (int jcoeff = 0; jcoeff < ncoeff; jcoeff++) {
-    //  h_coeffelem(ielem,jcoeff) = coeffelem[ielem][jcoeff];
-    //}
-  }
-
-  printf("^^^ begin loop over map\n");
-  // NOTE: At this point it's becoming obvious that compute sna grid is not like pair snap, where 
-  // some things like `map` get allocated regardless of chem flag.
-  if (chemflag){ 
-    for (int i = 1; i <= atom->ntypes; i++) {
-      h_map(i) = map[i];
-      printf("%d\n", map[i]);
-    }
-  }
-
-  Kokkos::deep_copy(d_radelem,h_radelem);
-  Kokkos::deep_copy(d_wjelem,h_wjelem);
-  if (switchinnerflag){
-    Kokkos::deep_copy(d_sinnerelem,h_sinnerelem);
-    Kokkos::deep_copy(d_dinnerelem,h_dinnerelem);
-  }
-  if (chemflag){
-    Kokkos::deep_copy(d_map,h_map);
-  }
-
-  snaKK = SNAKokkos<DeviceType, real_type, vector_length>(rfac0,twojmax,
-    rmin0,switchflag,bzeroflag,chemflag,bnormflag,wselfallflag,nelements,switchinnerflag);
-  snaKK.grow_rij(0,0);
-  snaKK.init();
-  */
-
   if (host_flag) {
-
-    // The following lmp->kokkos will compile error with pointer to incomplete class type not allowed.
-    //if (lmp->kokkos->nthreads > 1)
-    //  error->all(FLERR,"Compute style sna/grid/kk can currently only run on a single "
-    //                     "CPU thread");
-
-    //ComputeSNAGrid::init();
     return;
   }
-
-  printf("^^^ Finished ComputeSNAGridKokkos init\n");
+  ComputeSNAGrid::init();
 
 }
 
@@ -274,11 +152,10 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::init()
 template<class DeviceType, typename real_type, int vector_length>
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::setup()
 {
+
   // Do not call ComputeGrid::setup(), we don't wanna allocate the grid array there.
   // Instead, call ComputeGrid::set_grid_global and set_grid_local to set the n indices.
-  //ComputeGrid::setup();
-  printf("^^^^^ SETUP!\n");
-  //printf("^^^^^ gridlocal: %f\n", gridlocal[0][0][0][0]);
+
   ComputeGrid::set_grid_global();
   ComputeGrid::set_grid_local();
   
@@ -303,20 +180,7 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::setup()
 template<class DeviceType, typename real_type, int vector_length>
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::compute_array()
 {
-  printf("^^^ Begin ComputeSNAGridKokkos compute_array()\n");
-
-  if (DeviceType::in_parallel()) {
-    printf("^^^ compute_array() is a host function\n");
-  } else {
-    printf("^^^ compute_array() is not a host function\n");
-  }
-
   if (host_flag) {
-    /*
-    atomKK->sync(Host,X_MASK|F_MASK|TYPE_MASK);
-    PairSNAP::compute(eflag_in,vflag_in);
-    atomKK->modified(Host,F_MASK);
-    */
     return;
   }
 
@@ -325,53 +189,26 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::compute_array()
   zlen = nzhi-nzlo+1;
   ylen = nyhi-nylo+1;
   xlen = nxhi-nxlo+1;
-  printf("^^^ nzlo nzhi nylo nyhi nxlo nxhi: %d %d %d %d %d %d\n", nzlo, nzhi, nylo, nyhi, nxlo, nxhi);
   total_range = (nzhi-nzlo+1)*(nyhi-nylo+1)*(nxhi-nxlo+1);
 
   atomKK->sync(execution_space,X_MASK|F_MASK|TYPE_MASK);
   x = atomKK->k_x.view<DeviceType>();
-  // This will error because trying to access host view on the device:
-  //printf("x(0,0): %f\n", x(0,0));
   type = atomKK->k_type.view<DeviceType>();
   k_cutsq.template sync<DeviceType>();
 
-
-  MemKK::realloc_kokkos(d_ninside,"PairSNAPKokkos:ninside",total_range);
-
-  //printf("^^^ nzlo nzhi nylo nyhi nxlo nxhi: %d %d %d %d %d %d\n", nzlo, nzhi, nylo, nyhi, nxlo, nxhi);
-  
   // Pair snap/kk uses grow_ij with some max number of neighs but compute sna/grid uses total 
   // number of atoms.
-  
-  //const int ntotal = atomKK->nlocal + atomKK->nghost;
+
   ntotal = atomKK->nlocal + atomKK->nghost;
-  //printf("^^^ ntotal:  %d\n", ntotal);
+  // Allocate view for number of neighbors per grid point
+  MemKK::realloc_kokkos(d_ninside,"ComputeSNAGridKokkos:ninside",total_range);
 
-  // ensure rij, inside, and typej are of size jnum
-  // snaKK.grow_rij(int, int) requires 2 args where one is a chunksize.
-
-  chunk_size = MIN(chunksize, total_range); // "chunksize" variable is set by user
-  //printf("^^^ chunk_size: %d\n", chunk_size);
+  // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user 
+  chunk_size = MIN(chunksize, total_range);
   snaKK.grow_rij(chunk_size, ntotal);
 
-  // Launch 3 teams of the maximum number of threads per team
-  //const int team_size_max = team_policy(3, 1).team_size_max(
-  //    TagCSNAGridTeamPolicy, Kokkos::ParallelForTag());
-  //typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridTeamPolicy> team_policy_test(3,1);
-
-  // Using custom policy:
-  /* 
-  CSNAGridTeamPolicy<DeviceType, team_size_compute_neigh ,TagCSNAGridTeam> team_policy(chunk_size,team_size_compute_neigh,vector_length);
-  //team_policy = team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
-  Kokkos::parallel_for("TeamPolicy",team_policy,*this);
-  */
-
-
-  chunk_size = total_range; 
-  printf("%d %d %d\n", chunk_size, team_size_compute_neigh, vector_length);
-  // team_size_compute_neigh is defined in `pair_snap_kokkos.h`
-
-
+  //chunk_size = total_range;
+ 
   // Pre-compute ceil(chunk_size / vector_length) for code cleanliness
   const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length;
 
@@ -443,7 +280,6 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::compute_array()
   }
 
   //Compute bispectrum in AoSoA data layout, transform Bi
-  //if (quadraticflag || eflag) {
 
   //ComputeZi
   const int idxz_max = snaKK.idxz_max;
@@ -465,33 +301,12 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::compute_array()
       policy_transform_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_transform_bi,1});
   Kokkos::parallel_for("TransformBi",policy_transform_bi,*this);
 
-  //Looks like best way to grab blist is in a parallel_for
-
-  //GridFill
-  /* 
+  // Fill the grid array with bispectrum values
   {
-    int scratch_size = scratch_size_helper<int>(team_size_compute_neigh * ntotal);
-
-    SnapAoSoATeamPolicy<DeviceType, team_size_compute_neigh, TagCSNAGridLocalFill> 
-      policy_fill(chunk_size, team_size_compute_neigh, vector_length);
-    policy_fill = policy_fill.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
-    Kokkos::parallel_for("GridLocalFill",policy_fill,*this);
-  }
-  */
-
-  //GridFill2
-  {
-    typename Kokkos::RangePolicy<DeviceType,TagCSNAGridLocalFill2> policy_fill(0,chunk_size);
+    typename Kokkos::RangePolicy<DeviceType,TagCSNAGridLocalFill> policy_fill(0,chunk_size);
     Kokkos::parallel_for(policy_fill, *this);
   }
 
-
-  // populate the gridlocal array
-  // best to do parallel loop over grid points again
-  // ...
-
-  // d_grid(0,0) = 1.0; // attempt to access inaccessible memory space
-
   k_gridlocal.template modify<DeviceType>();
   k_gridlocal.template sync<LMPHostType>();
 
@@ -500,9 +315,6 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::compute_array()
 
   k_gridall.template modify<DeviceType>();
   k_gridall.template sync<LMPHostType>();
-  
-
-  printf("^^^ End ComputeSNAGridKokkos compute_array()\n");
 }
 
 /* ----------------------------------------------------------------------
@@ -517,16 +329,8 @@ template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridComputeNeigh>::member_type& team) const {
 
-  // this function is following the same procedure in ComputeNeigh of PairSNAPKokkos
-  //printf("d_wjelem[1]: %f %f %f %f\n", d_wjelem[1], d_wjelem[0], d_wjelem(1), d_wjelem(0));
-  //artificially set values here since we can't get the deep_copy to work
-  //d_wjelem[1] = 1.0;
-  //d_radelem[1] = 0.5;
-  //printf("%f\n", rnd_cutsq(1,1));
-
-  //Print the test view to see that the deep copy works:
-  //printf("%f\n", d_test(0));
-
+  // This function follows similar procedure as ComputeNeigh of PairSNAPKokkos.
+  // Main difference is that we don't use the neighbor class or neighbor variables here.
 
   SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
 
@@ -534,14 +338,11 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
   // team_rank : rank of thread in this team
   // league_rank : rank of team in this league
   // team_size : number of threads in this team
-  //printf("%d %d %d\n", team.team_rank(), team.league_rank(), team.team_size());
 
   // extract loop index
   int ii = team.team_rank() + team.league_rank() * team.team_size();
   if (ii >= chunk_size) return;
 
-  //d_gridall(ii,0) = 100.0;
-
   // get a pointer to scratch memory
   // This is used to cache whether or not an atom is within the cutoff.
   // If it is, type_cache is assigned to the atom type.
@@ -549,11 +350,8 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
   const int tile_size = ntotal; // number of elements per thread
   const int team_rank = team.team_rank();
   const int scratch_shift = team_rank * tile_size; // offset into pointer for entire team
-  //printf("ntotal scratch_shift: %d %d\n", ntotal, scratch_shift);
   int* type_cache = (int*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(int), 0) + scratch_shift;
 
-  //printf("ii: %d\n", ii);
-
   // convert to grid indices
 
   int iz = ii/(xlen*ylen);
@@ -565,10 +363,10 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
   ix += nxlo;
 
   double xgrid[3];
-  //int igrid = iz * (nx * ny) + iy * nx + ix;
 
-  // these end up being the same...?
-  //printf("ii igrid: %d %d\n", ii, igrid);
+  // index ii already captures the proper grid point
+  // int igrid = iz * (nx * ny) + iy * nx + ix;
+  // printf("ii igrid: %d %d\n", ii, igrid);
 
   // grid2x converts igrid to ix,iy,iz like we've done before
   //grid2x(igrid, xgrid);
@@ -578,7 +376,6 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
   const F_FLOAT xtmp = xgrid[0];
   const F_FLOAT ytmp = xgrid[1];
   const F_FLOAT ztmp = xgrid[2];
-  //printf("rtmp: %f %f %f\n", xtmp, ytmp, ztmp);
 
   // currently, all grid points are type 1
   // not clear what a better choice would be
@@ -589,19 +386,14 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
   const double radi = d_radelem[ielem];
 
   // We need a DomainKokkos::lamda2x parallel for loop here, but let's ignore for now.
+  // The purpose here is to transform for triclinic boxes.
   if (triclinic){
     printf("We are triclinic %f %f %f\n", xtmp, ytmp, ztmp);
-  } else {
-    //printf("We are not triclinic\n");
-  }
-
-  // can check xgrid positions with original
-  //printf("%f %f %f\n", xgrid[0], xgrid[1], xgrid[2]);
+  } 
 
   // Compute the number of neighbors, store rsq
   int ninside = 0;
   // want to loop over ntotal... keep getting seg fault when accessing type_cache[j]?
-  //printf("ntotal: %d\n", ntotal);
   Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,ntotal),
     [&] (const int j, int& count) {
 
@@ -621,24 +413,15 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
     const F_FLOAT dx = x(j,0) - xtmp;
     const F_FLOAT dy = x(j,1) - ytmp;
     const F_FLOAT dz = x(j,2) - ztmp;
-    //printf("dx: %f\n", dx);
 
-    //const double rsq = delx * delx + dely * dely + delz * delz;
     int jtype = type(j);
-    //printf("jtype: %d\n", jtype);
-    //int jelem = 0;
-    //if (rsq < cutsq[jtype][jtype] && rsq > 1e-20) {
     const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
 
-    //if (rsq >= cutsq_tmp){
     // don't include atoms that share location with grid point
-    if (rsq >= rnd_cutsq(itype,jtype) || rsq < 1e-10) {
+    if (rsq >= rnd_cutsq(itype,jtype) || rsq < 1e-20) {
       jtype = -1; // use -1 to signal it's outside the radius
-    } else {
-      //printf("jtype rsq rnd_cutsq: %d %.11f %f\n", jtype, rsq, rnd_cutsq(itype, jtype));
-    }
+    } 
 
-    //printf("j: %d\n", j);
     type_cache[j] = jtype;
 
     if (jtype >= 0)
@@ -646,12 +429,9 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
 
   }, ninside);
 
-  //printf("ninside: %d\n", ninside);
-
   d_ninside(ii) = ninside; 
-  //printf("%d\n", d_ninside(ii));
 
-  // TODO: Make sure itype is appropriate instead of ielem
+  // TODO: Adjust for multi-element, currently we set jelem = 0 regardless of type.
   Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team,ntotal),
     [&] (const int j, int& offset, bool final) {
 
@@ -663,45 +443,16 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
         const F_FLOAT dy = x(j,1) - ytmp;
         const F_FLOAT dz = x(j,2) - ztmp;
         int jtype = type(j);
-        //printf("jtype: %d\n", jtype);
-        if (dx==0 && dy==0 && dz==0){
-          printf("rij: %f %f %f\n", xtmp, ytmp, ztmp);
-        }
         int jelem = 0;
         if (chemflag) jelem = d_map[jtype];
-        //d_wjelem[jelem] = 1.0;
-        //d_radelem[jelem] = 1.0;
         my_sna.rij(ii,offset,0) = static_cast<real_type>(dx);
         my_sna.rij(ii,offset,1) = static_cast<real_type>(dy);
         my_sna.rij(ii,offset,2) = static_cast<real_type>(dz);
         // pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp
         // actually since the views here have values starting at 0, let's use jelem
         my_sna.wj(ii,offset) = static_cast<real_type>(d_wjelem[jelem]);
-        //my_sna.rcutij(ii,offset) = static_cast<real_type>((radi + d_radelem[jtype])*rcutfac);
         my_sna.rcutij(ii,offset) = static_cast<real_type>((2.0 * d_radelem[jelem])*rcutfac);
         my_sna.inside(ii,offset) = j;
-
-        //printf("%f\n", my_sna.wj(ii,offset));
-
-        //printf("jelem: %d\n", jelem);
-        //printf("rij: %f %f %f\n", dx, dy, dz);
-        //printf("params: %f %f %f\n", d_wjelem[jtype], d_radelem[jtype], rcutfac);
-        //printf("%f %f %f\n", my_sna.rij(ii,offset,0), my_sna.rij(ii,offset,1), my_sna.rij(offset,2));
-        //printf("%f %f %f\n", my_sna.wj(ii,offset), my_sna.rcutij(ii,offset), my_sna.inside(ii,offset));
-        // we can't use std::cout on device code, maybe make another function for this?
-        //std::cout << my_sna.rij(ii,offset,0) << std::endl;
-        //printf("%f %f %f\n", dx, dy, dz);
-        // apparently isnan is also a host function and not allowed here...
-        /*
-        if (isnan(dx) || isnan(dy) || isnan(dz)){
-          printf("Found a nan!\n");
-        }
-        if (isnan(d_wjelem[jelem]) || isnan(radi) || isnan(d_radelem[jelem]) || isnan(rcutfac) || isnan(j)){
-          printf("Found a nan 2!\n");
-        }
-        */
-        // Our best bet is to make another non-host function for printing
-
         if (switchinnerflag) {
           my_sna.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]);
           my_sna.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]);
@@ -722,24 +473,12 @@ KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeCayleyKlein,const int iatom_mod, const int jnbor, const int iatom_div) const {
   SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
 
-  //printf("^^^ ComputeCayleyKlein\n");
-
-  /*
-  if (DeviceType::in_parallel()) {
-    printf("operator() of TagCSNAGridComputeCayleyKlein is a host function\n");
-  } else {
-    printf("operator() of TagCSNAGridComputeCayleyKlein is not a host function\n");
-  }
-  */
-
   const int ii = iatom_mod + iatom_div * vector_length;
   if (ii >= chunk_size) return;
 
-  const int ninside = d_ninside(ii); // use d_ninside or ntotal?
+  const int ninside = d_ninside(ii);
   if (jnbor >= ninside) return;
 
-  //printf("ninside: %d\n", ninside);
-
   my_sna.compute_cayley_klein(iatom_mod,jnbor,iatom_div);
 }
 
@@ -752,7 +491,7 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
   if (ii >= chunk_size) return;
 
   int itype = type(ii);
-  //int ielem = d_map[itype];
+  // force ielem to be zero (i.e. type 1) per `compute_sna_grid.cpp`
   int ielem = 0;
 
   my_sna.pre_ui(iatom_mod, j, ielem, iatom_div);
@@ -777,7 +516,7 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
     const int ii = iatom_mod + vector_length * iatom_div;
     if (ii >= chunk_size) return;
 
-    const int ninside = d_ninside(ii); // use ntotal or d_ninside?
+    const int ninside = d_ninside(ii);
     if (jj >= ninside) return;
 
     my_sna.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div);
@@ -827,7 +566,6 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
 
     auto utot_re = my_sna.ulisttot_re_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div);
     auto utot_im = my_sna.ulisttot_im_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div);
-    //printf("^^^ utot: %f %f\n", utot_re, utot_im);
 
     if (mapper.flip_sign == 1){
       utot_im = -utot_im;
@@ -893,41 +631,7 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
 
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalFill,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridLocalFill>::member_type& team) const {
-
-  // this function is following the same procedure in ComputeNeigh so that we can fill the grid
-
-  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
-
-  // basic quantities associated with this team:
-  // team_rank : rank of thread in this team
-  // league_rank : rank of team in this league
-  // team_size : number of threads in this team
-  //printf("%d %d %d\n", team.team_rank(), team.league_rank(), team.team_size());
-
-  // extract loop index
-  int ii = team.team_rank() + team.league_rank() * team.team_size();
-  if (ii >= chunk_size) return;
-
-  //d_gridall(ii,0) = 100.0;
-
-  const auto idxb_max = snaKK.idxb_max;
-
-  // linear contributions
-  
-
-
-  for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
-    const auto idxb = icoeff % idxb_max;
-    const auto idx_chem = icoeff / idxb_max;
-    d_gridall(ii,icoeff) = my_sna.blist(ii,idx_chem,idxb);
-  }
-
-}
-
-template<class DeviceType, typename real_type, int vector_length>
-KOKKOS_INLINE_FUNCTION
-void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalFill2, const int& ii) const {
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalFill, const int& ii) const {
   SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
 
   const auto idxb_max = snaKK.idxb_max;
@@ -937,39 +641,11 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
   for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
     const auto idxb = icoeff % idxb_max;
     const auto idx_chem = icoeff / idxb_max;
-    //printf("blist: %f\n", my_sna.blist(ii,idx_chem,idxb));
     d_gridall(ii,icoeff) = my_sna.blist(ii,idx_chem,idxb);
-
-    if (icoeff == 0){
-      //printf("%f\n", my_sna.blist(ii,idx_chem,idxb));
-    }
   }
 
 }
 
-/*
-template<class DeviceType, typename real_type, int vector_length>
-KOKKOS_INLINE_FUNCTION
-void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridComputeNeigh>::member_type& team) const {
-
-
-}
-*/
-
-/* ----------------------------------------------------------------------
-   Begin routines that are unique to the CPU codepath. These do not take
-   advantage of AoSoA data layouts, but that could be a good point of
-   future optimization and unification with the above kernels. It's unlikely
-   that scratch memory optimizations will ever be useful for the CPU due to
-   different arithmetic intensity requirements for the CPU vs GPU.
-------------------------------------------------------------------------- */
-
-template<class DeviceType, typename real_type, int vector_length>
-KOKKOS_INLINE_FUNCTION
-void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagComputeSNAGridLoopCPU,const int& ii) const {
-
-}
-
 /* ----------------------------------------------------------------------
    utility functions
 ------------------------------------------------------------------------- */
diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h
index 55256f60cd..258fcb97a8 100644
--- a/src/KOKKOS/sna_kokkos_impl.h
+++ b/src/KOKKOS/sna_kokkos_impl.h
@@ -393,14 +393,9 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_cayley_klein(const
   const real_type z0 = r * cs / sn;
   const real_type dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq;
 
-  //printf("jnbor: %d %f %f %f %f %f\n", jnbor, x,y,z, rfac0, rcut);
-  //printf("%f %f %f %f %f %f %f\n", rscale0, r, rmin0, theta0, sn, cs, z0);
-  //printf("%f %f %f %f %f\n", x, y, z, rcut, rmin0);
-
   const real_type wj_local = wj(iatom, jnbor);
   real_type sfac, dsfac;
   compute_s_dsfac(r, rcut, sinner, dinner, sfac, dsfac);
-  //printf("^^^ sfac wj_local: %f %f\n", sfac, wj_local);
   sfac *= wj_local;
   dsfac *= wj_local;
 
@@ -521,8 +516,6 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_ui_small(const typ
   const complex b = b_pack(iatom_mod, jnbor, iatom_div);
   const real_type sfac = sfac_pack(iatom_mod, jnbor, iatom_div, 0);
 
-  //printf("^^^ %f %f %f %f %f\n", a.re, a.im, b.re, b.im, sfac);
-
   const int jelem = element(iatom_mod + vector_length * iatom_div, jnbor);
 
   // we need to "choose" when to bend
@@ -609,7 +602,6 @@ void SNAKokkos<DeviceType, real_type, vector_length>::evaluate_ui_jbend(const Wi
       ulist_accum.im = -rootpq * (b.re * ulist_prev.im - b.im * ulist_prev.re);
 
     }
-    //printf("^^^ ulist %f %f\n", ulist_accum.re, ulist_accum.im);
 
     ulist_wrapper.set(ma, ulist_accum);
   }
@@ -651,7 +643,6 @@ void SNAKokkos<DeviceType, real_type, vector_length>::evaluate_ui_jbend(const Wi
     }
 
     ulist_wrapper.set(ma, ulist_accum);
-    //printf("^^^ ulist_accum: %f %f\n", ulist_accum.re, ulist_accum.im);
     mb++;
   }
 
@@ -660,15 +651,10 @@ void SNAKokkos<DeviceType, real_type, vector_length>::evaluate_ui_jbend(const Wi
 
   for (int ma = 0; ma < j; ma++) {
     const complex ulist_prev = ulist_wrapper.get(ma);
-    //printf("ulist_prev %f %f\n", ulist_prev.re, ulist_prev.im);
 
     // atomic add the previous level here
     Kokkos::atomic_add(&(ulisttot_re_pack(iatom_mod, jjup + ma, jelem, iatom_div)), ulist_prev.re * sfac);
     Kokkos::atomic_add(&(ulisttot_im_pack(iatom_mod, jjup + ma, jelem, iatom_div)), ulist_prev.im * sfac);
-
-    // see if we can see this value
-    //printf("^^^ %f\n", ulisttot_re_pack(iatom_mod, jjup + ma, jelem, iatom_div));
-    //printf("^^^ sfac: %f\n", sfac);
   }
 
 }
@@ -759,7 +745,6 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_bi(const int& iato
 
             const complex utot = ulisttot_pack(iatom_mod, jju_index, elem3, iatom_div);
             const complex zloc = zlist_pack(iatom_mod, jjz_index, idouble, iatom_div);
-            //printf("^^^ %f %f %f %f\n", utot.re, zloc.re, utot.im, zloc.im); 
             sumzu_temp += utot.re * zloc.re + utot.im * zloc.im;
 
           }
@@ -784,7 +769,6 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_bi(const int& iato
             sumzu -= bzero[j];
           }
         }
-        //printf("%f\n", sumzu);
         blist_pack(iatom_mod, jjb, itriple, iatom_div) = sumzu;
             //} // end loop over j
           //} // end loop over j1, j2
@@ -885,7 +869,6 @@ typename SNAKokkos<DeviceType, real_type, vector_length>::complex SNAKokkos<Devi
   int jju1 = idxu_block[j1] + (j1+1)*mb1min;
   int jju2 = idxu_block[j2] + (j2+1)*mb2max;
   int icgb = mb1min*(j2+1) + mb2max;
-  //printf("^^^ na nb: %d %d\n", na, nb);
   #ifdef LMP_KK_DEVICE_COMPILE
   #pragma unroll
   #endif
@@ -903,7 +886,6 @@ typename SNAKokkos<DeviceType, real_type, vector_length>::complex SNAKokkos<Devi
       const complex utot2 = ulisttot_pack(iatom_mod, jju2+ma2, elem2, iatom_div);
       const real_type cgcoeff_a = cgblock[icga];
       const real_type cgcoeff_b = cgblock[icgb];
-      //printf("^^^ utot %f %f %f %f\n", utot1.re, utot1.im, utot2.re, utot2.im);
       ztmp.re += cgcoeff_a * cgcoeff_b * (utot1.re * utot2.re - utot1.im * utot2.im);
       ztmp.im += cgcoeff_a * cgcoeff_b * (utot1.re * utot2.im + utot1.im * utot2.re);
       ma1++;
@@ -921,7 +903,6 @@ typename SNAKokkos<DeviceType, real_type, vector_length>::complex SNAKokkos<Devi
     ztmp.re *= scale;
     ztmp.im *= scale;
   }
-  //printf("^^^ ztmp.re ztmp.im: %f %f\n", ztmp.re, ztmp.im);
 
   return ztmp;
 }
@@ -2315,8 +2296,6 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_s_dsfac(const real
   constexpr real_type zero = static_cast<real_type>(0.0);
   constexpr real_type onehalf = static_cast<real_type>(0.5);
 
-  //printf("^^^ flags: %d %d\n", switch_flag, switch_inner_flag);
-
   if (switch_flag == 0) { sfac_outer = zero; dsfac_outer = zero; }
   else if (switch_flag == 1) {
     if (r <= rmin0) { sfac_outer = one; dsfac_outer = zero; }
diff --git a/src/ML-SNAP/compute_grid.cpp b/src/ML-SNAP/compute_grid.cpp
index ad70df30e8..12135c705d 100644
--- a/src/ML-SNAP/compute_grid.cpp
+++ b/src/ML-SNAP/compute_grid.cpp
@@ -57,7 +57,6 @@ ComputeGrid::ComputeGrid(LAMMPS *lmp, int narg, char **arg) :
 
 ComputeGrid::~ComputeGrid()
 {
-  printf("^^^ begin ComputeGrid destructor\n");
   if (copymode) return;
   deallocate();
 }
@@ -113,7 +112,6 @@ void ComputeGrid::assign_coords_all()
 void ComputeGrid::allocate()
 {
   // allocate arrays
-  printf("^^^^^^^^^^^^^^^ ComputeGrid::allocate()\n");
   memory->create(grid, size_array_rows, size_array_cols, "grid:grid");
   memory->create(gridall, size_array_rows, size_array_cols, "grid:gridall");
   if (nxlo <= nxhi && nylo <= nyhi && nzlo <= nzhi) {
diff --git a/src/ML-SNAP/compute_sna_grid.cpp b/src/ML-SNAP/compute_sna_grid.cpp
index 9125b7dcd4..95c3fa70a8 100644
--- a/src/ML-SNAP/compute_sna_grid.cpp
+++ b/src/ML-SNAP/compute_sna_grid.cpp
@@ -31,7 +31,6 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) :
   // skip over arguments used by base class
   // so that argument positions are identical to
   // regular per-atom compute
-  printf("^^^ inside compute sna grid constructor\n");
   arg += nargbase;
   narg -= nargbase;
 
@@ -71,7 +70,6 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) :
   for (int i = 0; i < ntypes; i++) radelem[i + 1] = utils::numeric(FLERR, arg[6 + i], false, lmp);
   for (int i = 0; i < ntypes; i++) {
     wjelem[i + 1] = utils::numeric(FLERR, arg[6 + ntypes + i], false, lmp);
-    printf("^^^^^ ComputeSNAGrid wj: %f\n", wjelem[i+1]);
   }
 
   // construct cutsq
@@ -116,7 +114,6 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) :
       quadraticflag = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
       iarg += 2;
     } else if (strcmp(arg[iarg], "chem") == 0) {
-      printf("^^^ chem flag, creating map\n");
       if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
       chemflag = 1;
       memory->create(map, ntypes + 1, "compute_sna_grid:map");
@@ -188,15 +185,10 @@ ComputeSNAGrid::~ComputeSNAGrid()
 {
   if (copymode) return;
 
-  printf("^^^ begin ComputeSNAGrid destructor\n");
   memory->destroy(radelem);
-  printf("^^^^ CSG 1\n");
   memory->destroy(wjelem);
-  printf("^^^^ CSG 2\n");
   memory->destroy(cutsq);
-  printf("^^^^ CSG 3\n");
   delete snaptr;
-  printf("^^^^ CSG 4\n");
   if (chemflag) memory->destroy(map);
 }
 
@@ -207,15 +199,12 @@ void ComputeSNAGrid::init()
   if ((modify->get_compute_by_style("^sna/grid$").size() > 1) && (comm->me == 0))
     error->warning(FLERR, "More than one instance of compute sna/grid");
   snaptr->init();
-
-  printf("^^^ finished ComputeSNAGrid init()\n");
 }
 
 /* ---------------------------------------------------------------------- */
 
 void ComputeSNAGrid::compute_array()
 {
-  printf("^^^ inside ComputeSNAGrid compute_array()\n");
 
   invoked_array = update->ntimestep;
 
@@ -226,8 +215,6 @@ void ComputeSNAGrid::compute_array()
   int *const type = atom->type;
   const int ntotal = atom->nlocal + atom->nghost;
 
-  printf("^^^ ntotal: %d\n", ntotal);
-
   // ensure rij, inside, and typej are of size jnum
 
   snaptr->grow_rij(ntotal);

From 66def742c44871535b15fdb91878673e517b622c Mon Sep 17 00:00:00 2001
From: rohskopf <drew.rohskopf@gmail.com>
Date: Mon, 24 Apr 2023 11:29:04 -0600
Subject: [PATCH 16/51] Organize deallocator calls

---
 .../.compute_sna_grid_kokkos_impl.h.swo       | Bin 49152 -> 0 bytes
 src/KOKKOS/compute_sna_grid_kokkos_impl.h     |  48 +++++++++++++++++-
 2 files changed, 47 insertions(+), 1 deletion(-)
 delete mode 100644 src/KOKKOS/.compute_sna_grid_kokkos_impl.h.swo

diff --git a/src/KOKKOS/.compute_sna_grid_kokkos_impl.h.swo b/src/KOKKOS/.compute_sna_grid_kokkos_impl.h.swo
deleted file mode 100644
index 7f92a608886209c3570518c3c072aa00307d6609..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 49152
zcmeI53w-2NmFI)+0YS&{J?qRR3{Xi|9^Jf}&@}cF+U}%Vk_M$^ib~Z#$v>$_sH$|5
zZd$=7DmpqoP#F{-%&afQk;RWQpA}toc11;86(5MAtEk{x$8mMWb-(A_d;kA`RjQIo
zCkQ*6`t+~!f82ZSz2`p9x#yn$X#YjK2F=Fa{tkYR@96m68&A)F{k>^(+rX1LviVAR
zqB<pS7X4M%_n)!x?9)zLG!QSJ?+Er4vc=+bwXa&q_3hrfd-vYqzFfIDJyQ!Z)lxPy
zUJ3J=sq)lRxta-!(}muNr3kORU*o_U2NvhROl_=Z<MAD*ZCI~e{iMzl&7VAB|Kf4h
z-mY<AjRR{OSmVGN2i7>S#(^~stZ`tC1OK5oP@6us;{}xHVXjo8?(Z=Te;?uQRe0cf
ztl|A5-Ti?3dt1Z%N4fir?(aD+0iVvJ-Se&P@2eZ$KgK=x63cxJ@BhfXKhyoa&=ttj
zKei$MKQz2Q&b{B~{yyR1_QJ<^w;g}!-d7vmyT@yP);O@nfi(`SabS%DYaCePz#0eE
zIIzZnH4dzCV2uM0Ne*Q59UXU(v*(LE%>MtV%jJ(A*U|9~@H}u5`1uJP9q$99;OXGk
z$9Hsm1Wbc6I2nBBu^k;ZgE0_*XM#>}B6tk=8AA3q!Dqk+z~6uugAnWi8^DR+c<@;8
zZwT!_0C$2ffscWYg7<(ofos84;7V{gK$X$a3*L+};9Bq!5P-h`j{@IAiEuA?6}T8Y
z6PyR044wo&j}qbz@L#~q;AP+v@C@)%6d3n`d%<_XZQwBI2j4*X@d|JyI1`)!o&mmr
z(&TRN4loJ2!TsnBJ`8RGhrl=(1M9%=se^BVkAfS4)X5p(7bqdV2)^Kz5sSAZqrZtv
zH7u2aN>B)jsj$nOYux=3uL*u2y!4z72^Nl-RCPA2<tCKWSRp&!by(lX+m<P`M`F4X
zmW0fT2~A{XCv7Qv)=N3<T~vsPOeLF7zIV4Sz*xCrQemlPLZYrehd-OnHQ8FZ*t5A*
zo1YG<JX*J|%SgHV`b@Q!t<=m|rCc;&DId(4^=354WoN2^DUX?n@~m;DECy4SJ~3r6
zVYP#}6Pa32t)==UhZ}nPg~8dGTG>74;cm5f85qb_f-IIW#jsK-S272~U^bQ4xLp#5
zEC`)Ab}>7>RAho^T$f%xY8OjIj6){l$L(sV_;JW&d|N+D#kHZ5@m#$v710GFCQKR(
zio16Y48YPtp`3G8Meq}rq&v4+qod)WE$35Dp8ve+OqbTJ*QBLklt{6zK`cgCDkp7P
zkd~L7X;YHc7?f&NO(L$*R+){fp+%nK0OY)mcq@`|&9opH2WXe1t*{p4?8|mZxhiZy
zYrws3q$Yi1IH;MKY2$cc3SqUTz3&KZ!5I6$F;kh`Os#r(Z!N&QjLv3?<$O3cziH3l
zz6;9L+6di#b4n{mV)@V$wljw5N$JB4Pbbql#wy)RtvD@x)tG2-nIr~-%jCkrWs<=~
z#+E4PlH(A8qhuhf*9X>L%0F%LaA9Qc_PqmU$6T$F&DCrGMk|vq&x{rVQz)0G8MF%J
zS(;**))?e9^@ykrSMVVt13oFFg1LZ~VY!s4PG@t0Irm&6e6n4vQp#I)64G5ogIoN@
zvnyh`3~ZaW2M5Dkkl?oq2B}(pCR@miO=m0FVl~yGLur;f)5+esZhgb8&1F7Y%NDcM
zsf?{^64)`gZ{)d|!7anPCA-mkhKl@xw+k2ma~(^Dg(Ewe4{TYIMH(1Tb}2WQP<HQz
zZfc8M=}>RFyL4QU_txAoE7c<nrQ>i%v%x7rJZ)r5%g79sHV1=Tjk%%_l*Vfln+FD(
z#ZB$mGB~(z*mM_)(`i}*<65>do;JCOY{hhUbKPYIOnuChhPB}&K!>lrO}Rq0S~YR?
zrH$mIlr09yd`T+xdF$w)d2TD0s<q0D<hi3m;@DPYGng3~X0GixDIbi5rN9hsdCtXq
zFWhcUOFr4NYiNgA-@oB>!iA+=VJ07#O(883o)kAHALe)#mc}<H!tp3y2uJH46|?Xy
z5wCM*ng$W{PISf*JDJesL0?~#raW?UN`25$5ROlbQYE%to4m|ag3D)^4w}7^QLFyB
zZL|%~f?~Nc-!z3lmJaoybLC>O{)u!N=E3SzwaN21-w6gxXTyqE=uS6HloXno@Cxyc
zp4Ix9?ml`;i{Fle*^;?92uq_uW!$8OF50zy*Oo2`Pm7}+j53Cm#%0+xQLa>J>=P9@
z9TI7zQm)lZYItZ*SK18McrsWnR|5B>qp#bnP61Y*6T?AZY7>FEaL1PIgFDPv$Oytv
zFOO>F>7GJxFen(;_48px2a5S#;fVw}e`vosKR`;z7R){-hcIXMFd&zzfd<J1m0A{R
z6<TwxYUzemQ!S6xX0w%`*O-xs#62rrlZT_qX(p8!7RVJzYWt3!E!+A}?>YPQEqgYe
z(cj;15Gp3jaJG~Wvs6xHx?IW1{K7CouJ-p1_nPy|2ZKsUmPw2#l8dTP+E$*P*W8%Y
zhW`H3x|+}+SNkPl%Y&d&tjfw<TL8165b^8~-jt^`i5dgPfXUk@L8bakYogv;u8d2J
ziCS&CI?&fQJ3HGOzv-i`*dh#R?qTAdEWMtdRS5&luS=?j7nSj#r<%)BKQaY`xxT@2
zfjOkW@H@z08j_iLvybVnP@u|&!y>O_gp|hD-D7g=*6H7)Z-_GVGH#c~dsu5Q;=z_N
zZ2UVqK7?HPEaXp-|NZ{<?~(aG0{#}H!Trem_ksTgJ_i04+z2iQ7l1zn-$Dm)H~2dE
z8u%)BHJAr!unydZ9^msJ1D*wTf!mSwKMSr0)1U;3U>$e@cq=mf%fZ#)kHBNV*O2Gm
z2i^nT4aULw;BMsjonQwz0X!BQ2W~}%{|Kmp?cmqQ?f(jX1?~bR@JR3oa38Y!E#PMG
zPVjbc19%Cz0PF-kU>)cNPXxa~wtuaY@w@nY0=OHV-UU7lJ_Y^<cptbNl)*({1Pp>b
zK<eNEuoG+rj|O)k|BJ1{5pV?<1zB)5*a)5qo&r|;cZAt6pnHyW$f9{`wi+Orx)DBG
zsbr5V6<}T#(1$w?E)sB&S3yP+@ifXv(SBvpZo1{R0}AnF1+EtxCF&9REy1y7g-J@D
zvZ78b@L0wr`AzKBt1p%=aV>?Vu;zH5<lEv&*+`3~z%bajKD6uDM5Y-Khqmm?*h4)T
z=A=FKPcGOyJd(L+$F@B?hBh}YF3n+c4icKL=f(5eDpblYOx>`%$Xa)?qMj6vvHCPf
zgT5r^@5AmWW^qHsXghDk>t<p>L;S_^l1R0UifxwIAE;`#@3?5!wjD=Nxy|z2ta@#@
zRjJ_BUxro6vbGGW21Q`rjLsV`DY1ANLkU|#YSyO7vS1ZgldDEb*45h7SZmExN@h66
zDke8^5kk61=SZ>>#h{2v`B0ELm@Oc=WTt}(YRPQT%GV<PNvE+x!oYyU+*GwxR7&km
zNAI==W7(NPtzLvC2^X4#M6gkzFph*Ki>D)%Y*?#q@~P}E6|<%6c#z+0HaF(^oDLB`
zo3*660c#O4+Gj{gotX#<lvp*zjSN6u_0_#85`%qYd9XQtfAoU0RbC0lwHnY9MAwLM
zbA5G!w&)Z@dmjbqu6qr)qBjz`OH@8}PayA^JVKKD%E$)uNyd$2$A@e41r%jrX{MO=
zfw$}*+1rI;&E-;vr@CuM2~h~Y#Jw=c&!GFsj;Fe86O!txi+E1Q!B(ScGABumLtQCl
z3Y`dYQ`$_GX-M6YsMYsNwp9^wN|dUo`J<||RTY)9@r50aq<6Hh{P>SnhdaxaKX3&i
zx;$vLcl4rHLrkZ|V9F?jwRtl(Q_6|-yQ)(I%TcfJ&g+qC<jPbSro&u07#kA}c9Ag_
zl@?Mz`%>~IKbN7u6y;)+E@}AXL{(=us{CtWS{kO$NJ#18%>s|c0Z*<A7;CyUD+KFj
zkIi0hG9|t65A4nxSwVI!Fu@`$lO?Ci5svYyK98CoXi~J@yL;md5|Fj29j4kFJsg8P
ztyLI}%FG}l)2F&jihi@r?L6%%qZZg177A!%mEVknr<jlp1$%qA!!4+H4T<J6vvdE@
zwvk<XhgM@m68V1~d3rx`w#fhePLTW>GXEFB9pH1|`JezcgOkCZgFgjN2R}ph{}lKD
zcs|$zL?>_-_!ICr@Iz$(`@uhg4+F6ocp~^6^8b&)kHGyv^aEc9cY!y7?O-eD1}ShV
z_;c`Q;BNB#dhpla1)vJbK+5uP@H@)&W$^dl4PXxt{lSmW6Wk7N0xtxjH|PMrM^A7&
z5c_~Za6GsZUBPvr2nrw#9uIzup5RXKLeK~9MF;SDunYVfvi}#rhr!L@gWw;))4{#S
z^xp;F0Y||3;3OdX{(lc{0`CVCpdZ|UZsv3C?fXm38Ge0zQ4gC;^e}g=J*>@O3>55H
zDGk2`zLJ=2WUOh5^nF__`LkI0?E8lQSC*<HwiFQwX2Zl+I#0E#0WIiq!cHzL@Paz?
z*;_hOzP?wI9yKPL1u_UvyS*)ujH>3cQf`HGCA#~ys1)9|c$4e`$SUmyY|;r97H}1*
zts*ILqGD!NkKpQ2vRj~HH^Z;3`f95hR03~Bvek3eA5qg-Bsc9u_sbXGi>%?0l+a8|
z{ACn%d7))h_aG0_TELZ5<w{EI8N9soCB<b_Nj;MjgTPA2EF%X?B(sv&OT--;Gx6HI
zS>u-Bp5OA=A2-+pFBk16?IqD>PHeC?12<iow8&0_+G$|G>u$YBX4Pz7OEZD$UyW^K
zv|!I<rCN`rRKD#iEU1=U>?c^=peU4bY`|iT5Om3|ESkcVnyNg%nSySyy^dw&eLEHw
z-q1t^VkZ^4EfV9T#PLdbHd6^FQ~l|FC-_#1VW~fDjbFtKDm{8AsFd|eWMuoO#6GHC
z%~pfL7y`E5L>721dTcu@^{qKTrd*~lnyQRtbN%VsY<UuSJ1r}HXU;V`k69rEy{)5D
zpUza#i(=^**-;Y1E9vq<FrAT_be5%QYn-Y@w<@Y;A;o5L0%_kGlWJpdoKOR{))8a#
zMB>Lcj?>MOimeZc5#MNxRz}Iq=s251Zy~Jln}zcRhmox?A>o%TlKo#ScqzLzrRE~C
zFOGz*B@OqiO4c`@uET~Tw1XuU%hfU_ROm}f<5d(MQ-QgRv@SE_*fpvND^^K)y54xU
zk}m|9U}4%I1dX)3-e!4@`E9cv-)jj({=Wrz_8*aTMgI5a_htY8ci`W_FTpRs2f#Q8
zKps2}d<fb87VtVy0cCIrcrJJtcsnxw6c_>`=f4xY4O|0W2`&UDfuA6+{|NjLd<)zI
z#2!G-1w0F!1;ifURB#G-GI$dBJ@Wj0;ETYY3HVE{H-g84dy(tE3*;<78I-_Q@E9Ov
z{%7z>a2>cDYykt{Y_Jjh2HyMxd<eV_><3Q+at7cg@P6<<a6NbhcsaNlOoC^E@1qa+
zI5+?<0nY``0fXRi-~{kn^a9@jSA+AwW-tJr3?2!N0e7GyxE{O&TnbJD8$d5efn&kX
z(HDFIh`m4w90%+#8XRP|qs|bzgkepX57M3cvb07vS(=(lCqji4+2VDb$UW~&*UB}l
z5i1y%1k!aNwZP!BI$z4ILJ|u}EPOJh!Ucm{lYF-f-SUN!v2W#nDO{U;YByx7aIKy<
zFPX@CM;Bv3tOXC8#a}XE39}4dEY)V(X6jOe7Bs#Vg=u`e3gz)6`K$fzfh&_3txJbO
zlkF32a>^lY{%r+f3BT;V-9F*P+lugO0lSh<xNa6?v92y1VgMdAq3oh16KEJ1TSsa-
ztoXEKFzsA~Y+NNRx!p#}N1H$*UgC0*IiY!rATD)bDJ+*x`{5T4xuCHuYzfUo3uzBp
zM-5Y!uEe&;+$dd34kzfD2=<6ECYDu}H|ee#HgSv|?3ymKQ9~|EI{UE%XK#o3lY?VI
zreM~^55`5r^T#t{q7tDhrnW2t_j!eB-_(><ma141PFiDGT^O?TH*svuQq^GA%tnzK
zUyF2M51pyD)#ghos_AT;*amaQUBp-+Q4V&_dC=SZsK#7fz_}KoJWFJ-YCK1hevTf$
zxMHXMGxFShJM_vfz7tCdQc9H$O@uwALxnOYv$|8IdAXbScXM(#r*|S_&2bWfIHmZp
zJjGZ+pT<$;=^^(tCa=}x<y^>c9D{IH3h+X&UCOVv-0o`8+di(w{vHj>bptEIM|;5(
zY|8Ho#|NRBSGA24#b__M?E^j)y<0pYOje^$(vj}Pcr5kfr7yGiPZ|WrnP>y@zg%|;
zkpcNn)9zH^&loEnOE0wtEl<y9u!F7!HNV8MT+ujb&4{~HD<I7vqI022GBif{UlXP&
zrVZIOag>bCC@QdNBeHEqvM8s(B$MWpQ;alXy{7<r4^_Q~u2mj4=D0qiBx19v7l}-w
z(;$<H2R7F+TZmwpvUmb%VPPGiB`FJ8ctP9qEU+~z%TxZj26?{J&wy}VQphNq?Ww!w
z0mcjE(QH9NSlYWVP2jK=o93j_f~bvSu+lg632{0?p6DTg5>`NYjy?F1csqyU%8aSa
z$j(fsI3h9|pnWNwS~IiRQYjmkQcf>OVO&GzmQxZO6koYYkGx9Kt287G3pE)K)c#-W
z*|#FAiu~{W^NIZb9w7Vv&jA;Mv%y(F_WR!sZUC<auK=>we<rx+c+TE~ZQxPhVc-}b
zd;8x5HvqBu{}QtME5J*^OTb|;3(f#PMQ(p7*au|KemD3v^7=2q`#~0@z-N%pKMmdm
z-U_Bc89W_)1sVMca2WgrxQFun71#@;tldEN?7s`X1O6BIHn<TCgWHg~KLOqi{xcW?
zPXu=&XI~5C+<qS9zz>kKzYksv4uDI*bHR6y!L|a-fdxOq?+6D~*#J9CdzRmB6<Y@e
z<jkWS0?CN6lRx>mKO8Rdl;gwGYKxbdVT@DDIybPH_yr?q6sWgqi9e8uk-~za*i;uh
z)^&122_<M-0o}OZS+R_Gnw#aOu?+gU`ttc|s3J{!nY4~7zftp8W@14@#*aSw(Z`J1
zYl}v`2t#b54_wkSqS%e{*DLdgH|~|$l}&@F=k5zCBf6oz+@=;)9agqYdNzeS8j8v^
z_usPPCnGO)y=>w~HIOaE_-rwitQavX_8E_=FI7@&;41qN2iZ!B?38ovY?F+tbw<Py
zkzs6Ewv>m<TSl7NL${!nF!QS)y*RUDGZ&fsnMBrTaXzPrk<GGcdJNu9#2bh=wT_G6
z%e{a!WOEZUr73F+4GT=SmtMOu7Pp&ABxs#qNd8#ESt&9WJ0-=cx(x{O17uT$SaNdO
zhAzmZtTe2boueEf>%`#zLbcepcKI?x^&(DL^2`die#Q%{MI+}v&16QRDu?XsXt$eu
z*4c~pQ8KlON>H6}q~j7NTLQ&S-a1?UO~SB3Ft<6jl4oUa4>8l{yA9^Ke36JQ?%F_1
zJbDA=y4E5LSja|->(bZt_yDX@)j%~79I=)aCwn=0W6u{LT9l(bzL_yjktjQ)LYh>$
zp1sv0^EjL-nag}ZPzWSHsMI@g8B~O&(Aj1SyFWCueXbZ0Td4`w{P+eSKUO|SbGBE`
z4{OfD+?Ko~#4375?1yP1QnyxEYrc-zS7Dy%OuL>qT?y!FP1i5_Ds`B>*mVDq^x-=G
z`VCOLJk&Sjg#mr#WuPZR>iQ$D8l@wjw*5lctJt^TSVLUtKAuT*VJ*Yvs2&=UIEp$@
zf=hAE)zh{t8jShf1rE5!t3Y4Rxv3TtXB;46scGP;8v?`^gzsQ7JRV6e2pF~dsG~@_
z%jRsHr*x9L3(MJr)p6XTVE4il+T{I^ICO{_uPWZ^8n0{B`hqbbdAsz2q2;`lB$J9O
zr`fz^vs6DACEEM)CN4)T(eNGPhJ+29jW7*u2IqB>!)l9BmSD|BSj#Fu{b*M0u#`+n
zQ;LV(m<7^4NiV?^X05uB!q1O&4`pIx;G@^|xOQZ6L0CwV@gt%0rqX3y<6!Z^`Mn@p
zl0yhp^cYGg&JTG>=qpu2!N|)2gvhic4$Hu<VYSThV@M3{8nRiDNrzgl+f>z@cWQWf
z-N`gj3DskaxLvSE>X=y0I&-5;0Iqn^2laY$)3|+cdJs9Fh=RBRdu9J$_K+_`wiWq*
zD}UtodE|Vt{r@W<J^+sa9pK%__<8U|@FnE>&w)>a+rXQ_8^8z<x&I}|`C_}j8Jr6q
z4ju;n1sPw?{9gqg4`lED3*eLBdhl%UE#&)mf_H!l=mK9xt`}K<1l)s6|5k845Zn9^
zJQjQtxn9ome+|gK{FUGZ;9?-Y`ovaW&iLPjjQ>^e74Rys1&H1LP2m0DJwSN!Y#_4#
z6TlyX$Aia#6M&rOzZJY36u=ai1f$?&@I`b0*8n-Ee-;pZz_-9%;CAp?a1HQ3ar!W=
zt{+&I=q&FB5_>`jM|IVJ!GB!c%yIB>-W0O)ETgFXU|MuimVk>IQ>%My$CP#{=5U2F
zUbc%gQEvHFSA9~eLo-~Ze6A9iL~&O{Q6ih+B}pt9pjov?kyfidWL(!)cvS6AZTnXe
zzeQ_fG1|+q>D52B&Ay(_jOzYNdK1UEt9F#MhmqadS0@1$zca`z2k8an!h8uIF4=-;
zw9VFFT%6?iBjB|Oe5<f<#{W%-k0;SIOIyN+0Hbm~n8w#sFOG8pG=H@$7GBvrev4{Y
z>#gR=a9N2r+22u{frW1FnuK&N@#0gix<wdKjkj_ZB^d`>f}U#s<Isx|Z3x?v@*>iR
zTUp^QiQSW=6e1g*>c-3qF)ieT#M)?&Kfbrwuls~7YN2>;Pb_bY9Q4x$k=Q6ucSxgQ
zbsG7qGE;OCpe(a|HvGVAb;2BjFICYF7q`X|70&+bv&?7<!73Rd5(XM%0e@gJk$Ec%
zG=z&Kfh1KgWVI_|)x}MU0wcaKTSJin`uc|Uj_epP?94FDsmDS&Q=VApL_6Q?o}Z^N
z>id>#;cHiC^JXH<O(4bM1Cg_>v*npW-jqu?Um6YgS3-!+(X)A^wN%7$6D9*n^5SIL
z-tFsUdVp3<G@bhg1_QT<-C3^S_`xQaF9%h&jFeDRyyU<mSyWnvktE7*>qg+xNHhVz
z=xfyg;`ggp{V~nuR(>L8vg)^4t%R-kc<t{SrbrnLM|-7xM6s6S*Clb6@i{ouqQk`R
z7va|-IRM>$cZq2pEx*WZ6MkKy9i@(6nzJOo^nsOu9KS9Jg<lc-Okxb86)%_@kx6{V
zG+Q<5B4cc4)XEcjpD4blWRe4~;pax>WSCZrkJ`Y@^4OoG?JIyKPn#;Cy4cA|81Zb`
zuhpQkTCAo^Mr+hsEOe@P88u=>;RMwIRues)W90(+RIhPM8#$AJT?-00sYY8D_WxEh
zhpYGfp*p0C^+=-kuCESP8Oz=MXsOJIs&^*zs_h3;39sdUh$-QvYsyb_ydwK2Y`xl3
zx=SmN9|R)(dY{ZDe)%8@_Bsu_$p5z^o1Tk2EAqepzMk0q{}Xs8xEopj>)@4O16U7!
zihTbQAm0FZ4akD^pdUOHd=WWcz6J0>@G7ta91p&SyniE*{r(fddy)0UpZ}}CH9-9M
zi~N5sI2Ak%90z`f{QvLZKJWu@Gk6iW2<!(>0lz~pApZU34F4_Q^`Hta1kVPi0I}!)
z-{9Lo{QbWNybxRky1`G;4SWOK0ImaX0b<K9`h#P@N6`~p2i^i+0Or8OU_Cev+zTJS
z3H~d%1-uFbK=}JK@Ko>=Ah!J<L`U#4@HgNE;2bajo&i2Xoygz+1pb-pPl6l3)!=e)
zD!477JFqoxhkti42&eal82u07jT&Xx+=0nU4{UIb{Tsv06K*~cPFzV#JBq^9-H4as
zqbA;pQZltf0<D54OC@2*YCA!f%YXz65}DG(U8>?AWd#g(RF;$X2P$k-*`o!`)gong
z&1pF~Nl@3;vt*NV{1!9QxkS_KcTE#@fF#dIC(gIMzB4HXm;nvpJO<nJcV>3(*}G-L
z<fUiKr6$wtS@vibNjCT>lw?vuu{YPF9i(X}%_5=rwgZfboWGA`b`6c-@?G4btF{~0
zu+<dq<0qYBzM%+_#{@9|<{Xw3@4zXS%9%@PWTI}LREJfkP}-SN%~}U85D=Y<EB}0i
z?a^%hB6a+Gq3n6-ZfDx5pwpxR3o&EE6s~Mt!|MjDec#*LYo<iG$oVt%om>sZjXG~b
zsKJ^xo5O03^LgXy{yU@lmy9zafJt_ptu-9b?_H3PTYgxxyM?yZY&qf{2Or|q%XL9x
zAjG->7v8n8R43w>s8#U(lS2)Uui}%(?7!#9hL|uut(C;Oh-+oCY#T@6^_U`=K=)#%
zXK*@<21YFOWJxAU8o!29ZBG=-jF#sj(?Bl)W)Ve;I3@O6#wl<<yJ0SxXO)CH+Lr{<
zYNK5UYNsNyj&o`>(D$YCY_Alvu4`}<QS43f{;MSiLiw11HhQ}UZV#V!8GWuH^)#X=
zE^H_m@82og<K~02DYfsm7R_)%%^ca*s8F4vv_k%^)z}xvO_Z^Rbh7cN95Ep<+Aka#
z)(Z3NBcQ@i^@W<iqcM|e5!&nH^7u}qgC*Ta!n+BmzAx&;;{({4r#^soe;>g1e@~6_
zhlkijm+46QrhUHtQh7dapU=n7`}O&peXdTc`qWx}!(1wq?9qtikG32NEZ5X2nZYDM
z{9nrhX({1cI-F0Fu%6JI4NgTTh&`yN4^YYk!ULlFW*de!=72dl&p&OfxZAQ1eOl68
zEPYp^R7vt#n(ty`ON+uiS1#9dFZuWkq_YL--}BfGOmiNalfbICu86CtL={Pv+$?2T
zbQ*1iXCXwz7P(LsE|HGhVUetNhx876WWAFN%19S(Fej^pee_gD$#6rTSlvg@HpsJ(
z@h}x0>XtX%Vt#M?q)?LaHw^ny$yb(bj@`m%<+roV)!de9SY^=8>_`I%ac9%rC5vNp
zhhhjLZYon<y^L4xI`K}7wZtwZ(p_;aTWRGnA=8yM4B+eHEzOB=j3XPqdrAdGyPfii
z)HO*)jj1HjbENLANx2MyWu#jEP#ZK2L;t~uLL=5ujwxBHwlqT8T149pA$f>1Pfe+m
zpb5t2IsW@EdQL@*V_4#Ad)zf<LK%VR^fW%X61U^yB`iG1WzTwd4n>18BqHm_oMwbf
z29`D^W<MIl#Zy=Cgkxh-3F>4Z(*%JzK~vGe1_i&yBb?|w(J=@qKH~wy934IC95IcY
zdD9tdhNA6&QJu!MIcd!lJ0!4k5APSUlIqp=z>qgS6Ucd0_$!qcHy~{!Qti}Y;J%=u
z1G)A%Nu3v3UD9-Li5zp@GCTH@FQIJsM0&)~7K5@+-qS>20qn3r667^hBCytE6g;Lq
znyoSwUOvMV9@cO>FBMfrgDD5aRj;h%CA_XFmg#q3ffGEN!$J8h0N%m9Z(9nW+a73V
ztzo=4_K~Xf3m@zKR-1wQHb%66UDZiA*^0bFM7j(ovS#_>Kv)dLQT}wLoWp}BJt$lC
zYYDYU5Ot{>po-5gGPADOk46iBeQpt>38!t(l&(^@Z7>4A_bsq4O6(dWiQuS|#?3)9
zJ#Tf2HjF=j^FMJzDf0iDk$c7Goyh-Zy3Od{Aoq*^zxRWeftP|J5dVMYg8zdK;J<^b
z!ByZ&5CE|a*ax<PEx_vwp2W4-415gTz?EPx7y?}&1)czYkAC1z@Lq5g$b&&}BKVHe
z8;tW;&i;QKoxmaR3?OF%z6!1caxUP{fSm9DI%WDCxDi|rq>R^qR{}Zb@4x@Ij_XH)
zH^GNjgGsOv^nh<8^WOsIfB_#u#vcN5W?#+-ydB&C{up@MfZLJTKMSq_=Ycc8>EHzL
z3*>e=r~g_YXY|ux9q0z%LY{vccxx=z8~xeYWZgO$mgMJEeD-d!w`hCh*BGSf!z5Mb
z8`RAu;vKZCahlLtBATA9>S8R`wdj%wyHpx8y!=G;Xhnx3#~(@*-z?-vGJ&NMcXT9}
zy<93nDce@KwgRE}?oCz;j+yjF9_+>9Ws@+AW!KSIEJ%~K(WH`_@L#iT6KI=$+xb~0
z-CAnd5u_8eQP9QtyE5k4?*^vA2P*FibN4`{v@*IHyS+BFvV`J6OWJdSR^qbOd|+<m
zWjitAF!5E;CjItWXJf83R(pxXp&GANr^#EYt3!)pbse6bj{OUVP0sPiw4|So^%E<i
zgZMb}Zml8`$=Shi(E3rrXITBP(0fKVPEwLimQ;O%;8FKfQ@vEhVV_M$*Lu5ZK{s}V
z<<vq-H}P}<T`5oFCw!v=hY@G2PlWoSOvX%cS*yrLAR7wI_i$@r4tbo9*NAf=UBPZn
zrTD}xa#gIwavAFu8hA2Gl-v=YZI7^bXm(&hNfFEVz84>j#F7J!<gEN52imQGVJ5Pu
zY3$hw`x#%+TcY*i{FYVCiOiy8qk4yCq?NTk8T7phELN$9z@yPb06=@m*VuCI2nj7t
zPhN=SZA8#>GRa2KL1Wa=oRRrM3t1(e>Y`RBy6M)j=a-6&%A<AcxkT)`j$zqEO#PDX
zfk`R7)5}Y97CnV!RC3v8A4^ao%Ab^1gi9PNM|q`TEB$<vkS%odO+vQN)7wtZ*Cw+C
z@2yn|tA!FB11vf$a=*``mM&3@NbRi}wU#=2J6+Lg%H;yz>5EeHDrCP!%i+RoDdnHG
zrM|b{d)<<soME3p)J%Lkt`_#VWg_P~yyjM{rx^F*%4u*VC#C9@vPM2`XlT+xoF@Mq
zBQH4^;_hi5-1E~72|o`uaL)!2(h;YisO)voj~GkEKICN&-^!)$Nzdw8FWM>5Q?Vbe
zS}U>9#LJGV7ere=qehrIa<gwBH5Pr6Hl-Txw5HW%uDC)r7C1Y$KIqw?wJKRB9jmhN
zUlO(F*;JjO2cV+!YgV12N=fVbMRQ%>a3uk{#awq@)V2BfZd9}UJ=C37{j}>FiPoc?
zpb4Q?J)k~0sIOp$%E|v%AaDK%`7^oy|664Ie*xn6@8{rW;3jY_knaHg9guGTmcS4g
z1S#+s@Mv&5^89CjoDH}dTn@x0;I-g2;5lF;I17kB|GSXsZv&qK*Mc_y@$G*hI0sB4
z=l6mh@Iz$#Yrz|V`1E@;_$TD|*8wlr%Rc|tz}vtPAie;^ci)-d4DfSg`5%J&!4JUq
z!HwY6;1kIASAi?RW^g~U{e9r$pa5hqe-ro>^1S#1xDL#NIq+;Ce)-meU%}I#f*%9n
z@4tf2gPXwzf%xfrH`ouJ1%69Cd<A>~{7>+1kod_W#Ghev^N62oX2J~WG2Pp<ZeDQh
zuvFkSg1Egxbcyn**fRSgld=P;w;aIny<hwLWP)DeGFoCZ$!eBQjkHBt`n1qgBsj9%
z%cYsJtP)GM{6HAF!!`VDAC-9`-T|!}QQf#Fj9W~Pous7t?1){ADRl>%SZhVqV%KdA
zJI7JX7Q4pG8cWSN^(l6xWn9;}P`0@7Kg!M_dkGl~HQDxwazoDCHiZiwziEuceq_9%
zQQ>=E2T@^nDl<D}5|}Mlrr2s^?>leK-?q)1#kQ(^o|O^YYw9C)vuf)TZPjPCKA+3m
zA+V>W<AyAZPNW@?)-4V_p+<?kw%ctJ^ofaaKAKV_TPxMbKfr2kXDYZ-MYkjKAgQ_;
zIpB2b(X%SPRjg%%Yr1y0lge6j)LR<^;xS-yGBc^IyT{nSQP-DlvWyNLkDrzft3^*b
z8Y8IAL@@ulaY(5jh@5W%hNQYf`YPqfNVeF})H3ce%a#~0qOswjJy@w{F7e;+|HNRW
zUXoTmSSgtYZ?KA%P1boqDw@6-jF{C#vP(>Eb|I!MSl%!ldwVOUbrd1i9Ecf|6E12p
zo(g;Udb>&?s1L+c<T5|4CDj&}(aDr*QeHtI)aA=ki<h`}E?Bz6{j9rwQK5t%s`H`x
zYDj#dEjdxdg{6Apo2a0FfZbeObT`NaIdvJ4nG&B7i8eG@!W7Z1y`y!!uHxlIizzKG
z)nbd<|L<DGHmmMSIOlMKPduUMQX*!b<^F@!O0=2yh0qb*|34F1^{<g<MgHICWU5<{
z^ZyRWx&MzM&)*7O1TF&^a5mTojswSnFC*XI3El|A2fzTh2f6;6;8yT1a0M6z&jSa*
zqrs!Vy~y_>@9zNT0g?C51{=Y@BJbY=Hi6$F=l=%W3a$fh0X0wsJHR#|zWwD}e)pjZ
z*a;p19u7W*%zq<z7kEApoxol&2=;(u!7q{h2f=aRSa1ij|NFpu!HdC*fcN8nBG(3d
z2EKh7ycxU!ydDTIUkhFXc7tbu4PZSu27Cd2-w0};3Pe}X4@5U`2RH%z8rlDrdig&g
z?Ydz(WTY3vo}{%HlA%6Im>e}RacV)J#bdRSYKv~{ozqKXe^}co^MIFN<q0f5kIPDP
zS%Ih62G@bA{if6^!t#sN%<fLqSFAcjQxmO$+!ty^Rm7qP!))ARa3No*r2~xWad4+m
zv7>CX5#?SI%1w+)Dl?tUO(mv7C6%OHuiz4ex)_N1q;+`}-SAZ`o1-e<thHCsM{DCT
z--n5F;-AE8Das>Lugj@F?;6`R!V;ZbC3g}qalAaCeTvi6#RmySYe$BLXhDJuRHs0C
zNUyx2bC6rZr6yNl+e2vuHX0@x+T`SD(^c2xTE!!S(5@T5jO-RyOIN#a_Q6_F$yn8}
z?mkps>xY`x`l0&nuBWrtx_g*+wSLKxoqclh(1Tj*c9ydLM{PilW9t-?35$u&Z#bqy
zE0<4_&txXDg)wHFMmb)lmR@gn;o4M!UL`nsdNOsm9E=shX>{4+d<`MFyY15Rr^}no
zk~7WoeAr;XU(3irC%wSo(GfQxd*-+R6tKr#+G0A-v8{5~v%c#v-AjI+vZoLhErnK5
zC1!_2%n~yZyRn#IaY0Gjs41IvOd6B)XzGqQh~vq|rAAw%HQRO~-#Jq#TtH(P)W+l5
zP@I%DrZK{3i#i~;_FzxEXo)mJP7SmPOSYhh14oV!vSg7P7A(puS>(hoNyS@2ne*7l
sARxzAsvKaj=f`w0LV{+qAO18qc*9s5#KL^EX4${wFeYq!u*R|f3;$<ZrvLx|

diff --git a/src/KOKKOS/compute_sna_grid_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
index ec55b5fae4..5ec494f206 100644
--- a/src/KOKKOS/compute_sna_grid_kokkos_impl.h
+++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
@@ -133,6 +133,12 @@ template<class DeviceType, typename real_type, int vector_length>
 ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::~ComputeSNAGridKokkos()
 {
   if (copymode) return;
+
+  printf("^^^ ComputeSNAGridKokkos destructor begin destroy\n");
+  memoryKK->destroy_kokkos(k_cutsq,cutsq);
+  memoryKK->destroy_kokkos(k_grid,grid);
+  memoryKK->destroy_kokkos(k_gridall, gridall);
+  //memoryKK->destroy_kokkos(k_gridlocal, gridlocal);
 }
 
 // Init
@@ -163,11 +169,17 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::setup()
 
   memoryKK->create_kokkos(k_grid,grid, size_array_rows, size_array_cols, "grid:grid");
   memoryKK->create_kokkos(k_gridall, gridall, size_array_rows, size_array_cols, "grid:gridall");
+
+  // do not use or allocate gridlocal for now
+
+  gridlocal_allocated = 0;
+  /*
   if (nxlo <= nxhi && nylo <= nyhi && nzlo <= nzhi) {
     gridlocal_allocated = 1;
     memoryKK->create4d_offset_kokkos(k_gridlocal, gridlocal, size_array_cols, nzlo, nzhi, nylo, 
                                      nyhi, nxlo, nxhi, "grid:gridlocal");
   }
+  */
   array = gridall;
 
   d_gridlocal = k_gridlocal.template view<DeviceType>();
@@ -331,6 +343,11 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
 
   // This function follows similar procedure as ComputeNeigh of PairSNAPKokkos.
   // Main difference is that we don't use the neighbor class or neighbor variables here.
+  // This is because the grid points are not atoms and therefore do not get assigned
+  // neighbors in LAMMPS. 
+  // TODO: If we did make a neighborlist for each grid point, we could use current 
+  //       routines and avoid having to loop over all atoms (which limits us to 
+  //       natoms = max team size).
 
   SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
 
@@ -369,6 +386,7 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
   // printf("ii igrid: %d %d\n", ii, igrid);
 
   // grid2x converts igrid to ix,iy,iz like we've done before
+  // multiply grid integers by grid spacing delx, dely, delz
   //grid2x(igrid, xgrid);
   xgrid[0] = ix * delx;
   xgrid[1] = iy * dely;
@@ -634,6 +652,34 @@ KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalFill, const int& ii) const {
   SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
 
+  // convert to grid indices
+
+  int iz = ii/(xlen*ylen);
+  int i2 = ii - (iz*xlen*ylen);
+  int iy = i2/xlen;
+  int ix = i2 % xlen;
+  iz += nzlo;
+  iy += nylo;
+  ix += nxlo;
+
+  double xgrid[3];
+
+  // index ii already captures the proper grid point
+  // int igrid = iz * (nx * ny) + iy * nx + ix;
+  // printf("ii igrid: %d %d\n", ii, igrid);
+
+  // grid2x converts igrid to ix,iy,iz like we've done before
+  //grid2x(igrid, xgrid);
+  xgrid[0] = ix * delx;
+  xgrid[1] = iy * dely;
+  xgrid[2] = iz * delz;
+  const F_FLOAT xtmp = xgrid[0];
+  const F_FLOAT ytmp = xgrid[1];
+  const F_FLOAT ztmp = xgrid[2];
+  d_gridall(ii,0) = xtmp;
+  d_gridall(ii,1) = ytmp;
+  d_gridall(ii,2) = ztmp;
+
   const auto idxb_max = snaKK.idxb_max;
 
   // linear contributions
@@ -641,7 +687,7 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
   for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
     const auto idxb = icoeff % idxb_max;
     const auto idx_chem = icoeff / idxb_max;
-    d_gridall(ii,icoeff) = my_sna.blist(ii,idx_chem,idxb);
+    d_gridall(ii,icoeff+3) = my_sna.blist(ii,idx_chem,idxb);
   }
 
 }

From 709da60474592bfc9729e370935b465d068540fa Mon Sep 17 00:00:00 2001
From: rohskopf <drew.rohskopf@gmail.com>
Date: Tue, 30 May 2023 11:08:43 -0600
Subject: [PATCH 17/51] Replace limited parallel for with normal for for now

---
 src/KOKKOS/compute_sna_grid_kokkos_impl.h | 86 ++++++++++++++++++-----
 1 file changed, 67 insertions(+), 19 deletions(-)

diff --git a/src/KOKKOS/compute_sna_grid_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
index 5ec494f206..db6245ec34 100644
--- a/src/KOKKOS/compute_sna_grid_kokkos_impl.h
+++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
@@ -136,7 +136,7 @@ ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::~ComputeSNAGridKokko
 
   printf("^^^ ComputeSNAGridKokkos destructor begin destroy\n");
   memoryKK->destroy_kokkos(k_cutsq,cutsq);
-  memoryKK->destroy_kokkos(k_grid,grid);
+  //memoryKK->destroy_kokkos(k_grid,grid);
   memoryKK->destroy_kokkos(k_gridall, gridall);
   //memoryKK->destroy_kokkos(k_gridlocal, gridlocal);
 }
@@ -166,9 +166,11 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::setup()
   ComputeGrid::set_grid_local();
   
   // allocate arrays
-
-  memoryKK->create_kokkos(k_grid,grid, size_array_rows, size_array_cols, "grid:grid");
+  printf(">>> Allocating gridall.\n");
+  printf(">>> %d %d\n", size_array_rows, size_array_cols);
+  //memoryKK->create_kokkos(k_grid,grid, size_array_rows, size_array_cols, "grid:grid");
   memoryKK->create_kokkos(k_gridall, gridall, size_array_rows, size_array_cols, "grid:gridall");
+  printf(">>> Allocated gridall.\n");
 
   // do not use or allocate gridlocal for now
 
@@ -183,7 +185,7 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::setup()
   array = gridall;
 
   d_gridlocal = k_gridlocal.template view<DeviceType>();
-  d_grid = k_grid.template view<DeviceType>();
+  //d_grid = k_grid.template view<DeviceType>();
   d_gridall = k_gridall.template view<DeviceType>();
 }
 
@@ -218,6 +220,7 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::compute_array()
   // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user 
   chunk_size = MIN(chunksize, total_range);
   snaKK.grow_rij(chunk_size, ntotal);
+  //snaKK.grow_rij(chunk_size, max_neighs);
 
   //chunk_size = total_range;
  
@@ -322,8 +325,8 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::compute_array()
   k_gridlocal.template modify<DeviceType>();
   k_gridlocal.template sync<LMPHostType>();
 
-  k_grid.template modify<DeviceType>();
-  k_grid.template sync<LMPHostType>();
+  //k_grid.template modify<DeviceType>();
+  //k_grid.template sync<LMPHostType>();
 
   k_gridall.template modify<DeviceType>();
   k_gridall.template sync<LMPHostType>();
@@ -411,23 +414,32 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
 
   // Compute the number of neighbors, store rsq
   int ninside = 0;
+  
   // want to loop over ntotal... keep getting seg fault when accessing type_cache[j]?
-  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,ntotal),
-    [&] (const int j, int& count) {
-
-    // From pair snap/kk :
-    /*
-    T_INT j = d_neighbors(i,jj);
+  for (int j = 0; j < ntotal; j++){
     const F_FLOAT dx = x(j,0) - xtmp;
     const F_FLOAT dy = x(j,1) - ytmp;
     const F_FLOAT dz = x(j,2) - ztmp;
-    */
-    // From compute sna/grid/kk :
-    /*
-    const double delx = xtmp - x[j][0];
-    const double dely = ytmp - x[j][1];
-    const double delz = ztmp - x[j][2];
-    */
+
+    int jtype = type(j);
+    const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
+
+    // don't include atoms that share location with grid point
+    if (rsq >= rnd_cutsq(itype,jtype) || rsq < 1e-20) {
+      jtype = -1; // use -1 to signal it's outside the radius
+    } 
+
+    type_cache[j] = jtype;
+
+    if (jtype >= 0)
+      ninside++;
+
+  }
+  
+
+  /*
+  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,ntotal),
+    [&] (const int j, int& count) {
     const F_FLOAT dx = x(j,0) - xtmp;
     const F_FLOAT dy = x(j,1) - ytmp;
     const F_FLOAT dz = x(j,2) - ztmp;
@@ -446,10 +458,45 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
      count++;
 
   }, ninside);
+  */
+  
+
+  //printf("ninside: %d\n", ninside);
 
   d_ninside(ii) = ninside; 
 
   // TODO: Adjust for multi-element, currently we set jelem = 0 regardless of type.
+  int offset = 0;
+  for (int j = 0; j < ntotal; j++){
+    const int jtype = type_cache[j];
+    if (jtype >= 0) {
+      const F_FLOAT dx = x(j,0) - xtmp;
+      const F_FLOAT dy = x(j,1) - ytmp;
+      const F_FLOAT dz = x(j,2) - ztmp;
+      int jtype = type(j);
+      int jelem = 0;
+      if (chemflag) jelem = d_map[jtype];
+      my_sna.rij(ii,offset,0) = static_cast<real_type>(dx);
+      my_sna.rij(ii,offset,1) = static_cast<real_type>(dy);
+      my_sna.rij(ii,offset,2) = static_cast<real_type>(dz);
+      // pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp
+      // actually since the views here have values starting at 0, let's use jelem
+      my_sna.wj(ii,offset) = static_cast<real_type>(d_wjelem[jelem]);
+      my_sna.rcutij(ii,offset) = static_cast<real_type>((2.0 * d_radelem[jelem])*rcutfac);
+      my_sna.inside(ii,offset) = j;
+      if (switchinnerflag) {
+        my_sna.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]);
+        my_sna.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]);
+      }
+      if (chemflag)
+        my_sna.element(ii,offset) = jelem;
+      else
+        my_sna.element(ii,offset) = 0;
+      offset++;
+    }
+  }
+
+  /*
   Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team,ntotal),
     [&] (const int j, int& offset, bool final) {
 
@@ -483,6 +530,7 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
       offset++;
     }
   });
+  */
 }
 
 

From 1037e4a4eb672e914df45d718d0d2973dc7b03ff Mon Sep 17 00:00:00 2001
From: rohskopf <drew.rohskopf@gmail.com>
Date: Tue, 30 May 2023 22:40:12 -0600
Subject: [PATCH 18/51] Use normal loop over ntotal inside neighbor team policy

---
 src/KOKKOS/compute_sna_grid_kokkos.h      |   2 +-
 src/KOKKOS/compute_sna_grid_kokkos_impl.h | 100 ++++++++++++++++------
 src/KOKKOS/memory_kokkos.h                |   2 +
 src/KOKKOS/pair_mliap_kokkos.cpp          |   2 +
 4 files changed, 81 insertions(+), 25 deletions(-)

diff --git a/src/KOKKOS/compute_sna_grid_kokkos.h b/src/KOKKOS/compute_sna_grid_kokkos.h
index 830601c0fb..0f56fdcbf1 100644
--- a/src/KOKKOS/compute_sna_grid_kokkos.h
+++ b/src/KOKKOS/compute_sna_grid_kokkos.h
@@ -220,7 +220,7 @@ class ComputeSNAGridKokkos : public ComputeSNAGrid {
 
   SNAKokkos<DeviceType, real_type, vector_length> snaKK;
 
-  int chunk_size, chunk_offset;
+  int max_neighs, chunk_size, chunk_offset;
   int host_flag;
   int ntotal;
   int total_range; // total number of loop iterations in grid
diff --git a/src/KOKKOS/compute_sna_grid_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
index db6245ec34..9e704954f1 100644
--- a/src/KOKKOS/compute_sna_grid_kokkos_impl.h
+++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
@@ -166,11 +166,11 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::setup()
   ComputeGrid::set_grid_local();
   
   // allocate arrays
-  printf(">>> Allocating gridall.\n");
-  printf(">>> %d %d\n", size_array_rows, size_array_cols);
+  //printf(">>> Allocating gridall.\n");
+  //printf(">>> %d %d\n", size_array_rows, size_array_cols);
   //memoryKK->create_kokkos(k_grid,grid, size_array_rows, size_array_cols, "grid:grid");
   memoryKK->create_kokkos(k_gridall, gridall, size_array_rows, size_array_cols, "grid:gridall");
-  printf(">>> Allocated gridall.\n");
+  //printf(">>> Allocated gridall.\n");
 
   // do not use or allocate gridlocal for now
 
@@ -209,6 +209,9 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::compute_array()
   x = atomKK->k_x.view<DeviceType>();
   type = atomKK->k_type.view<DeviceType>();
   k_cutsq.template sync<DeviceType>();
+  //printf(">>> max neighs\n");
+  // max_neighs is defined here - think of more elaborate methods.
+  max_neighs = 100;
 
   // Pair snap/kk uses grow_ij with some max number of neighs but compute sna/grid uses total 
   // number of atoms.
@@ -216,32 +219,37 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::compute_array()
   ntotal = atomKK->nlocal + atomKK->nghost;
   // Allocate view for number of neighbors per grid point
   MemKK::realloc_kokkos(d_ninside,"ComputeSNAGridKokkos:ninside",total_range);
-
+  //printf(">>> chunk_size\n");
   // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user 
   chunk_size = MIN(chunksize, total_range);
-  snaKK.grow_rij(chunk_size, ntotal);
-  //snaKK.grow_rij(chunk_size, max_neighs);
+  //snaKK.grow_rij(chunk_size, ntotal);
+  snaKK.grow_rij(chunk_size, max_neighs);
 
   //chunk_size = total_range;
  
   // Pre-compute ceil(chunk_size / vector_length) for code cleanliness
   const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length;
 
+  //printf(">>> Begin computeneigh block\n");
   //ComputeNeigh 
   {
-    int scratch_size = scratch_size_helper<int>(team_size_compute_neigh * ntotal);
+    int scratch_size = scratch_size_helper<int>(team_size_compute_neigh * max_neighs); //ntotal);
 
     SnapAoSoATeamPolicy<DeviceType, team_size_compute_neigh, TagCSNAGridComputeNeigh> 
       policy_neigh(chunk_size, team_size_compute_neigh, vector_length);
     policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+    //printf(">>>> blah\n");
     Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this);
+    //printf(">>>> foo\n");
   }
 
+  //printf(">>>>> Ended compute neigh\n");
+
   //ComputeCayleyKlein
   {
     // tile_size_compute_ck is defined in `compute_sna_grid_kokkos.h`
     Snap3DRangePolicy<DeviceType, tile_size_compute_ck, TagCSNAGridComputeCayleyKlein>
-      policy_compute_ck({0,0,0}, {vector_length, ntotal, chunk_size_div}, {vector_length, tile_size_compute_ck, 1});
+      policy_compute_ck({0,0,0}, {vector_length, max_neighs, chunk_size_div}, {vector_length, tile_size_compute_ck, 1});
     Kokkos::parallel_for("ComputeCayleyKlein", policy_compute_ck, *this);
   }
 
@@ -265,7 +273,7 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::compute_array()
       // Version with parallelism over j_bend
 
       // total number of teams needed: (natoms / 32) * (ntotal) * ("bend" locations)
-      const int n_teams = chunk_size_div * ntotal * (twojmax + 1);
+      const int n_teams = chunk_size_div * max_neighs * (twojmax + 1);
       const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
 
       SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagCSNAGridComputeUiSmall>
@@ -276,7 +284,7 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::compute_array()
       // Version w/out parallelism  over j_bend
 
       // total number of teams needed: (natoms / 32) * (ntotal)
-      const int n_teams = chunk_size_div * ntotal;
+      const int n_teams = chunk_size_div * max_neighs;
       const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
 
       SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagCSNAGridComputeUiLarge>
@@ -353,7 +361,7 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
   //       natoms = max team size).
 
   SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
-
+  //printf(">>> Begin computeneigh\n");
   // basic quantities associated with this team:
   // team_rank : rank of thread in this team
   // league_rank : rank of team in this league
@@ -367,11 +375,11 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
   // This is used to cache whether or not an atom is within the cutoff.
   // If it is, type_cache is assigned to the atom type.
   // If it's not, it's assigned to -1.
-  const int tile_size = ntotal; // number of elements per thread
+  const int tile_size = ntotal; //max_neighs; // number of elements per thread
   const int team_rank = team.team_rank();
   const int scratch_shift = team_rank * tile_size; // offset into pointer for entire team
   int* type_cache = (int*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(int), 0) + scratch_shift;
-
+  //printf(">>> Convert to grid indices\n");
   // convert to grid indices
 
   int iz = ii/(xlen*ylen);
@@ -415,12 +423,13 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
   // Compute the number of neighbors, store rsq
   int ninside = 0;
   
-  // want to loop over ntotal... keep getting seg fault when accessing type_cache[j]?
+  //printf(">>> Looping over ntotal\n");
+  // Looping over ntotal for now.
   for (int j = 0; j < ntotal; j++){
     const F_FLOAT dx = x(j,0) - xtmp;
     const F_FLOAT dy = x(j,1) - ytmp;
     const F_FLOAT dz = x(j,2) - ztmp;
-
+    //printf(">>> jtype\n");
     int jtype = type(j);
     const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
 
@@ -428,14 +437,17 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
     if (rsq >= rnd_cutsq(itype,jtype) || rsq < 1e-20) {
       jtype = -1; // use -1 to signal it's outside the radius
     } 
-
-    type_cache[j] = jtype;
+    //printf(">>> accessing type cache\n");
+    //type_cache[j] = jtype;
 
     if (jtype >= 0)
       ninside++;
 
+    //printf(">>> after type cache\n");
+
   }
-  
+
+  //printf(">>> after first loop\n");  
 
   /*
   Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,ntotal),
@@ -467,9 +479,46 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
 
   // TODO: Adjust for multi-element, currently we set jelem = 0 regardless of type.
   int offset = 0;
+  for (int j = 0; j < ntotal; j++){
+    //const int jtype = type_cache[j];
+    //if (jtype >= 0) {
+    //printf(">>> offset: %d\n", offset);
+    const F_FLOAT dx = x(j,0) - xtmp;
+    const F_FLOAT dy = x(j,1) - ytmp;
+    const F_FLOAT dz = x(j,2) - ztmp;
+    const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
+    int jtype = type(j);
+    if (rsq < rnd_cutsq(itype,jtype) && rsq > 1e-20) {
+      int jelem = 0;
+      if (chemflag) jelem = d_map[jtype];
+      my_sna.rij(ii,offset,0) = static_cast<real_type>(dx);
+      my_sna.rij(ii,offset,1) = static_cast<real_type>(dy);
+      my_sna.rij(ii,offset,2) = static_cast<real_type>(dz);
+      // pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp
+      // actually since the views here have values starting at 0, let's use jelem
+      my_sna.wj(ii,offset) = static_cast<real_type>(d_wjelem[jelem]);
+      my_sna.rcutij(ii,offset) = static_cast<real_type>((2.0 * d_radelem[jelem])*rcutfac);
+      my_sna.inside(ii,offset) = j;
+      if (switchinnerflag) {
+        my_sna.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]);
+        my_sna.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]);
+      }
+      if (chemflag)
+        my_sna.element(ii,offset) = jelem;
+      else
+        my_sna.element(ii,offset) = 0;
+      offset++;
+    }
+  }
+
+  //printf(">>> end inside\n");
+
+  /*
+  int offset = 0;
   for (int j = 0; j < ntotal; j++){
     const int jtype = type_cache[j];
     if (jtype >= 0) {
+      printf(">>> offset: %d\n", offset);
       const F_FLOAT dx = x(j,0) - xtmp;
       const F_FLOAT dy = x(j,1) - ytmp;
       const F_FLOAT dz = x(j,2) - ztmp;
@@ -495,6 +544,9 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
       offset++;
     }
   }
+  */
+
+  //printf(">>> End of computeneigh\n");
 
   /*
   Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team,ntotal),
@@ -572,10 +624,10 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
   int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
 
   // extract neighbor index, iatom_div
-  int iatom_div = flattened_idx / (ntotal * (twojmax + 1)); // removed "const" to work around GCC 7 bug
-  const int jj_jbend = flattened_idx - iatom_div * (ntotal * (twojmax + 1));
-  const int jbend = jj_jbend / ntotal;
-  int jj = jj_jbend - jbend * ntotal; // removed "const" to work around GCC 7 bug
+  int iatom_div = flattened_idx / (max_neighs * (twojmax + 1)); // removed "const" to work around GCC 7 bug
+  const int jj_jbend = flattened_idx - iatom_div * (max_neighs * (twojmax + 1));
+  const int jbend = jj_jbend / max_neighs;
+  int jj = jj_jbend - jbend * max_neighs; // removed "const" to work around GCC 7 bug
 
   Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
     [&] (const int iatom_mod) {
@@ -599,8 +651,8 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
   int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
 
   // extract neighbor index, iatom_div
-  int iatom_div = flattened_idx / ntotal; // removed "const" to work around GCC 7 bug
-  int jj = flattened_idx - iatom_div * ntotal;
+  int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug
+  int jj = flattened_idx - iatom_div * max_neighs;
 
   Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
     [&] (const int iatom_mod) {
diff --git a/src/KOKKOS/memory_kokkos.h b/src/KOKKOS/memory_kokkos.h
index 35a7ceaeb4..623b002dcb 100644
--- a/src/KOKKOS/memory_kokkos.h
+++ b/src/KOKKOS/memory_kokkos.h
@@ -163,6 +163,7 @@ template <typename TYPE, typename HTYPE>
 {
   data = TYPE(std::string(name),n1,n2);
   h_data = Kokkos::create_mirror_view(data);
+  printf(">>> name: %s\n", name);
   return data;
 }
 
@@ -173,6 +174,7 @@ TYPE create_kokkos(TYPE &data, typename TYPE::value_type **&array,
   data = TYPE(std::string(name),n1,n2);
   bigint nbytes = ((bigint) sizeof(typename TYPE::value_type *)) * n1;
   array = (typename TYPE::value_type **) smalloc(nbytes,name);
+  printf(">>> name %s nbytes %d\n", name, nbytes);
 
   for (int i = 0; i < n1; i++) {
     if (n2 == 0)
diff --git a/src/KOKKOS/pair_mliap_kokkos.cpp b/src/KOKKOS/pair_mliap_kokkos.cpp
index d19d81e314..18f5368a98 100644
--- a/src/KOKKOS/pair_mliap_kokkos.cpp
+++ b/src/KOKKOS/pair_mliap_kokkos.cpp
@@ -232,6 +232,7 @@ void PairMLIAPKokkos<DeviceType>::coeff(int narg, char **arg) {
   // map[i] = which element the Ith atom type is, -1 if not mapped
   // map[0] is not used
 
+  //printf(">>> ntypes: %d\n", atom->ntypes);
   for (int i = 1; i <= atom->ntypes; i++) {
     char* elemname = elemtypes[i-1];
     int jelem;
@@ -239,6 +240,7 @@ void PairMLIAPKokkos<DeviceType>::coeff(int narg, char **arg) {
       if (strcmp(elemname,descriptor->elements[jelem]) == 0)
         break;
 
+    //printf(">>> nelements: %d\n", descriptor->nelements);
     if (jelem < descriptor->nelements)
       map[i] = jelem;
     else if (strcmp(elemname,"NULL") == 0) map[i] = -1;

From 95e39ba89a93e9c0fdcdf5213b7bdfe6934a6f8a Mon Sep 17 00:00:00 2001
From: rohskopf <drew.rohskopf@gmail.com>
Date: Tue, 30 May 2023 22:53:24 -0600
Subject: [PATCH 19/51] Clean up kernels

---
 src/KOKKOS/compute_sna_grid_kokkos_impl.h | 33 ++++-------------------
 1 file changed, 5 insertions(+), 28 deletions(-)

diff --git a/src/KOKKOS/compute_sna_grid_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
index 9e704954f1..d6984fbdb1 100644
--- a/src/KOKKOS/compute_sna_grid_kokkos_impl.h
+++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
@@ -134,7 +134,6 @@ ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::~ComputeSNAGridKokko
 {
   if (copymode) return;
 
-  printf("^^^ ComputeSNAGridKokkos destructor begin destroy\n");
   memoryKK->destroy_kokkos(k_cutsq,cutsq);
   //memoryKK->destroy_kokkos(k_grid,grid);
   memoryKK->destroy_kokkos(k_gridall, gridall);
@@ -209,7 +208,7 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::compute_array()
   x = atomKK->k_x.view<DeviceType>();
   type = atomKK->k_type.view<DeviceType>();
   k_cutsq.template sync<DeviceType>();
-  //printf(">>> max neighs\n");
+
   // max_neighs is defined here - think of more elaborate methods.
   max_neighs = 100;
 
@@ -219,7 +218,7 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::compute_array()
   ntotal = atomKK->nlocal + atomKK->nghost;
   // Allocate view for number of neighbors per grid point
   MemKK::realloc_kokkos(d_ninside,"ComputeSNAGridKokkos:ninside",total_range);
-  //printf(">>> chunk_size\n");
+
   // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user 
   chunk_size = MIN(chunksize, total_range);
   //snaKK.grow_rij(chunk_size, ntotal);
@@ -230,7 +229,6 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::compute_array()
   // Pre-compute ceil(chunk_size / vector_length) for code cleanliness
   const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length;
 
-  //printf(">>> Begin computeneigh block\n");
   //ComputeNeigh 
   {
     int scratch_size = scratch_size_helper<int>(team_size_compute_neigh * max_neighs); //ntotal);
@@ -238,13 +236,9 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::compute_array()
     SnapAoSoATeamPolicy<DeviceType, team_size_compute_neigh, TagCSNAGridComputeNeigh> 
       policy_neigh(chunk_size, team_size_compute_neigh, vector_length);
     policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
-    //printf(">>>> blah\n");
     Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this);
-    //printf(">>>> foo\n");
   }
 
-  //printf(">>>>> Ended compute neigh\n");
-
   //ComputeCayleyKlein
   {
     // tile_size_compute_ck is defined in `compute_sna_grid_kokkos.h`
@@ -361,7 +355,7 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
   //       natoms = max team size).
 
   SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
-  //printf(">>> Begin computeneigh\n");
+
   // basic quantities associated with this team:
   // team_rank : rank of thread in this team
   // league_rank : rank of team in this league
@@ -379,7 +373,7 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
   const int team_rank = team.team_rank();
   const int scratch_shift = team_rank * tile_size; // offset into pointer for entire team
   int* type_cache = (int*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(int), 0) + scratch_shift;
-  //printf(">>> Convert to grid indices\n");
+
   // convert to grid indices
 
   int iz = ii/(xlen*ylen);
@@ -394,7 +388,6 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
 
   // index ii already captures the proper grid point
   // int igrid = iz * (nx * ny) + iy * nx + ix;
-  // printf("ii igrid: %d %d\n", ii, igrid);
 
   // grid2x converts igrid to ix,iy,iz like we've done before
   // multiply grid integers by grid spacing delx, dely, delz
@@ -423,13 +416,11 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
   // Compute the number of neighbors, store rsq
   int ninside = 0;
   
-  //printf(">>> Looping over ntotal\n");
   // Looping over ntotal for now.
   for (int j = 0; j < ntotal; j++){
     const F_FLOAT dx = x(j,0) - xtmp;
     const F_FLOAT dy = x(j,1) - ytmp;
     const F_FLOAT dz = x(j,2) - ztmp;
-    //printf(">>> jtype\n");
     int jtype = type(j);
     const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
 
@@ -437,17 +428,11 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
     if (rsq >= rnd_cutsq(itype,jtype) || rsq < 1e-20) {
       jtype = -1; // use -1 to signal it's outside the radius
     } 
-    //printf(">>> accessing type cache\n");
-    //type_cache[j] = jtype;
 
     if (jtype >= 0)
       ninside++;
 
-    //printf(">>> after type cache\n");
-
-  }
-
-  //printf(">>> after first loop\n");  
+  } 
 
   /*
   Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,ntotal),
@@ -471,9 +456,6 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
 
   }, ninside);
   */
-  
-
-  //printf("ninside: %d\n", ninside);
 
   d_ninside(ii) = ninside; 
 
@@ -482,7 +464,6 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
   for (int j = 0; j < ntotal; j++){
     //const int jtype = type_cache[j];
     //if (jtype >= 0) {
-    //printf(">>> offset: %d\n", offset);
     const F_FLOAT dx = x(j,0) - xtmp;
     const F_FLOAT dy = x(j,1) - ytmp;
     const F_FLOAT dz = x(j,2) - ztmp;
@@ -511,8 +492,6 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
     }
   }
 
-  //printf(">>> end inside\n");
-
   /*
   int offset = 0;
   for (int j = 0; j < ntotal; j++){
@@ -546,8 +525,6 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
   }
   */
 
-  //printf(">>> End of computeneigh\n");
-
   /*
   Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team,ntotal),
     [&] (const int j, int& offset, bool final) {

From be5476e442dd66e1854bcb011de3488f9419e8fd Mon Sep 17 00:00:00 2001
From: rohskopf <drew.rohskopf@gmail.com>
Date: Fri, 2 Jun 2023 15:10:45 -0600
Subject: [PATCH 20/51] Loop over chunks on GPU to write values properly when
 using default chunk size

---
 src/KOKKOS/compute_sna_grid_kokkos_impl.h | 206 ++++++++++++----------
 src/KOKKOS/memory_kokkos.h                |   4 +-
 src/ML-SNAP/compute_grid.cpp              |   3 +
 3 files changed, 120 insertions(+), 93 deletions(-)

diff --git a/src/KOKKOS/compute_sna_grid_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
index d6984fbdb1..cb0a8a646f 100644
--- a/src/KOKKOS/compute_sna_grid_kokkos_impl.h
+++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
@@ -220,7 +220,10 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::compute_array()
   MemKK::realloc_kokkos(d_ninside,"ComputeSNAGridKokkos:ninside",total_range);
 
   // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user 
+  // `total_range` is the number of grid points which may be larger than chunk size.
+  //printf(">>> total_range: %d\n", total_range);
   chunk_size = MIN(chunksize, total_range);
+  chunk_offset = 0;
   //snaKK.grow_rij(chunk_size, ntotal);
   snaKK.grow_rij(chunk_size, max_neighs);
 
@@ -229,100 +232,112 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::compute_array()
   // Pre-compute ceil(chunk_size / vector_length) for code cleanliness
   const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length;
 
-  //ComputeNeigh 
-  {
-    int scratch_size = scratch_size_helper<int>(team_size_compute_neigh * max_neighs); //ntotal);
+  while (chunk_offset < total_range) { // chunk up loop to prevent running out of memory
 
-    SnapAoSoATeamPolicy<DeviceType, team_size_compute_neigh, TagCSNAGridComputeNeigh> 
-      policy_neigh(chunk_size, team_size_compute_neigh, vector_length);
-    policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
-    Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this);
-  }
+    if (chunk_size > total_range - chunk_offset)
+      chunk_size = total_range - chunk_offset;
 
-  //ComputeCayleyKlein
-  {
-    // tile_size_compute_ck is defined in `compute_sna_grid_kokkos.h`
-    Snap3DRangePolicy<DeviceType, tile_size_compute_ck, TagCSNAGridComputeCayleyKlein>
-      policy_compute_ck({0,0,0}, {vector_length, max_neighs, chunk_size_div}, {vector_length, tile_size_compute_ck, 1});
-    Kokkos::parallel_for("ComputeCayleyKlein", policy_compute_ck, *this);
-  }
+    //printf(">>> chunk_offset: %d\n", chunk_offset);
 
-  //PreUi
-  {
-    // tile_size_pre_ui is defined in `compute_sna_grid_kokkos.h`
-    Snap3DRangePolicy<DeviceType, tile_size_pre_ui, TagCSNAGridPreUi>
-      policy_preui({0,0,0},{vector_length,twojmax+1,chunk_size_div},{vector_length,tile_size_pre_ui,1});
-    Kokkos::parallel_for("PreUi",policy_preui,*this);
-  }
-
-  // ComputeUi w/ vector parallelism, shared memory, direct atomicAdd into ulisttot
-  {
-    // team_size_compute_ui is defined in `compute_sna_grid_kokkos.h`
-    // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer
-    const int tile_size = vector_length * (twojmax + 1);
-    const int scratch_size = scratch_size_helper<complex>(team_size_compute_ui * tile_size);
-
-    if (chunk_size < parallel_thresh)
+    //ComputeNeigh 
     {
-      // Version with parallelism over j_bend
+      int scratch_size = scratch_size_helper<int>(team_size_compute_neigh * max_neighs); //ntotal);
 
-      // total number of teams needed: (natoms / 32) * (ntotal) * ("bend" locations)
-      const int n_teams = chunk_size_div * max_neighs * (twojmax + 1);
-      const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
-
-      SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagCSNAGridComputeUiSmall>
-        policy_ui(n_teams_div, team_size_compute_ui, vector_length);
-      policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
-      Kokkos::parallel_for("ComputeUiSmall", policy_ui, *this);
-    } else {
-      // Version w/out parallelism  over j_bend
-
-      // total number of teams needed: (natoms / 32) * (ntotal)
-      const int n_teams = chunk_size_div * max_neighs;
-      const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
-
-      SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagCSNAGridComputeUiLarge>
-        policy_ui(n_teams_div, team_size_compute_ui, vector_length);
-      policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
-      Kokkos::parallel_for("ComputeUiLarge", policy_ui, *this);
+      SnapAoSoATeamPolicy<DeviceType, team_size_compute_neigh, TagCSNAGridComputeNeigh> 
+        policy_neigh(chunk_size, team_size_compute_neigh, vector_length);
+      policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+      Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this);
     }
-  }
 
-  //TransformUi: un-"fold" ulisttot, zero ylist
-  {
-    // team_size_transform_ui is defined in `pair_snap_kokkos.h`
-    Snap3DRangePolicy<DeviceType, tile_size_transform_ui, TagCSNAGridTransformUi>
-        policy_transform_ui({0,0,0},{vector_length,snaKK.idxu_max,chunk_size_div},{vector_length,tile_size_transform_ui,1});
-    Kokkos::parallel_for("TransformUi",policy_transform_ui,*this);
-  }
+    //ComputeCayleyKlein
+    {
+      // tile_size_compute_ck is defined in `compute_sna_grid_kokkos.h`
+      Snap3DRangePolicy<DeviceType, tile_size_compute_ck, TagCSNAGridComputeCayleyKlein>
+        policy_compute_ck({0,0,0}, {vector_length, max_neighs, chunk_size_div}, {vector_length, tile_size_compute_ck, 1});
+      Kokkos::parallel_for("ComputeCayleyKlein", policy_compute_ck, *this);
+    }
 
-  //Compute bispectrum in AoSoA data layout, transform Bi
+    //PreUi
+    {
+      // tile_size_pre_ui is defined in `compute_sna_grid_kokkos.h`
+      Snap3DRangePolicy<DeviceType, tile_size_pre_ui, TagCSNAGridPreUi>
+        policy_preui({0,0,0},{vector_length,twojmax+1,chunk_size_div},{vector_length,tile_size_pre_ui,1});
+      Kokkos::parallel_for("PreUi",policy_preui,*this);
+    }
 
-  //ComputeZi
-  const int idxz_max = snaKK.idxz_max;
-  Snap3DRangePolicy<DeviceType, tile_size_compute_zi, TagCSNAGridComputeZi>
-      policy_compute_zi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_zi,1});
-  Kokkos::parallel_for("ComputeZi",policy_compute_zi,*this);
+    // ComputeUi w/ vector parallelism, shared memory, direct atomicAdd into ulisttot
+    {
+      // team_size_compute_ui is defined in `compute_sna_grid_kokkos.h`
+      // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer
+      const int tile_size = vector_length * (twojmax + 1);
+      const int scratch_size = scratch_size_helper<complex>(team_size_compute_ui * tile_size);
 
-  //ComputeBi
-  const int idxb_max = snaKK.idxb_max;
-  Snap3DRangePolicy<DeviceType, tile_size_compute_bi, TagCSNAGridComputeBi>
-      policy_compute_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_compute_bi,1});
-  Kokkos::parallel_for("ComputeBi",policy_compute_bi,*this);
+      if (chunk_size < parallel_thresh)
+      {
+        // Version with parallelism over j_bend
 
-  //Transform data layout of blist out of AoSoA
-  //We need this because `blist` gets used in ComputeForce which doesn't
-  //take advantage of AoSoA, which at best would only be beneficial on the margins
-  //NOTE: Do we need this in compute sna/grid/kk?
-  Snap3DRangePolicy<DeviceType, tile_size_transform_bi, TagCSNAGridTransformBi>
-      policy_transform_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_transform_bi,1});
-  Kokkos::parallel_for("TransformBi",policy_transform_bi,*this);
+        // total number of teams needed: (natoms / 32) * (ntotal) * ("bend" locations)
+        const int n_teams = chunk_size_div * max_neighs * (twojmax + 1);
+        const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
 
-  // Fill the grid array with bispectrum values
-  {
-    typename Kokkos::RangePolicy<DeviceType,TagCSNAGridLocalFill> policy_fill(0,chunk_size);
-    Kokkos::parallel_for(policy_fill, *this);
-  }
+        SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagCSNAGridComputeUiSmall>
+          policy_ui(n_teams_div, team_size_compute_ui, vector_length);
+        policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+        Kokkos::parallel_for("ComputeUiSmall", policy_ui, *this);
+      } else {
+        // Version w/out parallelism  over j_bend
+
+        // total number of teams needed: (natoms / 32) * (ntotal)
+        const int n_teams = chunk_size_div * max_neighs;
+        const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
+
+        SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagCSNAGridComputeUiLarge>
+          policy_ui(n_teams_div, team_size_compute_ui, vector_length);
+        policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+        Kokkos::parallel_for("ComputeUiLarge", policy_ui, *this);
+      }
+    }
+
+    //TransformUi: un-"fold" ulisttot, zero ylist
+    {
+      // team_size_transform_ui is defined in `pair_snap_kokkos.h`
+      Snap3DRangePolicy<DeviceType, tile_size_transform_ui, TagCSNAGridTransformUi>
+          policy_transform_ui({0,0,0},{vector_length,snaKK.idxu_max,chunk_size_div},{vector_length,tile_size_transform_ui,1});
+      Kokkos::parallel_for("TransformUi",policy_transform_ui,*this);
+    }
+
+    //Compute bispectrum in AoSoA data layout, transform Bi
+
+    //ComputeZi
+    const int idxz_max = snaKK.idxz_max;
+    Snap3DRangePolicy<DeviceType, tile_size_compute_zi, TagCSNAGridComputeZi>
+        policy_compute_zi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_zi,1});
+    Kokkos::parallel_for("ComputeZi",policy_compute_zi,*this);
+
+    //ComputeBi
+    const int idxb_max = snaKK.idxb_max;
+    Snap3DRangePolicy<DeviceType, tile_size_compute_bi, TagCSNAGridComputeBi>
+        policy_compute_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_compute_bi,1});
+    Kokkos::parallel_for("ComputeBi",policy_compute_bi,*this);
+
+    //Transform data layout of blist out of AoSoA
+    //We need this because `blist` gets used in ComputeForce which doesn't
+    //take advantage of AoSoA, which at best would only be beneficial on the margins
+    //NOTE: Do we need this in compute sna/grid/kk?
+    Snap3DRangePolicy<DeviceType, tile_size_transform_bi, TagCSNAGridTransformBi>
+        policy_transform_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_transform_bi,1});
+    Kokkos::parallel_for("TransformBi",policy_transform_bi,*this);
+
+    // Fill the grid array with bispectrum values
+    {
+      typename Kokkos::RangePolicy<DeviceType,TagCSNAGridLocalFill> policy_fill(0,chunk_size);
+      Kokkos::parallel_for(policy_fill, *this);
+    }
+
+    // Proceed to the next chunk.
+    chunk_offset += chunk_size;
+
+  } // end while
 
   k_gridlocal.template modify<DeviceType>();
   k_gridlocal.template sync<LMPHostType>();
@@ -363,8 +378,12 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
 
   // extract loop index
   int ii = team.team_rank() + team.league_rank() * team.team_size();
+
   if (ii >= chunk_size) return;
 
+  // extract grid index
+  int igrid = ii + chunk_offset;
+
   // get a pointer to scratch memory
   // This is used to cache whether or not an atom is within the cutoff.
   // If it is, type_cache is assigned to the atom type.
@@ -376,8 +395,8 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
 
   // convert to grid indices
 
-  int iz = ii/(xlen*ylen);
-  int i2 = ii - (iz*xlen*ylen);
+  int iz = igrid/(xlen*ylen);
+  int i2 = igrid - (iz*xlen*ylen);
   int iy = i2/xlen;
   int ix = i2 % xlen;
   iz += nzlo;
@@ -387,7 +406,8 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
   double xgrid[3];
 
   // index ii already captures the proper grid point
-  // int igrid = iz * (nx * ny) + iy * nx + ix;
+  //int igrid = iz * (nx * ny) + iy * nx + ix;
+  //printf("%d %d\n", ii, igrid);
 
   // grid2x converts igrid to ix,iy,iz like we've done before
   // multiply grid integers by grid spacing delx, dely, delz
@@ -729,10 +749,14 @@ KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalFill, const int& ii) const {
   SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
 
+
+  // extract grid index
+  int igrid = ii + chunk_offset;
+
   // convert to grid indices
 
-  int iz = ii/(xlen*ylen);
-  int i2 = ii - (iz*xlen*ylen);
+  int iz = igrid/(xlen*ylen);
+  int i2 = igrid - (iz*xlen*ylen);
   int iy = i2/xlen;
   int ix = i2 % xlen;
   iz += nzlo;
@@ -753,9 +777,9 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
   const F_FLOAT xtmp = xgrid[0];
   const F_FLOAT ytmp = xgrid[1];
   const F_FLOAT ztmp = xgrid[2];
-  d_gridall(ii,0) = xtmp;
-  d_gridall(ii,1) = ytmp;
-  d_gridall(ii,2) = ztmp;
+  d_gridall(igrid,0) = xtmp;
+  d_gridall(igrid,1) = ytmp;
+  d_gridall(igrid,2) = ztmp;
 
   const auto idxb_max = snaKK.idxb_max;
 
@@ -764,7 +788,7 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
   for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
     const auto idxb = icoeff % idxb_max;
     const auto idx_chem = icoeff / idxb_max;
-    d_gridall(ii,icoeff+3) = my_sna.blist(ii,idx_chem,idxb);
+    d_gridall(igrid,icoeff+3) = my_sna.blist(ii,idx_chem,idxb);
   }
 
 }
diff --git a/src/KOKKOS/memory_kokkos.h b/src/KOKKOS/memory_kokkos.h
index 623b002dcb..e40edac607 100644
--- a/src/KOKKOS/memory_kokkos.h
+++ b/src/KOKKOS/memory_kokkos.h
@@ -163,7 +163,7 @@ template <typename TYPE, typename HTYPE>
 {
   data = TYPE(std::string(name),n1,n2);
   h_data = Kokkos::create_mirror_view(data);
-  printf(">>> name: %s\n", name);
+  //printf(">>> name: %s\n", name);
   return data;
 }
 
@@ -174,7 +174,7 @@ TYPE create_kokkos(TYPE &data, typename TYPE::value_type **&array,
   data = TYPE(std::string(name),n1,n2);
   bigint nbytes = ((bigint) sizeof(typename TYPE::value_type *)) * n1;
   array = (typename TYPE::value_type **) smalloc(nbytes,name);
-  printf(">>> name %s nbytes %d\n", name, nbytes);
+  //printf(">>> name %s nbytes %d\n", name, nbytes);
 
   for (int i = 0; i < n1; i++) {
     if (n2 == 0)
diff --git a/src/ML-SNAP/compute_grid.cpp b/src/ML-SNAP/compute_grid.cpp
index 12135c705d..dce2ab0283 100644
--- a/src/ML-SNAP/compute_grid.cpp
+++ b/src/ML-SNAP/compute_grid.cpp
@@ -88,6 +88,7 @@ void ComputeGrid::grid2x(int igrid, double *x)
   x[2] = iz * delz;
 
   if (triclinic) domain->lamda2x(x, x);
+  //printf(">>>>> ComputeGrid::grid2x\n");
 }
 
 /* ----------------------------------------------------------------------
@@ -103,6 +104,7 @@ void ComputeGrid::assign_coords_all()
     gridall[igrid][1] = x[1];
     gridall[igrid][2] = x[2];
   }
+  //printf(">>>>> ComputeGrid::assign_coords_all\n");
 }
 
 /* ----------------------------------------------------------------------
@@ -111,6 +113,7 @@ void ComputeGrid::assign_coords_all()
 
 void ComputeGrid::allocate()
 {
+  //printf(">>> ComputeGrid::allocate\n");
   // allocate arrays
   memory->create(grid, size_array_rows, size_array_cols, "grid:grid");
   memory->create(gridall, size_array_rows, size_array_cols, "grid:gridall");

From b1ffcbcd4190ccd26b10ab2726aac083fe404740 Mon Sep 17 00:00:00 2001
From: rohskopf <drew.rohskopf@gmail.com>
Date: Fri, 2 Jun 2023 17:38:48 -0600
Subject: [PATCH 21/51] Fix cutoff factor when switchflag = 0

---
 src/KOKKOS/sna_kokkos_impl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h
index 258fcb97a8..3bc241825b 100644
--- a/src/KOKKOS/sna_kokkos_impl.h
+++ b/src/KOKKOS/sna_kokkos_impl.h
@@ -2296,7 +2296,7 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_s_dsfac(const real
   constexpr real_type zero = static_cast<real_type>(0.0);
   constexpr real_type onehalf = static_cast<real_type>(0.5);
 
-  if (switch_flag == 0) { sfac_outer = zero; dsfac_outer = zero; }
+  if (switch_flag == 0) { sfac_outer = one; dsfac_outer = zero; }
   else if (switch_flag == 1) {
     if (r <= rmin0) { sfac_outer = one; dsfac_outer = zero; }
     else if (r > rcut) { sfac = zero; dsfac = zero; return; }

From b1105a231baccc1e4ac7092747c051cefd71d4eb Mon Sep 17 00:00:00 2001
From: rohskopf <drew.rohskopf@gmail.com>
Date: Sun, 4 Jun 2023 05:03:06 -0600
Subject: [PATCH 22/51] Add triclinic cell conversion

---
 src/KOKKOS/compute_sna_grid_kokkos.h      | 11 ++++
 src/KOKKOS/compute_sna_grid_kokkos_impl.h | 63 ++++++++++++++++++++++-
 2 files changed, 73 insertions(+), 1 deletion(-)

diff --git a/src/KOKKOS/compute_sna_grid_kokkos.h b/src/KOKKOS/compute_sna_grid_kokkos.h
index 0f56fdcbf1..fa0c7f18dd 100644
--- a/src/KOKKOS/compute_sna_grid_kokkos.h
+++ b/src/KOKKOS/compute_sna_grid_kokkos.h
@@ -269,6 +269,17 @@ class ComputeSNAGridKokkos : public ComputeSNAGrid {
   template <typename scratch_type>
   int scratch_size_helper(int values_per_team);
 
+  class DomainKokkos *domainKK;
+
+  // triclinic vars
+  /*
+  xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0];
+  xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1];
+  xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2];
+  */
+  double h0, h1, h2, h3, h4, h5;
+  double lo0, lo1, lo2;
+
 };
 
 // These wrapper classes exist to make the compute style factory happy/avoid having
diff --git a/src/KOKKOS/compute_sna_grid_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
index cb0a8a646f..6dc3be90d4 100644
--- a/src/KOKKOS/compute_sna_grid_kokkos_impl.h
+++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
@@ -28,6 +28,8 @@
 #include "neigh_request.h"
 #include "neighbor_kokkos.h"
 //#include "sna_kokkos.h"
+#include "domain.h"
+#include "domain_kokkos.h"
 #include "sna.h"
 #include "update.h"
 
@@ -49,6 +51,7 @@ ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::ComputeSNAGridKokkos
 {
   kokkosable = 1;
   atomKK = (AtomKokkos *) atom;
+  domainKK = (DomainKokkos *) domain;
   execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
   datamask_read = EMPTY_MASK;
   datamask_modify = EMPTY_MASK;
@@ -232,6 +235,23 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::compute_array()
   // Pre-compute ceil(chunk_size / vector_length) for code cleanliness
   const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length;
 
+  if (triclinic){
+    /*
+    xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0];
+    xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1];
+    xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2];
+    */
+    h0 = domain->h[0];
+    h1 = domain->h[1];
+    h2 = domain->h[2];
+    h3 = domain->h[3];
+    h4 = domain->h[4];   
+    h5 = domain->h[5];   
+    lo0 = domain->boxlo[0];
+    lo1 = domain->boxlo[1];
+    lo2 = domain->boxlo[2];
+  }
+
   while (chunk_offset < total_range) { // chunk up loop to prevent running out of memory
 
     if (chunk_size > total_range - chunk_offset)
@@ -415,6 +435,26 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
   xgrid[0] = ix * delx;
   xgrid[1] = iy * dely;
   xgrid[2] = iz * delz;
+
+  if (triclinic) {
+
+    // Do a conversion on `xgrid` here like we do in the CPU version.
+
+    // Can't do this:
+    // domainKK->lamda2x(xgrid, xgrid);
+    // Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed
+
+    // Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats.
+    /*
+    xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0];
+    xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1];
+    xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2];
+    */
+    xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0;
+    xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1;
+    xgrid[2] = h2*xgrid[2] + lo2;
+  }
+
   const F_FLOAT xtmp = xgrid[0];
   const F_FLOAT ytmp = xgrid[1];
   const F_FLOAT ztmp = xgrid[2];
@@ -429,9 +469,11 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
 
   // We need a DomainKokkos::lamda2x parallel for loop here, but let's ignore for now.
   // The purpose here is to transform for triclinic boxes.
+  /*
   if (triclinic){
     printf("We are triclinic %f %f %f\n", xtmp, ytmp, ztmp);
-  } 
+  }
+  */
 
   // Compute the number of neighbors, store rsq
   int ninside = 0;
@@ -774,6 +816,25 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
   xgrid[0] = ix * delx;
   xgrid[1] = iy * dely;
   xgrid[2] = iz * delz;
+  if (triclinic) {
+
+    // Do a conversion on `xgrid` here like we do in the CPU version.
+
+    // Can't do this:
+    // domainKK->lamda2x(xgrid, xgrid);
+    // Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed
+
+    // Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats.
+    /*
+    xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0];
+    xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1];
+    xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2];
+    */
+    xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0;
+    xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1;
+    xgrid[2] = h2*xgrid[2] + lo2;
+  }
+
   const F_FLOAT xtmp = xgrid[0];
   const F_FLOAT ytmp = xgrid[1];
   const F_FLOAT ztmp = xgrid[2];

From 788fd3a9ac1726f892069aa620b4fcdd12fd5b60 Mon Sep 17 00:00:00 2001
From: Lenz Fiedler <l.fiedler@hzdr.de>
Date: Tue, 6 Jun 2023 15:28:41 +0200
Subject: [PATCH 23/51] Re-Adding Gaussian grid again, originally authored by
 Aidan Thompson

Co-authored-by: Aidan Thompson <athomps@sandia.gov>
---
 src/ML-SNAP/compute_gaussian_grid_local.cpp | 167 ++++++++++++++++++++
 src/ML-SNAP/compute_gaussian_grid_local.h   |  51 ++++++
 2 files changed, 218 insertions(+)
 create mode 100644 src/ML-SNAP/compute_gaussian_grid_local.cpp
 create mode 100644 src/ML-SNAP/compute_gaussian_grid_local.h

diff --git a/src/ML-SNAP/compute_gaussian_grid_local.cpp b/src/ML-SNAP/compute_gaussian_grid_local.cpp
new file mode 100644
index 0000000000..ec75563bcf
--- /dev/null
+++ b/src/ML-SNAP/compute_gaussian_grid_local.cpp
@@ -0,0 +1,167 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/ Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "compute_gaussian_grid_local.h"
+
+#include "atom.h"
+#include "comm.h"
+#include "error.h"
+#include "force.h"
+#include "math_const.h"
+#include "math_special.h"
+#include "memory.h"
+#include "modify.h"
+#include "update.h"
+
+#include <cmath>
+#include <cstring>
+
+using namespace LAMMPS_NS;
+using MathConst::MY_2PI;
+using MathSpecial::powint;
+
+ComputeGaussianGridLocal::ComputeGaussianGridLocal(LAMMPS *lmp, int narg, char **arg) :
+    ComputeGridLocal(lmp, narg, arg), cutsq(nullptr), radelem(nullptr),
+    sigmaelem(nullptr), prefacelem(nullptr), argfacelem(nullptr)
+{
+  // skip over arguments used by base class
+  // so that argument positions are identical to
+  // regular per-atom compute
+
+  arg += nargbase;
+  narg -= nargbase;
+
+  double rfac0, rmin0;
+  int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;
+
+  int ntypes = atom->ntypes;
+  int nargmin = 4 + 2 * ntypes;
+
+  if (narg < nargmin) error->all(FLERR, "Illegal compute {} command", style);
+
+  // process required arguments
+
+  memory->create(radelem, ntypes + 1, "gaussian/atom:radelem");    // offset by 1 to match up with types
+  memory->create(sigmaelem, ntypes + 1, "gaussian/atom:sigmaelem");
+  memory->create(prefacelem, ntypes + 1, "gaussian/atom:prefacelem");
+  memory->create(argfacelem, ntypes + 1, "gaussian/atom:argfacelem");
+
+  rcutfac = utils::numeric(FLERR, arg[3], false, lmp);
+
+  for (int i = 0; i < ntypes; i++) radelem[i + 1] = utils::numeric(FLERR, arg[4 + i], false, lmp);
+  for (int i = 0; i < ntypes; i++)
+    sigmaelem[i + 1] = utils::numeric(FLERR, arg[ntypes + 4 + i], false, lmp);
+  
+  // construct cutsq
+
+  double cut;
+  cutmax = 0.0;
+  memory->create(cutsq, ntypes + 1, ntypes + 1, "gaussian/atom:cutsq");
+  for (int i = 1; i <= ntypes; i++) {
+    cut = 2.0 * radelem[i] * rcutfac;
+    if (cut > cutmax) cutmax = cut;
+    cutsq[i][i] = cut * cut;
+    for (int j = i + 1; j <= ntypes; j++) {
+      cut = (radelem[i] + radelem[j]) * rcutfac;
+      cutsq[i][j] = cutsq[j][i] = cut * cut;
+    }
+  }
+
+  size_local_cols = size_local_cols_base + ntypes;
+
+  // pre-compute coefficients
+  
+  for (int i = 0; i < ntypes; i++) {
+    prefacelem[i + 1] = 1.0/powint(sigmaelem[i + 1] * sqrt(MY_2PI), 3);
+    argfacelem[i + 1] = 1.0/(2.0 * sigmaelem[i + 1] * sigmaelem[i + 1]);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+ComputeGaussianGridLocal::~ComputeGaussianGridLocal()
+{
+  memory->destroy(radelem);
+  memory->destroy(sigmaelem);
+  memory->destroy(prefacelem);
+  memory->destroy(argfacelem);
+  memory->destroy(cutsq);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ComputeGaussianGridLocal::init()
+{
+  if ((modify->get_compute_by_style("^gaussian/grid/local$").size() > 1) && (comm->me == 0))
+    error->warning(FLERR, "More than one instance of compute gaussian/grid/local");
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ComputeGaussianGridLocal::compute_local()
+{
+  invoked_local = update->ntimestep;
+
+  // compute gaussian for each gridpoint
+
+  double **const x = atom->x;
+  const int *const mask = atom->mask;
+  int *const type = atom->type;
+  const int ntotal = atom->nlocal + atom->nghost;
+
+  int igrid = 0;
+  for (int iz = nzlo; iz <= nzhi; iz++)
+    for (int iy = nylo; iy <= nyhi; iy++)
+      for (int ix = nxlo; ix <= nxhi; ix++) {
+        double xgrid[3];
+        grid2x(ix, iy, iz, xgrid);
+        const double xtmp = xgrid[0];
+        const double ytmp = xgrid[1];
+        const double ztmp = xgrid[2];
+
+        // Zeroing out the components, which are filled as a sum.
+        for (int icol = size_local_cols_base; icol < size_local_cols; icol++){
+          alocal[igrid][icol] = 0.0;
+        }
+
+        for (int j = 0; j < ntotal; j++) {
+
+          // check that j is in compute group
+
+          if (!(mask[j] & groupbit)) continue;
+
+          const double delx = xtmp - x[j][0];
+          const double dely = ytmp - x[j][1];
+          const double delz = ztmp - x[j][2];
+          const double rsq = delx * delx + dely * dely + delz * delz;
+          int jtype = type[j];
+          if (rsq < cutsq[jtype][jtype]) {
+          int icol = size_local_cols_base + jtype - 1;
+            alocal[igrid][icol] += prefacelem[jtype] * exp(-rsq * argfacelem[jtype]);
+          }
+        }
+	    igrid++;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   memory usage
+------------------------------------------------------------------------- */
+
+double ComputeGaussianGridLocal::memory_usage()
+{
+  int n = atom->ntypes + 1;
+  int nbytes = (double) n * sizeof(int);    // map
+
+  return nbytes;
+}
diff --git a/src/ML-SNAP/compute_gaussian_grid_local.h b/src/ML-SNAP/compute_gaussian_grid_local.h
new file mode 100644
index 0000000000..cfab841a6e
--- /dev/null
+++ b/src/ML-SNAP/compute_gaussian_grid_local.h
@@ -0,0 +1,51 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/ Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef COMPUTE_CLASS
+// clang-format off
+ComputeStyle(gaussian/grid/local,ComputeGaussianGridLocal);
+// clang-format on
+#else
+
+#ifndef LMP_COMPUTE_GAUSSIAN_GRID_LOCAL_H
+#define LMP_COMPUTE_GAUSSIAN_GRID_LOCAL_H
+
+#include "compute_grid_local.h"
+
+namespace LAMMPS_NS {
+
+class ComputeGaussianGridLocal : public ComputeGridLocal {
+ public:
+  ComputeGaussianGridLocal(class LAMMPS *, int, char **);
+  ~ComputeGaussianGridLocal() override;
+  void init() override;
+  void compute_local() override;
+  double memory_usage() override;
+
+ private:
+  int ncoeff;
+  double **cutsq;
+  double rcutfac;     // global cut-off scale
+  double *radelem;    // cut-off radius of each atom type
+  double *sigmaelem;  // Gaussian width of each atom type
+  double *prefacelem; // Gaussian prefactor of each atom type
+  double *argfacelem; // Gaussian argument factor of each atom type
+  int *map;    // map types to [0,nelements)
+  int nelements;
+  double cutmax;
+};
+
+}    // namespace LAMMPS_NS
+
+#endif
+#endif

From fc5e583c56c61fc122d4782f9ddfb88da4109931 Mon Sep 17 00:00:00 2001
From: rohskopf <drew.rohskopf@gmail.com>
Date: Sat, 24 Jun 2023 15:56:54 -0600
Subject: [PATCH 24/51] Initial Gaussian grid implementation

---
 .../compute_gaussian_grid_local_kokkos.cpp    | 85 +++++++++++++++++++
 .../compute_gaussian_grid_local_kokkos.h      | 75 ++++++++++++++++
 src/KOKKOS/compute_sna_grid_kokkos.h          |  8 --
 src/KOKKOS/compute_sna_grid_kokkos_impl.h     | 11 ---
 4 files changed, 160 insertions(+), 19 deletions(-)
 create mode 100644 src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp
 create mode 100644 src/KOKKOS/compute_gaussian_grid_local_kokkos.h

diff --git a/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp b/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp
new file mode 100644
index 0000000000..240767e43a
--- /dev/null
+++ b/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp
@@ -0,0 +1,85 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Drew Rohskopf (SNL)
+------------------------------------------------------------------------- */
+
+#include "compute_gaussian_grid_local_kokkos.h"
+
+#include "atom_kokkos.h"
+#include "atom_masks.h"
+#include "comm.h"
+#include "domain.h"
+#include "error.h"
+#include "force.h"
+#include "memory_kokkos.h"
+#include "modify.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "neighbor_kokkos.h"
+#include "pair.h"
+#include "update.h"
+
+#include <cmath>
+#include <cstring>
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+ComputeGaussianGridLocalKokkos<DeviceType>::ComputeGaussianGridLocalKokkos(LAMMPS *lmp, int narg, char **arg) :
+  ComputeGaussianGridLocal(lmp, narg, arg)
+{
+  kokkosable = 1;
+  atomKK = (AtomKokkos *) atom;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  datamask_read = EMPTY_MASK;
+  datamask_modify = EMPTY_MASK;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+ComputeGaussianGridLocalKokkos<DeviceType>::~ComputeGaussianGridLocalKokkos()
+{
+  if (copymode) return;
+
+  //memoryKK->destroy_kokkos(k_result,result);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void ComputeGaussianGridLocalKokkos<DeviceType>::init()
+{
+  ComputeGaussianGridLocal::init();
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void ComputeGaussianGridLocalKokkos<DeviceType>::compute_local()
+{
+
+  printf(">>> compute_local Kokkos\n");
+
+}
+
+namespace LAMMPS_NS {
+template class ComputeGaussianGridLocalKokkos<LMPDeviceType>;
+#ifdef LMP_KOKKOS_GPU
+template class ComputeGaussianGridLocalKokkos<LMPHostType>;
+#endif
+}
\ No newline at end of file
diff --git a/src/KOKKOS/compute_gaussian_grid_local_kokkos.h b/src/KOKKOS/compute_gaussian_grid_local_kokkos.h
new file mode 100644
index 0000000000..7698ce9567
--- /dev/null
+++ b/src/KOKKOS/compute_gaussian_grid_local_kokkos.h
@@ -0,0 +1,75 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef COMPUTE_CLASS
+// clang-format off
+ComputeStyle(gaussian/grid/local/kk,ComputeGaussianGridLocalKokkos<LMPDeviceType>);
+ComputeStyle(gaussian/grid/local/kk/device,ComputeGaussianGridLocalKokkos<LMPDeviceType>);
+ComputeStyle(gaussian/grid/local/kk/host,ComputeGaussianGridLocalKokkos<LMPHostType>);
+// clang-format on
+
+#else
+
+#ifndef LMP_COMPUTE_GAUSSIAN_GRID_LOCAL_KOKKOS_H
+#define LMP_COMPUTE_GAUSSIAN_GRID_LOCAL_KOKKOS_H
+
+#include "compute_gaussian_grid_local.h"
+#include "kokkos_type.h"
+
+namespace LAMMPS_NS {
+
+// clang-format off
+//struct TagComputeGaussianGridLocal {};
+// clang-format on
+
+template <class DeviceType> class ComputeGaussianGridLocalKokkos : public ComputeGaussianGridLocal {
+ public:
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+
+  ComputeGaussianGridLocalKokkos(class LAMMPS *, int, char **);
+  ~ComputeGaussianGridLocalKokkos() override;
+  void init() override;
+  void compute_local() override;
+
+  //KOKKOS_INLINE_FUNCTION
+  //void operator()(TagComputeGaussianGridLocal const int &) const;
+
+ private:
+  //double adof, mvv2e, mv2d, boltz;
+
+  Kokkos::View<double*, DeviceType> d_radelem;              // element radii
+  Kokkos::View<int*, DeviceType> d_ninside;                // ninside for all atoms in list
+  Kokkos::View<int*, DeviceType> d_map;                    // mapping from atom types to elements
+
+  /*
+  typename AT::t_x_array x;
+  typename AT::t_v_array v;
+  typename ArrayTypes<DeviceType>::t_float_1d rmass;
+  typename ArrayTypes<DeviceType>::t_float_1d mass;
+  typename ArrayTypes<DeviceType>::t_int_1d type;
+  typename ArrayTypes<DeviceType>::t_int_1d mask;
+  */
+
+  //typename AT::t_neighbors_2d d_neighbors;
+  //typename AT::t_int_1d d_ilist;
+  //typename AT::t_int_1d d_numneigh;
+
+  //DAT::tdual_float_2d k_result;
+  //typename AT::t_float_2d d_result;
+};
+
+}    // namespace LAMMPS_NS
+
+#endif
+#endif
diff --git a/src/KOKKOS/compute_sna_grid_kokkos.h b/src/KOKKOS/compute_sna_grid_kokkos.h
index fa0c7f18dd..bd47059312 100644
--- a/src/KOKKOS/compute_sna_grid_kokkos.h
+++ b/src/KOKKOS/compute_sna_grid_kokkos.h
@@ -252,14 +252,6 @@ class ComputeSNAGridKokkos : public ComputeSNAGrid {
   typename AT::t_float_2d d_grid;
   typename AT::t_float_2d d_gridall;
 
-  //DAT::tdual_float_4d k_gridlocal;
-  //typedef Kokkos::DualView<real_type****, Kokkos::LayoutLeft, DeviceType> t_gridlocal_4d;
-  //typedef Kokkos::View<real_type****, DeviceType> t_4d;
-  // should we use LMPDeviceType below?
-  //typedef Kokkos::DualView<LMP_FLOAT****, LMPDeviceType> tdual_float_4d;
-  //typedef tdual_float_4d::t_dev tdev_float_4d;
-  //tdual_float_4d k_gridlocal;
-  //tdev_float_4d d_gridlocal; 
   DAT::tdual_float_4d k_gridlocal;
   typename AT::t_float_4d d_gridlocal;
 
diff --git a/src/KOKKOS/compute_sna_grid_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
index 6dc3be90d4..bd95c6a62c 100644
--- a/src/KOKKOS/compute_sna_grid_kokkos_impl.h
+++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
@@ -168,22 +168,11 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::setup()
   ComputeGrid::set_grid_local();
   
   // allocate arrays
-  //printf(">>> Allocating gridall.\n");
-  //printf(">>> %d %d\n", size_array_rows, size_array_cols);
-  //memoryKK->create_kokkos(k_grid,grid, size_array_rows, size_array_cols, "grid:grid");
   memoryKK->create_kokkos(k_gridall, gridall, size_array_rows, size_array_cols, "grid:gridall");
-  //printf(">>> Allocated gridall.\n");
 
   // do not use or allocate gridlocal for now
 
   gridlocal_allocated = 0;
-  /*
-  if (nxlo <= nxhi && nylo <= nyhi && nzlo <= nzhi) {
-    gridlocal_allocated = 1;
-    memoryKK->create4d_offset_kokkos(k_gridlocal, gridlocal, size_array_cols, nzlo, nzhi, nylo, 
-                                     nyhi, nxlo, nxhi, "grid:gridlocal");
-  }
-  */
   array = gridall;
 
   d_gridlocal = k_gridlocal.template view<DeviceType>();

From 5885f49b751452f3a1009517dfb808f5ce493f0a Mon Sep 17 00:00:00 2001
From: rohskopf <drew.rohskopf@gmail.com>
Date: Mon, 26 Jun 2023 14:50:44 -0600
Subject: [PATCH 25/51] Prevent polymorphic destructor calls with copymode

---
 .../compute_gaussian_grid_local_kokkos.cpp    | 167 +++++++++++++++++-
 .../compute_gaussian_grid_local_kokkos.h      |  37 +++-
 src/KOKKOS/compute_sna_grid_kokkos_impl.h     |   2 +
 3 files changed, 201 insertions(+), 5 deletions(-)

diff --git a/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp b/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp
index 240767e43a..e7da2a315a 100644
--- a/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp
+++ b/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp
@@ -47,6 +47,49 @@ ComputeGaussianGridLocalKokkos<DeviceType>::ComputeGaussianGridLocalKokkos(LAMMP
   execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
   datamask_read = EMPTY_MASK;
   datamask_modify = EMPTY_MASK;
+
+  k_cutsq = tdual_fparams("ComputeSNAGridKokkos::cutsq",atom->ntypes+1,atom->ntypes+1);
+  auto d_cutsq = k_cutsq.template view<DeviceType>();
+  rnd_cutsq = d_cutsq;
+
+  host_flag = (execution_space == Host);
+
+  // TODO: Extract cutsq in double loop below, no need for cutsq_tmp
+
+  //cutsq_tmp = cutsq[1][1];
+
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = 1; j <= atom->ntypes; j++){
+      k_cutsq.h_view(i,j) = k_cutsq.h_view(j,i) = cutsq[i][j]; //cutsq_tmp;
+      k_cutsq.template modify<LMPHostType>();
+    }
+  }
+  //printf(">>> 1\n");
+  // Set up element lists
+  MemKK::realloc_kokkos(d_radelem,"ComputeSNAGridKokkos::radelem",nelements);
+  int n = atom->ntypes;
+  MemKK::realloc_kokkos(d_map,"ComputeSNAGridKokkos::map",n+1);
+  //printf(">>> 2\n");
+  auto h_radelem = Kokkos::create_mirror_view(d_radelem);
+  auto h_map = Kokkos::create_mirror_view(d_map);
+  //printf(">>> 3\n");
+  // start from index 1 because of how compute sna/grid is
+  for (int i = 1; i <= atom->ntypes; i++) {
+    h_radelem(i-1) = radelem[i];
+  }
+  //printf(">>> 4\n");
+  // In pair snap some things like `map` get allocated regardless of chem flag.
+  // In this compute, however, map does not get allocated in parent classes.
+  /*
+  for (int i = 1; i <= atom->ntypes; i++) {
+    h_map(i) = map[i];
+  }
+  */
+  //printf(">>> 5\n");
+  Kokkos::deep_copy(d_radelem,h_radelem);
+  Kokkos::deep_copy(d_map,h_map);
+  //printf(">>> 6\n");
+
 }
 
 /* ---------------------------------------------------------------------- */
@@ -54,9 +97,40 @@ ComputeGaussianGridLocalKokkos<DeviceType>::ComputeGaussianGridLocalKokkos(LAMMP
 template<class DeviceType>
 ComputeGaussianGridLocalKokkos<DeviceType>::~ComputeGaussianGridLocalKokkos()
 {
+  printf(">>> ComputeGaussianGridLocalKokkos destruct begin, copymode %d\n", copymode);
   if (copymode) return;
 
-  //memoryKK->destroy_kokkos(k_result,result);
+  memoryKK->destroy_kokkos(k_cutsq,cutsq);
+  memoryKK->destroy_kokkos(k_alocal,alocal);
+  //gridlocal_allocated = 0;
+
+  printf(">>> ComputeGaussianGridLocalKokkos end\n");
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void ComputeGaussianGridLocalKokkos<DeviceType>::setup()
+{
+
+  // Do not call ComputeGrid::setup(), we don't wanna allocate the grid array there.
+  // Instead, call ComputeGrid::set_grid_global and set_grid_local to set the n indices.
+
+  //ComputeGrid::set_grid_global();
+  //ComputeGrid::set_grid_local();
+  ComputeGridLocal::setup();
+
+  // allocate arrays
+  printf(">>> rows cols kokkos init: %d %d\n", size_local_rows, size_local_cols);
+  memoryKK->create_kokkos(k_alocal, alocal, size_local_rows, size_local_cols, "grid:alocal");
+
+  //gridlocal_allocated = 1;
+  //array = gridall;
+
+  d_alocal = k_alocal.template view<DeviceType>();
+  //d_grid = k_grid.template view<DeviceType>();
+  //d_gridall = k_gridall.template view<DeviceType>();
+
 }
 
 /* ---------------------------------------------------------------------- */
@@ -72,9 +146,98 @@ void ComputeGaussianGridLocalKokkos<DeviceType>::init()
 template<class DeviceType>
 void ComputeGaussianGridLocalKokkos<DeviceType>::compute_local()
 {
+  printf(">>> compute_local Kokkos begin\n");
 
-  printf(">>> compute_local Kokkos\n");
+  if (host_flag) {
+    return;
+  }
 
+  invoked_local = update->ntimestep;
+
+  copymode = 1;
+
+  zlen = nzhi-nzlo+1;
+  ylen = nyhi-nylo+1;
+  xlen = nxhi-nxlo+1;
+  total_range = (nzhi-nzlo+1)*(nyhi-nylo+1)*(nxhi-nxlo+1);
+
+  atomKK->sync(execution_space,X_MASK|F_MASK|TYPE_MASK);
+  x = atomKK->k_x.view<DeviceType>();
+  type = atomKK->k_type.view<DeviceType>();
+  k_cutsq.template sync<DeviceType>();
+
+  // max_neighs is defined here - think of more elaborate methods.
+  max_neighs = 100;
+
+  // Pair snap/kk uses grow_ij with some max number of neighs but compute sna/grid uses total 
+  // number of atoms.
+
+  ntotal = atomKK->nlocal + atomKK->nghost;
+  // Allocate view for number of neighbors per grid point
+  MemKK::realloc_kokkos(d_ninside,"ComputeSNAGridKokkos:ninside",total_range);
+
+  // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user 
+  // `total_range` is the number of grid points which may be larger than chunk size.
+  //printf(">>> total_range: %d\n", total_range);
+  chunksize = 32768;
+  chunk_size = MIN(chunksize, total_range);
+  chunk_offset = 0;
+
+  int vector_length_default = 1;
+  int team_size_default = 1;
+  if (!host_flag)
+    team_size_default = 32;//max_neighs;
+
+  while (chunk_offset < total_range) { // chunk up loop to prevent running out of memory
+
+    if (chunk_size > total_range - chunk_offset)
+      chunk_size = total_range - chunk_offset;
+
+    //Neigh
+    {
+      int vector_length = vector_length_default;
+      int team_size = team_size_default;
+      check_team_size_for<TagComputeGaussianGridLocalNeigh>(chunk_size,team_size,vector_length);
+      printf(">>> Check 1 %d %d %d\n", chunk_size, team_size, vector_length);
+      typename Kokkos::TeamPolicy<DeviceType, TagComputeGaussianGridLocalNeigh> policy_neigh(chunk_size,team_size,vector_length);
+      printf(">>> Check 2\n");
+      Kokkos::parallel_for("ComputeGaussianGridLocalNeigh",policy_neigh,*this);
+    }
+
+    // Proceed to the next chunk.
+    chunk_offset += chunk_size;
+  } // end while
+
+  copymode = 0;
+
+  k_alocal.template modify<DeviceType>();
+  k_alocal.template sync<LMPHostType>();
+
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void ComputeGaussianGridLocalKokkos<DeviceType>::operator() (TagComputeGaussianGridLocalNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagComputeGaussianGridLocalNeigh>::member_type& team) const
+{
+  const int ii = team.league_rank();
+  //printf("%d\n", ii);
+}
+
+/* ----------------------------------------------------------------------
+   check max team size
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+template<class TagStyle>
+void ComputeGaussianGridLocalKokkos<DeviceType>::check_team_size_for(int inum, int &team_size, int vector_length) {
+  int team_size_max;
+
+  team_size_max = Kokkos::TeamPolicy<DeviceType,TagStyle>(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelForTag());
+
+  if (team_size*vector_length > team_size_max)
+    team_size = team_size_max/vector_length;
 }
 
 namespace LAMMPS_NS {
diff --git a/src/KOKKOS/compute_gaussian_grid_local_kokkos.h b/src/KOKKOS/compute_gaussian_grid_local_kokkos.h
index 7698ce9567..474797584f 100644
--- a/src/KOKKOS/compute_gaussian_grid_local_kokkos.h
+++ b/src/KOKKOS/compute_gaussian_grid_local_kokkos.h
@@ -29,7 +29,7 @@ ComputeStyle(gaussian/grid/local/kk/host,ComputeGaussianGridLocalKokkos<LMPHostT
 namespace LAMMPS_NS {
 
 // clang-format off
-//struct TagComputeGaussianGridLocal {};
+struct TagComputeGaussianGridLocalNeigh{};
 // clang-format on
 
 template <class DeviceType> class ComputeGaussianGridLocalKokkos : public ComputeGaussianGridLocal {
@@ -37,13 +37,25 @@ template <class DeviceType> class ComputeGaussianGridLocalKokkos : public Comput
   typedef DeviceType device_type;
   typedef ArrayTypes<DeviceType> AT;
 
+  // Static team/tile sizes for device offload
+
+#ifdef KOKKOS_ENABLE_HIP
+  static constexpr int team_size_compute_neigh = 2;
+#else
+  static constexpr int team_size_compute_neigh = 4;
+#endif
+
   ComputeGaussianGridLocalKokkos(class LAMMPS *, int, char **);
   ~ComputeGaussianGridLocalKokkos() override;
+  void setup() override;
   void init() override;
   void compute_local() override;
 
-  //KOKKOS_INLINE_FUNCTION
-  //void operator()(TagComputeGaussianGridLocal const int &) const;
+  template<class TagStyle>
+  void check_team_size_for(int, int&, int);
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagComputeGaussianGridLocalNeigh, const typename Kokkos::TeamPolicy<DeviceType, TagComputeGaussianGridLocalNeigh>::member_type& team) const;
 
  private:
   //double adof, mvv2e, mv2d, boltz;
@@ -52,6 +64,12 @@ template <class DeviceType> class ComputeGaussianGridLocalKokkos : public Comput
   Kokkos::View<int*, DeviceType> d_ninside;                // ninside for all atoms in list
   Kokkos::View<int*, DeviceType> d_map;                    // mapping from atom types to elements
 
+  typedef Kokkos::DualView<F_FLOAT**, DeviceType> tdual_fparams;
+  tdual_fparams k_cutsq;
+  typedef Kokkos::View<const F_FLOAT**, DeviceType,
+      Kokkos::MemoryTraits<Kokkos::RandomAccess> > t_fparams_rnd;
+  t_fparams_rnd rnd_cutsq;
+
   /*
   typename AT::t_x_array x;
   typename AT::t_v_array v;
@@ -67,6 +85,19 @@ template <class DeviceType> class ComputeGaussianGridLocalKokkos : public Comput
 
   //DAT::tdual_float_2d k_result;
   //typename AT::t_float_2d d_result;
+
+  int max_neighs, inum, chunk_size, chunk_offset;
+  int host_flag;
+  int total_range; // total number of loop iterations in grid
+  int xlen, ylen, zlen;
+  int chunksize; 
+  int ntotal; 
+
+  typename AT::t_x_array_randomread x;
+  typename AT::t_int_1d_randomread type;
+
+  DAT::tdual_float_2d k_alocal;
+  typename AT::t_float_2d d_alocal;
 };
 
 }    // namespace LAMMPS_NS
diff --git a/src/KOKKOS/compute_sna_grid_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
index bd95c6a62c..81f3173a7d 100644
--- a/src/KOKKOS/compute_sna_grid_kokkos_impl.h
+++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
@@ -135,7 +135,9 @@ ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::ComputeSNAGridKokkos
 template<class DeviceType, typename real_type, int vector_length>
 ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::~ComputeSNAGridKokkos()
 {
+  //printf(">>> ComputeSNAGridKokkos destruct begin copymode %d\n", copymode);
   if (copymode) return;
+  //printf(">>> After copymode\n");
 
   memoryKK->destroy_kokkos(k_cutsq,cutsq);
   //memoryKK->destroy_kokkos(k_grid,grid);

From 9eb26e4cd0c995e4b159d5d765cc73af79b2e703 Mon Sep 17 00:00:00 2001
From: rohskopf <drew.rohskopf@gmail.com>
Date: Mon, 26 Jun 2023 16:43:28 -0600
Subject: [PATCH 26/51] Shallow copy Kokkos written array to returned array
 variable

---
 .../compute_gaussian_grid_local_kokkos.cpp    | 135 +++++++++++++++++-
 .../compute_gaussian_grid_local_kokkos.h      |  12 ++
 src/ML-SNAP/compute_gaussian_grid_local.cpp   |  11 +-
 src/ML-SNAP/compute_gaussian_grid_local.h     |   2 +-
 src/ML-SNAP/compute_grid_local.cpp            |  10 ++
 5 files changed, 163 insertions(+), 7 deletions(-)

diff --git a/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp b/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp
index e7da2a315a..5158cb5246 100644
--- a/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp
+++ b/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp
@@ -66,16 +66,25 @@ ComputeGaussianGridLocalKokkos<DeviceType>::ComputeGaussianGridLocalKokkos(LAMMP
   }
   //printf(">>> 1\n");
   // Set up element lists
-  MemKK::realloc_kokkos(d_radelem,"ComputeSNAGridKokkos::radelem",nelements);
   int n = atom->ntypes;
+  MemKK::realloc_kokkos(d_radelem,"ComputeSNAGridKokkos::radelem",nelements);
+  MemKK::realloc_kokkos(d_sigmaelem,"ComputeSNAGridKokkos::sigmaelem",n+1);
+  MemKK::realloc_kokkos(d_prefacelem,"ComputeSNAGridKokkos::prefacelem",n+1);
+  MemKK::realloc_kokkos(d_argfacelem,"ComputeSNAGridKokkos::argfacelem",n+1);
   MemKK::realloc_kokkos(d_map,"ComputeSNAGridKokkos::map",n+1);
   //printf(">>> 2\n");
   auto h_radelem = Kokkos::create_mirror_view(d_radelem);
+  auto h_sigmaelem = Kokkos::create_mirror_view(d_sigmaelem);
+  auto h_prefacelem = Kokkos::create_mirror_view(d_prefacelem);
+  auto h_argfacelem = Kokkos::create_mirror_view(d_argfacelem);
   auto h_map = Kokkos::create_mirror_view(d_map);
   //printf(">>> 3\n");
   // start from index 1 because of how compute sna/grid is
   for (int i = 1; i <= atom->ntypes; i++) {
     h_radelem(i-1) = radelem[i];
+    h_sigmaelem(i-1) = sigmaelem[i];
+    h_prefacelem(i-1) = prefacelem[i];
+    h_argfacelem(i-1) = argfacelem[i];
   }
   //printf(">>> 4\n");
   // In pair snap some things like `map` get allocated regardless of chem flag.
@@ -87,6 +96,9 @@ ComputeGaussianGridLocalKokkos<DeviceType>::ComputeGaussianGridLocalKokkos(LAMMP
   */
   //printf(">>> 5\n");
   Kokkos::deep_copy(d_radelem,h_radelem);
+  Kokkos::deep_copy(d_sigmaelem,h_sigmaelem);
+  Kokkos::deep_copy(d_prefacelem, h_prefacelem);
+  Kokkos::deep_copy(d_argfacelem, h_argfacelem);
   Kokkos::deep_copy(d_map,h_map);
   //printf(">>> 6\n");
 
@@ -127,6 +139,8 @@ void ComputeGaussianGridLocalKokkos<DeviceType>::setup()
   //gridlocal_allocated = 1;
   //array = gridall;
 
+  array_local = alocal;
+
   d_alocal = k_alocal.template view<DeviceType>();
   //d_grid = k_grid.template view<DeviceType>();
   //d_gridall = k_gridall.template view<DeviceType>();
@@ -188,6 +202,23 @@ void ComputeGaussianGridLocalKokkos<DeviceType>::compute_local()
   if (!host_flag)
     team_size_default = 32;//max_neighs;
 
+  if (triclinic){
+    /*
+    xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0];
+    xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1];
+    xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2];
+    */
+    h0 = domain->h[0];
+    h1 = domain->h[1];
+    h2 = domain->h[2];
+    h3 = domain->h[3];
+    h4 = domain->h[4];   
+    h5 = domain->h[5];   
+    lo0 = domain->boxlo[0];
+    lo1 = domain->boxlo[1];
+    lo2 = domain->boxlo[2];
+  }
+
   while (chunk_offset < total_range) { // chunk up loop to prevent running out of memory
 
     if (chunk_size > total_range - chunk_offset)
@@ -198,9 +229,9 @@ void ComputeGaussianGridLocalKokkos<DeviceType>::compute_local()
       int vector_length = vector_length_default;
       int team_size = team_size_default;
       check_team_size_for<TagComputeGaussianGridLocalNeigh>(chunk_size,team_size,vector_length);
-      printf(">>> Check 1 %d %d %d\n", chunk_size, team_size, vector_length);
+      //printf(">>> Check 1 %d %d %d\n", chunk_size, team_size, vector_length);
       typename Kokkos::TeamPolicy<DeviceType, TagComputeGaussianGridLocalNeigh> policy_neigh(chunk_size,team_size,vector_length);
-      printf(">>> Check 2\n");
+      //printf(">>> Check 2\n");
       Kokkos::parallel_for("ComputeGaussianGridLocalNeigh",policy_neigh,*this);
     }
 
@@ -213,6 +244,8 @@ void ComputeGaussianGridLocalKokkos<DeviceType>::compute_local()
   k_alocal.template modify<DeviceType>();
   k_alocal.template sync<LMPHostType>();
 
+  printf(">>> k_alocal: %f\n", k_alocal.h_view(0,6));
+
 }
 
 /* ---------------------------------------------------------------------- */
@@ -223,6 +256,102 @@ void ComputeGaussianGridLocalKokkos<DeviceType>::operator() (TagComputeGaussianG
 {
   const int ii = team.league_rank();
   //printf("%d\n", ii);
+
+  if (ii >= chunk_size) return;
+
+  // extract grid index
+  int igrid = ii + chunk_offset;
+
+  // get a pointer to scratch memory
+  // This is used to cache whether or not an atom is within the cutoff.
+  // If it is, type_cache is assigned to the atom type.
+  // If it's not, it's assigned to -1.
+  const int tile_size = ntotal; //max_neighs; // number of elements per thread
+  const int team_rank = team.team_rank();
+  const int scratch_shift = team_rank * tile_size; // offset into pointer for entire team
+  int* type_cache = (int*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(int), 0) + scratch_shift;
+
+  // convert to grid indices
+
+  int iz = igrid/(xlen*ylen);
+  int i2 = igrid - (iz*xlen*ylen);
+  int iy = i2/xlen;
+  int ix = i2 % xlen;
+  iz += nzlo;
+  iy += nylo;
+  ix += nxlo;
+
+  double xgrid[3];
+
+  // index ii already captures the proper grid point
+  //int igrid = iz * (nx * ny) + iy * nx + ix;
+  //printf("%d %d\n", ii, igrid);
+
+  // grid2x converts igrid to ix,iy,iz like we've done before
+  // multiply grid integers by grid spacing delx, dely, delz
+  //grid2x(igrid, xgrid);
+  xgrid[0] = ix * delx;
+  xgrid[1] = iy * dely;
+  xgrid[2] = iz * delz;
+
+  if (triclinic) {
+
+    // Do a conversion on `xgrid` here like we do in the CPU version.
+
+    // Can't do this:
+    // domainKK->lamda2x(xgrid, xgrid);
+    // Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed
+
+    // Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats.
+    /*
+    xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0];
+    xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1];
+    xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2];
+    */
+    xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0;
+    xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1;
+    xgrid[2] = h2*xgrid[2] + lo2;
+  }
+
+  const F_FLOAT xtmp = xgrid[0];
+  const F_FLOAT ytmp = xgrid[1];
+  const F_FLOAT ztmp = xgrid[2];
+
+  // Zeroing out the components, which are filled as a sum.
+  for (int icol = size_local_cols_base; icol < size_local_cols; icol++){
+    d_alocal(igrid, icol) = 0.0;
+  }
+
+  // currently, all grid points are type 1
+  // not clear what a better choice would be
+  
+  const int itype = 1;
+  int ielem = 0;
+  ielem = d_map[itype];
+  const double radi = d_radelem[ielem];
+
+  // Compute the number of neighbors, store rsq
+  int ninside = 0;
+  
+
+  // Looping over ntotal for now.
+  
+  for (int j = 0; j < ntotal; j++){
+    const F_FLOAT dx = x(j,0) - xtmp;
+    const F_FLOAT dy = x(j,1) - ytmp;
+    const F_FLOAT dz = x(j,2) - ztmp;
+    int jtype = type(j);
+    const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
+
+    if (rsq < rnd_cutsq(jtype, jtype) ) {
+      //printf("%f %f\n", d_prefacelem(jtype-1), d_argfacelem(jtype-1));
+      int icol = size_local_cols_base + jtype - 1;
+      d_alocal(igrid, icol) += d_prefacelem(jtype-1) * exp(-rsq * d_argfacelem(jtype-1));
+    }
+  }
+
+  //printf("%f\n", d_alocal(igrid, 6));
+  
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/KOKKOS/compute_gaussian_grid_local_kokkos.h b/src/KOKKOS/compute_gaussian_grid_local_kokkos.h
index 474797584f..db3e87a7e9 100644
--- a/src/KOKKOS/compute_gaussian_grid_local_kokkos.h
+++ b/src/KOKKOS/compute_gaussian_grid_local_kokkos.h
@@ -61,6 +61,9 @@ template <class DeviceType> class ComputeGaussianGridLocalKokkos : public Comput
   //double adof, mvv2e, mv2d, boltz;
 
   Kokkos::View<double*, DeviceType> d_radelem;              // element radii
+  Kokkos::View<double*, DeviceType> d_sigmaelem;
+  Kokkos::View<double*, DeviceType> d_prefacelem;
+  Kokkos::View<double*, DeviceType> d_argfacelem;
   Kokkos::View<int*, DeviceType> d_ninside;                // ninside for all atoms in list
   Kokkos::View<int*, DeviceType> d_map;                    // mapping from atom types to elements
 
@@ -98,6 +101,15 @@ template <class DeviceType> class ComputeGaussianGridLocalKokkos : public Comput
 
   DAT::tdual_float_2d k_alocal;
   typename AT::t_float_2d d_alocal;
+
+  // triclinic vars
+  /*
+  xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0];
+  xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1];
+  xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2];
+  */
+  double h0, h1, h2, h3, h4, h5;
+  double lo0, lo1, lo2;
 };
 
 }    // namespace LAMMPS_NS
diff --git a/src/ML-SNAP/compute_gaussian_grid_local.cpp b/src/ML-SNAP/compute_gaussian_grid_local.cpp
index ec75563bcf..c660a16cee 100644
--- a/src/ML-SNAP/compute_gaussian_grid_local.cpp
+++ b/src/ML-SNAP/compute_gaussian_grid_local.cpp
@@ -41,8 +41,8 @@ ComputeGaussianGridLocal::ComputeGaussianGridLocal(LAMMPS *lmp, int narg, char *
   arg += nargbase;
   narg -= nargbase;
 
-  double rfac0, rmin0;
-  int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;
+  //double rfac0, rmin0;
+  //int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;
 
   int ntypes = atom->ntypes;
   int nargmin = 4 + 2 * ntypes;
@@ -91,11 +91,14 @@ ComputeGaussianGridLocal::ComputeGaussianGridLocal(LAMMPS *lmp, int narg, char *
 
 ComputeGaussianGridLocal::~ComputeGaussianGridLocal()
 {
+  //printf(">>> ComputeGaussianGridLocal begin destruct copymode %d\n", copymode);
+  if (copymode) return;
   memory->destroy(radelem);
   memory->destroy(sigmaelem);
   memory->destroy(prefacelem);
   memory->destroy(argfacelem);
   memory->destroy(cutsq);
+  //printf(">>> ComputeGaussianGridLocal end destruct\n");
 }
 
 /* ---------------------------------------------------------------------- */
@@ -110,6 +113,8 @@ void ComputeGaussianGridLocal::init()
 
 void ComputeGaussianGridLocal::compute_local()
 {
+  printf(">>> compute_local CPU\n");
+  printf(">>> size_local_cols_base, size_local_cols: %d %d\n", size_local_cols_base, size_local_cols);
   invoked_local = update->ntimestep;
 
   // compute gaussian for each gridpoint
@@ -146,7 +151,7 @@ void ComputeGaussianGridLocal::compute_local()
           const double rsq = delx * delx + dely * dely + delz * delz;
           int jtype = type[j];
           if (rsq < cutsq[jtype][jtype]) {
-          int icol = size_local_cols_base + jtype - 1;
+            int icol = size_local_cols_base + jtype - 1;
             alocal[igrid][icol] += prefacelem[jtype] * exp(-rsq * argfacelem[jtype]);
           }
         }
diff --git a/src/ML-SNAP/compute_gaussian_grid_local.h b/src/ML-SNAP/compute_gaussian_grid_local.h
index cfab841a6e..72e7326b49 100644
--- a/src/ML-SNAP/compute_gaussian_grid_local.h
+++ b/src/ML-SNAP/compute_gaussian_grid_local.h
@@ -32,7 +32,7 @@ class ComputeGaussianGridLocal : public ComputeGridLocal {
   void compute_local() override;
   double memory_usage() override;
 
- private:
+ protected:
   int ncoeff;
   double **cutsq;
   double rcutfac;     // global cut-off scale
diff --git a/src/ML-SNAP/compute_grid_local.cpp b/src/ML-SNAP/compute_grid_local.cpp
index 0f275a9aae..5dd8185ae7 100644
--- a/src/ML-SNAP/compute_grid_local.cpp
+++ b/src/ML-SNAP/compute_grid_local.cpp
@@ -61,13 +61,16 @@ ComputeGridLocal::ComputeGridLocal(LAMMPS *lmp, int narg, char **arg) :
 
 ComputeGridLocal::~ComputeGridLocal()
 {
+  printf(">>> ComputeGridLocal begin destruct\n");
   deallocate();
+  printf(">>> ComputeGridLocal end destruct\n");
 }
 
 /* ---------------------------------------------------------------------- */
 
 void ComputeGridLocal::setup()
 {
+  printf(">>> ComputeGridLocal setup\n");
   deallocate();
   set_grid_global();
   set_grid_local();
@@ -106,6 +109,7 @@ void ComputeGridLocal::grid2lamda(int ix, int iy, int iz, double *x)
 
 void ComputeGridLocal::allocate()
 {
+  printf(">>> ComputeGridLocal::allocate %d %d\n", size_local_rows, size_local_cols);
   if (nxlo <= nxhi && nylo <= nyhi && nzlo <= nzhi) {
     gridlocal_allocated = 1;
     memory->create(alocal, size_local_rows, size_local_cols, "compute/grid/local:alocal");
@@ -119,10 +123,14 @@ void ComputeGridLocal::allocate()
 
 void ComputeGridLocal::deallocate()
 {
+  //printf(">>> ComputeGridLocal::deallocate begin gridlocal_allocated %d copymode %d\n", gridlocal_allocated, copymode);
+  if (copymode) return;
+
   if (gridlocal_allocated) {
     gridlocal_allocated = 0;
     memory->destroy(alocal);
   }
+  //printf(">>> ComputeGridLocal:: deallocate end\n");
   array_local = nullptr;
 }
 
@@ -178,6 +186,8 @@ void ComputeGridLocal::set_grid_local()
   //   the 2 equality if tests ensure a consistent decision
   //   as to which proc owns it
 
+  //printf(">>> ComputeGridLocal set_grid_local\n");
+
   double xfraclo, xfrachi, yfraclo, yfrachi, zfraclo, zfrachi;
 
   if (comm->layout != Comm::LAYOUT_TILED) {

From c871fe8505067017ca20e3ce46f8ba01344f4802 Mon Sep 17 00:00:00 2001
From: rohskopf <drew.rohskopf@gmail.com>
Date: Mon, 26 Jun 2023 17:03:05 -0600
Subject: [PATCH 27/51] Fill grid geometry info

---
 src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp b/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp
index 5158cb5246..11eda3a3e2 100644
--- a/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp
+++ b/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp
@@ -322,6 +322,14 @@ void ComputeGaussianGridLocalKokkos<DeviceType>::operator() (TagComputeGaussianG
     d_alocal(igrid, icol) = 0.0;
   }
 
+  // Fill grid info columns
+  d_alocal(igrid, 0) = ix;
+  d_alocal(igrid, 1) = iy;
+  d_alocal(igrid, 2) = iz;
+  d_alocal(igrid, 3) = xtmp;
+  d_alocal(igrid, 4) = ytmp;
+  d_alocal(igrid, 5) = ztmp;
+
   // currently, all grid points are type 1
   // not clear what a better choice would be
   

From 969cc5dc035d69c574aa860283ff6728c2553887 Mon Sep 17 00:00:00 2001
From: rohskopf <drew.rohskopf@gmail.com>
Date: Mon, 26 Jun 2023 17:41:13 -0600
Subject: [PATCH 28/51] Tweak TeamPolicy settings for speedup

---
 src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp b/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp
index 11eda3a3e2..55e5b599e7 100644
--- a/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp
+++ b/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp
@@ -193,14 +193,14 @@ void ComputeGaussianGridLocalKokkos<DeviceType>::compute_local()
   // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user 
   // `total_range` is the number of grid points which may be larger than chunk size.
   //printf(">>> total_range: %d\n", total_range);
-  chunksize = 32768;
+  chunksize = 10*32768; // 100*32768
   chunk_size = MIN(chunksize, total_range);
   chunk_offset = 0;
 
   int vector_length_default = 1;
   int team_size_default = 1;
   if (!host_flag)
-    team_size_default = 32;//max_neighs;
+    team_size_default = 1; // cost will increase with increasing team size //32;//max_neighs;
 
   if (triclinic){
     /*

From be5eb198c345ba35e9020a5d2fbd0cefbbe47f90 Mon Sep 17 00:00:00 2001
From: rohskopf <drew.rohskopf@gmail.com>
Date: Tue, 27 Jun 2023 11:50:00 -0600
Subject: [PATCH 29/51] Clean up debug prints

---
 src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp | 12 ++++++------
 src/ML-SNAP/compute_gaussian_grid_local.cpp       |  4 ++--
 src/ML-SNAP/compute_grid_local.cpp                |  8 ++++----
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp b/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp
index 55e5b599e7..a52d747922 100644
--- a/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp
+++ b/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp
@@ -109,14 +109,14 @@ ComputeGaussianGridLocalKokkos<DeviceType>::ComputeGaussianGridLocalKokkos(LAMMP
 template<class DeviceType>
 ComputeGaussianGridLocalKokkos<DeviceType>::~ComputeGaussianGridLocalKokkos()
 {
-  printf(">>> ComputeGaussianGridLocalKokkos destruct begin, copymode %d\n", copymode);
+  //printf(">>> ComputeGaussianGridLocalKokkos destruct begin, copymode %d\n", copymode);
   if (copymode) return;
 
   memoryKK->destroy_kokkos(k_cutsq,cutsq);
   memoryKK->destroy_kokkos(k_alocal,alocal);
   //gridlocal_allocated = 0;
 
-  printf(">>> ComputeGaussianGridLocalKokkos end\n");
+  //printf(">>> ComputeGaussianGridLocalKokkos end\n");
 }
 
 /* ---------------------------------------------------------------------- */
@@ -133,7 +133,7 @@ void ComputeGaussianGridLocalKokkos<DeviceType>::setup()
   ComputeGridLocal::setup();
 
   // allocate arrays
-  printf(">>> rows cols kokkos init: %d %d\n", size_local_rows, size_local_cols);
+  //printf(">>> rows cols kokkos init: %d %d\n", size_local_rows, size_local_cols);
   memoryKK->create_kokkos(k_alocal, alocal, size_local_rows, size_local_cols, "grid:alocal");
 
   //gridlocal_allocated = 1;
@@ -160,7 +160,7 @@ void ComputeGaussianGridLocalKokkos<DeviceType>::init()
 template<class DeviceType>
 void ComputeGaussianGridLocalKokkos<DeviceType>::compute_local()
 {
-  printf(">>> compute_local Kokkos begin\n");
+  //printf(">>> compute_local Kokkos begin\n");
 
   if (host_flag) {
     return;
@@ -193,7 +193,7 @@ void ComputeGaussianGridLocalKokkos<DeviceType>::compute_local()
   // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user 
   // `total_range` is the number of grid points which may be larger than chunk size.
   //printf(">>> total_range: %d\n", total_range);
-  chunksize = 10*32768; // 100*32768
+  chunksize = 32768; // 100*32768
   chunk_size = MIN(chunksize, total_range);
   chunk_offset = 0;
 
@@ -244,7 +244,7 @@ void ComputeGaussianGridLocalKokkos<DeviceType>::compute_local()
   k_alocal.template modify<DeviceType>();
   k_alocal.template sync<LMPHostType>();
 
-  printf(">>> k_alocal: %f\n", k_alocal.h_view(0,6));
+  //printf(">>> k_alocal: %f\n", k_alocal.h_view(0,6));
 
 }
 
diff --git a/src/ML-SNAP/compute_gaussian_grid_local.cpp b/src/ML-SNAP/compute_gaussian_grid_local.cpp
index c660a16cee..c18aa31d05 100644
--- a/src/ML-SNAP/compute_gaussian_grid_local.cpp
+++ b/src/ML-SNAP/compute_gaussian_grid_local.cpp
@@ -113,8 +113,8 @@ void ComputeGaussianGridLocal::init()
 
 void ComputeGaussianGridLocal::compute_local()
 {
-  printf(">>> compute_local CPU\n");
-  printf(">>> size_local_cols_base, size_local_cols: %d %d\n", size_local_cols_base, size_local_cols);
+  //printf(">>> compute_local CPU\n");
+  //printf(">>> size_local_cols_base, size_local_cols: %d %d\n", size_local_cols_base, size_local_cols);
   invoked_local = update->ntimestep;
 
   // compute gaussian for each gridpoint
diff --git a/src/ML-SNAP/compute_grid_local.cpp b/src/ML-SNAP/compute_grid_local.cpp
index 5dd8185ae7..6065f38171 100644
--- a/src/ML-SNAP/compute_grid_local.cpp
+++ b/src/ML-SNAP/compute_grid_local.cpp
@@ -61,16 +61,16 @@ ComputeGridLocal::ComputeGridLocal(LAMMPS *lmp, int narg, char **arg) :
 
 ComputeGridLocal::~ComputeGridLocal()
 {
-  printf(">>> ComputeGridLocal begin destruct\n");
+  //printf(">>> ComputeGridLocal begin destruct\n");
   deallocate();
-  printf(">>> ComputeGridLocal end destruct\n");
+  //printf(">>> ComputeGridLocal end destruct\n");
 }
 
 /* ---------------------------------------------------------------------- */
 
 void ComputeGridLocal::setup()
 {
-  printf(">>> ComputeGridLocal setup\n");
+  //printf(">>> ComputeGridLocal setup\n");
   deallocate();
   set_grid_global();
   set_grid_local();
@@ -109,7 +109,7 @@ void ComputeGridLocal::grid2lamda(int ix, int iy, int iz, double *x)
 
 void ComputeGridLocal::allocate()
 {
-  printf(">>> ComputeGridLocal::allocate %d %d\n", size_local_rows, size_local_cols);
+  //printf(">>> ComputeGridLocal::allocate %d %d\n", size_local_rows, size_local_cols);
   if (nxlo <= nxhi && nylo <= nyhi && nzlo <= nzhi) {
     gridlocal_allocated = 1;
     memory->create(alocal, size_local_rows, size_local_cols, "compute/grid/local:alocal");

From 3f9cc8f0fdd4bf272da65ee5cc1eb66df1bef6c9 Mon Sep 17 00:00:00 2001
From: rohskopf <drew.rohskopf@gmail.com>
Date: Fri, 7 Jul 2023 17:08:41 -0600
Subject: [PATCH 30/51] Initial sna/grid/local/kk implement

---
 src/KOKKOS/compute_sna_grid_local_kokkos.cpp  |  81 ++
 src/KOKKOS/compute_sna_grid_local_kokkos.h    | 310 ++++++
 .../compute_sna_grid_local_kokkos_impl.h      | 924 ++++++++++++++++++
 src/ML-SNAP/compute_sna_grid_local.cpp        |   6 +-
 src/ML-SNAP/compute_sna_grid_local.h          |   6 +-
 5 files changed, 1324 insertions(+), 3 deletions(-)
 create mode 100644 src/KOKKOS/compute_sna_grid_local_kokkos.cpp
 create mode 100644 src/KOKKOS/compute_sna_grid_local_kokkos.h
 create mode 100644 src/KOKKOS/compute_sna_grid_local_kokkos_impl.h

diff --git a/src/KOKKOS/compute_sna_grid_local_kokkos.cpp b/src/KOKKOS/compute_sna_grid_local_kokkos.cpp
new file mode 100644
index 0000000000..087dbc5fd5
--- /dev/null
+++ b/src/KOKKOS/compute_sna_grid_local_kokkos.cpp
@@ -0,0 +1,81 @@
+// clang-format off
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "compute_sna_grid_local_kokkos.h"
+#include "compute_sna_grid_local_kokkos_impl.h"
+
+namespace LAMMPS_NS {
+
+template class ComputeSNAGridLocalKokkosDevice<LMPDeviceType>;
+#ifdef LMP_KOKKOS_GPU
+template class ComputeSNAGridLocalKokkosHost<LMPHostType>;
+#endif
+
+}
+
+
+
+
+// The following chunk will compile but we're gonna try a wrapper approach like pair snap.
+/*
+#include "compute_sna_grid_kokkos.h"
+
+#include "atom_kokkos.h"
+#include "atom_masks.h"
+#include "comm.h"
+#include "error.h"
+#include "memory_kokkos.h"
+#include "modify.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "neighbor_kokkos.h"
+#include "sna_kokkos.h"
+#include "update.h"
+
+using namespace LAMMPS_NS;
+
+// ----------------------------------------------------------------------
+
+template<class DeviceType>
+ComputeSNAGridKokkos<DeviceType>::ComputeSNAGridKokkos(LAMMPS *lmp, int narg, char **arg) :
+  ComputeSNAGrid(lmp, narg, arg)
+{
+
+  printf("^^^ inside ComputeSNAGridKokkos constructor\n");
+  kokkosable = 1;
+  atomKK = (AtomKokkos *) atom;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  datamask_read = EMPTY_MASK;
+  datamask_modify = EMPTY_MASK;
+
+}
+
+// ----------------------------------------------------------------------
+
+template<class DeviceType>
+ComputeSNAGridKokkos<DeviceType>::~ComputeSNAGridKokkos()
+{
+  if (copymode) return;
+
+
+}
+
+namespace LAMMPS_NS {
+template class ComputeSNAGridKokkos<LMPDeviceType>;
+#ifdef LMP_KOKKOS_GPU
+template class ComputeSNAGridKokkos<LMPHostType>;
+#endif
+}
+*/
+
diff --git a/src/KOKKOS/compute_sna_grid_local_kokkos.h b/src/KOKKOS/compute_sna_grid_local_kokkos.h
new file mode 100644
index 0000000000..9fccb39aa2
--- /dev/null
+++ b/src/KOKKOS/compute_sna_grid_local_kokkos.h
@@ -0,0 +1,310 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef COMPUTE_CLASS
+// clang-format off
+ComputeStyle(sna/grid/local/kk,ComputeSNAGridLocalKokkosDevice<LMPDeviceType>);
+ComputeStyle(sna/grid/local/kk/device,ComputeSNAGridLocalKokkosDevice<LMPDeviceType>);
+#ifdef LMP_KOKKOS_GPU
+ComputeStyle(sna/grid/local/kk/host,ComputeSNAGridLocalKokkosHost<LMPHostType>);
+#else
+ComputeStyle(sna/grid/local/kk/host,ComputeSNAGridLocalKokkosDevice<LMPHostType>);
+#endif
+// clang-format on
+#else
+
+// clang-format off
+#ifndef LMP_COMPUTE_SNA_GRID_LOCAL_KOKKOS_H
+#define LMP_COMPUTE_SNA_GRID_LOCAL_KOKKOS_H
+
+#include "compute_sna_grid_local.h"
+#include "kokkos_type.h"
+//#include "pair_snap.h"
+//#include "kokkos_type.h"
+//#include "neigh_list_kokkos.h"
+#include "sna_kokkos.h"
+//#include "pair_kokkos.h"
+
+namespace LAMMPS_NS {
+
+// Routines for both the CPU and GPU backend
+//template<int NEIGHFLAG, int EVFLAG>
+//struct TagPairSNAPComputeForce{};
+
+
+// GPU backend only
+/*
+struct TagPairSNAPComputeNeigh{};
+struct TagPairSNAPComputeCayleyKlein{};
+struct TagPairSNAPPreUi{};
+struct TagPairSNAPComputeUiSmall{}; // more parallelism, more divergence
+struct TagPairSNAPComputeUiLarge{}; // less parallelism, no divergence
+struct TagPairSNAPTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist
+struct TagPairSNAPComputeZi{};
+struct TagPairSNAPBeta{};
+struct TagPairSNAPComputeBi{};
+struct TagPairSNAPTransformBi{}; // re-order blist from AoSoA to AoS
+struct TagPairSNAPComputeYi{};
+struct TagPairSNAPComputeYiWithZlist{};
+template<int dir>
+struct TagPairSNAPComputeFusedDeidrjSmall{}; // more parallelism, more divergence
+template<int dir>
+struct TagPairSNAPComputeFusedDeidrjLarge{}; // less parallelism, no divergence
+*/
+//struct TagPairSNAPPreUi{};
+struct TagCSNAGridLocalComputeNeigh{};
+struct TagCSNAGridLocalComputeCayleyKlein{};
+struct TagCSNAGridLocalPreUi{};
+struct TagCSNAGridLocalComputeUiSmall{}; // more parallelism, more divergence
+struct TagCSNAGridLocalComputeUiLarge{}; // less parallelism, no divergence
+struct TagCSNAGridLocalTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist
+struct TagCSNAGridLocalComputeZi{};
+struct TagCSNAGridLocalComputeBi{};
+struct TagCSNAGridLocalTransformBi{}; // re-order blist from AoSoA to AoS
+struct TagCSNAGridLocal2Fill{}; // fill the gridlocal array
+//struct TagCSNAGridLocalFill2{}; // fill the gridlocal array using same kinda loop as ComputeForce
+
+struct TagComputeSNAGridLocalLoop{};
+struct TagComputeSNAGridLocal3D{};
+
+// CPU backend only
+/*
+struct TagPairSNAPComputeNeighCPU{};
+struct TagPairSNAPPreUiCPU{};
+struct TagPairSNAPComputeUiCPU{};
+struct TagPairSNAPTransformUiCPU{};
+struct TagPairSNAPComputeZiCPU{};
+struct TagPairSNAPBetaCPU{};
+struct TagPairSNAPComputeBiCPU{};
+struct TagPairSNAPZeroYiCPU{};
+struct TagPairSNAPComputeYiCPU{};
+struct TagPairSNAPComputeDuidrjCPU{};
+struct TagPairSNAPComputeDeidrjCPU{};
+*/
+struct TagComputeSNAGridLocalLoopCPU{};
+
+//template<class DeviceType>
+template<class DeviceType, typename real_type_, int vector_length_>
+class ComputeSNAGridLocalKokkos : public ComputeSNAGridLocal {
+ public:
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+
+  static constexpr int vector_length = vector_length_;
+  using real_type = real_type_;
+  using complex = SNAComplex<real_type>;
+
+  // Static team/tile sizes for device offload
+
+#ifdef KOKKOS_ENABLE_HIP
+  static constexpr int team_size_compute_neigh = 2;
+  static constexpr int tile_size_compute_ck = 2;
+  static constexpr int tile_size_pre_ui = 2;
+  static constexpr int team_size_compute_ui = 2;
+  static constexpr int tile_size_transform_ui = 2;
+  static constexpr int tile_size_compute_zi = 2;
+  static constexpr int tile_size_compute_bi = 2;
+  static constexpr int tile_size_transform_bi = 2;
+  static constexpr int tile_size_compute_yi = 2;
+  static constexpr int team_size_compute_fused_deidrj = 2;
+#else
+  static constexpr int team_size_compute_neigh = 4;
+  static constexpr int tile_size_compute_ck = 4;
+  static constexpr int tile_size_pre_ui = 4;
+  static constexpr int team_size_compute_ui = sizeof(real_type) == 4 ? 8 : 4;
+  static constexpr int tile_size_transform_ui = 4;
+  static constexpr int tile_size_compute_zi = 8;
+  static constexpr int tile_size_compute_bi = 4;
+  static constexpr int tile_size_transform_bi = 4;
+  static constexpr int tile_size_compute_yi = 8;
+  static constexpr int team_size_compute_fused_deidrj = sizeof(real_type) == 4 ? 4 : 2;
+#endif
+
+  // Custom MDRangePolicy, Rank3, to reduce verbosity of kernel launches
+  // This hides the Kokkos::IndexType<int> and Kokkos::Rank<3...>
+  // and reduces the verbosity of the LaunchBound by hiding the explicit
+  // multiplication by vector_length
+  template <class Device, int num_tiles, class TagComputeSNAP>
+  using Snap3DRangePolicy = typename Kokkos::MDRangePolicy<Device, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, Kokkos::LaunchBounds<vector_length * num_tiles>, TagComputeSNAP>;
+
+  // MDRangePolicy for the 3D grid loop:
+  template <class Device, class TagComputeSNAP>
+  using CSNAGridLocal3DPolicy = typename Kokkos::MDRangePolicy<Device, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>>;
+
+  // Testing out team policies
+  template <class Device, int num_teams,  class TagComputeSNAP>
+  using CSNAGridLocalTeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::LaunchBounds<vector_length * num_teams>, TagComputeSNAP>;
+
+  // Custom SnapAoSoATeamPolicy to reduce the verbosity of kernel launches
+  // This hides the LaunchBounds abstraction by hiding the explicit
+  // multiplication by vector length
+  template <class Device, int num_teams, class TagComputeSNAP>
+  using SnapAoSoATeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::LaunchBounds<vector_length * num_teams>, TagComputeSNAP>;
+
+  ComputeSNAGridLocalKokkos(class LAMMPS *, int, char **);
+  ~ComputeSNAGridLocalKokkos() override;
+
+  void init() override;
+  void setup() override;
+  void compute_local() override;
+
+  // Utility functions for teams
+
+  template<class TagStyle>
+  void check_team_size_for(int, int&);
+
+  template<class TagStyle>
+  void check_team_size_reduce(int, int&);
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagComputeSNAGridLocalLoop, const int& ) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagComputeSNAGridLocalLoopCPU, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridLocalComputeNeigh>::member_type& team) const;
+
+  // PrintNeigh
+  //void operator() (TagPrintNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagPrintNeigh>::member_type& team) const;
+
+  // 3D case - used by parallel_for
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagComputeSNAGridLocal3D, const int& iz, const int& iy, const int& ix) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalComputeCayleyKlein, const int iatom_mod, const int jnbor, const int iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalPreUi,const int iatom_mod, const int j, const int iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridLocalComputeUiSmall>::member_type& team) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridLocalComputeUiLarge>::member_type& team) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalTransformUi,const int iatom_mod, const int j, const int iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalComputeZi,const int iatom_mod, const int idxz, const int iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalComputeBi,const int iatom_mod, const int idxb, const int iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocal2Fill,const int& ii) const;
+
+ protected:
+
+  SNAKokkos<DeviceType, real_type, vector_length> snaKK;
+
+  int max_neighs, chunk_size, chunk_offset;
+  int host_flag;
+  int ntotal;
+  int total_range; // total number of loop iterations in grid
+  int zlen; //= nzhi-nzlo+1;
+  int ylen; //= nyhi-nylo+1;
+  int xlen; //= nxhi-nxlo+1;
+
+  double cutsq_tmp; // temporary cutsq until we get a view
+
+  Kokkos::View<real_type*, DeviceType> d_radelem;              // element radii
+  Kokkos::View<real_type*, DeviceType> d_wjelem;               // elements weights
+  //Kokkos::View<real_type**, Kokkos::LayoutRight, DeviceType> d_coeffelem;           // element bispectrum coefficients
+  Kokkos::View<real_type*, DeviceType> d_sinnerelem;           // element inner cutoff midpoint
+  Kokkos::View<real_type*, DeviceType> d_dinnerelem;           // element inner cutoff half-width
+  Kokkos::View<T_INT*, DeviceType> d_ninside;                // ninside for all atoms in list
+  Kokkos::View<T_INT*, DeviceType> d_map;                    // mapping from atom types to elements
+  Kokkos::View<real_type*, DeviceType> d_test;              // test view
+
+  typedef Kokkos::DualView<F_FLOAT**, DeviceType> tdual_fparams;
+  tdual_fparams k_cutsq;
+  typedef Kokkos::View<const F_FLOAT**, DeviceType,
+      Kokkos::MemoryTraits<Kokkos::RandomAccess> > t_fparams_rnd;
+  t_fparams_rnd rnd_cutsq;
+
+  typename AT::t_x_array_randomread x;
+  typename AT::t_int_1d_randomread type;
+  DAT::tdual_float_2d k_grid;
+  DAT::tdual_float_2d k_gridall;
+  typename AT::t_float_2d d_grid;
+  typename AT::t_float_2d d_gridall;
+
+  DAT::tdual_float_4d k_gridlocal;
+  typename AT::t_float_4d d_gridlocal;
+
+
+  // Utility routine which wraps computing per-team scratch size requirements for
+  // ComputeNeigh, ComputeUi, and ComputeFusedDeidrj
+  template <typename scratch_type>
+  int scratch_size_helper(int values_per_team);
+
+  class DomainKokkos *domainKK;
+
+  // triclinic vars
+  /*
+  xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0];
+  xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1];
+  xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2];
+  */
+  double h0, h1, h2, h3, h4, h5;
+  double lo0, lo1, lo2;
+
+};
+
+// These wrapper classes exist to make the compute style factory happy/avoid having
+// to extend the compute  style factory to support Compute classes w/an arbitrary number
+// of extra template parameters
+
+template <class DeviceType>
+class ComputeSNAGridLocalKokkosDevice : public ComputeSNAGridLocalKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN> {
+
+ private:
+  using Base = ComputeSNAGridLocalKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN>;
+
+ public:
+
+  ComputeSNAGridLocalKokkosDevice(class LAMMPS *, int, char **);
+
+  void init() override;
+  void compute_local() override;
+  //void setup() override;
+
+};
+
+#ifdef LMP_KOKKOS_GPU
+template <class DeviceType>
+class ComputeSNAGridLocalKokkosHost : public ComputeSNAGridLocalKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_HOST_VECLEN> {
+
+ private:
+  using Base = ComputeSNAGridLocalKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_HOST_VECLEN>;
+
+ public:
+
+  ComputeSNAGridLocalKokkosHost(class LAMMPS *, int, char **);
+
+  void init() override;
+  void compute_local() override;
+
+};
+#endif
+
+}
+
+#endif
+#endif
\ No newline at end of file
diff --git a/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h
new file mode 100644
index 0000000000..67ea878143
--- /dev/null
+++ b/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h
@@ -0,0 +1,924 @@
+// clang-format off
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Christian Trott (SNL), Stan Moore (SNL),
+                         Evan Weinberg (NVIDIA)
+------------------------------------------------------------------------- */
+
+#include "compute_sna_grid_local_kokkos.h"
+#include "pair_snap_kokkos.h"
+
+#include "atom_kokkos.h"
+#include "atom_masks.h"
+#include "comm.h"
+#include "error.h"
+#include "memory_kokkos.h"
+#include "modify.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "neighbor_kokkos.h"
+//#include "sna_kokkos.h"
+#include "domain.h"
+#include "domain_kokkos.h"
+#include "sna.h"
+#include "update.h"
+
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+
+#include <iostream>
+
+#define MAXLINE 1024
+#define MAXWORD 3
+
+namespace LAMMPS_NS {
+
+// Constructor
+
+template<class DeviceType, typename real_type, int vector_length>
+ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::ComputeSNAGridLocalKokkos(LAMMPS *lmp, int narg, char **arg) : ComputeSNAGridLocal(lmp, narg, arg)
+{
+  kokkosable = 1;
+  atomKK = (AtomKokkos *) atom;
+  domainKK = (DomainKokkos *) domain;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  datamask_read = EMPTY_MASK;
+  datamask_modify = EMPTY_MASK;
+
+  k_cutsq = tdual_fparams("ComputeSNAGridLocalKokkos::cutsq",atom->ntypes+1,atom->ntypes+1);
+  auto d_cutsq = k_cutsq.template view<DeviceType>();
+  rnd_cutsq = d_cutsq;
+
+  host_flag = (execution_space == Host);
+
+  // TODO: Extract cutsq in double loop below, no need for cutsq_tmp
+
+  cutsq_tmp = cutsq[1][1];
+
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = 1; j <= atom->ntypes; j++){
+      k_cutsq.h_view(i,j) = k_cutsq.h_view(j,i) = cutsq_tmp;
+      k_cutsq.template modify<LMPHostType>();
+    }
+  }
+
+   // Set up element lists
+  MemKK::realloc_kokkos(d_radelem,"ComputeSNAGridLocalKokkos::radelem",nelements);
+  MemKK::realloc_kokkos(d_wjelem,"ComputeSNAGridLocalKokkos:wjelem",nelements);
+  MemKK::realloc_kokkos(d_sinnerelem,"ComputeSNAGridLocalKokkos:sinnerelem",nelements);
+  MemKK::realloc_kokkos(d_dinnerelem,"ComputeSNAGridLocalKokkos:dinnerelem",nelements);
+  // test
+  MemKK::realloc_kokkos(d_test, "ComputeSNAGridLocalKokkos::test", nelements);
+
+  int n = atom->ntypes;
+  MemKK::realloc_kokkos(d_map,"ComputeSNAGridLocalKokkos::map",n+1);
+
+  auto h_radelem = Kokkos::create_mirror_view(d_radelem);
+  auto h_wjelem = Kokkos::create_mirror_view(d_wjelem);
+  auto h_sinnerelem = Kokkos::create_mirror_view(d_sinnerelem);
+  auto h_dinnerelem = Kokkos::create_mirror_view(d_dinnerelem);
+  auto h_map = Kokkos::create_mirror_view(d_map);
+  // test
+  auto h_test = Kokkos::create_mirror_view(d_test);
+  h_test(0) = 2.0;
+
+  // start from index 1 because of how compute sna/grid is
+  for (int i = 1; i <= atom->ntypes; i++) {
+    h_radelem(i-1) = radelem[i];
+    h_wjelem(i-1) = wjelem[i];
+    if (switchinnerflag){
+      h_sinnerelem(i) = sinnerelem[i];
+      h_dinnerelem(i) = dinnerelem[i];
+    }
+  }
+
+  // In pair snap some things like `map` get allocated regardless of chem flag.
+  if (chemflag){ 
+    for (int i = 1; i <= atom->ntypes; i++) {
+      h_map(i) = map[i];
+    }
+  }
+
+  Kokkos::deep_copy(d_radelem,h_radelem);
+  Kokkos::deep_copy(d_wjelem,h_wjelem);
+  if (switchinnerflag){
+    Kokkos::deep_copy(d_sinnerelem,h_sinnerelem);
+    Kokkos::deep_copy(d_dinnerelem,h_dinnerelem);
+  }
+  if (chemflag){
+    Kokkos::deep_copy(d_map,h_map);
+  }
+  Kokkos::deep_copy(d_test,h_test);
+
+  double bytes =  MemKK::memory_usage(d_wjelem);
+
+  snaKK = SNAKokkos<DeviceType, real_type, vector_length>(rfac0,twojmax,
+    rmin0,switchflag,bzeroflag,chemflag,bnormflag,wselfallflag,nelements,switchinnerflag);
+  snaKK.grow_rij(0,0);
+  snaKK.init();
+
+}
+
+// Destructor
+
+template<class DeviceType, typename real_type, int vector_length>
+ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::~ComputeSNAGridLocalKokkos()
+{
+  //printf(">>> ComputeSNAGridLocalKokkos destruct begin copymode %d\n", copymode);
+  if (copymode) return;
+  //printf(">>> After copymode\n");
+
+  memoryKK->destroy_kokkos(k_cutsq,cutsq);
+  //memoryKK->destroy_kokkos(k_grid,grid);
+  //memoryKK->destroy_kokkos(k_gridall, gridall);
+  //memoryKK->destroy_kokkos(k_gridlocal, gridlocal);
+}
+
+// Init
+
+template<class DeviceType, typename real_type, int vector_length>
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::init()
+{
+  if (host_flag) {
+    return;
+  }
+  ComputeSNAGridLocal::init();
+
+}
+
+// Setup
+
+template<class DeviceType, typename real_type, int vector_length>
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::setup()
+{
+
+  // Do not call ComputeGrid::setup(), we don't wanna allocate the grid array there.
+  // Instead, call ComputeGrid::set_grid_global and set_grid_local to set the n indices.
+
+  //ComputeGrid::set_grid_global();
+  //ComputeGrid::set_grid_local();
+  //ComputeSNAGridLocal::setup();
+  
+  // allocate arrays
+  //memoryKK->create_kokkos(k_gridall, gridall, size_array_rows, size_array_cols, "grid:gridall");
+
+  // do not use or allocate gridlocal for now
+
+  gridlocal_allocated = 0;
+  //array = gridall;
+
+  d_gridlocal = k_gridlocal.template view<DeviceType>();
+  //d_grid = k_grid.template view<DeviceType>();
+  d_gridall = k_gridall.template view<DeviceType>();
+}
+
+// Compute
+
+template<class DeviceType, typename real_type, int vector_length>
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::compute_local()
+{
+  if (host_flag) {
+    return;
+  }
+
+  copymode = 1;
+
+  zlen = nzhi-nzlo+1;
+  ylen = nyhi-nylo+1;
+  xlen = nxhi-nxlo+1;
+  total_range = (nzhi-nzlo+1)*(nyhi-nylo+1)*(nxhi-nxlo+1);
+
+  atomKK->sync(execution_space,X_MASK|F_MASK|TYPE_MASK);
+  x = atomKK->k_x.view<DeviceType>();
+  type = atomKK->k_type.view<DeviceType>();
+  k_cutsq.template sync<DeviceType>();
+
+  // max_neighs is defined here - think of more elaborate methods.
+  max_neighs = 100;
+
+  // Pair snap/kk uses grow_ij with some max number of neighs but compute sna/grid uses total 
+  // number of atoms.
+
+  ntotal = atomKK->nlocal + atomKK->nghost;
+  // Allocate view for number of neighbors per grid point
+  MemKK::realloc_kokkos(d_ninside,"ComputeSNAGridLocalKokkos:ninside",total_range);
+
+  // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user 
+  // `total_range` is the number of grid points which may be larger than chunk size.
+  //printf(">>> total_range: %d\n", total_range);
+  chunk_size = MIN(chunksize, total_range);
+  chunk_offset = 0;
+  //snaKK.grow_rij(chunk_size, ntotal);
+  snaKK.grow_rij(chunk_size, max_neighs);
+
+  //chunk_size = total_range;
+ 
+  // Pre-compute ceil(chunk_size / vector_length) for code cleanliness
+  const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length;
+
+  if (triclinic){
+    /*
+    xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0];
+    xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1];
+    xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2];
+    */
+    h0 = domain->h[0];
+    h1 = domain->h[1];
+    h2 = domain->h[2];
+    h3 = domain->h[3];
+    h4 = domain->h[4];   
+    h5 = domain->h[5];   
+    lo0 = domain->boxlo[0];
+    lo1 = domain->boxlo[1];
+    lo2 = domain->boxlo[2];
+  }
+
+  while (chunk_offset < total_range) { // chunk up loop to prevent running out of memory
+
+    if (chunk_size > total_range - chunk_offset)
+      chunk_size = total_range - chunk_offset;
+
+    //printf(">>> chunk_offset: %d\n", chunk_offset);
+
+    //ComputeNeigh 
+    {
+      int scratch_size = scratch_size_helper<int>(team_size_compute_neigh * max_neighs); //ntotal);
+
+      SnapAoSoATeamPolicy<DeviceType, team_size_compute_neigh, TagCSNAGridLocalComputeNeigh> 
+        policy_neigh(chunk_size, team_size_compute_neigh, vector_length);
+      policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+      Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this);
+    }
+
+    //ComputeCayleyKlein
+    {
+      // tile_size_compute_ck is defined in `compute_sna_grid_kokkos.h`
+      Snap3DRangePolicy<DeviceType, tile_size_compute_ck, TagCSNAGridLocalComputeCayleyKlein>
+        policy_compute_ck({0,0,0}, {vector_length, max_neighs, chunk_size_div}, {vector_length, tile_size_compute_ck, 1});
+      Kokkos::parallel_for("ComputeCayleyKlein", policy_compute_ck, *this);
+    }
+
+    //PreUi
+    {
+      // tile_size_pre_ui is defined in `compute_sna_grid_kokkos.h`
+      Snap3DRangePolicy<DeviceType, tile_size_pre_ui, TagCSNAGridLocalPreUi>
+        policy_preui({0,0,0},{vector_length,twojmax+1,chunk_size_div},{vector_length,tile_size_pre_ui,1});
+      Kokkos::parallel_for("PreUi",policy_preui,*this);
+    }
+
+    // ComputeUi w/ vector parallelism, shared memory, direct atomicAdd into ulisttot
+    {
+      // team_size_compute_ui is defined in `compute_sna_grid_kokkos.h`
+      // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer
+      const int tile_size = vector_length * (twojmax + 1);
+      const int scratch_size = scratch_size_helper<complex>(team_size_compute_ui * tile_size);
+
+      if (chunk_size < parallel_thresh)
+      {
+        // Version with parallelism over j_bend
+
+        // total number of teams needed: (natoms / 32) * (ntotal) * ("bend" locations)
+        const int n_teams = chunk_size_div * max_neighs * (twojmax + 1);
+        const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
+
+        SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagCSNAGridLocalComputeUiSmall>
+          policy_ui(n_teams_div, team_size_compute_ui, vector_length);
+        policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+        Kokkos::parallel_for("ComputeUiSmall", policy_ui, *this);
+      } else {
+        // Version w/out parallelism  over j_bend
+
+        // total number of teams needed: (natoms / 32) * (ntotal)
+        const int n_teams = chunk_size_div * max_neighs;
+        const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
+
+        SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagCSNAGridLocalComputeUiLarge>
+          policy_ui(n_teams_div, team_size_compute_ui, vector_length);
+        policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+        Kokkos::parallel_for("ComputeUiLarge", policy_ui, *this);
+      }
+    }
+
+    //TransformUi: un-"fold" ulisttot, zero ylist
+    {
+      // team_size_transform_ui is defined in `pair_snap_kokkos.h`
+      Snap3DRangePolicy<DeviceType, tile_size_transform_ui, TagCSNAGridLocalTransformUi>
+          policy_transform_ui({0,0,0},{vector_length,snaKK.idxu_max,chunk_size_div},{vector_length,tile_size_transform_ui,1});
+      Kokkos::parallel_for("TransformUi",policy_transform_ui,*this);
+    }
+
+    //Compute bispectrum in AoSoA data layout, transform Bi
+
+    //ComputeZi
+    const int idxz_max = snaKK.idxz_max;
+    Snap3DRangePolicy<DeviceType, tile_size_compute_zi, TagCSNAGridLocalComputeZi>
+        policy_compute_zi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_zi,1});
+    Kokkos::parallel_for("ComputeZi",policy_compute_zi,*this);
+
+    //ComputeBi
+    const int idxb_max = snaKK.idxb_max;
+    Snap3DRangePolicy<DeviceType, tile_size_compute_bi, TagCSNAGridLocalComputeBi>
+        policy_compute_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_compute_bi,1});
+    Kokkos::parallel_for("ComputeBi",policy_compute_bi,*this);
+
+    //Transform data layout of blist out of AoSoA
+    //We need this because `blist` gets used in ComputeForce which doesn't
+    //take advantage of AoSoA, which at best would only be beneficial on the margins
+    //NOTE: Do we need this in compute sna/grid/kk?
+    Snap3DRangePolicy<DeviceType, tile_size_transform_bi, TagCSNAGridLocalTransformBi>
+        policy_transform_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_transform_bi,1});
+    Kokkos::parallel_for("TransformBi",policy_transform_bi,*this);
+
+    // Fill the grid array with bispectrum values
+    {
+      typename Kokkos::RangePolicy<DeviceType,TagCSNAGridLocal2Fill> policy_fill(0,chunk_size);
+      Kokkos::parallel_for(policy_fill, *this);
+    }
+
+    // Proceed to the next chunk.
+    chunk_offset += chunk_size;
+
+  } // end while
+
+  k_gridlocal.template modify<DeviceType>();
+  k_gridlocal.template sync<LMPHostType>();
+
+  //k_grid.template modify<DeviceType>();
+  //k_grid.template sync<LMPHostType>();
+
+  k_gridall.template modify<DeviceType>();
+  k_gridall.template sync<LMPHostType>();
+}
+
+/* ----------------------------------------------------------------------
+   Begin routines that are unique to the GPU codepath. These take advantage
+   of AoSoA data layouts and scratch memory for recursive polynomials
+------------------------------------------------------------------------- */
+
+/*
+ Simple team policy functor seeing how many layers deep we can go with the parallelism.
+ */
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridLocalComputeNeigh>::member_type& team) const {
+
+  // This function follows similar procedure as ComputeNeigh of PairSNAPKokkos.
+  // Main difference is that we don't use the neighbor class or neighbor variables here.
+  // This is because the grid points are not atoms and therefore do not get assigned
+  // neighbors in LAMMPS. 
+  // TODO: If we did make a neighborlist for each grid point, we could use current 
+  //       routines and avoid having to loop over all atoms (which limits us to 
+  //       natoms = max team size).
+
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+  // basic quantities associated with this team:
+  // team_rank : rank of thread in this team
+  // league_rank : rank of team in this league
+  // team_size : number of threads in this team
+
+  // extract loop index
+  int ii = team.team_rank() + team.league_rank() * team.team_size();
+
+  if (ii >= chunk_size) return;
+
+  // extract grid index
+  int igrid = ii + chunk_offset;
+
+  // get a pointer to scratch memory
+  // This is used to cache whether or not an atom is within the cutoff.
+  // If it is, type_cache is assigned to the atom type.
+  // If it's not, it's assigned to -1.
+  const int tile_size = ntotal; //max_neighs; // number of elements per thread
+  const int team_rank = team.team_rank();
+  const int scratch_shift = team_rank * tile_size; // offset into pointer for entire team
+  int* type_cache = (int*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(int), 0) + scratch_shift;
+
+  // convert to grid indices
+
+  int iz = igrid/(xlen*ylen);
+  int i2 = igrid - (iz*xlen*ylen);
+  int iy = i2/xlen;
+  int ix = i2 % xlen;
+  iz += nzlo;
+  iy += nylo;
+  ix += nxlo;
+
+  double xgrid[3];
+
+  // index ii already captures the proper grid point
+  //int igrid = iz * (nx * ny) + iy * nx + ix;
+  //printf("%d %d\n", ii, igrid);
+
+  // grid2x converts igrid to ix,iy,iz like we've done before
+  // multiply grid integers by grid spacing delx, dely, delz
+  //grid2x(igrid, xgrid);
+  xgrid[0] = ix * delx;
+  xgrid[1] = iy * dely;
+  xgrid[2] = iz * delz;
+
+  if (triclinic) {
+
+    // Do a conversion on `xgrid` here like we do in the CPU version.
+
+    // Can't do this:
+    // domainKK->lamda2x(xgrid, xgrid);
+    // Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed
+
+    // Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats.
+    /*
+    xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0];
+    xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1];
+    xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2];
+    */
+    xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0;
+    xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1;
+    xgrid[2] = h2*xgrid[2] + lo2;
+  }
+
+  const F_FLOAT xtmp = xgrid[0];
+  const F_FLOAT ytmp = xgrid[1];
+  const F_FLOAT ztmp = xgrid[2];
+
+  // currently, all grid points are type 1
+  // not clear what a better choice would be
+
+  const int itype = 1;
+  int ielem = 0;
+  if (chemflag) ielem = d_map[itype];
+  const double radi = d_radelem[ielem];
+
+  // We need a DomainKokkos::lamda2x parallel for loop here, but let's ignore for now.
+  // The purpose here is to transform for triclinic boxes.
+  /*
+  if (triclinic){
+    printf("We are triclinic %f %f %f\n", xtmp, ytmp, ztmp);
+  }
+  */
+
+  // Compute the number of neighbors, store rsq
+  int ninside = 0;
+  
+  // Looping over ntotal for now.
+  for (int j = 0; j < ntotal; j++){
+    const F_FLOAT dx = x(j,0) - xtmp;
+    const F_FLOAT dy = x(j,1) - ytmp;
+    const F_FLOAT dz = x(j,2) - ztmp;
+    int jtype = type(j);
+    const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
+
+    // don't include atoms that share location with grid point
+    if (rsq >= rnd_cutsq(itype,jtype) || rsq < 1e-20) {
+      jtype = -1; // use -1 to signal it's outside the radius
+    } 
+
+    if (jtype >= 0)
+      ninside++;
+
+  } 
+
+  /*
+  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,ntotal),
+    [&] (const int j, int& count) {
+    const F_FLOAT dx = x(j,0) - xtmp;
+    const F_FLOAT dy = x(j,1) - ytmp;
+    const F_FLOAT dz = x(j,2) - ztmp;
+
+    int jtype = type(j);
+    const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
+
+    // don't include atoms that share location with grid point
+    if (rsq >= rnd_cutsq(itype,jtype) || rsq < 1e-20) {
+      jtype = -1; // use -1 to signal it's outside the radius
+    } 
+
+    type_cache[j] = jtype;
+
+    if (jtype >= 0)
+     count++;
+
+  }, ninside);
+  */
+
+  d_ninside(ii) = ninside; 
+
+  // TODO: Adjust for multi-element, currently we set jelem = 0 regardless of type.
+  int offset = 0;
+  for (int j = 0; j < ntotal; j++){
+    //const int jtype = type_cache[j];
+    //if (jtype >= 0) {
+    const F_FLOAT dx = x(j,0) - xtmp;
+    const F_FLOAT dy = x(j,1) - ytmp;
+    const F_FLOAT dz = x(j,2) - ztmp;
+    const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
+    int jtype = type(j);
+    if (rsq < rnd_cutsq(itype,jtype) && rsq > 1e-20) {
+      int jelem = 0;
+      if (chemflag) jelem = d_map[jtype];
+      my_sna.rij(ii,offset,0) = static_cast<real_type>(dx);
+      my_sna.rij(ii,offset,1) = static_cast<real_type>(dy);
+      my_sna.rij(ii,offset,2) = static_cast<real_type>(dz);
+      // pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp
+      // actually since the views here have values starting at 0, let's use jelem
+      my_sna.wj(ii,offset) = static_cast<real_type>(d_wjelem[jelem]);
+      my_sna.rcutij(ii,offset) = static_cast<real_type>((2.0 * d_radelem[jelem])*rcutfac);
+      my_sna.inside(ii,offset) = j;
+      if (switchinnerflag) {
+        my_sna.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]);
+        my_sna.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]);
+      }
+      if (chemflag)
+        my_sna.element(ii,offset) = jelem;
+      else
+        my_sna.element(ii,offset) = 0;
+      offset++;
+    }
+  }
+
+  /*
+  int offset = 0;
+  for (int j = 0; j < ntotal; j++){
+    const int jtype = type_cache[j];
+    if (jtype >= 0) {
+      printf(">>> offset: %d\n", offset);
+      const F_FLOAT dx = x(j,0) - xtmp;
+      const F_FLOAT dy = x(j,1) - ytmp;
+      const F_FLOAT dz = x(j,2) - ztmp;
+      int jtype = type(j);
+      int jelem = 0;
+      if (chemflag) jelem = d_map[jtype];
+      my_sna.rij(ii,offset,0) = static_cast<real_type>(dx);
+      my_sna.rij(ii,offset,1) = static_cast<real_type>(dy);
+      my_sna.rij(ii,offset,2) = static_cast<real_type>(dz);
+      // pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp
+      // actually since the views here have values starting at 0, let's use jelem
+      my_sna.wj(ii,offset) = static_cast<real_type>(d_wjelem[jelem]);
+      my_sna.rcutij(ii,offset) = static_cast<real_type>((2.0 * d_radelem[jelem])*rcutfac);
+      my_sna.inside(ii,offset) = j;
+      if (switchinnerflag) {
+        my_sna.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]);
+        my_sna.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]);
+      }
+      if (chemflag)
+        my_sna.element(ii,offset) = jelem;
+      else
+        my_sna.element(ii,offset) = 0;
+      offset++;
+    }
+  }
+  */
+
+  /*
+  Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team,ntotal),
+    [&] (const int j, int& offset, bool final) {
+
+    const int jtype = type_cache[j];
+
+    if (jtype >= 0) {
+      if (final) {
+        const F_FLOAT dx = x(j,0) - xtmp;
+        const F_FLOAT dy = x(j,1) - ytmp;
+        const F_FLOAT dz = x(j,2) - ztmp;
+        int jtype = type(j);
+        int jelem = 0;
+        if (chemflag) jelem = d_map[jtype];
+        my_sna.rij(ii,offset,0) = static_cast<real_type>(dx);
+        my_sna.rij(ii,offset,1) = static_cast<real_type>(dy);
+        my_sna.rij(ii,offset,2) = static_cast<real_type>(dz);
+        // pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp
+        // actually since the views here have values starting at 0, let's use jelem
+        my_sna.wj(ii,offset) = static_cast<real_type>(d_wjelem[jelem]);
+        my_sna.rcutij(ii,offset) = static_cast<real_type>((2.0 * d_radelem[jelem])*rcutfac);
+        my_sna.inside(ii,offset) = j;
+        if (switchinnerflag) {
+          my_sna.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]);
+          my_sna.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]);
+        }
+        if (chemflag)
+          my_sna.element(ii,offset) = jelem;
+        else
+          my_sna.element(ii,offset) = 0;
+      }
+      offset++;
+    }
+  });
+  */
+}
+
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeCayleyKlein,const int iatom_mod, const int jnbor, const int iatom_div) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+  const int ii = iatom_mod + iatom_div * vector_length;
+  if (ii >= chunk_size) return;
+
+  const int ninside = d_ninside(ii);
+  if (jnbor >= ninside) return;
+
+  my_sna.compute_cayley_klein(iatom_mod,jnbor,iatom_div);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalPreUi, const int iatom_mod, const int j, const int iatom_div) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+  const int ii = iatom_mod + iatom_div * vector_length;
+  if (ii >= chunk_size) return;
+
+  int itype = type(ii);
+  // force ielem to be zero (i.e. type 1) per `compute_sna_grid.cpp`
+  int ielem = 0;
+
+  my_sna.pre_ui(iatom_mod, j, ielem, iatom_div);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridLocalComputeUiSmall>::member_type& team) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+  // extract flattened atom_div / neighbor number / bend_location
+  int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
+
+  // extract neighbor index, iatom_div
+  int iatom_div = flattened_idx / (max_neighs * (twojmax + 1)); // removed "const" to work around GCC 7 bug
+  const int jj_jbend = flattened_idx - iatom_div * (max_neighs * (twojmax + 1));
+  const int jbend = jj_jbend / max_neighs;
+  int jj = jj_jbend - jbend * max_neighs; // removed "const" to work around GCC 7 bug
+
+  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
+    [&] (const int iatom_mod) {
+    const int ii = iatom_mod + vector_length * iatom_div;
+    if (ii >= chunk_size) return;
+
+    const int ninside = d_ninside(ii);
+    if (jj >= ninside) return;
+
+    my_sna.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div);
+  });
+
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridLocalComputeUiLarge>::member_type& team) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+  // extract flattened atom_div / neighbor number / bend location
+  int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
+
+  // extract neighbor index, iatom_div
+  int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug
+  int jj = flattened_idx - iatom_div * max_neighs;
+
+  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
+    [&] (const int iatom_mod) {
+    const int ii = iatom_mod + vector_length * iatom_div;
+    if (ii >= chunk_size) return;
+
+    const int ninside = d_ninside(ii);
+    if (jj >= ninside) return;
+
+    my_sna.compute_ui_large(team,iatom_mod, jj, iatom_div);
+  });
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalTransformUi,const int iatom_mod, const int idxu, const int iatom_div) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+
+  if (idxu > my_sna.idxu_max) return;
+
+  int elem_count = chemflag ? nelements : 1;
+
+  for (int ielem = 0; ielem < elem_count; ielem++){
+
+    const FullHalfMapper mapper = my_sna.idxu_full_half[idxu];
+
+    auto utot_re = my_sna.ulisttot_re_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div);
+    auto utot_im = my_sna.ulisttot_im_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div);
+
+    if (mapper.flip_sign == 1){
+      utot_im = -utot_im;
+    } else if (mapper.flip_sign == -1){
+      utot_re = -utot_re;
+    }
+
+    my_sna.ulisttot_pack(iatom_mod, idxu, ielem, iatom_div) = { utot_re, utot_im };
+
+    if (mapper.flip_sign == 0) {
+      my_sna.ylist_pack_re(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.;
+      my_sna.ylist_pack_im(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.;
+    }
+  }
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeZi,const int iatom_mod, const int jjz, const int iatom_div) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+
+  if (jjz >= my_sna.idxz_max) return;
+
+  my_sna.compute_zi(iatom_mod,jjz,iatom_div);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeBi,const int iatom_mod, const int jjb, const int iatom_div) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+
+  if (jjb >= my_sna.idxb_max) return;
+
+  my_sna.compute_bi(iatom_mod,jjb,iatom_div);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+
+  if (idxb >= my_sna.idxb_max) return;
+
+  const int ntriples = my_sna.ntriples;
+
+  for (int itriple = 0; itriple < ntriples; itriple++) {
+
+    const real_type blocal = my_sna.blist_pack(iatom_mod, idxb, itriple, iatom_div);
+
+    my_sna.blist(iatom, itriple, idxb) = blocal;
+  }
+
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocal2Fill, const int& ii) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+
+  // extract grid index
+  int igrid = ii + chunk_offset;
+
+  // convert to grid indices
+
+  int iz = igrid/(xlen*ylen);
+  int i2 = igrid - (iz*xlen*ylen);
+  int iy = i2/xlen;
+  int ix = i2 % xlen;
+  iz += nzlo;
+  iy += nylo;
+  ix += nxlo;
+
+  double xgrid[3];
+
+  // index ii already captures the proper grid point
+  // int igrid = iz * (nx * ny) + iy * nx + ix;
+  // printf("ii igrid: %d %d\n", ii, igrid);
+
+  // grid2x converts igrid to ix,iy,iz like we've done before
+  //grid2x(igrid, xgrid);
+  xgrid[0] = ix * delx;
+  xgrid[1] = iy * dely;
+  xgrid[2] = iz * delz;
+  if (triclinic) {
+
+    // Do a conversion on `xgrid` here like we do in the CPU version.
+
+    // Can't do this:
+    // domainKK->lamda2x(xgrid, xgrid);
+    // Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed
+
+    // Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats.
+    /*
+    xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0];
+    xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1];
+    xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2];
+    */
+    xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0;
+    xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1;
+    xgrid[2] = h2*xgrid[2] + lo2;
+  }
+
+  const F_FLOAT xtmp = xgrid[0];
+  const F_FLOAT ytmp = xgrid[1];
+  const F_FLOAT ztmp = xgrid[2];
+  d_gridall(igrid,0) = xtmp;
+  d_gridall(igrid,1) = ytmp;
+  d_gridall(igrid,2) = ztmp;
+
+  const auto idxb_max = snaKK.idxb_max;
+
+  // linear contributions
+
+  for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
+    const auto idxb = icoeff % idxb_max;
+    const auto idx_chem = icoeff / idxb_max;
+    d_gridall(igrid,icoeff+3) = my_sna.blist(ii,idx_chem,idxb);
+  }
+
+}
+
+/* ----------------------------------------------------------------------
+   utility functions
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+template<class TagStyle>
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::check_team_size_for(int inum, int &team_size) {
+  int team_size_max;
+
+  team_size_max = Kokkos::TeamPolicy<DeviceType,TagStyle>(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelForTag());
+
+  if (team_size*vector_length > team_size_max)
+    team_size = team_size_max/vector_length;
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+template<class TagStyle>
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::check_team_size_reduce(int inum, int &team_size) {
+  int team_size_max;
+
+  team_size_max = Kokkos::TeamPolicy<DeviceType,TagStyle>(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelReduceTag());
+
+  if (team_size*vector_length > team_size_max)
+    team_size = team_size_max/vector_length;
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+template<typename scratch_type>
+int ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::scratch_size_helper(int values_per_team) {
+  typedef Kokkos::View<scratch_type*, Kokkos::DefaultExecutionSpace::scratch_memory_space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > ScratchViewType;
+
+  return ScratchViewType::shmem_size(values_per_team);
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   routines used by template reference classes
+------------------------------------------------------------------------- */
+
+
+template<class DeviceType>
+ComputeSNAGridLocalKokkosDevice<DeviceType>::ComputeSNAGridLocalKokkosDevice(class LAMMPS *lmp, int narg, char **arg)
+   : ComputeSNAGridLocalKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN>(lmp, narg, arg) { ; }
+
+template<class DeviceType>
+void ComputeSNAGridLocalKokkosDevice<DeviceType>::init()
+{
+  Base::init();
+}
+
+template<class DeviceType>
+void ComputeSNAGridLocalKokkosDevice<DeviceType>::compute_local()
+{
+  Base::compute_local();
+}
+
+#ifdef LMP_KOKKOS_GPU
+template<class DeviceType>
+ComputeSNAGridLocalKokkosHost<DeviceType>::ComputeSNAGridLocalKokkosHost(class LAMMPS *lmp, int narg, char **arg)
+   : ComputeSNAGridLocalKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_HOST_VECLEN>(lmp, narg, arg) { ; }
+
+template<class DeviceType>
+void ComputeSNAGridLocalKokkosHost<DeviceType>::init()
+{
+  Base::init();
+}
+
+template<class DeviceType>
+void ComputeSNAGridLocalKokkosHost<DeviceType>::compute_local()
+{
+  Base::compute_local();
+}
+#endif
+
+}
diff --git a/src/ML-SNAP/compute_sna_grid_local.cpp b/src/ML-SNAP/compute_sna_grid_local.cpp
index 1d42a42c05..3981970506 100644
--- a/src/ML-SNAP/compute_sna_grid_local.cpp
+++ b/src/ML-SNAP/compute_sna_grid_local.cpp
@@ -37,8 +37,8 @@ ComputeSNAGridLocal::ComputeSNAGridLocal(LAMMPS *lmp, int narg, char **arg) :
 
   // begin code common to all SNAP computes
 
-  double rfac0, rmin0;
-  int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;
+  //double rfac0, rmin0;
+  //int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;
 
   int ntypes = atom->ntypes;
   int nargmin = 6 + 2 * ntypes;
@@ -56,6 +56,8 @@ ComputeSNAGridLocal::ComputeSNAGridLocal(LAMMPS *lmp, int narg, char **arg) :
   wselfallflag = 0;
   switchinnerflag = 0;
   nelements = 1;
+  chunksize = 32768;
+  parallel_thresh = 8192;
 
   // process required arguments
 
diff --git a/src/ML-SNAP/compute_sna_grid_local.h b/src/ML-SNAP/compute_sna_grid_local.h
index 0475212e13..85662ad509 100644
--- a/src/ML-SNAP/compute_sna_grid_local.h
+++ b/src/ML-SNAP/compute_sna_grid_local.h
@@ -32,7 +32,7 @@ class ComputeSNAGridLocal : public ComputeGridLocal {
   void compute_local() override;
   double memory_usage() override;
 
- private:
+ protected:
   int ncoeff;
   double **cutsq;
   double rcutfac;
@@ -46,6 +46,10 @@ class ComputeSNAGridLocal : public ComputeGridLocal {
   class SNA *snaptr;
   double cutmax;
   int quadraticflag;
+  double rfac0, rmin0;
+  int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;
+  int chunksize;
+  int parallel_thresh;
 };
 
 }    // namespace LAMMPS_NS

From b5dc7d58a8645ad19e5152dc050505a796118efc Mon Sep 17 00:00:00 2001
From: rohskopf <drew.rohskopf@gmail.com>
Date: Sun, 9 Jul 2023 10:20:45 -0600
Subject: [PATCH 31/51] Destruct sna/grid/local/kk properly and use local
 arrays

---
 src/KOKKOS/compute_sna_grid_local_kokkos.h    |  6 ++++
 .../compute_sna_grid_local_kokkos_impl.h      | 35 +++++++++++--------
 src/ML-SNAP/compute_grid_local.cpp            |  6 ++--
 src/ML-SNAP/compute_sna_grid_local.cpp        |  1 +
 4 files changed, 30 insertions(+), 18 deletions(-)

diff --git a/src/KOKKOS/compute_sna_grid_local_kokkos.h b/src/KOKKOS/compute_sna_grid_local_kokkos.h
index 9fccb39aa2..d11d2e1623 100644
--- a/src/KOKKOS/compute_sna_grid_local_kokkos.h
+++ b/src/KOKKOS/compute_sna_grid_local_kokkos.h
@@ -240,6 +240,11 @@ class ComputeSNAGridLocalKokkos : public ComputeSNAGridLocal {
 
   typename AT::t_x_array_randomread x;
   typename AT::t_int_1d_randomread type;
+  
+  DAT::tdual_float_2d k_alocal;
+  typename AT::t_float_2d d_alocal;
+
+  /*
   DAT::tdual_float_2d k_grid;
   DAT::tdual_float_2d k_gridall;
   typename AT::t_float_2d d_grid;
@@ -247,6 +252,7 @@ class ComputeSNAGridLocalKokkos : public ComputeSNAGridLocal {
 
   DAT::tdual_float_4d k_gridlocal;
   typename AT::t_float_4d d_gridlocal;
+  */
 
 
   // Utility routine which wraps computing per-team scratch size requirements for
diff --git a/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h
index 67ea878143..e8555a2101 100644
--- a/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h
+++ b/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h
@@ -135,11 +135,12 @@ ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::ComputeSNAGridL
 template<class DeviceType, typename real_type, int vector_length>
 ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::~ComputeSNAGridLocalKokkos()
 {
-  //printf(">>> ComputeSNAGridLocalKokkos destruct begin copymode %d\n", copymode);
+  printf(">>> ComputeSNAGridLocalKokkos destruct begin copymode %d\n", copymode);
   if (copymode) return;
   //printf(">>> After copymode\n");
 
   memoryKK->destroy_kokkos(k_cutsq,cutsq);
+  memoryKK->destroy_kokkos(k_alocal,alocal);
   //memoryKK->destroy_kokkos(k_grid,grid);
   //memoryKK->destroy_kokkos(k_gridall, gridall);
   //memoryKK->destroy_kokkos(k_gridlocal, gridlocal);
@@ -169,18 +170,23 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::setup()
   //ComputeGrid::set_grid_global();
   //ComputeGrid::set_grid_local();
   //ComputeSNAGridLocal::setup();
+  ComputeGridLocal::setup();
   
   // allocate arrays
   //memoryKK->create_kokkos(k_gridall, gridall, size_array_rows, size_array_cols, "grid:gridall");
+  memoryKK->create_kokkos(k_alocal, alocal, size_local_rows, size_local_cols, "grid:alocal");
 
   // do not use or allocate gridlocal for now
 
-  gridlocal_allocated = 0;
+  //gridlocal_allocated = 0;
   //array = gridall;
 
-  d_gridlocal = k_gridlocal.template view<DeviceType>();
+  array_local = alocal;
+
+  //d_gridlocal = k_gridlocal.template view<DeviceType>();
   //d_grid = k_grid.template view<DeviceType>();
-  d_gridall = k_gridall.template view<DeviceType>();
+  //d_gridall = k_gridall.template view<DeviceType>();
+  d_alocal = k_alocal.template view<DeviceType>();
 }
 
 // Compute
@@ -192,6 +198,8 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::compute_lo
     return;
   }
 
+  printf(">>> ComputeSNAGridLocalKokkos::compute_local begin\n");
+
   copymode = 1;
 
   zlen = nzhi-nzlo+1;
@@ -212,6 +220,7 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::compute_lo
 
   ntotal = atomKK->nlocal + atomKK->nghost;
   // Allocate view for number of neighbors per grid point
+  printf(">>> total_range: %d\n", total_range);
   MemKK::realloc_kokkos(d_ninside,"ComputeSNAGridLocalKokkos:ninside",total_range);
 
   // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user 
@@ -351,14 +360,10 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::compute_lo
 
   } // end while
 
-  k_gridlocal.template modify<DeviceType>();
-  k_gridlocal.template sync<LMPHostType>();
+  copymode = 0;
 
-  //k_grid.template modify<DeviceType>();
-  //k_grid.template sync<LMPHostType>();
-
-  k_gridall.template modify<DeviceType>();
-  k_gridall.template sync<LMPHostType>();
+  k_alocal.template modify<DeviceType>();
+  k_alocal.template sync<LMPHostType>();
 }
 
 /* ----------------------------------------------------------------------
@@ -830,9 +835,9 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator()
   const F_FLOAT xtmp = xgrid[0];
   const F_FLOAT ytmp = xgrid[1];
   const F_FLOAT ztmp = xgrid[2];
-  d_gridall(igrid,0) = xtmp;
-  d_gridall(igrid,1) = ytmp;
-  d_gridall(igrid,2) = ztmp;
+  //d_gridall(igrid,0) = xtmp;
+  //d_gridall(igrid,1) = ytmp;
+  //d_gridall(igrid,2) = ztmp;
 
   const auto idxb_max = snaKK.idxb_max;
 
@@ -841,7 +846,7 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator()
   for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
     const auto idxb = icoeff % idxb_max;
     const auto idx_chem = icoeff / idxb_max;
-    d_gridall(igrid,icoeff+3) = my_sna.blist(ii,idx_chem,idxb);
+    //d_gridall(igrid,icoeff+3) = my_sna.blist(ii,idx_chem,idxb);
   }
 
 }
diff --git a/src/ML-SNAP/compute_grid_local.cpp b/src/ML-SNAP/compute_grid_local.cpp
index 6065f38171..48a0e777e0 100644
--- a/src/ML-SNAP/compute_grid_local.cpp
+++ b/src/ML-SNAP/compute_grid_local.cpp
@@ -61,21 +61,21 @@ ComputeGridLocal::ComputeGridLocal(LAMMPS *lmp, int narg, char **arg) :
 
 ComputeGridLocal::~ComputeGridLocal()
 {
-  //printf(">>> ComputeGridLocal begin destruct\n");
+  printf(">>> ComputeGridLocal begin destruct\n");
   deallocate();
-  //printf(">>> ComputeGridLocal end destruct\n");
+  printf(">>> ComputeGridLocal end destruct\n");
 }
 
 /* ---------------------------------------------------------------------- */
 
 void ComputeGridLocal::setup()
 {
-  //printf(">>> ComputeGridLocal setup\n");
   deallocate();
   set_grid_global();
   set_grid_local();
   allocate();
   assign_coords();
+  printf(">>> ComputeGridLocal setup nx ny nz %d %d %d %d %d %d\n", nxlo, nxhi, nylo, nyhi, nzlo, nzhi);
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/ML-SNAP/compute_sna_grid_local.cpp b/src/ML-SNAP/compute_sna_grid_local.cpp
index 3981970506..db49063920 100644
--- a/src/ML-SNAP/compute_sna_grid_local.cpp
+++ b/src/ML-SNAP/compute_sna_grid_local.cpp
@@ -182,6 +182,7 @@ ComputeSNAGridLocal::ComputeSNAGridLocal(LAMMPS *lmp, int narg, char **arg) :
 
 ComputeSNAGridLocal::~ComputeSNAGridLocal()
 {
+  if (copymode) return;
   memory->destroy(radelem);
   memory->destroy(wjelem);
   memory->destroy(cutsq);

From cb915cdce7a2f7e1776310493389137fb10c2027 Mon Sep 17 00:00:00 2001
From: rohskopf <drew.rohskopf@gmail.com>
Date: Mon, 10 Jul 2023 10:22:52 -0600
Subject: [PATCH 32/51] Fill local sna/grid array

---
 .../compute_sna_grid_local_kokkos_impl.h      | 21 +++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h
index e8555a2101..ee7cd464cd 100644
--- a/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h
+++ b/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h
@@ -456,6 +456,19 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator()
   const F_FLOAT ytmp = xgrid[1];
   const F_FLOAT ztmp = xgrid[2];
 
+  // Zeroing out the components, which are filled as a sum.
+  for (int icol = size_local_cols_base; icol < size_local_cols; icol++){
+    d_alocal(igrid, icol) = 0.0;
+  }
+
+  // Fill grid info columns
+  d_alocal(igrid, 0) = ix;
+  d_alocal(igrid, 1) = iy;
+  d_alocal(igrid, 2) = iz;
+  d_alocal(igrid, 3) = xtmp;
+  d_alocal(igrid, 4) = ytmp;
+  d_alocal(igrid, 5) = ztmp;
+
   // currently, all grid points are type 1
   // not clear what a better choice would be
 
@@ -832,9 +845,9 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator()
     xgrid[2] = h2*xgrid[2] + lo2;
   }
 
-  const F_FLOAT xtmp = xgrid[0];
-  const F_FLOAT ytmp = xgrid[1];
-  const F_FLOAT ztmp = xgrid[2];
+  //const F_FLOAT xtmp = xgrid[0];
+  //const F_FLOAT ytmp = xgrid[1];
+  //const F_FLOAT ztmp = xgrid[2];
   //d_gridall(igrid,0) = xtmp;
   //d_gridall(igrid,1) = ytmp;
   //d_gridall(igrid,2) = ztmp;
@@ -846,7 +859,7 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator()
   for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
     const auto idxb = icoeff % idxb_max;
     const auto idx_chem = icoeff / idxb_max;
-    //d_gridall(igrid,icoeff+3) = my_sna.blist(ii,idx_chem,idxb);
+    d_alocal(igrid,icoeff+6) = my_sna.blist(ii,idx_chem,idxb);
   }
 
 }

From 79e05595dbe6d6d25e7b4cee8c5adee4722561cf Mon Sep 17 00:00:00 2001
From: rohskopf <drew.rohskopf@gmail.com>
Date: Tue, 11 Jul 2023 13:11:50 -0600
Subject: [PATCH 33/51] Remove destructor prints

---
 src/KOKKOS/compute_sna_grid_local_kokkos_impl.h | 6 +++---
 src/ML-SNAP/compute_grid_local.cpp              | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h
index ee7cd464cd..73a9df39ac 100644
--- a/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h
+++ b/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h
@@ -135,7 +135,7 @@ ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::ComputeSNAGridL
 template<class DeviceType, typename real_type, int vector_length>
 ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::~ComputeSNAGridLocalKokkos()
 {
-  printf(">>> ComputeSNAGridLocalKokkos destruct begin copymode %d\n", copymode);
+  //printf(">>> ComputeSNAGridLocalKokkos destruct begin copymode %d\n", copymode);
   if (copymode) return;
   //printf(">>> After copymode\n");
 
@@ -198,7 +198,7 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::compute_lo
     return;
   }
 
-  printf(">>> ComputeSNAGridLocalKokkos::compute_local begin\n");
+  //printf(">>> ComputeSNAGridLocalKokkos::compute_local begin\n");
 
   copymode = 1;
 
@@ -220,7 +220,7 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::compute_lo
 
   ntotal = atomKK->nlocal + atomKK->nghost;
   // Allocate view for number of neighbors per grid point
-  printf(">>> total_range: %d\n", total_range);
+  //printf(">>> total_range: %d\n", total_range);
   MemKK::realloc_kokkos(d_ninside,"ComputeSNAGridLocalKokkos:ninside",total_range);
 
   // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user 
diff --git a/src/ML-SNAP/compute_grid_local.cpp b/src/ML-SNAP/compute_grid_local.cpp
index 48a0e777e0..92bb556c50 100644
--- a/src/ML-SNAP/compute_grid_local.cpp
+++ b/src/ML-SNAP/compute_grid_local.cpp
@@ -61,9 +61,9 @@ ComputeGridLocal::ComputeGridLocal(LAMMPS *lmp, int narg, char **arg) :
 
 ComputeGridLocal::~ComputeGridLocal()
 {
-  printf(">>> ComputeGridLocal begin destruct\n");
+  //printf(">>> ComputeGridLocal begin destruct\n");
   deallocate();
-  printf(">>> ComputeGridLocal end destruct\n");
+  //printf(">>> ComputeGridLocal end destruct\n");
 }
 
 /* ---------------------------------------------------------------------- */
@@ -75,7 +75,7 @@ void ComputeGridLocal::setup()
   set_grid_local();
   allocate();
   assign_coords();
-  printf(">>> ComputeGridLocal setup nx ny nz %d %d %d %d %d %d\n", nxlo, nxhi, nylo, nyhi, nzlo, nzhi);
+  //printf(">>> ComputeGridLocal setup nx ny nz %d %d %d %d %d %d\n", nxlo, nxhi, nylo, nyhi, nzlo, nzhi);
 }
 
 /* ----------------------------------------------------------------------

From a5b262aefad8680886a55667b95c23440a71bfc6 Mon Sep 17 00:00:00 2001
From: Lenz Fiedler <l.fiedler@hzdr.de>
Date: Fri, 8 Sep 2023 00:20:45 +0200
Subject: [PATCH 34/51] Hotfixing a small bug in the Kokkos Gaussian Compute

Co-authored-by: Drew Rohskopf <<drew.rohskopf@gmail.com>
---
 src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp b/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp
index a52d747922..6913fd284b 100644
--- a/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp
+++ b/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp
@@ -67,7 +67,7 @@ ComputeGaussianGridLocalKokkos<DeviceType>::ComputeGaussianGridLocalKokkos(LAMMP
   //printf(">>> 1\n");
   // Set up element lists
   int n = atom->ntypes;
-  MemKK::realloc_kokkos(d_radelem,"ComputeSNAGridKokkos::radelem",nelements);
+  MemKK::realloc_kokkos(d_radelem,"ComputeSNAGridKokkos::radelem",n);
   MemKK::realloc_kokkos(d_sigmaelem,"ComputeSNAGridKokkos::sigmaelem",n+1);
   MemKK::realloc_kokkos(d_prefacelem,"ComputeSNAGridKokkos::prefacelem",n+1);
   MemKK::realloc_kokkos(d_argfacelem,"ComputeSNAGridKokkos::argfacelem",n+1);

From 2185ffa4280072a3325b9795c92fe89632501f38 Mon Sep 17 00:00:00 2001
From: Lenz Fiedler <l.fiedler@hzdr.de>
Date: Thu, 5 Dec 2024 16:43:18 +0100
Subject: [PATCH 35/51] Renamed files to be more consistent with other examples

---
 examples/snap/{in.grid.gaussian => in.gaussian.grid}  | 0
 examples/snap/{in.grid.snap => in.snap.grid}          | 0
 examples/snap/{in.grid.tri => in.snap.grid.triclinic} | 0
 3 files changed, 0 insertions(+), 0 deletions(-)
 rename examples/snap/{in.grid.gaussian => in.gaussian.grid} (100%)
 rename examples/snap/{in.grid.snap => in.snap.grid} (100%)
 rename examples/snap/{in.grid.tri => in.snap.grid.triclinic} (100%)

diff --git a/examples/snap/in.grid.gaussian b/examples/snap/in.gaussian.grid
similarity index 100%
rename from examples/snap/in.grid.gaussian
rename to examples/snap/in.gaussian.grid
diff --git a/examples/snap/in.grid.snap b/examples/snap/in.snap.grid
similarity index 100%
rename from examples/snap/in.grid.snap
rename to examples/snap/in.snap.grid
diff --git a/examples/snap/in.grid.tri b/examples/snap/in.snap.grid.triclinic
similarity index 100%
rename from examples/snap/in.grid.tri
rename to examples/snap/in.snap.grid.triclinic

From 30d39c8fb311565613c976583449d12674b19b11 Mon Sep 17 00:00:00 2001
From: Lenz Fiedler <l.fiedler@hzdr.de>
Date: Fri, 6 Dec 2024 09:37:09 +0100
Subject: [PATCH 36/51] Fixed formatting issues

---
 .../compute_gaussian_grid_local_kokkos.cpp    | 17 ++++------
 .../compute_gaussian_grid_local_kokkos.h      |  4 +--
 src/KOKKOS/compute_sna_grid_kokkos_impl.h     | 34 +++++++++----------
 src/KOKKOS/compute_sna_grid_local_kokkos.h    |  4 +--
 .../compute_sna_grid_local_kokkos_impl.h      | 34 +++++++++----------
 src/ML-SNAP/compute_gaussian_grid_local.cpp   |  8 ++---
 src/ML-SNAP/compute_gaussian_grid_local.h     |  2 +-
 7 files changed, 48 insertions(+), 55 deletions(-)

diff --git a/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp b/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp
index 6913fd284b..99380e0d63 100644
--- a/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp
+++ b/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp
@@ -183,16 +183,15 @@ void ComputeGaussianGridLocalKokkos<DeviceType>::compute_local()
   // max_neighs is defined here - think of more elaborate methods.
   max_neighs = 100;
 
-  // Pair snap/kk uses grow_ij with some max number of neighs but compute sna/grid uses total 
+  // Pair snap/kk uses grow_ij with some max number of neighs but compute sna/grid uses total
   // number of atoms.
-
   ntotal = atomKK->nlocal + atomKK->nghost;
   // Allocate view for number of neighbors per grid point
   MemKK::realloc_kokkos(d_ninside,"ComputeSNAGridKokkos:ninside",total_range);
 
-  // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user 
+  // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user
   // `total_range` is the number of grid points which may be larger than chunk size.
-  //printf(">>> total_range: %d\n", total_range);
+  // printf(">>> total_range: %d\n", total_range);
   chunksize = 32768; // 100*32768
   chunk_size = MIN(chunksize, total_range);
   chunk_offset = 0;
@@ -212,8 +211,8 @@ void ComputeGaussianGridLocalKokkos<DeviceType>::compute_local()
     h1 = domain->h[1];
     h2 = domain->h[2];
     h3 = domain->h[3];
-    h4 = domain->h[4];   
-    h5 = domain->h[5];   
+    h4 = domain->h[4];
+    h5 = domain->h[5];
     lo0 = domain->boxlo[0];
     lo1 = domain->boxlo[1];
     lo2 = domain->boxlo[2];
@@ -332,7 +331,6 @@ void ComputeGaussianGridLocalKokkos<DeviceType>::operator() (TagComputeGaussianG
 
   // currently, all grid points are type 1
   // not clear what a better choice would be
-  
   const int itype = 1;
   int ielem = 0;
   ielem = d_map[itype];
@@ -340,10 +338,8 @@ void ComputeGaussianGridLocalKokkos<DeviceType>::operator() (TagComputeGaussianG
 
   // Compute the number of neighbors, store rsq
   int ninside = 0;
-  
 
   // Looping over ntotal for now.
-  
   for (int j = 0; j < ntotal; j++){
     const F_FLOAT dx = x(j,0) - xtmp;
     const F_FLOAT dy = x(j,1) - ytmp;
@@ -359,7 +355,6 @@ void ComputeGaussianGridLocalKokkos<DeviceType>::operator() (TagComputeGaussianG
   }
 
   //printf("%f\n", d_alocal(igrid, 6));
-  
 }
 
 /* ----------------------------------------------------------------------
@@ -382,4 +377,4 @@ template class ComputeGaussianGridLocalKokkos<LMPDeviceType>;
 #ifdef LMP_KOKKOS_GPU
 template class ComputeGaussianGridLocalKokkos<LMPHostType>;
 #endif
-}
\ No newline at end of file
+}
diff --git a/src/KOKKOS/compute_gaussian_grid_local_kokkos.h b/src/KOKKOS/compute_gaussian_grid_local_kokkos.h
index db3e87a7e9..deb5eaa8cb 100644
--- a/src/KOKKOS/compute_gaussian_grid_local_kokkos.h
+++ b/src/KOKKOS/compute_gaussian_grid_local_kokkos.h
@@ -93,8 +93,8 @@ template <class DeviceType> class ComputeGaussianGridLocalKokkos : public Comput
   int host_flag;
   int total_range; // total number of loop iterations in grid
   int xlen, ylen, zlen;
-  int chunksize; 
-  int ntotal; 
+  int chunksize;
+  int ntotal;
 
   typename AT::t_x_array_randomread x;
   typename AT::t_int_1d_randomread type;
diff --git a/src/KOKKOS/compute_sna_grid_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
index 81f3173a7d..2101d5968b 100644
--- a/src/KOKKOS/compute_sna_grid_kokkos_impl.h
+++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
@@ -104,7 +104,7 @@ ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::ComputeSNAGridKokkos
   }
 
   // In pair snap some things like `map` get allocated regardless of chem flag.
-  if (chemflag){ 
+  if (chemflag){
     for (int i = 1; i <= atom->ntypes; i++) {
       h_map(i) = map[i];
     }
@@ -168,7 +168,7 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::setup()
 
   ComputeGrid::set_grid_global();
   ComputeGrid::set_grid_local();
-  
+
   // allocate arrays
   memoryKK->create_kokkos(k_gridall, gridall, size_array_rows, size_array_cols, "grid:gridall");
 
@@ -206,14 +206,14 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::compute_array()
   // max_neighs is defined here - think of more elaborate methods.
   max_neighs = 100;
 
-  // Pair snap/kk uses grow_ij with some max number of neighs but compute sna/grid uses total 
+  // Pair snap/kk uses grow_ij with some max number of neighs but compute sna/grid uses total
   // number of atoms.
 
   ntotal = atomKK->nlocal + atomKK->nghost;
   // Allocate view for number of neighbors per grid point
   MemKK::realloc_kokkos(d_ninside,"ComputeSNAGridKokkos:ninside",total_range);
 
-  // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user 
+  // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user
   // `total_range` is the number of grid points which may be larger than chunk size.
   //printf(">>> total_range: %d\n", total_range);
   chunk_size = MIN(chunksize, total_range);
@@ -222,7 +222,7 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::compute_array()
   snaKK.grow_rij(chunk_size, max_neighs);
 
   //chunk_size = total_range;
- 
+
   // Pre-compute ceil(chunk_size / vector_length) for code cleanliness
   const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length;
 
@@ -236,8 +236,8 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::compute_array()
     h1 = domain->h[1];
     h2 = domain->h[2];
     h3 = domain->h[3];
-    h4 = domain->h[4];   
-    h5 = domain->h[5];   
+    h4 = domain->h[4];
+    h5 = domain->h[5];
     lo0 = domain->boxlo[0];
     lo1 = domain->boxlo[1];
     lo2 = domain->boxlo[2];
@@ -250,11 +250,11 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::compute_array()
 
     //printf(">>> chunk_offset: %d\n", chunk_offset);
 
-    //ComputeNeigh 
+    //ComputeNeigh
     {
       int scratch_size = scratch_size_helper<int>(team_size_compute_neigh * max_neighs); //ntotal);
 
-      SnapAoSoATeamPolicy<DeviceType, team_size_compute_neigh, TagCSNAGridComputeNeigh> 
+      SnapAoSoATeamPolicy<DeviceType, team_size_compute_neigh, TagCSNAGridComputeNeigh>
         policy_neigh(chunk_size, team_size_compute_neigh, vector_length);
       policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
       Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this);
@@ -375,9 +375,9 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
   // This function follows similar procedure as ComputeNeigh of PairSNAPKokkos.
   // Main difference is that we don't use the neighbor class or neighbor variables here.
   // This is because the grid points are not atoms and therefore do not get assigned
-  // neighbors in LAMMPS. 
-  // TODO: If we did make a neighborlist for each grid point, we could use current 
-  //       routines and avoid having to loop over all atoms (which limits us to 
+  // neighbors in LAMMPS.
+  // TODO: If we did make a neighborlist for each grid point, we could use current
+  //       routines and avoid having to loop over all atoms (which limits us to
   //       natoms = max team size).
 
   SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
@@ -468,7 +468,7 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
 
   // Compute the number of neighbors, store rsq
   int ninside = 0;
-  
+
   // Looping over ntotal for now.
   for (int j = 0; j < ntotal; j++){
     const F_FLOAT dx = x(j,0) - xtmp;
@@ -480,12 +480,12 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
     // don't include atoms that share location with grid point
     if (rsq >= rnd_cutsq(itype,jtype) || rsq < 1e-20) {
       jtype = -1; // use -1 to signal it's outside the radius
-    } 
+    }
 
     if (jtype >= 0)
       ninside++;
 
-  } 
+  }
 
   /*
   Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,ntotal),
@@ -500,7 +500,7 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
     // don't include atoms that share location with grid point
     if (rsq >= rnd_cutsq(itype,jtype) || rsq < 1e-20) {
       jtype = -1; // use -1 to signal it's outside the radius
-    } 
+    }
 
     type_cache[j] = jtype;
 
@@ -510,7 +510,7 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
   }, ninside);
   */
 
-  d_ninside(ii) = ninside; 
+  d_ninside(ii) = ninside;
 
   // TODO: Adjust for multi-element, currently we set jelem = 0 regardless of type.
   int offset = 0;
diff --git a/src/KOKKOS/compute_sna_grid_local_kokkos.h b/src/KOKKOS/compute_sna_grid_local_kokkos.h
index d11d2e1623..9073b921c1 100644
--- a/src/KOKKOS/compute_sna_grid_local_kokkos.h
+++ b/src/KOKKOS/compute_sna_grid_local_kokkos.h
@@ -240,7 +240,7 @@ class ComputeSNAGridLocalKokkos : public ComputeSNAGridLocal {
 
   typename AT::t_x_array_randomread x;
   typename AT::t_int_1d_randomread type;
-  
+
   DAT::tdual_float_2d k_alocal;
   typename AT::t_float_2d d_alocal;
 
@@ -313,4 +313,4 @@ class ComputeSNAGridLocalKokkosHost : public ComputeSNAGridLocalKokkos<DeviceTyp
 }
 
 #endif
-#endif
\ No newline at end of file
+#endif
diff --git a/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h
index 73a9df39ac..8f6958904b 100644
--- a/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h
+++ b/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h
@@ -104,7 +104,7 @@ ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::ComputeSNAGridL
   }
 
   // In pair snap some things like `map` get allocated regardless of chem flag.
-  if (chemflag){ 
+  if (chemflag){
     for (int i = 1; i <= atom->ntypes; i++) {
       h_map(i) = map[i];
     }
@@ -171,7 +171,7 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::setup()
   //ComputeGrid::set_grid_local();
   //ComputeSNAGridLocal::setup();
   ComputeGridLocal::setup();
-  
+
   // allocate arrays
   //memoryKK->create_kokkos(k_gridall, gridall, size_array_rows, size_array_cols, "grid:gridall");
   memoryKK->create_kokkos(k_alocal, alocal, size_local_rows, size_local_cols, "grid:alocal");
@@ -215,7 +215,7 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::compute_lo
   // max_neighs is defined here - think of more elaborate methods.
   max_neighs = 100;
 
-  // Pair snap/kk uses grow_ij with some max number of neighs but compute sna/grid uses total 
+  // Pair snap/kk uses grow_ij with some max number of neighs but compute sna/grid uses total
   // number of atoms.
 
   ntotal = atomKK->nlocal + atomKK->nghost;
@@ -223,7 +223,7 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::compute_lo
   //printf(">>> total_range: %d\n", total_range);
   MemKK::realloc_kokkos(d_ninside,"ComputeSNAGridLocalKokkos:ninside",total_range);
 
-  // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user 
+  // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user
   // `total_range` is the number of grid points which may be larger than chunk size.
   //printf(">>> total_range: %d\n", total_range);
   chunk_size = MIN(chunksize, total_range);
@@ -232,7 +232,7 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::compute_lo
   snaKK.grow_rij(chunk_size, max_neighs);
 
   //chunk_size = total_range;
- 
+
   // Pre-compute ceil(chunk_size / vector_length) for code cleanliness
   const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length;
 
@@ -246,8 +246,8 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::compute_lo
     h1 = domain->h[1];
     h2 = domain->h[2];
     h3 = domain->h[3];
-    h4 = domain->h[4];   
-    h5 = domain->h[5];   
+    h4 = domain->h[4];
+    h5 = domain->h[5];
     lo0 = domain->boxlo[0];
     lo1 = domain->boxlo[1];
     lo2 = domain->boxlo[2];
@@ -260,11 +260,11 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::compute_lo
 
     //printf(">>> chunk_offset: %d\n", chunk_offset);
 
-    //ComputeNeigh 
+    //ComputeNeigh
     {
       int scratch_size = scratch_size_helper<int>(team_size_compute_neigh * max_neighs); //ntotal);
 
-      SnapAoSoATeamPolicy<DeviceType, team_size_compute_neigh, TagCSNAGridLocalComputeNeigh> 
+      SnapAoSoATeamPolicy<DeviceType, team_size_compute_neigh, TagCSNAGridLocalComputeNeigh>
         policy_neigh(chunk_size, team_size_compute_neigh, vector_length);
       policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
       Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this);
@@ -381,9 +381,9 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator()
   // This function follows similar procedure as ComputeNeigh of PairSNAPKokkos.
   // Main difference is that we don't use the neighbor class or neighbor variables here.
   // This is because the grid points are not atoms and therefore do not get assigned
-  // neighbors in LAMMPS. 
-  // TODO: If we did make a neighborlist for each grid point, we could use current 
-  //       routines and avoid having to loop over all atoms (which limits us to 
+  // neighbors in LAMMPS.
+  // TODO: If we did make a neighborlist for each grid point, we could use current
+  //       routines and avoid having to loop over all atoms (which limits us to
   //       natoms = max team size).
 
   SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
@@ -487,7 +487,7 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator()
 
   // Compute the number of neighbors, store rsq
   int ninside = 0;
-  
+
   // Looping over ntotal for now.
   for (int j = 0; j < ntotal; j++){
     const F_FLOAT dx = x(j,0) - xtmp;
@@ -499,12 +499,12 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator()
     // don't include atoms that share location with grid point
     if (rsq >= rnd_cutsq(itype,jtype) || rsq < 1e-20) {
       jtype = -1; // use -1 to signal it's outside the radius
-    } 
+    }
 
     if (jtype >= 0)
       ninside++;
 
-  } 
+  }
 
   /*
   Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,ntotal),
@@ -519,7 +519,7 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator()
     // don't include atoms that share location with grid point
     if (rsq >= rnd_cutsq(itype,jtype) || rsq < 1e-20) {
       jtype = -1; // use -1 to signal it's outside the radius
-    } 
+    }
 
     type_cache[j] = jtype;
 
@@ -529,7 +529,7 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator()
   }, ninside);
   */
 
-  d_ninside(ii) = ninside; 
+  d_ninside(ii) = ninside;
 
   // TODO: Adjust for multi-element, currently we set jelem = 0 regardless of type.
   int offset = 0;
diff --git a/src/ML-SNAP/compute_gaussian_grid_local.cpp b/src/ML-SNAP/compute_gaussian_grid_local.cpp
index c18aa31d05..81286f9d81 100644
--- a/src/ML-SNAP/compute_gaussian_grid_local.cpp
+++ b/src/ML-SNAP/compute_gaussian_grid_local.cpp
@@ -1,7 +1,7 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    https://www.lammps.org/ Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
+   LAMMPS development team: developers@lammps.org
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
@@ -61,9 +61,8 @@ ComputeGaussianGridLocal::ComputeGaussianGridLocal(LAMMPS *lmp, int narg, char *
   for (int i = 0; i < ntypes; i++) radelem[i + 1] = utils::numeric(FLERR, arg[4 + i], false, lmp);
   for (int i = 0; i < ntypes; i++)
     sigmaelem[i + 1] = utils::numeric(FLERR, arg[ntypes + 4 + i], false, lmp);
-  
-  // construct cutsq
 
+  // construct cutsq
   double cut;
   cutmax = 0.0;
   memory->create(cutsq, ntypes + 1, ntypes + 1, "gaussian/atom:cutsq");
@@ -80,7 +79,6 @@ ComputeGaussianGridLocal::ComputeGaussianGridLocal(LAMMPS *lmp, int narg, char *
   size_local_cols = size_local_cols_base + ntypes;
 
   // pre-compute coefficients
-  
   for (int i = 0; i < ntypes; i++) {
     prefacelem[i + 1] = 1.0/powint(sigmaelem[i + 1] * sqrt(MY_2PI), 3);
     argfacelem[i + 1] = 1.0/(2.0 * sigmaelem[i + 1] * sigmaelem[i + 1]);
@@ -155,7 +153,7 @@ void ComputeGaussianGridLocal::compute_local()
             alocal[igrid][icol] += prefacelem[jtype] * exp(-rsq * argfacelem[jtype]);
           }
         }
-	    igrid++;
+        igrid++;
   }
 }
 
diff --git a/src/ML-SNAP/compute_gaussian_grid_local.h b/src/ML-SNAP/compute_gaussian_grid_local.h
index 72e7326b49..77f88a7a8e 100644
--- a/src/ML-SNAP/compute_gaussian_grid_local.h
+++ b/src/ML-SNAP/compute_gaussian_grid_local.h
@@ -1,7 +1,7 @@
 /* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    https://www.lammps.org/ Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
+   LAMMPS development team: developers@lammps.org
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains

From 80acfeebe7e7c21c113f24dbbb4d6ba9e34971b5 Mon Sep 17 00:00:00 2001
From: Lenz Fiedler <l.fiedler@hzdr.de>
Date: Mon, 9 Dec 2024 14:04:47 +0100
Subject: [PATCH 37/51] Added documentation

---
 doc/src/compute_gaussian_grid_local.rst | 99 +++++++++++++++++++++++++
 doc/src/compute_sna_atom.rst            |  9 ++-
 2 files changed, 107 insertions(+), 1 deletion(-)
 create mode 100644 doc/src/compute_gaussian_grid_local.rst

diff --git a/doc/src/compute_gaussian_grid_local.rst b/doc/src/compute_gaussian_grid_local.rst
new file mode 100644
index 0000000000..24f41586d1
--- /dev/null
+++ b/doc/src/compute_gaussian_grid_local.rst
@@ -0,0 +1,99 @@
+.. index:: compute sna/atom
+.. index:: compute snad/atom
+.. index:: compute snav/atom
+.. index:: compute snap
+.. index:: compute sna/grid
+.. index:: compute sna/grid/local
+
+
+compute gaussian/grid/local command
+===================================
+
+compute gaussian/grid/local/kk command
+======================================
+
+Syntax
+""""""
+
+.. code-block:: LAMMPS
+
+   compute ID group-ID gaussian/grid nx ny nz rcutfac R_1 R_2 ...  R_1 R_2 ... sigma_1 sigma_2
+   compute ID group-ID gaussian/grid/local nx ny nz rcutfac  R_1 R_2 ... sigma_1 sigma_2
+
+* ID, group-ID are documented in :doc:`compute <compute>` command
+* sna/atom = style name of this compute command
+* rcutfac = scale factor applied to all cutoff radii (positive real)
+* sigma_1, sigma_2,... = Gaussian broadening, one for each type (positive real)
+* R_1, R_2,... = list of cutoff radii, one for each type (distance units)
+* nx, ny, nz = number of grid points in x, y, and z directions (positive integer)
+
+Examples
+""""""""
+
+.. code-block:: LAMMPS
+
+    compute ggrid all gaussian/grid/local  grid 40 40 40  4.0  0.5 0.5  0.4 0.4
+
+Description
+"""""""""""
+
+Define a computation that calculates a Gaussian representation of the ionic
+structure. This representation is used for the efficient evaluation
+of quantities related to the structure factor in a grid-based workflow,
+such as the ML-DFT workflow MALA :ref:`(Ellis)) <Ellis2021>`, for which it was originally
+implemented. Usage of the workflow is described in a separate publication :ref:`(Fiedler) <Fiedler2023>`.
+
+For each atomic species, a separate sum of Gaussians is calculated, using
+a separate Gaussian broadening per species. The computation
+is always performed on the numerical grid, no atom-based version of this
+compute exists. The Gaussian representation can only be executed in a local
+fashion, thus the output array only contains  rows for grid points
+that are local to the processor subdomain. The layout of the grid is the same
+as for the see :doc:`sna/grid/local <compute_sna_atom>` command.
+
+Namely, the array contains one row for each of the
+local grid points, looping over the global index *ix* fastest,
+then *iy*, and *iz* slowest.  Each row of the array contains
+the global indexes *ix*, *iy*, and *iz* first, followed by the *x*, *y*,
+and *z* coordinates of the grid point, followed by the values of the Gaussians
+(one floating point number per species per grid point).
+
+Computation of these Gaussians can be accelerated via Kokkos through the
+*gaussian/grid/local/kk* command.
+
+----------
+
+Output info
+"""""""""""
+
+Compute *gaussian/grid/local* evaluates a local array.
+The array contains one row for each of the
+local grid points, looping over the global index *ix* fastest,
+then *iy*, and *iz* slowest.  Each row of the array contains
+the global indexes *ix*, *iy*, and *iz* first, followed by the *x*, *y*,
+and *z* coordinates of the grid point, followed by the values of the Gaussians
+(one floating point number per species per grid point).
+
+Restrictions
+""""""""""""
+
+These computes are part of the ML-SNAP package.  They are only enabled
+if LAMMPS was built with that package.  See the :doc:`Build package
+<Build_package>` page for more info.
+
+Related commands
+""""""""""""""""
+
+:doc:`compute sna/grid/local <compute_sna_atom>`
+
+----------
+
+.. _Ellis2021:
+
+**(Ellis)** Ellis, Fiedler, Popoola, Modine, Stephens, Thompson, Cangi, Rajamanickam,  Phys Rev B, 104, 035120, (2021)
+
+.. _Fiedler2023:
+
+**(Fiedler)** Fiedler, Modine, Schmerler, Vogel, Popoola, Thompson, Rajamanickam, and Cangi,
+`npj Comp. Mater., 9, 115 (2023) <https://doi.org/10.1038/s41524-023-01070-z>`_
+
diff --git a/doc/src/compute_sna_atom.rst b/doc/src/compute_sna_atom.rst
index 179c362dc6..776ccca5a6 100644
--- a/doc/src/compute_sna_atom.rst
+++ b/doc/src/compute_sna_atom.rst
@@ -20,9 +20,15 @@ compute snap command
 compute sna/grid command
 ========================
 
+compute sna/grid/kk command
+===========================
+
 compute sna/grid/local command
 ==============================
 
+compute sna/grid/local/kk command
+=================================
+
 Syntax
 """"""
 
@@ -252,7 +258,8 @@ for finite-temperature Kohn-Sham density functional theory (:ref:`Ellis
 et al. <Ellis2021>`) Neighbor atoms not in the group do not contribute
 to the bispectrum components of the grid points. The distance cutoff
 :math:`R_{ii'}` assumes that *i* has the same type as the neighbor atom
-*i'*.
+*i'*. Both computes can be hardware accelerated with Kokkos by using the
+*sna/grid/kk* and *sna/grid/local/kk* coammnds, respectively.
 
 Compute *sna/grid* calculates a global array containing bispectrum
 components for a regular grid of points.

From f93dd3273d0f6f96b4e537cbf02a5b6dcba8f757 Mon Sep 17 00:00:00 2001
From: Lenz Fiedler <l.fiedler@hzdr.de>
Date: Mon, 9 Dec 2024 14:08:22 +0100
Subject: [PATCH 38/51] Added link to PRB paper

---
 doc/src/compute_gaussian_grid_local.rst | 2 +-
 doc/src/compute_sna_atom.rst            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/src/compute_gaussian_grid_local.rst b/doc/src/compute_gaussian_grid_local.rst
index 24f41586d1..226402bc22 100644
--- a/doc/src/compute_gaussian_grid_local.rst
+++ b/doc/src/compute_gaussian_grid_local.rst
@@ -90,7 +90,7 @@ Related commands
 
 .. _Ellis2021:
 
-**(Ellis)** Ellis, Fiedler, Popoola, Modine, Stephens, Thompson, Cangi, Rajamanickam,  Phys Rev B, 104, 035120, (2021)
+**(Ellis)** Ellis, Fiedler, Popoola, Modine, Stephens, Thompson, Cangi, Rajamanickam, `Phys. Rev. B, 104, 035120, (2021) <https://doi.org/10.1103/PhysRevB.104.035120>`_
 
 .. _Fiedler2023:
 
diff --git a/doc/src/compute_sna_atom.rst b/doc/src/compute_sna_atom.rst
index 776ccca5a6..28611ae3a7 100644
--- a/doc/src/compute_sna_atom.rst
+++ b/doc/src/compute_sna_atom.rst
@@ -661,7 +661,7 @@ of Angular Momentum, World Scientific, Singapore (1987).
 
 .. _Ellis2021:
 
-**(Ellis)** Ellis, Fiedler, Popoola, Modine, Stephens, Thompson, Cangi, Rajamanickam,  Phys Rev B, 104, 035120, (2021)
+**(Ellis)** Ellis, Fiedler, Popoola, Modine, Stephens, Thompson, Cangi, Rajamanickam, `Phys. Rev. B, 104, 035120, (2021) <https://doi.org/10.1103/PhysRevB.104.035120>`_
 
 .. _Lafourcade2023_2:
 

From f59f084c37d05461fcc85c2c73f9d2c0b128e7c4 Mon Sep 17 00:00:00 2001
From: Lenz Fiedler <l.fiedler@hzdr.de>
Date: Tue, 10 Dec 2024 10:39:57 +0100
Subject: [PATCH 39/51] Added logs for examples

---
 examples/snap/log.10Dec24.gaussian.grid.g++.1 | 57 +++++++++++++++++++
 examples/snap/log.10Dec24.gaussian.grid.g++.4 | 57 +++++++++++++++++++
 2 files changed, 114 insertions(+)
 create mode 100644 examples/snap/log.10Dec24.gaussian.grid.g++.1
 create mode 100644 examples/snap/log.10Dec24.gaussian.grid.g++.4

diff --git a/examples/snap/log.10Dec24.gaussian.grid.g++.1 b/examples/snap/log.10Dec24.gaussian.grid.g++.1
new file mode 100644
index 0000000000..230008ec97
--- /dev/null
+++ b/examples/snap/log.10Dec24.gaussian.grid.g++.1
@@ -0,0 +1,57 @@
+LAMMPS (19 Nov 2024 - Development - patch_19Nov2024-57-gf93dd3273d)
+OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:99)
+  using 1 OpenMP thread(s) per MPI task
+Lattice spacing in x,y,z = 3.316 3.316 3.316
+Created orthogonal box = (0 0 0) to (3.316 3.316 3.316)
+  1 by 1 by 1 MPI processor grid
+Created 2 atoms
+  using lattice units in orthogonal box = (0 0 0) to (3.316 3.316 3.316)
+  create_atoms CPU = 0.002 seconds
+1 atoms in group snapgroup
+WARNING: No fixes with time integration, atoms won't move (src/verlet.cpp:60)
+Generated 0 of 1 mixed pair_coeff terms from geometric mixing rule
+Neighbor list info ...
+  update: every = 1 steps, delay = 0 steps, check = yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 6.67637
+  ghost atom cutoff = 6.67637
+  binsize = 3.338185, bins = 1 1 1
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair zero, perpetual
+      attributes: half, newton on
+      pair build: half/bin/atomonly/newton
+      stencil: half/bin/3d
+      bin: standard
+Setting up Verlet run ...
+  Unit style    : metal
+  Current step  : 0
+  Time step     : 0.001
+Per MPI rank memory allocation (min/avg/max) = 3.492 | 3.492 | 3.492 Mbytes
+   Step          Temp          E_pair         E_mol          TotEng         Press     
+         0   0              0              0              0              0            
+Loop time of 6.83e-07 on 1 procs for 0 steps with 2 atoms
+
+146.4% CPU use with 1 MPI tasks x 1 OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 0          | 0          | 0          |   0.0 |  0.00
+Neigh   | 0          | 0          | 0          |   0.0 |  0.00
+Comm    | 0          | 0          | 0          |   0.0 |  0.00
+Output  | 0          | 0          | 0          |   0.0 |  0.00
+Modify  | 0          | 0          | 0          |   0.0 |  0.00
+Other   |            | 6.83e-07   |            |       |100.00
+
+Nlocal:              2 ave           2 max           2 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:            339 ave         339 max         339 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:             64 ave          64 max          64 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 64
+Ave neighs/atom = 32
+Neighbor list builds = 0
+Dangerous builds = 0
+Total wall time: 0:00:00
diff --git a/examples/snap/log.10Dec24.gaussian.grid.g++.4 b/examples/snap/log.10Dec24.gaussian.grid.g++.4
new file mode 100644
index 0000000000..f46db86fc7
--- /dev/null
+++ b/examples/snap/log.10Dec24.gaussian.grid.g++.4
@@ -0,0 +1,57 @@
+LAMMPS (19 Nov 2024 - Development - patch_19Nov2024-57-gf93dd3273d)
+OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:99)
+  using 1 OpenMP thread(s) per MPI task
+Lattice spacing in x,y,z = 3.316 3.316 3.316
+Created orthogonal box = (0 0 0) to (3.316 3.316 3.316)
+  1 by 1 by 1 MPI processor grid
+Created 2 atoms
+  using lattice units in orthogonal box = (0 0 0) to (3.316 3.316 3.316)
+  create_atoms CPU = 0.004 seconds
+1 atoms in group snapgroup
+WARNING: No fixes with time integration, atoms won't move (src/verlet.cpp:60)
+Generated 0 of 1 mixed pair_coeff terms from geometric mixing rule
+Neighbor list info ...
+  update: every = 1 steps, delay = 0 steps, check = yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 6.67637
+  ghost atom cutoff = 6.67637
+  binsize = 3.338185, bins = 1 1 1
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair zero, perpetual
+      attributes: half, newton on
+      pair build: half/bin/atomonly/newton
+      stencil: half/bin/3d
+      bin: standard
+Setting up Verlet run ...
+  Unit style    : metal
+  Current step  : 0
+  Time step     : 0.001
+Per MPI rank memory allocation (min/avg/max) = 3.492 | 3.492 | 3.492 Mbytes
+   Step          Temp          E_pair         E_mol          TotEng         Press     
+         0   0              0              0              0              0            
+Loop time of 6.18e-07 on 1 procs for 0 steps with 2 atoms
+
+161.8% CPU use with 1 MPI tasks x 1 OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 0          | 0          | 0          |   0.0 |  0.00
+Neigh   | 0          | 0          | 0          |   0.0 |  0.00
+Comm    | 0          | 0          | 0          |   0.0 |  0.00
+Output  | 0          | 0          | 0          |   0.0 |  0.00
+Modify  | 0          | 0          | 0          |   0.0 |  0.00
+Other   |            | 6.18e-07   |            |       |100.00
+
+Nlocal:              2 ave           2 max           2 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:            339 ave         339 max         339 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:             64 ave          64 max          64 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 64
+Ave neighs/atom = 32
+Neighbor list builds = 0
+Dangerous builds = 0
+Total wall time: 0:00:00

From 16e0a7788acdac870038a527f0607dc7d7e8e112 Mon Sep 17 00:00:00 2001
From: Lenz Fiedler <l.fiedler@hzdr.de>
Date: Fri, 13 Dec 2024 09:51:07 +0100
Subject: [PATCH 40/51] Now actually added the correct log

---
 examples/snap/log.10Dec24.gaussian.grid.g++.4 | 29 ++++++++++---------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/examples/snap/log.10Dec24.gaussian.grid.g++.4 b/examples/snap/log.10Dec24.gaussian.grid.g++.4
index f46db86fc7..fab0236dd6 100644
--- a/examples/snap/log.10Dec24.gaussian.grid.g++.4
+++ b/examples/snap/log.10Dec24.gaussian.grid.g++.4
@@ -3,10 +3,10 @@ OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:99
   using 1 OpenMP thread(s) per MPI task
 Lattice spacing in x,y,z = 3.316 3.316 3.316
 Created orthogonal box = (0 0 0) to (3.316 3.316 3.316)
-  1 by 1 by 1 MPI processor grid
+  1 by 2 by 2 MPI processor grid
 Created 2 atoms
   using lattice units in orthogonal box = (0 0 0) to (3.316 3.316 3.316)
-  create_atoms CPU = 0.004 seconds
+  create_atoms CPU = 0.003 seconds
 1 atoms in group snapgroup
 WARNING: No fixes with time integration, atoms won't move (src/verlet.cpp:60)
 Generated 0 of 1 mixed pair_coeff terms from geometric mixing rule
@@ -22,16 +22,17 @@ Neighbor list info ...
       pair build: half/bin/atomonly/newton
       stencil: half/bin/3d
       bin: standard
+WARNING: Proc sub-domain size < neighbor skin, could lead to lost atoms (src/domain.cpp:1202)
 Setting up Verlet run ...
   Unit style    : metal
   Current step  : 0
   Time step     : 0.001
-Per MPI rank memory allocation (min/avg/max) = 3.492 | 3.492 | 3.492 Mbytes
-   Step          Temp          E_pair         E_mol          TotEng         Press     
-         0   0              0              0              0              0            
-Loop time of 6.18e-07 on 1 procs for 0 steps with 2 atoms
+Per MPI rank memory allocation (min/avg/max) = 3.522 | 3.523 | 3.524 Mbytes
+   Step          Temp          E_pair         E_mol          TotEng         Press
+         0   0              0              0              0              0
+Loop time of 6.4355e-06 on 4 procs for 0 steps with 2 atoms
 
-161.8% CPU use with 1 MPI tasks x 1 OpenMP threads
+15.5% CPU use with 4 MPI tasks x 1 OpenMP threads
 
 MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
@@ -41,14 +42,14 @@ Neigh   | 0          | 0          | 0          |   0.0 |  0.00
 Comm    | 0          | 0          | 0          |   0.0 |  0.00
 Output  | 0          | 0          | 0          |   0.0 |  0.00
 Modify  | 0          | 0          | 0          |   0.0 |  0.00
-Other   |            | 6.18e-07   |            |       |100.00
+Other   |            | 6.435e-06  |            |       |100.00
 
-Nlocal:              2 ave           2 max           2 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-Nghost:            339 ave         339 max         339 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-Neighs:             64 ave          64 max          64 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
+Nlocal:            0.5 ave           1 max           0 min
+Histogram: 2 0 0 0 0 0 0 0 0 2
+Nghost:          274.5 ave         275 max         274 min
+Histogram: 2 0 0 0 0 0 0 0 0 2
+Neighs:             16 ave          40 max           0 min
+Histogram: 2 0 0 0 0 0 1 0 0 1
 
 Total # of neighbors = 64
 Ave neighs/atom = 32

From bff2e64bbc60833b3a1af7b91763683f6c12151b Mon Sep 17 00:00:00 2001
From: Lenz Fiedler <l.fiedler@hzdr.de>
Date: Thu, 19 Dec 2024 10:06:53 +0100
Subject: [PATCH 41/51] Docs and example updates by Aidan Thompson

I did NOT author this commit, I am only pushing it because for some reason, Github does not permit Aidan to do so

Co-authored-by: Aidan Thompson <athomps@sandia.gov>
---
 doc/src/Commands_compute.rst                  |  5 +-
 doc/src/compute.rst                           |  1 +
 doc/src/compute_gaussian_grid_local.rst       | 56 ++++++-----
 doc/src/compute_sna_atom.rst                  | 31 +++---
 doc/utils/sphinx-config/false_positives.txt   |  2 +
 examples/snap/README.md                       |  6 ++
 examples/snap/in.gaussian.grid                | 10 +-
 examples/snap/in.snap.grid.triclinic          |  1 -
 examples/snap/log.10Dec24.gaussian.grid.g++.1 | 96 ++++++++++++++++---
 examples/snap/log.10Dec24.gaussian.grid.g++.4 | 96 ++++++++++++++++---
 10 files changed, 232 insertions(+), 72 deletions(-)

diff --git a/doc/src/Commands_compute.rst b/doc/src/Commands_compute.rst
index fd68ce3e39..7c73583a4f 100644
--- a/doc/src/Commands_compute.rst
+++ b/doc/src/Commands_compute.rst
@@ -58,6 +58,7 @@ KOKKOS, o = OPENMP, t = OPT.
    * :doc:`fep/ta <compute_fep_ta>`
    * :doc:`force/tally <compute_tally>`
    * :doc:`fragment/atom <compute_cluster_atom>`
+   * :doc:`gaussian/grid/local (k) <compute_gaussian_grid_local>`
    * :doc:`global/atom <compute_global_atom>`
    * :doc:`group/group <compute_group_group>`
    * :doc:`gyration <compute_gyration>`
@@ -140,8 +141,8 @@ KOKKOS, o = OPENMP, t = OPT.
    * :doc:`smd/vol <compute_smd_vol>`
    * :doc:`snap <compute_sna_atom>`
    * :doc:`sna/atom <compute_sna_atom>`
-   * :doc:`sna/grid <compute_sna_atom>`
-   * :doc:`sna/grid/local <compute_sna_atom>`
+   * :doc:`sna/grid (k) <compute_sna_atom>`
+   * :doc:`sna/grid/local (k) <compute_sna_atom>`
    * :doc:`snad/atom <compute_sna_atom>`
    * :doc:`snav/atom <compute_sna_atom>`
    * :doc:`sph/e/atom <compute_sph_e_atom>`
diff --git a/doc/src/compute.rst b/doc/src/compute.rst
index 082f93a6c4..9a8a1734fb 100644
--- a/doc/src/compute.rst
+++ b/doc/src/compute.rst
@@ -236,6 +236,7 @@ The individual style names on the :doc:`Commands compute <Commands_compute>` pag
 * :doc:`fep/ta <compute_fep_ta>` - compute free energies for a test area perturbation
 * :doc:`force/tally <compute_tally>` - force between two groups of atoms via the tally callback mechanism
 * :doc:`fragment/atom <compute_cluster_atom>` - fragment ID for each atom
+* :doc:`gaussian/grid/local <compute_gaussian_grid_local>` - local array of Gaussian atomic contributions on a regular grid
 * :doc:`global/atom <compute_global_atom>` - assign global values to each atom from arrays of global values
 * :doc:`group/group <compute_group_group>` - energy/force between two groups of atoms
 * :doc:`gyration <compute_gyration>` - radius of gyration of group of atoms
diff --git a/doc/src/compute_gaussian_grid_local.rst b/doc/src/compute_gaussian_grid_local.rst
index 226402bc22..45ef6642c9 100644
--- a/doc/src/compute_gaussian_grid_local.rst
+++ b/doc/src/compute_gaussian_grid_local.rst
@@ -1,38 +1,31 @@
-.. index:: compute sna/atom
-.. index:: compute snad/atom
-.. index:: compute snav/atom
-.. index:: compute snap
-.. index:: compute sna/grid
-.. index:: compute sna/grid/local
-
+.. index:: compute gaussian/grid/local
+.. index:: compute gaussian/grid/local/kk
 
 compute gaussian/grid/local command
 ===================================
 
-compute gaussian/grid/local/kk command
-======================================
+Accelerator Variants: *gaussian/grid/local/kk*
 
 Syntax
 """"""
 
 .. code-block:: LAMMPS
 
-   compute ID group-ID gaussian/grid nx ny nz rcutfac R_1 R_2 ...  R_1 R_2 ... sigma_1 sigma_2
-   compute ID group-ID gaussian/grid/local nx ny nz rcutfac  R_1 R_2 ... sigma_1 sigma_2
+   compute ID group-ID gaussian/grid/local grid nx ny nz rcutfac  R_1 R_2 ... sigma_1 sigma_2
 
 * ID, group-ID are documented in :doc:`compute <compute>` command
-* sna/atom = style name of this compute command
-* rcutfac = scale factor applied to all cutoff radii (positive real)
-* sigma_1, sigma_2,... = Gaussian broadening, one for each type (positive real)
-* R_1, R_2,... = list of cutoff radii, one for each type (distance units)
-* nx, ny, nz = number of grid points in x, y, and z directions (positive integer)
+* gaussian/grid/local = style name of this compute command
+* *grid* values = nx, ny, nz, number of grid points in x, y, and z directions (positive integer)
+* *rcutfac* = scale factor applied to all cutoff radii (positive real)
+* *R_1, R_2,...* = list of cutoff radii, one for each type (distance units)
+* *sigma_1, sigma_2,...* = Gaussian widths, one for each type (distance units)
 
 Examples
 """"""""
 
 .. code-block:: LAMMPS
 
-    compute ggrid all gaussian/grid/local  grid 40 40 40  4.0  0.5 0.5  0.4 0.4
+    compute mygrid all gaussian/grid/local grid 40 40 40 4.0 0.5 0.5 0.4 0.4
 
 Description
 """""""""""
@@ -40,14 +33,14 @@ Description
 Define a computation that calculates a Gaussian representation of the ionic
 structure. This representation is used for the efficient evaluation
 of quantities related to the structure factor in a grid-based workflow,
-such as the ML-DFT workflow MALA :ref:`(Ellis)) <Ellis2021>`, for which it was originally
+such as the ML-DFT workflow MALA :ref:`(Ellis) <Ellis2021b>`, for which it was originally
 implemented. Usage of the workflow is described in a separate publication :ref:`(Fiedler) <Fiedler2023>`.
 
-For each atomic species, a separate sum of Gaussians is calculated, using
-a separate Gaussian broadening per species. The computation
+For each LAMMPS type, a separate sum of Gaussians is calculated, using
+a separate Gaussian broadening per type. The computation
 is always performed on the numerical grid, no atom-based version of this
 compute exists. The Gaussian representation can only be executed in a local
-fashion, thus the output array only contains  rows for grid points
+fashion, thus the output array only contains rows for grid points
 that are local to the processor subdomain. The layout of the grid is the same
 as for the see :doc:`sna/grid/local <compute_sna_atom>` command.
 
@@ -56,10 +49,14 @@ local grid points, looping over the global index *ix* fastest,
 then *iy*, and *iz* slowest.  Each row of the array contains
 the global indexes *ix*, *iy*, and *iz* first, followed by the *x*, *y*,
 and *z* coordinates of the grid point, followed by the values of the Gaussians
-(one floating point number per species per grid point).
+(one floating point number per type per grid point).
 
-Computation of these Gaussians can be accelerated via Kokkos through the
-*gaussian/grid/local/kk* command.
+----------
+
+
+.. include:: accel_styles.rst
+
+	     
 
 ----------
 
@@ -69,10 +66,11 @@ Output info
 Compute *gaussian/grid/local* evaluates a local array.
 The array contains one row for each of the
 local grid points, looping over the global index *ix* fastest,
-then *iy*, and *iz* slowest.  Each row of the array contains
-the global indexes *ix*, *iy*, and *iz* first, followed by the *x*, *y*,
-and *z* coordinates of the grid point, followed by the values of the Gaussians
-(one floating point number per species per grid point).
+then *iy*, and *iz* slowest.  The array contains math :math:`ntypes+6` columns,
+where *ntypes* is the number of LAMMPS types. The first three columns are
+the global indexes *ix*, *iy*, and *iz*, followed by the *x*, *y*,
+and *z* coordinates of the grid point, followed by the *ntypes* columns
+containing the values of the Gaussians for each type.
 
 Restrictions
 """"""""""""
@@ -88,7 +86,7 @@ Related commands
 
 ----------
 
-.. _Ellis2021:
+.. _Ellis2021b:
 
 **(Ellis)** Ellis, Fiedler, Popoola, Modine, Stephens, Thompson, Cangi, Rajamanickam, `Phys. Rev. B, 104, 035120, (2021) <https://doi.org/10.1103/PhysRevB.104.035120>`_
 
diff --git a/doc/src/compute_sna_atom.rst b/doc/src/compute_sna_atom.rst
index 28611ae3a7..2079234ddf 100644
--- a/doc/src/compute_sna_atom.rst
+++ b/doc/src/compute_sna_atom.rst
@@ -3,7 +3,9 @@
 .. index:: compute snav/atom
 .. index:: compute snap
 .. index:: compute sna/grid
+.. index:: compute sna/grid/kk
 .. index:: compute sna/grid/local
+.. index:: compute sna/grid/local/kk
 
 compute sna/atom command
 ========================
@@ -26,8 +28,7 @@ compute sna/grid/kk command
 compute sna/grid/local command
 ==============================
 
-compute sna/grid/local/kk command
-=================================
+Accelerator Variants: *sna/grid/local/kk*
 
 Syntax
 """"""
@@ -39,17 +40,17 @@ Syntax
    compute ID group-ID snav/atom rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ...
    compute ID group-ID snap rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ...
    compute ID group-ID snap rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ...
-   compute ID group-ID sna/grid nx ny nz rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ...
-   compute ID group-ID sna/grid/local nx ny nz rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ...
+   compute ID group-ID sna/grid grid nx ny nz rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ...
+   compute ID group-ID sna/grid/local grid nx ny nz rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ...
 
 * ID, group-ID are documented in :doc:`compute <compute>` command
 * sna/atom = style name of this compute command
-* rcutfac = scale factor applied to all cutoff radii (positive real)
-* rfac0 = parameter in distance to angle conversion (0 < rcutfac < 1)
-* twojmax = band limit for bispectrum components (non-negative integer)
-* R_1, R_2,... = list of cutoff radii, one for each type (distance units)
-* w_1, w_2,... = list of neighbor weights, one for each type
-* nx, ny, nz = number of grid points in x, y, and z directions (positive integer)
+* *rcutfac* = scale factor applied to all cutoff radii (positive real)
+* *rfac0* = parameter in distance to angle conversion (0 < rcutfac < 1)
+* *twojmax* = band limit for bispectrum components (non-negative integer)
+* *R_1, R_2,...* = list of cutoff radii, one for each type (distance units)
+* *w_1, w_2,...* = list of neighbor weights, one for each type
+* *grid* values = nx, ny, nz, number of grid points in x, y, and z directions (positive integer)
 * zero or more keyword/value pairs may be appended
 * keyword = *rmin0* or *switchflag* or *bzeroflag* or *quadraticflag* or *chem* or *bnormflag* or *wselfallflag* or *bikflag* or *switchinnerflag* or *sinner* or *dinner* or *dgradflag* or *nnn* or *wmode* or *delta*
 
@@ -109,7 +110,7 @@ Examples
    compute snap all snap 1.4 0.95 6 2.0 1.0
    compute snap all snap 1.0 0.99363 6 3.81 3.83 1.0 0.93 chem 2 0 1
    compute snap all snap 1.0 0.99363 6 3.81 3.83 1.0 0.93 switchinnerflag 1 sinner 1.35 1.6 dinner 0.25 0.3
-   compute bgrid all sna/grid/local 200 200 200 1.4 0.95 6 2.0 1.0
+   compute bgrid all sna/grid/local grid 200 200 200 1.4 0.95 6 2.0 1.0
    compute bnnn all sna/atom 9.0 0.99363 8 0.5 1.0 rmin0 0.0 nnn 24 wmode 1 delta 0.2
 
 Description
@@ -259,7 +260,7 @@ et al. <Ellis2021>`) Neighbor atoms not in the group do not contribute
 to the bispectrum components of the grid points. The distance cutoff
 :math:`R_{ii'}` assumes that *i* has the same type as the neighbor atom
 *i'*. Both computes can be hardware accelerated with Kokkos by using the
-*sna/grid/kk* and *sna/grid/local/kk* coammnds, respectively.
+*sna/grid/kk* and *sna/grid/local/kk* commands, respectively.
 
 Compute *sna/grid* calculates a global array containing bispectrum
 components for a regular grid of points.
@@ -470,6 +471,12 @@ fluctuations in the resulting local atomic environment fingerprint.  The
 detailed formalism is given in the paper by Lafourcade et
 al. :ref:`(Lafourcade) <Lafourcade2023_2>`.
 
+----------
+
+
+.. include:: accel_styles.rst
+
+	     
 ----------
 
 Output info
diff --git a/doc/utils/sphinx-config/false_positives.txt b/doc/utils/sphinx-config/false_positives.txt
index 8e601d6c16..34e56539fc 100644
--- a/doc/utils/sphinx-config/false_positives.txt
+++ b/doc/utils/sphinx-config/false_positives.txt
@@ -3360,6 +3360,7 @@ Schilfgarde
 Schimansky
 Schiotz
 Schlitter
+Schmerler
 Schmid
 Schnieders
 Schoen
@@ -4021,6 +4022,7 @@ VMDARCH
 VMDHOME
 vn
 Voigt
+Vogel
 volfactor
 Volkov
 Volpe
diff --git a/examples/snap/README.md b/examples/snap/README.md
index 305f920ae8..1df24acf1f 100644
--- a/examples/snap/README.md
+++ b/examples/snap/README.md
@@ -9,5 +9,11 @@ in.snap.Mo_Chen                   # SNAP linear Mo potential
 in.snap.compute                   # SNAP compute for training a linear model
 in.snap.compute.quadratic         # SNAP compute for training a quadratic model
 in.snap.scale.Ni_Zuo_JCPA2020     # SNAP linear Ni potential with thermodynamic integration (fix adapt scale)
+in.C_SNAP                         # SNAP carbon potential
 
 compute_snap_dgrad.py             # SNAP compute with dgradflag (dBi/dRj) for training a non-linear model
+
+in.snap.grid                      # SNAP descriptors on a grid
+in.snap.grid.triclinic            # SNAP descriptors on a grid, triclinic
+in.gaussian.grid                  # Gaussian descriptors on a grid
+
diff --git a/examples/snap/in.gaussian.grid b/examples/snap/in.gaussian.grid
index 9caa61e455..48aeec1632 100644
--- a/examples/snap/in.gaussian.grid
+++ b/examples/snap/in.gaussian.grid
@@ -4,6 +4,7 @@
 # sitting on an atom of type 1 or 2:
 # val1 = 1.0/(0.1355*sqrt(2.0*pi))**3 = 25.5219
 # val2 = 1.0/(0.2   *sqrt(2.0*pi))**3 = 7.93670
+# These values are extracted to the log file
 # 
 
 variable 	nrep index 1
@@ -36,7 +37,6 @@ mass 		* 180.88
 
 # define atom compute and grid compute
 
-group 		snapgroup type 1
 variable 	rcutfac equal 4.67637
 variable 	radelem1 equal 0.5
 variable 	radelem2 equal 0.5
@@ -57,10 +57,12 @@ compute 	mygridlocal all gaussian/grid/local grid ${ngrid} ${ngrid} ${ngrid} &
 
 # define output
 
-dump 1 all local 1000 dump.glocal c_mygridlocal[*]
-dump 2 all custom 1000 dump.gatom id x y z
+dump		1 all local 1000 dump.glocal c_mygridlocal[*]
+dump 		2 all custom 1000 dump.gatom id x y z
+compute		val1 all reduce max c_mygridlocal[7] inputs local
+compute		val2 all reduce max c_mygridlocal[8] inputs local
+thermo_style	custom step c_val1 c_val2
 
 # run
 
 run		0
-
diff --git a/examples/snap/in.snap.grid.triclinic b/examples/snap/in.snap.grid.triclinic
index 95a14f3bb4..59063f576e 100644
--- a/examples/snap/in.snap.grid.triclinic
+++ b/examples/snap/in.snap.grid.triclinic
@@ -47,7 +47,6 @@ lattice		custom $a &
 		basis 0.0 0.0 0.5 &
 		spacing 1 1 1
 
-box 		tilt large
 region		box prism 0 ${nx} 0 ${ny} 0 ${nz} ${ny} ${nz} ${nz}
 create_box	1 box
 create_atoms	1 box
diff --git a/examples/snap/log.10Dec24.gaussian.grid.g++.1 b/examples/snap/log.10Dec24.gaussian.grid.g++.1
index 230008ec97..b158ac07d0 100644
--- a/examples/snap/log.10Dec24.gaussian.grid.g++.1
+++ b/examples/snap/log.10Dec24.gaussian.grid.g++.1
@@ -1,13 +1,89 @@
-LAMMPS (19 Nov 2024 - Development - patch_19Nov2024-57-gf93dd3273d)
+LAMMPS (19 Nov 2024 - Development - patch_19Nov2024-59-g16e0a7788a)
 OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:99)
   using 1 OpenMP thread(s) per MPI task
+# Demonstrate calculation of Gaussian descriptors on a grid
+# for a cell with two atoms of type 1 and type 2.
+# The output in dump.glocal shows that for grid points
+# sitting on an atom of type 1 or 2:
+# val1 = 1.0/(0.1355*sqrt(2.0*pi))**3 = 25.5219
+# val2 = 1.0/(0.2   *sqrt(2.0*pi))**3 = 7.93670
+# These values are extracted to the log file
+#
+
+variable 	nrep index 1
+variable 	a index 3.316
+variable 	ngrid index 2
+
+units		metal
+atom_modify	map hash
+
+# generate the box and atom positions using a BCC lattice
+
+variable       	nx equal ${nrep}
+variable       	nx equal 1
+variable 	ny equal ${nrep}
+variable 	ny equal 1
+variable 	nz equal ${nrep}
+variable 	nz equal 1
+
+boundary	p p p
+
+lattice		custom $a 		a1 1 0 0 		a2 0 1 0  		a3 0 0 1 		basis 0 0 0 		basis 0.5 0.5 0.5
+lattice		custom 3.316 		a1 1 0 0 		a2 0 1 0  		a3 0 0 1 		basis 0 0 0 		basis 0.5 0.5 0.5
 Lattice spacing in x,y,z = 3.316 3.316 3.316
+region		box block 0 ${nx} 0 ${ny} 0 ${nz}
+region		box block 0 1 0 ${ny} 0 ${nz}
+region		box block 0 1 0 1 0 ${nz}
+region		box block 0 1 0 1 0 1
+create_box	2 box
 Created orthogonal box = (0 0 0) to (3.316 3.316 3.316)
   1 by 1 by 1 MPI processor grid
+create_atoms	1 box basis 1 1 basis 2 2
 Created 2 atoms
   using lattice units in orthogonal box = (0 0 0) to (3.316 3.316 3.316)
-  create_atoms CPU = 0.002 seconds
-1 atoms in group snapgroup
+  create_atoms CPU = 0.001 seconds
+
+mass 		* 180.88
+
+# define atom compute and grid compute
+
+variable 	rcutfac equal 4.67637
+variable 	radelem1 equal 0.5
+variable 	radelem2 equal 0.5
+variable	sigmaelem1 equal 0.1355
+variable	sigmaelem2 equal 0.2
+variable 	gaussian_options string 		"${rcutfac} ${radelem1} ${radelem2} ${sigmaelem1} ${sigmaelem2}"
+4.67637 ${radelem1} ${radelem2} ${sigmaelem1} ${sigmaelem2}
+4.67637 0.5 ${radelem2} ${sigmaelem1} ${sigmaelem2}
+4.67637 0.5 0.5 ${sigmaelem1} ${sigmaelem2}
+4.67637 0.5 0.5 0.1355 ${sigmaelem2}
+4.67637 0.5 0.5 0.1355 0.2
+
+# build zero potential to force ghost atom creation
+
+pair_style      zero ${rcutfac}
+pair_style      zero 4.67637
+pair_coeff      * *
+
+# define atom and grid computes
+
+compute 	mygridlocal all gaussian/grid/local grid ${ngrid} ${ngrid} ${ngrid} 	 	${gaussian_options}
+compute 	mygridlocal all gaussian/grid/local grid 2 ${ngrid} ${ngrid} 	 	${gaussian_options}
+compute 	mygridlocal all gaussian/grid/local grid 2 2 ${ngrid} 	 	${gaussian_options}
+compute 	mygridlocal all gaussian/grid/local grid 2 2 2 	 	${gaussian_options}
+compute 	mygridlocal all gaussian/grid/local grid 2 2 2 	 	4.67637 0.5 0.5 0.1355 0.2
+
+# define output
+
+dump		1 all local 1000 dump.glocal c_mygridlocal[*]
+dump 		2 all custom 1000 dump.gatom id x y z
+compute		val1 all reduce max c_mygridlocal[7] inputs local
+compute		val2 all reduce max c_mygridlocal[8] inputs local
+thermo_style	custom step c_val1 c_val2
+
+# run
+
+run		0
 WARNING: No fixes with time integration, atoms won't move (src/verlet.cpp:60)
 Generated 0 of 1 mixed pair_coeff terms from geometric mixing rule
 Neighbor list info ...
@@ -22,16 +98,12 @@ Neighbor list info ...
       pair build: half/bin/atomonly/newton
       stencil: half/bin/3d
       bin: standard
-Setting up Verlet run ...
-  Unit style    : metal
-  Current step  : 0
-  Time step     : 0.001
 Per MPI rank memory allocation (min/avg/max) = 3.492 | 3.492 | 3.492 Mbytes
-   Step          Temp          E_pair         E_mol          TotEng         Press     
-         0   0              0              0              0              0            
-Loop time of 6.83e-07 on 1 procs for 0 steps with 2 atoms
+   Step         c_val1         c_val2    
+         0   25.521859      7.9367045    
+Loop time of 1.088e-06 on 1 procs for 0 steps with 2 atoms
 
-146.4% CPU use with 1 MPI tasks x 1 OpenMP threads
+183.8% CPU use with 1 MPI tasks x 1 OpenMP threads
 
 MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
@@ -41,7 +113,7 @@ Neigh   | 0          | 0          | 0          |   0.0 |  0.00
 Comm    | 0          | 0          | 0          |   0.0 |  0.00
 Output  | 0          | 0          | 0          |   0.0 |  0.00
 Modify  | 0          | 0          | 0          |   0.0 |  0.00
-Other   |            | 6.83e-07   |            |       |100.00
+Other   |            | 1.088e-06  |            |       |100.00
 
 Nlocal:              2 ave           2 max           2 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
diff --git a/examples/snap/log.10Dec24.gaussian.grid.g++.4 b/examples/snap/log.10Dec24.gaussian.grid.g++.4
index fab0236dd6..54cc842bc7 100644
--- a/examples/snap/log.10Dec24.gaussian.grid.g++.4
+++ b/examples/snap/log.10Dec24.gaussian.grid.g++.4
@@ -1,13 +1,89 @@
-LAMMPS (19 Nov 2024 - Development - patch_19Nov2024-57-gf93dd3273d)
+LAMMPS (19 Nov 2024 - Development - patch_19Nov2024-59-g16e0a7788a)
 OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:99)
   using 1 OpenMP thread(s) per MPI task
+# Demonstrate calculation of Gaussian descriptors on a grid
+# for a cell with two atoms of type 1 and type 2.
+# The output in dump.glocal shows that for grid points
+# sitting on an atom of type 1 or 2:
+# val1 = 1.0/(0.1355*sqrt(2.0*pi))**3 = 25.5219
+# val2 = 1.0/(0.2   *sqrt(2.0*pi))**3 = 7.93670
+# These values are extracted to the log file
+#
+
+variable 	nrep index 1
+variable 	a index 3.316
+variable 	ngrid index 2
+
+units		metal
+atom_modify	map hash
+
+# generate the box and atom positions using a BCC lattice
+
+variable       	nx equal ${nrep}
+variable       	nx equal 1
+variable 	ny equal ${nrep}
+variable 	ny equal 1
+variable 	nz equal ${nrep}
+variable 	nz equal 1
+
+boundary	p p p
+
+lattice		custom $a 		a1 1 0 0 		a2 0 1 0  		a3 0 0 1 		basis 0 0 0 		basis 0.5 0.5 0.5
+lattice		custom 3.316 		a1 1 0 0 		a2 0 1 0  		a3 0 0 1 		basis 0 0 0 		basis 0.5 0.5 0.5
 Lattice spacing in x,y,z = 3.316 3.316 3.316
+region		box block 0 ${nx} 0 ${ny} 0 ${nz}
+region		box block 0 1 0 ${ny} 0 ${nz}
+region		box block 0 1 0 1 0 ${nz}
+region		box block 0 1 0 1 0 1
+create_box	2 box
 Created orthogonal box = (0 0 0) to (3.316 3.316 3.316)
   1 by 2 by 2 MPI processor grid
+create_atoms	1 box basis 1 1 basis 2 2
 Created 2 atoms
   using lattice units in orthogonal box = (0 0 0) to (3.316 3.316 3.316)
-  create_atoms CPU = 0.003 seconds
-1 atoms in group snapgroup
+  create_atoms CPU = 0.001 seconds
+
+mass 		* 180.88
+
+# define atom compute and grid compute
+
+variable 	rcutfac equal 4.67637
+variable 	radelem1 equal 0.5
+variable 	radelem2 equal 0.5
+variable	sigmaelem1 equal 0.1355
+variable	sigmaelem2 equal 0.2
+variable 	gaussian_options string 		"${rcutfac} ${radelem1} ${radelem2} ${sigmaelem1} ${sigmaelem2}"
+4.67637 ${radelem1} ${radelem2} ${sigmaelem1} ${sigmaelem2}
+4.67637 0.5 ${radelem2} ${sigmaelem1} ${sigmaelem2}
+4.67637 0.5 0.5 ${sigmaelem1} ${sigmaelem2}
+4.67637 0.5 0.5 0.1355 ${sigmaelem2}
+4.67637 0.5 0.5 0.1355 0.2
+
+# build zero potential to force ghost atom creation
+
+pair_style      zero ${rcutfac}
+pair_style      zero 4.67637
+pair_coeff      * *
+
+# define atom and grid computes
+
+compute 	mygridlocal all gaussian/grid/local grid ${ngrid} ${ngrid} ${ngrid} 	 	${gaussian_options}
+compute 	mygridlocal all gaussian/grid/local grid 2 ${ngrid} ${ngrid} 	 	${gaussian_options}
+compute 	mygridlocal all gaussian/grid/local grid 2 2 ${ngrid} 	 	${gaussian_options}
+compute 	mygridlocal all gaussian/grid/local grid 2 2 2 	 	${gaussian_options}
+compute 	mygridlocal all gaussian/grid/local grid 2 2 2 	 	4.67637 0.5 0.5 0.1355 0.2
+
+# define output
+
+dump		1 all local 1000 dump.glocal c_mygridlocal[*]
+dump 		2 all custom 1000 dump.gatom id x y z
+compute		val1 all reduce max c_mygridlocal[7] inputs local
+compute		val2 all reduce max c_mygridlocal[8] inputs local
+thermo_style	custom step c_val1 c_val2
+
+# run
+
+run		0
 WARNING: No fixes with time integration, atoms won't move (src/verlet.cpp:60)
 Generated 0 of 1 mixed pair_coeff terms from geometric mixing rule
 Neighbor list info ...
@@ -23,16 +99,12 @@ Neighbor list info ...
       stencil: half/bin/3d
       bin: standard
 WARNING: Proc sub-domain size < neighbor skin, could lead to lost atoms (src/domain.cpp:1202)
-Setting up Verlet run ...
-  Unit style    : metal
-  Current step  : 0
-  Time step     : 0.001
 Per MPI rank memory allocation (min/avg/max) = 3.522 | 3.523 | 3.524 Mbytes
-   Step          Temp          E_pair         E_mol          TotEng         Press
-         0   0              0              0              0              0
-Loop time of 6.4355e-06 on 4 procs for 0 steps with 2 atoms
+   Step         c_val1         c_val2    
+         0   25.521859      7.9367045    
+Loop time of 2.238e-06 on 4 procs for 0 steps with 2 atoms
 
-15.5% CPU use with 4 MPI tasks x 1 OpenMP threads
+89.4% CPU use with 4 MPI tasks x 1 OpenMP threads
 
 MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
@@ -42,7 +114,7 @@ Neigh   | 0          | 0          | 0          |   0.0 |  0.00
 Comm    | 0          | 0          | 0          |   0.0 |  0.00
 Output  | 0          | 0          | 0          |   0.0 |  0.00
 Modify  | 0          | 0          | 0          |   0.0 |  0.00
-Other   |            | 6.435e-06  |            |       |100.00
+Other   |            | 2.238e-06  |            |       |100.00
 
 Nlocal:            0.5 ave           1 max           0 min
 Histogram: 2 0 0 0 0 0 0 0 0 2

From 824dcda382cbed99a4be27a72246412d7add53c9 Mon Sep 17 00:00:00 2001
From: Lenz Fiedler <l.fiedler@hzdr.de>
Date: Thu, 19 Dec 2024 17:22:36 +0100
Subject: [PATCH 42/51] Fixed two style issues in the docs, got rid of printf
 that's also deleted on develop

---
 doc/src/compute_gaussian_grid_local.rst | 2 +-
 doc/src/compute_sna_atom.rst            | 2 +-
 src/KOKKOS/pair_snap_kokkos_impl.h      | 3 +--
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/doc/src/compute_gaussian_grid_local.rst b/doc/src/compute_gaussian_grid_local.rst
index 45ef6642c9..4ae99e7b55 100644
--- a/doc/src/compute_gaussian_grid_local.rst
+++ b/doc/src/compute_gaussian_grid_local.rst
@@ -56,7 +56,7 @@ and *z* coordinates of the grid point, followed by the values of the Gaussians
 
 .. include:: accel_styles.rst
 
-	     
+
 
 ----------
 
diff --git a/doc/src/compute_sna_atom.rst b/doc/src/compute_sna_atom.rst
index 2079234ddf..2572093499 100644
--- a/doc/src/compute_sna_atom.rst
+++ b/doc/src/compute_sna_atom.rst
@@ -476,7 +476,7 @@ al. :ref:`(Lafourcade) <Lafourcade2023_2>`.
 
 .. include:: accel_styles.rst
 
-	     
+
 ----------
 
 Output info
diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h
index 247289042e..6c3cea43ce 100644
--- a/src/KOKKOS/pair_snap_kokkos_impl.h
+++ b/src/KOKKOS/pair_snap_kokkos_impl.h
@@ -551,9 +551,8 @@ template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeNeigh>::member_type& team) const {
 
-  printf("d_wjelem: %f %f %f %f\n", d_wjelem[0], d_wjelem[1], d_wjelem(0), d_wjelem(1));
   SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
-  
+
   // extract atom number
   int ii = team.team_rank() + team.league_rank() * team.team_size();
   if (ii >= chunk_size) return;

From 1f61c9ba828a952e76d53b3a4228ba48ab4d2832 Mon Sep 17 00:00:00 2001
From: Lenz Fiedler <l.fiedler@hzdr.de>
Date: Thu, 19 Dec 2024 17:39:02 +0100
Subject: [PATCH 43/51] I forgot to include a change in merging develop that
 seems to be very important

---
 src/KOKKOS/pair_snap_kokkos_impl.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h
index 6c3cea43ce..2b9b862645 100644
--- a/src/KOKKOS/pair_snap_kokkos_impl.h
+++ b/src/KOKKOS/pair_snap_kokkos_impl.h
@@ -551,8 +551,6 @@ template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeNeigh>::member_type& team) const {
 
-  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
-
   // extract atom number
   int ii = team.team_rank() + team.league_rank() * team.team_size();
   if (ii >= chunk_size) return;

From 399f81cf462a166b86904e04e8aa883ea1217c0d Mon Sep 17 00:00:00 2001
From: Lenz Fiedler <l.fiedler@hzdr.de>
Date: Thu, 19 Dec 2024 19:20:41 +0100
Subject: [PATCH 44/51] Code by Aidan Thompson, I am only committing it; fixing
 the cyclical include that broke the build process after merging develop

Co-authored-by: Aidan Thompson <athomps@sandia.gov>
---
 src/KOKKOS/sna_kokkos.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h
index a438ccd25e..5ba5c159ac 100644
--- a/src/KOKKOS/sna_kokkos.h
+++ b/src/KOKKOS/sna_kokkos.h
@@ -29,7 +29,9 @@
 #endif
 
 namespace LAMMPS_NS {
-
+// copied from pair_snap_kokkos.h
+// pre-declare so sna_kokkos.h can refer to it
+template<class DeviceType, typename real_type_, int vector_length_> class PairSNAPKokkos;
 template<typename real_type_, int vector_length_>
 struct WignerWrapper {
   using real_type = real_type_;

From 3101bb326341d33aedd261bb47713384be801a24 Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Tue, 14 Jan 2025 14:34:59 -0700
Subject: [PATCH 45/51] Add new files to GNU Make build system

---
 src/KOKKOS/Install.sh | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/KOKKOS/Install.sh b/src/KOKKOS/Install.sh
index 64ba0c6b03..daa64d64e1 100755
--- a/src/KOKKOS/Install.sh
+++ b/src/KOKKOS/Install.sh
@@ -106,6 +106,12 @@ action compute_composition_atom_kokkos.cpp compute_composition_atom.cpp
 action compute_composition_atom_kokkos.h compute_composition_atom.h
 action compute_orientorder_atom_kokkos.cpp
 action compute_orientorder_atom_kokkos.h
+action compute_sna_grid_kokkos.cpp compute_sna_grid.cpp
+action compute_sna_grid_kokkos.h compute_sna_grid.h
+action compute_sna_grid_kokkos_impl.h compute_sna_grid.cpp
+action compute_sna_grid_local_kokkos.cpp compute_sna_grid_local.cpp
+action compute_sna_grid_local_kokkos.h compute_sna_grid_local.h
+action compute_sna_grid_local_kokkos_impl.h compute_sna_grid_local.cpp
 action compute_temp_deform_kokkos.cpp
 action compute_temp_deform_kokkos.h
 action compute_temp_kokkos.cpp

From 0ee4bf621fc8e344afceba69f012b0f0aefd4496 Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Tue, 14 Jan 2025 14:35:42 -0700
Subject: [PATCH 46/51] Fix some compile issues and remove unused variables

---
 src/KOKKOS/compute_sna_grid_kokkos.h          |   4 +-
 src/KOKKOS/compute_sna_grid_kokkos_impl.h     | 130 ++++++++---------
 src/KOKKOS/compute_sna_grid_local_kokkos.h    |   4 +-
 .../compute_sna_grid_local_kokkos_impl.h      | 131 ++++++++----------
 src/KOKKOS/pair_snap_kokkos.h                 |   1 -
 src/KOKKOS/pair_snap_kokkos_impl.h            |   3 +-
 src/KOKKOS/sna_kokkos.h                       |   4 +-
 src/KOKKOS/sna_kokkos_impl.h                  |  13 +-
 8 files changed, 130 insertions(+), 160 deletions(-)

diff --git a/src/KOKKOS/compute_sna_grid_kokkos.h b/src/KOKKOS/compute_sna_grid_kokkos.h
index bd47059312..a65ff44546 100644
--- a/src/KOKKOS/compute_sna_grid_kokkos.h
+++ b/src/KOKKOS/compute_sna_grid_kokkos.h
@@ -232,7 +232,7 @@ class ComputeSNAGridKokkos : public ComputeSNAGrid {
 
   Kokkos::View<real_type*, DeviceType> d_radelem;              // element radii
   Kokkos::View<real_type*, DeviceType> d_wjelem;               // elements weights
-  //Kokkos::View<real_type**, Kokkos::LayoutRight, DeviceType> d_coeffelem;           // element bispectrum coefficients
+  Kokkos::View<real_type**, Kokkos::LayoutRight, DeviceType> d_coeffelem;           // element bispectrum coefficients
   Kokkos::View<real_type*, DeviceType> d_sinnerelem;           // element inner cutoff midpoint
   Kokkos::View<real_type*, DeviceType> d_dinnerelem;           // element inner cutoff half-width
   Kokkos::View<T_INT*, DeviceType> d_ninside;                // ninside for all atoms in list
@@ -272,6 +272,8 @@ class ComputeSNAGridKokkos : public ComputeSNAGrid {
   double h0, h1, h2, h3, h4, h5;
   double lo0, lo1, lo2;
 
+  // Make SNAKokkos a friend
+  friend class SNAKokkos<DeviceType, real_type, vector_length>;
 };
 
 // These wrapper classes exist to make the compute style factory happy/avoid having
diff --git a/src/KOKKOS/compute_sna_grid_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
index 2101d5968b..8275e810a3 100644
--- a/src/KOKKOS/compute_sna_grid_kokkos_impl.h
+++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
@@ -121,13 +121,9 @@ ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::ComputeSNAGridKokkos
   }
   Kokkos::deep_copy(d_test,h_test);
 
-  double bytes =  MemKK::memory_usage(d_wjelem);
-
-  snaKK = SNAKokkos<DeviceType, real_type, vector_length>(rfac0,twojmax,
-    rmin0,switchflag,bzeroflag,chemflag,bnormflag,wselfallflag,nelements,switchinnerflag);
+  snaKK = SNAKokkos<DeviceType, real_type, vector_length>(*this);
   snaKK.grow_rij(0,0);
   snaKK.init();
-
 }
 
 // Destructor
@@ -380,8 +376,6 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
   //       routines and avoid having to loop over all atoms (which limits us to
   //       natoms = max team size).
 
-  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
-
   // basic quantities associated with this team:
   // team_rank : rank of thread in this team
   // league_rank : rank of team in this league
@@ -399,10 +393,10 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
   // This is used to cache whether or not an atom is within the cutoff.
   // If it is, type_cache is assigned to the atom type.
   // If it's not, it's assigned to -1.
-  const int tile_size = ntotal; //max_neighs; // number of elements per thread
-  const int team_rank = team.team_rank();
-  const int scratch_shift = team_rank * tile_size; // offset into pointer for entire team
-  int* type_cache = (int*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(int), 0) + scratch_shift;
+  //const int tile_size = ntotal; //max_neighs; // number of elements per thread
+  //const int team_rank = team.team_rank();
+  //const int scratch_shift = team_rank * tile_size; // offset into pointer for entire team
+  //int* type_cache = (int*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(int), 0) + scratch_shift;
 
   // convert to grid indices
 
@@ -456,7 +450,7 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
   const int itype = 1;
   int ielem = 0;
   if (chemflag) ielem = d_map[itype];
-  const double radi = d_radelem[ielem];
+  //const double radi = d_radelem[ielem];
 
   // We need a DomainKokkos::lamda2x parallel for loop here, but let's ignore for now.
   // The purpose here is to transform for triclinic boxes.
@@ -525,22 +519,22 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
     if (rsq < rnd_cutsq(itype,jtype) && rsq > 1e-20) {
       int jelem = 0;
       if (chemflag) jelem = d_map[jtype];
-      my_sna.rij(ii,offset,0) = static_cast<real_type>(dx);
-      my_sna.rij(ii,offset,1) = static_cast<real_type>(dy);
-      my_sna.rij(ii,offset,2) = static_cast<real_type>(dz);
+      snaKK.rij(ii,offset,0) = static_cast<real_type>(dx);
+      snaKK.rij(ii,offset,1) = static_cast<real_type>(dy);
+      snaKK.rij(ii,offset,2) = static_cast<real_type>(dz);
       // pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp
       // actually since the views here have values starting at 0, let's use jelem
-      my_sna.wj(ii,offset) = static_cast<real_type>(d_wjelem[jelem]);
-      my_sna.rcutij(ii,offset) = static_cast<real_type>((2.0 * d_radelem[jelem])*rcutfac);
-      my_sna.inside(ii,offset) = j;
+      snaKK.wj(ii,offset) = static_cast<real_type>(d_wjelem[jelem]);
+      snaKK.rcutij(ii,offset) = static_cast<real_type>((2.0 * d_radelem[jelem])*rcutfac);
+      snaKK.inside(ii,offset) = j;
       if (switchinnerflag) {
-        my_sna.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]);
-        my_sna.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]);
+        snaKK.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]);
+        snaKK.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]);
       }
       if (chemflag)
-        my_sna.element(ii,offset) = jelem;
+        snaKK.element(ii,offset) = jelem;
       else
-        my_sna.element(ii,offset) = 0;
+        snaKK.element(ii,offset) = 0;
       offset++;
     }
   }
@@ -557,22 +551,22 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
       int jtype = type(j);
       int jelem = 0;
       if (chemflag) jelem = d_map[jtype];
-      my_sna.rij(ii,offset,0) = static_cast<real_type>(dx);
-      my_sna.rij(ii,offset,1) = static_cast<real_type>(dy);
-      my_sna.rij(ii,offset,2) = static_cast<real_type>(dz);
+      snaKK.rij(ii,offset,0) = static_cast<real_type>(dx);
+      snaKK.rij(ii,offset,1) = static_cast<real_type>(dy);
+      snaKK.rij(ii,offset,2) = static_cast<real_type>(dz);
       // pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp
       // actually since the views here have values starting at 0, let's use jelem
-      my_sna.wj(ii,offset) = static_cast<real_type>(d_wjelem[jelem]);
-      my_sna.rcutij(ii,offset) = static_cast<real_type>((2.0 * d_radelem[jelem])*rcutfac);
-      my_sna.inside(ii,offset) = j;
+      snaKK.wj(ii,offset) = static_cast<real_type>(d_wjelem[jelem]);
+      snaKK.rcutij(ii,offset) = static_cast<real_type>((2.0 * d_radelem[jelem])*rcutfac);
+      snaKK.inside(ii,offset) = j;
       if (switchinnerflag) {
-        my_sna.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]);
-        my_sna.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]);
+        snaKK.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]);
+        snaKK.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]);
       }
       if (chemflag)
-        my_sna.element(ii,offset) = jelem;
+        snaKK.element(ii,offset) = jelem;
       else
-        my_sna.element(ii,offset) = 0;
+        snaKK.element(ii,offset) = 0;
       offset++;
     }
   }
@@ -592,22 +586,22 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
         int jtype = type(j);
         int jelem = 0;
         if (chemflag) jelem = d_map[jtype];
-        my_sna.rij(ii,offset,0) = static_cast<real_type>(dx);
-        my_sna.rij(ii,offset,1) = static_cast<real_type>(dy);
-        my_sna.rij(ii,offset,2) = static_cast<real_type>(dz);
+        snaKK.rij(ii,offset,0) = static_cast<real_type>(dx);
+        snaKK.rij(ii,offset,1) = static_cast<real_type>(dy);
+        snaKK.rij(ii,offset,2) = static_cast<real_type>(dz);
         // pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp
         // actually since the views here have values starting at 0, let's use jelem
-        my_sna.wj(ii,offset) = static_cast<real_type>(d_wjelem[jelem]);
-        my_sna.rcutij(ii,offset) = static_cast<real_type>((2.0 * d_radelem[jelem])*rcutfac);
-        my_sna.inside(ii,offset) = j;
+        snaKK.wj(ii,offset) = static_cast<real_type>(d_wjelem[jelem]);
+        snaKK.rcutij(ii,offset) = static_cast<real_type>((2.0 * d_radelem[jelem])*rcutfac);
+        snaKK.inside(ii,offset) = j;
         if (switchinnerflag) {
-          my_sna.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]);
-          my_sna.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]);
+          snaKK.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]);
+          snaKK.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]);
         }
         if (chemflag)
-          my_sna.element(ii,offset) = jelem;
+          snaKK.element(ii,offset) = jelem;
         else
-          my_sna.element(ii,offset) = 0;
+          snaKK.element(ii,offset) = 0;
       }
       offset++;
     }
@@ -619,7 +613,6 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeCayleyKlein,const int iatom_mod, const int jnbor, const int iatom_div) const {
-  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
 
   const int ii = iatom_mod + iatom_div * vector_length;
   if (ii >= chunk_size) return;
@@ -627,28 +620,26 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
   const int ninside = d_ninside(ii);
   if (jnbor >= ninside) return;
 
-  my_sna.compute_cayley_klein(iatom_mod,jnbor,iatom_div);
+  snaKK.compute_cayley_klein(iatom_mod,jnbor,iatom_div);
 }
 
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridPreUi, const int iatom_mod, const int j, const int iatom_div) const {
-  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
 
   const int ii = iatom_mod + iatom_div * vector_length;
   if (ii >= chunk_size) return;
 
-  int itype = type(ii);
+  //int itype = type(ii);
   // force ielem to be zero (i.e. type 1) per `compute_sna_grid.cpp`
   int ielem = 0;
 
-  my_sna.pre_ui(iatom_mod, j, ielem, iatom_div);
+  snaKK.pre_ui(iatom_mod, j, ielem, iatom_div);
 }
 
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridComputeUiSmall>::member_type& team) const {
-  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
 
   // extract flattened atom_div / neighbor number / bend_location
   int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
@@ -667,7 +658,7 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
     const int ninside = d_ninside(ii);
     if (jj >= ninside) return;
 
-    my_sna.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div);
+    snaKK.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div);
   });
 
 }
@@ -675,7 +666,6 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridComputeUiLarge>::member_type& team) const {
-  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
 
   // extract flattened atom_div / neighbor number / bend location
   int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
@@ -692,28 +682,27 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
     const int ninside = d_ninside(ii);
     if (jj >= ninside) return;
 
-    my_sna.compute_ui_large(team,iatom_mod, jj, iatom_div);
+    snaKK.compute_ui_large(team,iatom_mod, jj, iatom_div);
   });
 }
 
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridTransformUi,const int iatom_mod, const int idxu, const int iatom_div) const {
-  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
 
   const int iatom = iatom_mod + iatom_div * vector_length;
   if (iatom >= chunk_size) return;
 
-  if (idxu > my_sna.idxu_max) return;
+  if (idxu > snaKK.idxu_max) return;
 
   int elem_count = chemflag ? nelements : 1;
 
   for (int ielem = 0; ielem < elem_count; ielem++){
 
-    const FullHalfMapper mapper = my_sna.idxu_full_half[idxu];
+    const FullHalfMapper mapper = snaKK.idxu_full_half[idxu];
 
-    auto utot_re = my_sna.ulisttot_re_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div);
-    auto utot_im = my_sna.ulisttot_im_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div);
+    auto utot_re = snaKK.ulisttot_re_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div);
+    auto utot_im = snaKK.ulisttot_im_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div);
 
     if (mapper.flip_sign == 1){
       utot_im = -utot_im;
@@ -721,11 +710,11 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
       utot_re = -utot_re;
     }
 
-    my_sna.ulisttot_pack(iatom_mod, idxu, ielem, iatom_div) = { utot_re, utot_im };
+    snaKK.ulisttot_pack(iatom_mod, idxu, ielem, iatom_div) = { utot_re, utot_im };
 
     if (mapper.flip_sign == 0) {
-      my_sna.ylist_pack_re(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.;
-      my_sna.ylist_pack_im(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.;
+      snaKK.ylist_pack_re(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.;
+      snaKK.ylist_pack_im(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.;
     }
   }
 }
@@ -733,46 +722,43 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeZi,const int iatom_mod, const int jjz, const int iatom_div) const {
-  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
 
   const int iatom = iatom_mod + iatom_div * vector_length;
   if (iatom >= chunk_size) return;
 
-  if (jjz >= my_sna.idxz_max) return;
+  if (jjz >= snaKK.idxz_max) return;
 
-  my_sna.compute_zi(iatom_mod,jjz,iatom_div);
+  snaKK.compute_zi(iatom_mod,jjz,iatom_div);
 }
 
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeBi,const int iatom_mod, const int jjb, const int iatom_div) const {
-  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
 
   const int iatom = iatom_mod + iatom_div * vector_length;
   if (iatom >= chunk_size) return;
 
-  if (jjb >= my_sna.idxb_max) return;
+  if (jjb >= snaKK.idxb_max) return;
 
-  my_sna.compute_bi(iatom_mod,jjb,iatom_div);
+  snaKK.compute_bi(iatom_mod,jjb,iatom_div);
 }
 
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const {
-  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
 
   const int iatom = iatom_mod + iatom_div * vector_length;
   if (iatom >= chunk_size) return;
 
-  if (idxb >= my_sna.idxb_max) return;
+  if (idxb >= snaKK.idxb_max) return;
 
-  const int ntriples = my_sna.ntriples;
+  const int ntriples = snaKK.ntriples;
 
   for (int itriple = 0; itriple < ntriples; itriple++) {
 
-    const real_type blocal = my_sna.blist_pack(iatom_mod, idxb, itriple, iatom_div);
+    const real_type blocal = snaKK.blist_pack(iatom_mod, idxb, itriple, iatom_div);
 
-    my_sna.blist(iatom, itriple, idxb) = blocal;
+    snaKK.blist(iatom, itriple, idxb) = blocal;
   }
 
 }
@@ -780,8 +766,6 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalFill, const int& ii) const {
-  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
-
 
   // extract grid index
   int igrid = ii + chunk_offset;
@@ -840,7 +824,7 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
   for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
     const auto idxb = icoeff % idxb_max;
     const auto idx_chem = icoeff / idxb_max;
-    d_gridall(igrid,icoeff+3) = my_sna.blist(ii,idx_chem,idxb);
+    d_gridall(igrid,icoeff+3) = snaKK.blist(ii,idx_chem,idxb);
   }
 
 }
diff --git a/src/KOKKOS/compute_sna_grid_local_kokkos.h b/src/KOKKOS/compute_sna_grid_local_kokkos.h
index 9073b921c1..2f2ae59426 100644
--- a/src/KOKKOS/compute_sna_grid_local_kokkos.h
+++ b/src/KOKKOS/compute_sna_grid_local_kokkos.h
@@ -225,7 +225,7 @@ class ComputeSNAGridLocalKokkos : public ComputeSNAGridLocal {
 
   Kokkos::View<real_type*, DeviceType> d_radelem;              // element radii
   Kokkos::View<real_type*, DeviceType> d_wjelem;               // elements weights
-  //Kokkos::View<real_type**, Kokkos::LayoutRight, DeviceType> d_coeffelem;           // element bispectrum coefficients
+  Kokkos::View<real_type**, Kokkos::LayoutRight, DeviceType> d_coeffelem;           // element bispectrum coefficients
   Kokkos::View<real_type*, DeviceType> d_sinnerelem;           // element inner cutoff midpoint
   Kokkos::View<real_type*, DeviceType> d_dinnerelem;           // element inner cutoff half-width
   Kokkos::View<T_INT*, DeviceType> d_ninside;                // ninside for all atoms in list
@@ -271,6 +271,8 @@ class ComputeSNAGridLocalKokkos : public ComputeSNAGridLocal {
   double h0, h1, h2, h3, h4, h5;
   double lo0, lo1, lo2;
 
+  // Make SNAKokkos a friend
+  friend class SNAKokkos<DeviceType, real_type, vector_length>;
 };
 
 // These wrapper classes exist to make the compute style factory happy/avoid having
diff --git a/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h
index 8f6958904b..1a40af4e8c 100644
--- a/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h
+++ b/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h
@@ -121,13 +121,9 @@ ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::ComputeSNAGridL
   }
   Kokkos::deep_copy(d_test,h_test);
 
-  double bytes =  MemKK::memory_usage(d_wjelem);
-
-  snaKK = SNAKokkos<DeviceType, real_type, vector_length>(rfac0,twojmax,
-    rmin0,switchflag,bzeroflag,chemflag,bnormflag,wselfallflag,nelements,switchinnerflag);
+  snaKK = SNAKokkos<DeviceType, real_type, vector_length>(*this);
   snaKK.grow_rij(0,0);
   snaKK.init();
-
 }
 
 // Destructor
@@ -386,8 +382,6 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator()
   //       routines and avoid having to loop over all atoms (which limits us to
   //       natoms = max team size).
 
-  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
-
   // basic quantities associated with this team:
   // team_rank : rank of thread in this team
   // league_rank : rank of team in this league
@@ -405,10 +399,10 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator()
   // This is used to cache whether or not an atom is within the cutoff.
   // If it is, type_cache is assigned to the atom type.
   // If it's not, it's assigned to -1.
-  const int tile_size = ntotal; //max_neighs; // number of elements per thread
-  const int team_rank = team.team_rank();
-  const int scratch_shift = team_rank * tile_size; // offset into pointer for entire team
-  int* type_cache = (int*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(int), 0) + scratch_shift;
+  //const int tile_size = ntotal; //max_neighs; // number of elements per thread
+  //const int team_rank = team.team_rank();
+  //const int scratch_shift = team_rank * tile_size; // offset into pointer for entire team
+  //int* type_cache = (int*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(int), 0) + scratch_shift;
 
   // convert to grid indices
 
@@ -475,7 +469,7 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator()
   const int itype = 1;
   int ielem = 0;
   if (chemflag) ielem = d_map[itype];
-  const double radi = d_radelem[ielem];
+  //const double radi = d_radelem[ielem];
 
   // We need a DomainKokkos::lamda2x parallel for loop here, but let's ignore for now.
   // The purpose here is to transform for triclinic boxes.
@@ -503,7 +497,6 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator()
 
     if (jtype >= 0)
       ninside++;
-
   }
 
   /*
@@ -544,22 +537,22 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator()
     if (rsq < rnd_cutsq(itype,jtype) && rsq > 1e-20) {
       int jelem = 0;
       if (chemflag) jelem = d_map[jtype];
-      my_sna.rij(ii,offset,0) = static_cast<real_type>(dx);
-      my_sna.rij(ii,offset,1) = static_cast<real_type>(dy);
-      my_sna.rij(ii,offset,2) = static_cast<real_type>(dz);
+      snaKK.rij(ii,offset,0) = static_cast<real_type>(dx);
+      snaKK.rij(ii,offset,1) = static_cast<real_type>(dy);
+      snaKK.rij(ii,offset,2) = static_cast<real_type>(dz);
       // pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp
       // actually since the views here have values starting at 0, let's use jelem
-      my_sna.wj(ii,offset) = static_cast<real_type>(d_wjelem[jelem]);
-      my_sna.rcutij(ii,offset) = static_cast<real_type>((2.0 * d_radelem[jelem])*rcutfac);
-      my_sna.inside(ii,offset) = j;
+      snaKK.wj(ii,offset) = static_cast<real_type>(d_wjelem[jelem]);
+      snaKK.rcutij(ii,offset) = static_cast<real_type>((2.0 * d_radelem[jelem])*rcutfac);
+      snaKK.inside(ii,offset) = j;
       if (switchinnerflag) {
-        my_sna.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]);
-        my_sna.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]);
+        snaKK.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]);
+        snaKK.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]);
       }
       if (chemflag)
-        my_sna.element(ii,offset) = jelem;
+        snaKK.element(ii,offset) = jelem;
       else
-        my_sna.element(ii,offset) = 0;
+        snaKK.element(ii,offset) = 0;
       offset++;
     }
   }
@@ -576,22 +569,22 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator()
       int jtype = type(j);
       int jelem = 0;
       if (chemflag) jelem = d_map[jtype];
-      my_sna.rij(ii,offset,0) = static_cast<real_type>(dx);
-      my_sna.rij(ii,offset,1) = static_cast<real_type>(dy);
-      my_sna.rij(ii,offset,2) = static_cast<real_type>(dz);
+      snaKK.rij(ii,offset,0) = static_cast<real_type>(dx);
+      snaKK.rij(ii,offset,1) = static_cast<real_type>(dy);
+      snaKK.rij(ii,offset,2) = static_cast<real_type>(dz);
       // pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp
       // actually since the views here have values starting at 0, let's use jelem
-      my_sna.wj(ii,offset) = static_cast<real_type>(d_wjelem[jelem]);
-      my_sna.rcutij(ii,offset) = static_cast<real_type>((2.0 * d_radelem[jelem])*rcutfac);
-      my_sna.inside(ii,offset) = j;
+      snaKK.wj(ii,offset) = static_cast<real_type>(d_wjelem[jelem]);
+      snaKK.rcutij(ii,offset) = static_cast<real_type>((2.0 * d_radelem[jelem])*rcutfac);
+      snaKK.inside(ii,offset) = j;
       if (switchinnerflag) {
-        my_sna.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]);
-        my_sna.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]);
+        snaKK.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]);
+        snaKK.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]);
       }
       if (chemflag)
-        my_sna.element(ii,offset) = jelem;
+        snaKK.element(ii,offset) = jelem;
       else
-        my_sna.element(ii,offset) = 0;
+        snaKK.element(ii,offset) = 0;
       offset++;
     }
   }
@@ -611,22 +604,22 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator()
         int jtype = type(j);
         int jelem = 0;
         if (chemflag) jelem = d_map[jtype];
-        my_sna.rij(ii,offset,0) = static_cast<real_type>(dx);
-        my_sna.rij(ii,offset,1) = static_cast<real_type>(dy);
-        my_sna.rij(ii,offset,2) = static_cast<real_type>(dz);
+        snaKK.rij(ii,offset,0) = static_cast<real_type>(dx);
+        snaKK.rij(ii,offset,1) = static_cast<real_type>(dy);
+        snaKK.rij(ii,offset,2) = static_cast<real_type>(dz);
         // pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp
         // actually since the views here have values starting at 0, let's use jelem
-        my_sna.wj(ii,offset) = static_cast<real_type>(d_wjelem[jelem]);
-        my_sna.rcutij(ii,offset) = static_cast<real_type>((2.0 * d_radelem[jelem])*rcutfac);
-        my_sna.inside(ii,offset) = j;
+        snaKK.wj(ii,offset) = static_cast<real_type>(d_wjelem[jelem]);
+        snaKK.rcutij(ii,offset) = static_cast<real_type>((2.0 * d_radelem[jelem])*rcutfac);
+        snaKK.inside(ii,offset) = j;
         if (switchinnerflag) {
-          my_sna.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]);
-          my_sna.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]);
+          snaKK.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]);
+          snaKK.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]);
         }
         if (chemflag)
-          my_sna.element(ii,offset) = jelem;
+          snaKK.element(ii,offset) = jelem;
         else
-          my_sna.element(ii,offset) = 0;
+          snaKK.element(ii,offset) = 0;
       }
       offset++;
     }
@@ -638,7 +631,6 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator()
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeCayleyKlein,const int iatom_mod, const int jnbor, const int iatom_div) const {
-  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
 
   const int ii = iatom_mod + iatom_div * vector_length;
   if (ii >= chunk_size) return;
@@ -646,28 +638,26 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator()
   const int ninside = d_ninside(ii);
   if (jnbor >= ninside) return;
 
-  my_sna.compute_cayley_klein(iatom_mod,jnbor,iatom_div);
+  snaKK.compute_cayley_klein(iatom_mod,jnbor,iatom_div);
 }
 
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalPreUi, const int iatom_mod, const int j, const int iatom_div) const {
-  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
 
   const int ii = iatom_mod + iatom_div * vector_length;
   if (ii >= chunk_size) return;
 
-  int itype = type(ii);
+  //int itype = type(ii);
   // force ielem to be zero (i.e. type 1) per `compute_sna_grid.cpp`
   int ielem = 0;
 
-  my_sna.pre_ui(iatom_mod, j, ielem, iatom_div);
+  snaKK.pre_ui(iatom_mod, j, ielem, iatom_div);
 }
 
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridLocalComputeUiSmall>::member_type& team) const {
-  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
 
   // extract flattened atom_div / neighbor number / bend_location
   int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
@@ -686,7 +676,7 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator()
     const int ninside = d_ninside(ii);
     if (jj >= ninside) return;
 
-    my_sna.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div);
+    snaKK.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div);
   });
 
 }
@@ -694,7 +684,6 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator()
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridLocalComputeUiLarge>::member_type& team) const {
-  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
 
   // extract flattened atom_div / neighbor number / bend location
   int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
@@ -711,28 +700,27 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator()
     const int ninside = d_ninside(ii);
     if (jj >= ninside) return;
 
-    my_sna.compute_ui_large(team,iatom_mod, jj, iatom_div);
+    snaKK.compute_ui_large(team,iatom_mod, jj, iatom_div);
   });
 }
 
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalTransformUi,const int iatom_mod, const int idxu, const int iatom_div) const {
-  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
 
   const int iatom = iatom_mod + iatom_div * vector_length;
   if (iatom >= chunk_size) return;
 
-  if (idxu > my_sna.idxu_max) return;
+  if (idxu > snaKK.idxu_max) return;
 
   int elem_count = chemflag ? nelements : 1;
 
   for (int ielem = 0; ielem < elem_count; ielem++){
 
-    const FullHalfMapper mapper = my_sna.idxu_full_half[idxu];
+    const FullHalfMapper mapper = snaKK.idxu_full_half[idxu];
 
-    auto utot_re = my_sna.ulisttot_re_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div);
-    auto utot_im = my_sna.ulisttot_im_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div);
+    auto utot_re = snaKK.ulisttot_re_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div);
+    auto utot_im = snaKK.ulisttot_im_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div);
 
     if (mapper.flip_sign == 1){
       utot_im = -utot_im;
@@ -740,11 +728,11 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator()
       utot_re = -utot_re;
     }
 
-    my_sna.ulisttot_pack(iatom_mod, idxu, ielem, iatom_div) = { utot_re, utot_im };
+    snaKK.ulisttot_pack(iatom_mod, idxu, ielem, iatom_div) = { utot_re, utot_im };
 
     if (mapper.flip_sign == 0) {
-      my_sna.ylist_pack_re(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.;
-      my_sna.ylist_pack_im(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.;
+      snaKK.ylist_pack_re(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.;
+      snaKK.ylist_pack_im(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.;
     }
   }
 }
@@ -752,46 +740,43 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator()
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeZi,const int iatom_mod, const int jjz, const int iatom_div) const {
-  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
 
   const int iatom = iatom_mod + iatom_div * vector_length;
   if (iatom >= chunk_size) return;
 
-  if (jjz >= my_sna.idxz_max) return;
+  if (jjz >= snaKK.idxz_max) return;
 
-  my_sna.compute_zi(iatom_mod,jjz,iatom_div);
+  snaKK.compute_zi(iatom_mod,jjz,iatom_div);
 }
 
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeBi,const int iatom_mod, const int jjb, const int iatom_div) const {
-  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
 
   const int iatom = iatom_mod + iatom_div * vector_length;
   if (iatom >= chunk_size) return;
 
-  if (jjb >= my_sna.idxb_max) return;
+  if (jjb >= snaKK.idxb_max) return;
 
-  my_sna.compute_bi(iatom_mod,jjb,iatom_div);
+  snaKK.compute_bi(iatom_mod,jjb,iatom_div);
 }
 
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const {
-  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
 
   const int iatom = iatom_mod + iatom_div * vector_length;
   if (iatom >= chunk_size) return;
 
-  if (idxb >= my_sna.idxb_max) return;
+  if (idxb >= snaKK.idxb_max) return;
 
-  const int ntriples = my_sna.ntriples;
+  const int ntriples = snaKK.ntriples;
 
   for (int itriple = 0; itriple < ntriples; itriple++) {
 
-    const real_type blocal = my_sna.blist_pack(iatom_mod, idxb, itriple, iatom_div);
+    const real_type blocal = snaKK.blist_pack(iatom_mod, idxb, itriple, iatom_div);
 
-    my_sna.blist(iatom, itriple, idxb) = blocal;
+    snaKK.blist(iatom, itriple, idxb) = blocal;
   }
 
 }
@@ -799,8 +784,6 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator()
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocal2Fill, const int& ii) const {
-  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
-
 
   // extract grid index
   int igrid = ii + chunk_offset;
@@ -859,7 +842,7 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator()
   for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
     const auto idxb = icoeff % idxb_max;
     const auto idx_chem = icoeff / idxb_max;
-    d_alocal(igrid,icoeff+6) = my_sna.blist(ii,idx_chem,idxb);
+    d_alocal(igrid,icoeff+6) = snaKK.blist(ii,idx_chem,idxb);
   }
 
 }
diff --git a/src/KOKKOS/pair_snap_kokkos.h b/src/KOKKOS/pair_snap_kokkos.h
index 660503eed8..4dc4029d12 100644
--- a/src/KOKKOS/pair_snap_kokkos.h
+++ b/src/KOKKOS/pair_snap_kokkos.h
@@ -375,7 +375,6 @@ class PairSNAPKokkos : public PairSNAP {
 
   // Make SNAKokkos a friend
   friend class SNAKokkos<DeviceType, real_type, vector_length>;
-
 };
 
 
diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h
index 2b9b862645..783043e6d9 100644
--- a/src/KOKKOS/pair_snap_kokkos_impl.h
+++ b/src/KOKKOS/pair_snap_kokkos_impl.h
@@ -536,8 +536,7 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::coeff(int narg, char
   Kokkos::deep_copy(d_dinnerelem,h_dinnerelem);
   Kokkos::deep_copy(d_map,h_map);
 
-  snaKK = SNAKokkos<DeviceType, real_type, vector_length>(*this); //rfac0,twojmax,
-    //rmin0,switchflag,bzeroflag,chemflag,bnormflag,wselfallflag,nelements,switchinnerflag);
+  snaKK = SNAKokkos<DeviceType, real_type, vector_length>(*this);
   snaKK.grow_rij(0,0);
   snaKK.init();
 }
diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h
index 5ba5c159ac..61aebaf97d 100644
--- a/src/KOKKOS/sna_kokkos.h
+++ b/src/KOKKOS/sna_kokkos.h
@@ -172,9 +172,9 @@ class SNAKokkos {
   KOKKOS_INLINE_FUNCTION
   SNAKokkos(const SNAKokkos<DeviceType,real_type,vector_length>& sna, const typename Kokkos::TeamPolicy<DeviceType>::member_type& team);
 
+  template<class CopyClass>
   inline
-  //SNAKokkos(real_type, int, real_type, int, int, int, int, int, int, int);
-  SNAKokkos(const PairSNAPKokkos<DeviceType, real_type, vector_length>&);
+  SNAKokkos(const CopyClass&);
 
   KOKKOS_INLINE_FUNCTION
   ~SNAKokkos();
diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h
index 1ea971d146..622ef0b8ae 100644
--- a/src/KOKKOS/sna_kokkos_impl.h
+++ b/src/KOKKOS/sna_kokkos_impl.h
@@ -29,17 +29,18 @@ static const double MY_PI  = 3.14159265358979323846; // pi
 static const double MY_PI2  = 1.57079632679489661923; // pi/2
 
 template<class DeviceType, typename real_type, int vector_length>
+template<class CopyClass>
 inline
-SNAKokkos<DeviceType, real_type, vector_length>::SNAKokkos(const PairSNAPKokkos<DeviceType, real_type, vector_length>& psk)
-  : rfac0(psk.rfac0), rmin0(psk.rmin0), switch_flag(psk.switchflag),
-    bzero_flag(psk.bzeroflag), chem_flag(psk.chemflag), bnorm_flag(psk.bnormflag),
-    wselfall_flag(psk.wselfallflag), switch_inner_flag(psk.switchinnerflag),
-    quadratic_flag(psk.quadraticflag), twojmax(psk.twojmax), d_coeffelem(psk.d_coeffelem)
+SNAKokkos<DeviceType, real_type, vector_length>::SNAKokkos(const CopyClass& copy)
+  : twojmax(copy.twojmax), d_coeffelem(copy.d_coeffelem), rmin0(copy.rmin0),
+    rfac0(copy.rfac0), switch_flag(copy.switchflag), switch_inner_flag(copy.switchinnerflag),
+    chem_flag(copy.chemflag), bnorm_flag(copy.bnormflag), wselfall_flag(copy.wselfallflag),
+    quadratic_flag(copy.quadraticflag), bzero_flag(copy.bzeroflag)
 {
   wself = static_cast<real_type>(1.0);
 
   if (chem_flag)
-    nelements = psk.nelements;
+    nelements = copy.nelements;
   else
     nelements = 1;
 

From 008bf146938fa1992084556e85141222eaa81983 Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Tue, 14 Jan 2025 15:49:40 -0700
Subject: [PATCH 47/51] Fix compile issues from #4391

---
 src/KOKKOS/compute_sna_grid_kokkos.h          |  71 ++++--
 src/KOKKOS/compute_sna_grid_kokkos_impl.h     | 217 ++++++++++--------
 src/KOKKOS/compute_sna_grid_local_kokkos.h    |  69 ++++--
 .../compute_sna_grid_local_kokkos_impl.h      | 214 +++++++++--------
 src/KOKKOS/pair_snap_kokkos_impl.h            |  22 +-
 5 files changed, 355 insertions(+), 238 deletions(-)

diff --git a/src/KOKKOS/compute_sna_grid_kokkos.h b/src/KOKKOS/compute_sna_grid_kokkos.h
index a65ff44546..ac378b07df 100644
--- a/src/KOKKOS/compute_sna_grid_kokkos.h
+++ b/src/KOKKOS/compute_sna_grid_kokkos.h
@@ -53,7 +53,6 @@ struct TagPairSNAPTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero yl
 struct TagPairSNAPComputeZi{};
 struct TagPairSNAPBeta{};
 struct TagPairSNAPComputeBi{};
-struct TagPairSNAPTransformBi{}; // re-order blist from AoSoA to AoS
 struct TagPairSNAPComputeYi{};
 struct TagPairSNAPComputeYiWithZlist{};
 template<int dir>
@@ -68,9 +67,8 @@ struct TagCSNAGridPreUi{};
 struct TagCSNAGridComputeUiSmall{}; // more parallelism, more divergence
 struct TagCSNAGridComputeUiLarge{}; // less parallelism, no divergence
 struct TagCSNAGridTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist
-struct TagCSNAGridComputeZi{};
-struct TagCSNAGridComputeBi{};
-struct TagCSNAGridTransformBi{}; // re-order blist from AoSoA to AoS
+template <bool chemsnap> struct TagCSNAGridComputeZi{};
+template <bool chemsnap> struct TagCSNAGridComputeBi{};
 struct TagCSNAGridLocalFill{}; // fill the gridlocal array
 //struct TagCSNAGridLocalFill2{}; // fill the gridlocal array using same kinda loop as ComputeForce
 
@@ -114,9 +112,10 @@ class ComputeSNAGridKokkos : public ComputeSNAGrid {
   static constexpr int team_size_compute_ui = 2;
   static constexpr int tile_size_transform_ui = 2;
   static constexpr int tile_size_compute_zi = 2;
+  static constexpr int min_blocks_compute_zi = 0; // no minimum bound
   static constexpr int tile_size_compute_bi = 2;
-  static constexpr int tile_size_transform_bi = 2;
   static constexpr int tile_size_compute_yi = 2;
+  static constexpr int min_blocks_compute_yi = 0; // no minimum bound
   static constexpr int team_size_compute_fused_deidrj = 2;
 #else
   static constexpr int team_size_compute_neigh = 4;
@@ -126,33 +125,44 @@ class ComputeSNAGridKokkos : public ComputeSNAGrid {
   static constexpr int tile_size_transform_ui = 4;
   static constexpr int tile_size_compute_zi = 8;
   static constexpr int tile_size_compute_bi = 4;
-  static constexpr int tile_size_transform_bi = 4;
   static constexpr int tile_size_compute_yi = 8;
   static constexpr int team_size_compute_fused_deidrj = sizeof(real_type) == 4 ? 4 : 2;
+
+  // this empirically reduces perf fluctuations from compiler version to compiler version
+  static constexpr int min_blocks_compute_zi = 4;
+  static constexpr int min_blocks_compute_yi = 4;
 #endif
 
   // Custom MDRangePolicy, Rank3, to reduce verbosity of kernel launches
   // This hides the Kokkos::IndexType<int> and Kokkos::Rank<3...>
   // and reduces the verbosity of the LaunchBound by hiding the explicit
   // multiplication by vector_length
-  template <class Device, int num_tiles, class TagComputeSNAP>
-  using Snap3DRangePolicy = typename Kokkos::MDRangePolicy<Device, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, Kokkos::LaunchBounds<vector_length * num_tiles>, TagComputeSNAP>;
+  template <class Device, int num_tiles, class TagComputeSNA, int min_blocks = 0>
+  using Snap3DRangePolicy = typename Kokkos::MDRangePolicy<Device, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, Kokkos::LaunchBounds<vector_length * num_tiles, min_blocks>, TagComputeSNA>;
 
   // MDRangePolicy for the 3D grid loop:
-  template <class Device, class TagComputeSNAP>
+  template <class Device, class TagComputeSNA>
   using CSNAGrid3DPolicy = typename Kokkos::MDRangePolicy<Device, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>>;
 
   // Testing out team policies
-  template <class Device, int num_teams,  class TagComputeSNAP>
-  using CSNAGridTeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::LaunchBounds<vector_length * num_teams>, TagComputeSNAP>;
-  //using CSNAGridTeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::IndexType<int>, Kokkos::IndexType<int>, Kokkos::IndexType<int>, TagComputeSNAP>;
+  template <class Device, int num_teams,  class TagComputeSNA>
+  using CSNAGridTeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::LaunchBounds<vector_length * num_teams>, TagComputeSNA>;
+  //using CSNAGridTeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::IndexType<int>, Kokkos::IndexType<int>, Kokkos::IndexType<int>, TagComputeSNA>;
   //using team_member = typename team_policy::member_type;
 
   // Custom SnapAoSoATeamPolicy to reduce the verbosity of kernel launches
   // This hides the LaunchBounds abstraction by hiding the explicit
   // multiplication by vector length
-  template <class Device, int num_teams, class TagComputeSNAP>
-  using SnapAoSoATeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::LaunchBounds<vector_length * num_teams>, TagComputeSNAP>;
+  template <class Device, int num_teams, class TagComputeSNA>
+  using SnapAoSoATeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::LaunchBounds<vector_length * num_teams>, TagComputeSNA>;
+
+  // Helper routine that returns a CPU or a GPU policy as appropriate
+  template <class Device, int num_tiles, class TagComputeSNA, int min_blocks = 0>
+  auto snap_get_policy(const int& chunk_size_div, const int& second_loop) {
+    return Snap3DRangePolicy<Device, num_tiles, TagComputeSNA, min_blocks>({0, 0, 0},
+                                                                 {vector_length, second_loop, chunk_size_div},
+                                                                 {vector_length, num_tiles, 1});
+  }
 
   ComputeSNAGridKokkos(class LAMMPS *, int, char **);
   ~ComputeSNAGridKokkos() override;
@@ -193,7 +203,13 @@ class ComputeSNAGridKokkos : public ComputeSNAGrid {
   void operator() (TagCSNAGridComputeCayleyKlein, const int iatom_mod, const int jnbor, const int iatom_div) const;
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (TagCSNAGridPreUi,const int iatom_mod, const int j, const int iatom_div) const;
+  void operator() (TagCSNAGridPreUi, const int& iatom_mod, const int& j, const int& iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridPreUi, const int& iatom, const int& j) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridPreUi, const int& iatom) const;
 
   KOKKOS_INLINE_FUNCTION
   void operator() (TagCSNAGridComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridComputeUiSmall>::member_type& team) const;
@@ -202,16 +218,31 @@ class ComputeSNAGridKokkos : public ComputeSNAGrid {
   void operator() (TagCSNAGridComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridComputeUiLarge>::member_type& team) const;
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (TagCSNAGridTransformUi,const int iatom_mod, const int j, const int iatom_div) const;
+  void operator() (TagCSNAGridTransformUi, const int& iatom_mod, const int& idxu, const int& iatom_div) const;
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (TagCSNAGridComputeZi,const int iatom_mod, const int idxz, const int iatom_div) const;
+  void operator() (TagCSNAGridTransformUi, const int& iatom, const int& idxu) const;
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (TagCSNAGridComputeBi,const int iatom_mod, const int idxb, const int iatom_div) const;
+  void operator() (TagCSNAGridTransformUi, const int& iatom) const;
 
-  KOKKOS_INLINE_FUNCTION
-  void operator() (TagCSNAGridTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const;
+  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeZi<chemsnap>, const int& iatom_mod, const int& idxz, const int& iatom_div) const;
+
+  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeZi<chemsnap>, const int& iatom, const int& idxz) const;
+
+  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeZi<chemsnap>, const int& iatom) const;
+
+  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeBi<chemsnap>, const int& iatom_mod, const int& idxb, const int& iatom_div) const;
+
+  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeBi<chemsnap>, const int& iatom, const int& idxb) const;
+
+  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeBi<chemsnap>, const int& iatom) const;
 
   KOKKOS_INLINE_FUNCTION
   void operator() (TagCSNAGridLocalFill,const int& ii) const;
diff --git a/src/KOKKOS/compute_sna_grid_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
index 8275e810a3..ec69b8bbdc 100644
--- a/src/KOKKOS/compute_sna_grid_kokkos_impl.h
+++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
@@ -222,7 +222,7 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::compute_array()
   // Pre-compute ceil(chunk_size / vector_length) for code cleanliness
   const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length;
 
-  if (triclinic){
+  if (triclinic) {
     /*
     xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0];
     xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1];
@@ -266,10 +266,8 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::compute_array()
 
     //PreUi
     {
-      // tile_size_pre_ui is defined in `compute_sna_grid_kokkos.h`
-      Snap3DRangePolicy<DeviceType, tile_size_pre_ui, TagCSNAGridPreUi>
-        policy_preui({0,0,0},{vector_length,twojmax+1,chunk_size_div},{vector_length,tile_size_pre_ui,1});
-      Kokkos::parallel_for("PreUi",policy_preui,*this);
+      auto policy_pre_ui = snap_get_policy<DeviceType, tile_size_pre_ui, TagCSNAGridPreUi>(chunk_size_div, twojmax + 1);
+      Kokkos::parallel_for("PreUi", policy_pre_ui, *this);
     }
 
     // ComputeUi w/ vector parallelism, shared memory, direct atomicAdd into ulisttot
@@ -292,7 +290,7 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::compute_array()
         policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
         Kokkos::parallel_for("ComputeUiSmall", policy_ui, *this);
       } else {
-        // Version w/out parallelism  over j_bend
+        // Version w/out parallelism over j_bend
 
         // total number of teams needed: (natoms / 32) * (ntotal)
         const int n_teams = chunk_size_div * max_neighs;
@@ -307,33 +305,29 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::compute_array()
 
     //TransformUi: un-"fold" ulisttot, zero ylist
     {
-      // team_size_transform_ui is defined in `pair_snap_kokkos.h`
-      Snap3DRangePolicy<DeviceType, tile_size_transform_ui, TagCSNAGridTransformUi>
-          policy_transform_ui({0,0,0},{vector_length,snaKK.idxu_max,chunk_size_div},{vector_length,tile_size_transform_ui,1});
-      Kokkos::parallel_for("TransformUi",policy_transform_ui,*this);
+      // Expand ulisttot_re,_im -> ulisttot
+      // Zero out ylist
+      auto policy_transform_ui = snap_get_policy<DeviceType, tile_size_transform_ui, TagCSNAGridTransformUi>(chunk_size_div, snaKK.idxu_max);
+      Kokkos::parallel_for("TransformUi", policy_transform_ui, *this);
     }
 
-    //Compute bispectrum in AoSoA data layout, transform Bi
+    //Compute bispectrum
+    // team_size_[compute_zi, compute_bi, transform_bi] are defined in `pair_snap_kokkos.h`
 
-    //ComputeZi
-    const int idxz_max = snaKK.idxz_max;
-    Snap3DRangePolicy<DeviceType, tile_size_compute_zi, TagCSNAGridComputeZi>
-        policy_compute_zi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_zi,1});
-    Kokkos::parallel_for("ComputeZi",policy_compute_zi,*this);
+    //ComputeZi and Bi
+    if (nelements > 1) {
+      auto policy_compute_zi = snap_get_policy<DeviceType, tile_size_compute_zi, TagCSNAGridComputeZi<true>, min_blocks_compute_zi>(chunk_size_div, snaKK.idxz_max);
+      Kokkos::parallel_for("ComputeZiChemsnap", policy_compute_zi, *this);
 
-    //ComputeBi
-    const int idxb_max = snaKK.idxb_max;
-    Snap3DRangePolicy<DeviceType, tile_size_compute_bi, TagCSNAGridComputeBi>
-        policy_compute_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_compute_bi,1});
-    Kokkos::parallel_for("ComputeBi",policy_compute_bi,*this);
+      auto policy_compute_bi = snap_get_policy<DeviceType, tile_size_compute_bi, TagCSNAGridComputeBi<true>>(chunk_size_div, snaKK.idxb_max);
+      Kokkos::parallel_for("ComputeBiChemsnap", policy_compute_bi, *this);
+    } else {
+      auto policy_compute_zi = snap_get_policy<DeviceType, tile_size_compute_zi, TagCSNAGridComputeZi<false>, min_blocks_compute_zi>(chunk_size_div, snaKK.idxz_max);
+      Kokkos::parallel_for("ComputeZi", policy_compute_zi, *this);
 
-    //Transform data layout of blist out of AoSoA
-    //We need this because `blist` gets used in ComputeForce which doesn't
-    //take advantage of AoSoA, which at best would only be beneficial on the margins
-    //NOTE: Do we need this in compute sna/grid/kk?
-    Snap3DRangePolicy<DeviceType, tile_size_transform_bi, TagCSNAGridTransformBi>
-        policy_transform_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_transform_bi,1});
-    Kokkos::parallel_for("TransformBi",policy_transform_bi,*this);
+      auto policy_compute_bi = snap_get_policy<DeviceType, tile_size_compute_bi, TagCSNAGridComputeBi<false>>(chunk_size_div, snaKK.idxb_max);
+      Kokkos::parallel_for("ComputeBi", policy_compute_bi, *this);
+    }
 
     // Fill the grid array with bispectrum values
     {
@@ -346,6 +340,8 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::compute_array()
 
   } // end while
 
+  copymode = 0;
+
   k_gridlocal.template modify<DeviceType>();
   k_gridlocal.template sync<LMPHostType>();
 
@@ -478,7 +474,6 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
 
     if (jtype >= 0)
       ninside++;
-
   }
 
   /*
@@ -609,39 +604,68 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
   */
 }
 
+/* ----------------------------------------------------------------------
+  Pre-compute the Cayley-Klein parameters for reuse in later routines
+------------------------------------------------------------------------- */
 
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeCayleyKlein,const int iatom_mod, const int jnbor, const int iatom_div) const {
 
-  const int ii = iatom_mod + iatom_div * vector_length;
-  if (ii >= chunk_size) return;
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
 
-  const int ninside = d_ninside(ii);
+  const int ninside = d_ninside(iatom);
   if (jnbor >= ninside) return;
 
-  snaKK.compute_cayley_klein(iatom_mod,jnbor,iatom_div);
+  snaKK.compute_cayley_klein(iatom, jnbor);
+}
+
+/* ----------------------------------------------------------------------
+  Initialize the "ulisttot" structure with non-zero on-diagonal terms
+  and zero terms elsewhere
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridPreUi, const int& iatom_mod, const int& j, const int& iatom_div) const {
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+
+  int itype = type(iatom);
+  int ielem = d_map[itype];
+
+  snaKK.pre_ui(iatom, j, ielem);
 }
 
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridPreUi, const int iatom_mod, const int j, const int iatom_div) const {
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridPreUi, const int& iatom, const int& j) const {
+  if (iatom >= chunk_size) return;
 
-  const int ii = iatom_mod + iatom_div * vector_length;
-  if (ii >= chunk_size) return;
+  int itype = type(iatom);
+  int ielem = d_map[itype];
 
-  //int itype = type(ii);
-  // force ielem to be zero (i.e. type 1) per `compute_sna_grid.cpp`
-  int ielem = 0;
+  snaKK.pre_ui(iatom, j, ielem);
+}
 
-  snaKK.pre_ui(iatom_mod, j, ielem, iatom_div);
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridPreUi, const int& iatom) const {
+  if (iatom >= chunk_size) return;
+
+  const int itype = type(iatom);
+  const int ielem = d_map[itype];
+
+  for (int j = 0; j <= twojmax; j++)
+    snaKK.pre_ui(iatom, j, ielem);
 }
 
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridComputeUiSmall>::member_type& team) const {
 
-  // extract flattened atom_div / neighbor number / bend_location
+  // extract flattened atom_div / neighbor number / bend location
   int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
 
   // extract neighbor index, iatom_div
@@ -686,81 +710,90 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
   });
 }
 
+/* ----------------------------------------------------------------------
+  De-symmetrize ulisttot_re and _im and pack it into a unified ulisttot
+  structure. Zero-initialize ylist. CPU and GPU.
+------------------------------------------------------------------------- */
+
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridTransformUi,const int iatom_mod, const int idxu, const int iatom_div) const {
-
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridTransformUi, const int& iatom_mod, const int& idxu, const int& iatom_div) const {
   const int iatom = iatom_mod + iatom_div * vector_length;
   if (iatom >= chunk_size) return;
-
-  if (idxu > snaKK.idxu_max) return;
-
-  int elem_count = chemflag ? nelements : 1;
-
-  for (int ielem = 0; ielem < elem_count; ielem++){
-
-    const FullHalfMapper mapper = snaKK.idxu_full_half[idxu];
-
-    auto utot_re = snaKK.ulisttot_re_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div);
-    auto utot_im = snaKK.ulisttot_im_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div);
-
-    if (mapper.flip_sign == 1){
-      utot_im = -utot_im;
-    } else if (mapper.flip_sign == -1){
-      utot_re = -utot_re;
-    }
-
-    snaKK.ulisttot_pack(iatom_mod, idxu, ielem, iatom_div) = { utot_re, utot_im };
-
-    if (mapper.flip_sign == 0) {
-      snaKK.ylist_pack_re(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.;
-      snaKK.ylist_pack_im(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.;
-    }
-  }
+  if (idxu >= snaKK.idxu_max) return;
+  snaKK.transform_ui(iatom, idxu);
 }
 
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeZi,const int iatom_mod, const int jjz, const int iatom_div) const {
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridTransformUi, const int& iatom, const int& idxu) const {
+  if (iatom >= chunk_size) return;
+  snaKK.transform_ui(iatom, idxu);
+}
 
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridTransformUi, const int& iatom) const {
+  if (iatom >= chunk_size) return;
+  for (int idxu = 0; idxu < snaKK.idxu_max; idxu++)
+    snaKK.transform_ui(iatom, idxu);
+}
+
+/* ----------------------------------------------------------------------
+  Compute all elements of the Z tensor and store them into the `zlist`
+   view
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeZi<chemsnap>, const int& iatom_mod, const int& jjz, const int& iatom_div) const {
   const int iatom = iatom_mod + iatom_div * vector_length;
   if (iatom >= chunk_size) return;
-
   if (jjz >= snaKK.idxz_max) return;
-
-  snaKK.compute_zi(iatom_mod,jjz,iatom_div);
+  snaKK.template compute_zi<chemsnap>(iatom, jjz);
 }
 
 template<class DeviceType, typename real_type, int vector_length>
-KOKKOS_INLINE_FUNCTION
-void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeBi,const int iatom_mod, const int jjb, const int iatom_div) const {
+template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeZi<chemsnap>, const int& iatom, const int& jjz) const {
+  if (iatom >= chunk_size) return;
+  snaKK.template compute_zi<chemsnap>(iatom, jjz);
+}
 
+template<class DeviceType, typename real_type, int vector_length>
+template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeZi<chemsnap>, const int& iatom) const {
+  if (iatom >= chunk_size) return;
+  for (int jjz = 0; jjz < snaKK.idxz_max; jjz++)
+    snaKK.template compute_zi<chemsnap>(iatom, jjz);
+}
+
+/* ----------------------------------------------------------------------
+  Compute the energy triple products and store in the "blist" view
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeBi<chemsnap>, const int& iatom_mod, const int& jjb, const int& iatom_div) const {
   const int iatom = iatom_mod + iatom_div * vector_length;
   if (iatom >= chunk_size) return;
-
   if (jjb >= snaKK.idxb_max) return;
-
-  snaKK.compute_bi(iatom_mod,jjb,iatom_div);
+  snaKK.template compute_bi<chemsnap>(iatom, jjb);
 }
 
 template<class DeviceType, typename real_type, int vector_length>
-KOKKOS_INLINE_FUNCTION
-void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const {
-
-  const int iatom = iatom_mod + iatom_div * vector_length;
+template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeBi<chemsnap>, const int& iatom, const int& jjb) const {
   if (iatom >= chunk_size) return;
+  snaKK.template compute_bi<chemsnap>(iatom, jjb);
+}
 
-  if (idxb >= snaKK.idxb_max) return;
-
-  const int ntriples = snaKK.ntriples;
-
-  for (int itriple = 0; itriple < ntriples; itriple++) {
-
-    const real_type blocal = snaKK.blist_pack(iatom_mod, idxb, itriple, iatom_div);
-
-    snaKK.blist(iatom, itriple, idxb) = blocal;
-  }
-
+template<class DeviceType, typename real_type, int vector_length>
+template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeBi<chemsnap>, const int& iatom) const {
+  if (iatom >= chunk_size) return;
+  for (int jjb = 0; jjb < snaKK.idxb_max; jjb++)
+    snaKK.template compute_bi<chemsnap>(iatom, jjb);
 }
 
 template<class DeviceType, typename real_type, int vector_length>
diff --git a/src/KOKKOS/compute_sna_grid_local_kokkos.h b/src/KOKKOS/compute_sna_grid_local_kokkos.h
index 2f2ae59426..735e1b03d0 100644
--- a/src/KOKKOS/compute_sna_grid_local_kokkos.h
+++ b/src/KOKKOS/compute_sna_grid_local_kokkos.h
@@ -53,7 +53,6 @@ struct TagPairSNAPTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero yl
 struct TagPairSNAPComputeZi{};
 struct TagPairSNAPBeta{};
 struct TagPairSNAPComputeBi{};
-struct TagPairSNAPTransformBi{}; // re-order blist from AoSoA to AoS
 struct TagPairSNAPComputeYi{};
 struct TagPairSNAPComputeYiWithZlist{};
 template<int dir>
@@ -68,9 +67,8 @@ struct TagCSNAGridLocalPreUi{};
 struct TagCSNAGridLocalComputeUiSmall{}; // more parallelism, more divergence
 struct TagCSNAGridLocalComputeUiLarge{}; // less parallelism, no divergence
 struct TagCSNAGridLocalTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist
-struct TagCSNAGridLocalComputeZi{};
-struct TagCSNAGridLocalComputeBi{};
-struct TagCSNAGridLocalTransformBi{}; // re-order blist from AoSoA to AoS
+template <bool chemsnap> struct TagCSNAGridLocalComputeZi{};
+template <bool chemsnap> struct TagCSNAGridLocalComputeBi{};
 struct TagCSNAGridLocal2Fill{}; // fill the gridlocal array
 //struct TagCSNAGridLocalFill2{}; // fill the gridlocal array using same kinda loop as ComputeForce
 
@@ -113,9 +111,10 @@ class ComputeSNAGridLocalKokkos : public ComputeSNAGridLocal {
   static constexpr int team_size_compute_ui = 2;
   static constexpr int tile_size_transform_ui = 2;
   static constexpr int tile_size_compute_zi = 2;
+  static constexpr int min_blocks_compute_zi = 0; // no minimum bound
   static constexpr int tile_size_compute_bi = 2;
-  static constexpr int tile_size_transform_bi = 2;
   static constexpr int tile_size_compute_yi = 2;
+  static constexpr int min_blocks_compute_yi = 0; // no minimum bound
   static constexpr int team_size_compute_fused_deidrj = 2;
 #else
   static constexpr int team_size_compute_neigh = 4;
@@ -125,31 +124,42 @@ class ComputeSNAGridLocalKokkos : public ComputeSNAGridLocal {
   static constexpr int tile_size_transform_ui = 4;
   static constexpr int tile_size_compute_zi = 8;
   static constexpr int tile_size_compute_bi = 4;
-  static constexpr int tile_size_transform_bi = 4;
   static constexpr int tile_size_compute_yi = 8;
   static constexpr int team_size_compute_fused_deidrj = sizeof(real_type) == 4 ? 4 : 2;
+
+  // this empirically reduces perf fluctuations from compiler version to compiler version
+  static constexpr int min_blocks_compute_zi = 4;
+  static constexpr int min_blocks_compute_yi = 4;
 #endif
 
   // Custom MDRangePolicy, Rank3, to reduce verbosity of kernel launches
   // This hides the Kokkos::IndexType<int> and Kokkos::Rank<3...>
   // and reduces the verbosity of the LaunchBound by hiding the explicit
   // multiplication by vector_length
-  template <class Device, int num_tiles, class TagComputeSNAP>
-  using Snap3DRangePolicy = typename Kokkos::MDRangePolicy<Device, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, Kokkos::LaunchBounds<vector_length * num_tiles>, TagComputeSNAP>;
+  template <class Device, int num_tiles, class TagComputeSNA, int min_blocks = 0>
+  using Snap3DRangePolicy = typename Kokkos::MDRangePolicy<Device, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, Kokkos::LaunchBounds<vector_length * num_tiles, min_blocks>, TagComputeSNA>;
 
   // MDRangePolicy for the 3D grid loop:
-  template <class Device, class TagComputeSNAP>
+  template <class Device, class TagComputeSNA>
   using CSNAGridLocal3DPolicy = typename Kokkos::MDRangePolicy<Device, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>>;
 
   // Testing out team policies
-  template <class Device, int num_teams,  class TagComputeSNAP>
-  using CSNAGridLocalTeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::LaunchBounds<vector_length * num_teams>, TagComputeSNAP>;
+  template <class Device, int num_teams,  class TagComputeSNA>
+  using CSNAGridLocalTeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::LaunchBounds<vector_length * num_teams>, TagComputeSNA>;
 
   // Custom SnapAoSoATeamPolicy to reduce the verbosity of kernel launches
   // This hides the LaunchBounds abstraction by hiding the explicit
   // multiplication by vector length
-  template <class Device, int num_teams, class TagComputeSNAP>
-  using SnapAoSoATeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::LaunchBounds<vector_length * num_teams>, TagComputeSNAP>;
+  template <class Device, int num_teams, class TagComputeSNA>
+  using SnapAoSoATeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::LaunchBounds<vector_length * num_teams>, TagComputeSNA>;
+
+  // Helper routine that returns a CPU or a GPU policy as appropriate
+  template <class Device, int num_tiles, class TagComputeSNA, int min_blocks = 0>
+  auto snap_get_policy(const int& chunk_size_div, const int& second_loop) {
+    return Snap3DRangePolicy<Device, num_tiles, TagComputeSNA, min_blocks>({0, 0, 0},
+                                                                 {vector_length, second_loop, chunk_size_div},
+                                                                 {vector_length, num_tiles, 1});
+  }
 
   ComputeSNAGridLocalKokkos(class LAMMPS *, int, char **);
   ~ComputeSNAGridLocalKokkos() override;
@@ -186,7 +196,13 @@ class ComputeSNAGridLocalKokkos : public ComputeSNAGridLocal {
   void operator() (TagCSNAGridLocalComputeCayleyKlein, const int iatom_mod, const int jnbor, const int iatom_div) const;
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (TagCSNAGridLocalPreUi,const int iatom_mod, const int j, const int iatom_div) const;
+  void operator() (TagCSNAGridLocalPreUi, const int& iatom_mod, const int& j, const int& iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalPreUi, const int& iatom, const int& j) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalPreUi, const int& iatom) const;
 
   KOKKOS_INLINE_FUNCTION
   void operator() (TagCSNAGridLocalComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridLocalComputeUiSmall>::member_type& team) const;
@@ -195,16 +211,31 @@ class ComputeSNAGridLocalKokkos : public ComputeSNAGridLocal {
   void operator() (TagCSNAGridLocalComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridLocalComputeUiLarge>::member_type& team) const;
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (TagCSNAGridLocalTransformUi,const int iatom_mod, const int j, const int iatom_div) const;
+  void operator() (TagCSNAGridLocalTransformUi, const int& iatom_mod, const int& idxu, const int& iatom_div) const;
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (TagCSNAGridLocalComputeZi,const int iatom_mod, const int idxz, const int iatom_div) const;
+  void operator() (TagCSNAGridLocalTransformUi, const int& iatom, const int& idxu) const;
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (TagCSNAGridLocalComputeBi,const int iatom_mod, const int idxb, const int iatom_div) const;
+  void operator() (TagCSNAGridLocalTransformUi, const int& iatom) const;
 
-  KOKKOS_INLINE_FUNCTION
-  void operator() (TagCSNAGridLocalTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const;
+  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalComputeZi<chemsnap>, const int& iatom_mod, const int& idxz, const int& iatom_div) const;
+
+  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalComputeZi<chemsnap>, const int& iatom, const int& idxz) const;
+
+  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalComputeZi<chemsnap>, const int& iatom) const;
+
+  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalComputeBi<chemsnap>, const int& iatom_mod, const int& idxb, const int& iatom_div) const;
+
+  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalComputeBi<chemsnap>, const int& iatom, const int& idxb) const;
+
+  template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalComputeBi<chemsnap>, const int& iatom) const;
 
   KOKKOS_INLINE_FUNCTION
   void operator() (TagCSNAGridLocal2Fill,const int& ii) const;
diff --git a/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h
index 1a40af4e8c..1c3fed3a0c 100644
--- a/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h
+++ b/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h
@@ -232,7 +232,7 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::compute_lo
   // Pre-compute ceil(chunk_size / vector_length) for code cleanliness
   const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length;
 
-  if (triclinic){
+  if (triclinic) {
     /*
     xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0];
     xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1];
@@ -276,10 +276,8 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::compute_lo
 
     //PreUi
     {
-      // tile_size_pre_ui is defined in `compute_sna_grid_kokkos.h`
-      Snap3DRangePolicy<DeviceType, tile_size_pre_ui, TagCSNAGridLocalPreUi>
-        policy_preui({0,0,0},{vector_length,twojmax+1,chunk_size_div},{vector_length,tile_size_pre_ui,1});
-      Kokkos::parallel_for("PreUi",policy_preui,*this);
+      auto policy_pre_ui = snap_get_policy<DeviceType, tile_size_pre_ui, TagCSNAGridLocalPreUi>(chunk_size_div, twojmax + 1);
+      Kokkos::parallel_for("PreUi", policy_pre_ui, *this);
     }
 
     // ComputeUi w/ vector parallelism, shared memory, direct atomicAdd into ulisttot
@@ -302,7 +300,7 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::compute_lo
         policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
         Kokkos::parallel_for("ComputeUiSmall", policy_ui, *this);
       } else {
-        // Version w/out parallelism  over j_bend
+        // Version w/out parallelism over j_bend
 
         // total number of teams needed: (natoms / 32) * (ntotal)
         const int n_teams = chunk_size_div * max_neighs;
@@ -317,33 +315,29 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::compute_lo
 
     //TransformUi: un-"fold" ulisttot, zero ylist
     {
-      // team_size_transform_ui is defined in `pair_snap_kokkos.h`
-      Snap3DRangePolicy<DeviceType, tile_size_transform_ui, TagCSNAGridLocalTransformUi>
-          policy_transform_ui({0,0,0},{vector_length,snaKK.idxu_max,chunk_size_div},{vector_length,tile_size_transform_ui,1});
-      Kokkos::parallel_for("TransformUi",policy_transform_ui,*this);
+      // Expand ulisttot_re,_im -> ulisttot
+      // Zero out ylist
+      auto policy_transform_ui = snap_get_policy<DeviceType, tile_size_transform_ui, TagCSNAGridLocalTransformUi>(chunk_size_div, snaKK.idxu_max);
+      Kokkos::parallel_for("TransformUi", policy_transform_ui, *this);
     }
 
-    //Compute bispectrum in AoSoA data layout, transform Bi
+    //Compute bispectrum
+    // team_size_[compute_zi, compute_bi, transform_bi] are defined in `pair_snap_kokkos.h`
 
-    //ComputeZi
-    const int idxz_max = snaKK.idxz_max;
-    Snap3DRangePolicy<DeviceType, tile_size_compute_zi, TagCSNAGridLocalComputeZi>
-        policy_compute_zi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_zi,1});
-    Kokkos::parallel_for("ComputeZi",policy_compute_zi,*this);
+    //ComputeZi and Bi
+    if (nelements > 1) {
+      auto policy_compute_zi = snap_get_policy<DeviceType, tile_size_compute_zi, TagCSNAGridLocalComputeZi<true>, min_blocks_compute_zi>(chunk_size_div, snaKK.idxz_max);
+      Kokkos::parallel_for("ComputeZiChemsnap", policy_compute_zi, *this);
 
-    //ComputeBi
-    const int idxb_max = snaKK.idxb_max;
-    Snap3DRangePolicy<DeviceType, tile_size_compute_bi, TagCSNAGridLocalComputeBi>
-        policy_compute_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_compute_bi,1});
-    Kokkos::parallel_for("ComputeBi",policy_compute_bi,*this);
+      auto policy_compute_bi = snap_get_policy<DeviceType, tile_size_compute_bi, TagCSNAGridLocalComputeBi<true>>(chunk_size_div, snaKK.idxb_max);
+      Kokkos::parallel_for("ComputeBiChemsnap", policy_compute_bi, *this);
+    } else {
+      auto policy_compute_zi = snap_get_policy<DeviceType, tile_size_compute_zi, TagCSNAGridLocalComputeZi<false>, min_blocks_compute_zi>(chunk_size_div, snaKK.idxz_max);
+      Kokkos::parallel_for("ComputeZi", policy_compute_zi, *this);
 
-    //Transform data layout of blist out of AoSoA
-    //We need this because `blist` gets used in ComputeForce which doesn't
-    //take advantage of AoSoA, which at best would only be beneficial on the margins
-    //NOTE: Do we need this in compute sna/grid/kk?
-    Snap3DRangePolicy<DeviceType, tile_size_transform_bi, TagCSNAGridLocalTransformBi>
-        policy_transform_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_transform_bi,1});
-    Kokkos::parallel_for("TransformBi",policy_transform_bi,*this);
+      auto policy_compute_bi = snap_get_policy<DeviceType, tile_size_compute_bi, TagCSNAGridLocalComputeBi<false>>(chunk_size_div, snaKK.idxb_max);
+      Kokkos::parallel_for("ComputeBi", policy_compute_bi, *this);
+    }
 
     // Fill the grid array with bispectrum values
     {
@@ -627,39 +621,68 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator()
   */
 }
 
+/* ----------------------------------------------------------------------
+  Pre-compute the Cayley-Klein parameters for reuse in later routines
+------------------------------------------------------------------------- */
 
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeCayleyKlein,const int iatom_mod, const int jnbor, const int iatom_div) const {
 
-  const int ii = iatom_mod + iatom_div * vector_length;
-  if (ii >= chunk_size) return;
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
 
-  const int ninside = d_ninside(ii);
+  const int ninside = d_ninside(iatom);
   if (jnbor >= ninside) return;
 
-  snaKK.compute_cayley_klein(iatom_mod,jnbor,iatom_div);
+  snaKK.compute_cayley_klein(iatom, jnbor);
+}
+
+/* ----------------------------------------------------------------------
+  Initialize the "ulisttot" structure with non-zero on-diagonal terms
+  and zero terms elsewhere
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalPreUi, const int& iatom_mod, const int& j, const int& iatom_div) const {
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+
+  int itype = type(iatom);
+  int ielem = d_map[itype];
+
+  snaKK.pre_ui(iatom, j, ielem);
 }
 
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalPreUi, const int iatom_mod, const int j, const int iatom_div) const {
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalPreUi, const int& iatom, const int& j) const {
+  if (iatom >= chunk_size) return;
 
-  const int ii = iatom_mod + iatom_div * vector_length;
-  if (ii >= chunk_size) return;
+  int itype = type(iatom);
+  int ielem = d_map[itype];
 
-  //int itype = type(ii);
-  // force ielem to be zero (i.e. type 1) per `compute_sna_grid.cpp`
-  int ielem = 0;
+  snaKK.pre_ui(iatom, j, ielem);
+}
 
-  snaKK.pre_ui(iatom_mod, j, ielem, iatom_div);
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalPreUi, const int& iatom) const {
+  if (iatom >= chunk_size) return;
+
+  const int itype = type(iatom);
+  const int ielem = d_map[itype];
+
+  for (int j = 0; j <= twojmax; j++)
+    snaKK.pre_ui(iatom, j, ielem);
 }
 
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridLocalComputeUiSmall>::member_type& team) const {
 
-  // extract flattened atom_div / neighbor number / bend_location
+  // extract flattened atom_div / neighbor number / bend location
   int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
 
   // extract neighbor index, iatom_div
@@ -704,81 +727,90 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator()
   });
 }
 
+/* ----------------------------------------------------------------------
+  De-symmetrize ulisttot_re and _im and pack it into a unified ulisttot
+  structure. Zero-initialize ylist. CPU and GPU.
+------------------------------------------------------------------------- */
+
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalTransformUi,const int iatom_mod, const int idxu, const int iatom_div) const {
-
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalTransformUi, const int& iatom_mod, const int& idxu, const int& iatom_div) const {
   const int iatom = iatom_mod + iatom_div * vector_length;
   if (iatom >= chunk_size) return;
-
-  if (idxu > snaKK.idxu_max) return;
-
-  int elem_count = chemflag ? nelements : 1;
-
-  for (int ielem = 0; ielem < elem_count; ielem++){
-
-    const FullHalfMapper mapper = snaKK.idxu_full_half[idxu];
-
-    auto utot_re = snaKK.ulisttot_re_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div);
-    auto utot_im = snaKK.ulisttot_im_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div);
-
-    if (mapper.flip_sign == 1){
-      utot_im = -utot_im;
-    } else if (mapper.flip_sign == -1){
-      utot_re = -utot_re;
-    }
-
-    snaKK.ulisttot_pack(iatom_mod, idxu, ielem, iatom_div) = { utot_re, utot_im };
-
-    if (mapper.flip_sign == 0) {
-      snaKK.ylist_pack_re(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.;
-      snaKK.ylist_pack_im(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.;
-    }
-  }
+  if (idxu >= snaKK.idxu_max) return;
+  snaKK.transform_ui(iatom, idxu);
 }
 
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeZi,const int iatom_mod, const int jjz, const int iatom_div) const {
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalTransformUi, const int& iatom, const int& idxu) const {
+  if (iatom >= chunk_size) return;
+  snaKK.transform_ui(iatom, idxu);
+}
 
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalTransformUi, const int& iatom) const {
+  if (iatom >= chunk_size) return;
+  for (int idxu = 0; idxu < snaKK.idxu_max; idxu++)
+    snaKK.transform_ui(iatom, idxu);
+}
+
+/* ----------------------------------------------------------------------
+  Compute all elements of the Z tensor and store them into the `zlist`
+   view
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeZi<chemsnap>, const int& iatom_mod, const int& jjz, const int& iatom_div) const {
   const int iatom = iatom_mod + iatom_div * vector_length;
   if (iatom >= chunk_size) return;
-
   if (jjz >= snaKK.idxz_max) return;
-
-  snaKK.compute_zi(iatom_mod,jjz,iatom_div);
+  snaKK.template compute_zi<chemsnap>(iatom, jjz);
 }
 
 template<class DeviceType, typename real_type, int vector_length>
-KOKKOS_INLINE_FUNCTION
-void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeBi,const int iatom_mod, const int jjb, const int iatom_div) const {
+template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeZi<chemsnap>, const int& iatom, const int& jjz) const {
+  if (iatom >= chunk_size) return;
+  snaKK.template compute_zi<chemsnap>(iatom, jjz);
+}
 
+template<class DeviceType, typename real_type, int vector_length>
+template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeZi<chemsnap>, const int& iatom) const {
+  if (iatom >= chunk_size) return;
+  for (int jjz = 0; jjz < snaKK.idxz_max; jjz++)
+    snaKK.template compute_zi<chemsnap>(iatom, jjz);
+}
+
+/* ----------------------------------------------------------------------
+  Compute the energy triple products and store in the "blist" view
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeBi<chemsnap>, const int& iatom_mod, const int& jjb, const int& iatom_div) const {
   const int iatom = iatom_mod + iatom_div * vector_length;
   if (iatom >= chunk_size) return;
-
   if (jjb >= snaKK.idxb_max) return;
-
-  snaKK.compute_bi(iatom_mod,jjb,iatom_div);
+  snaKK.template compute_bi<chemsnap>(iatom, jjb);
 }
 
 template<class DeviceType, typename real_type, int vector_length>
-KOKKOS_INLINE_FUNCTION
-void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const {
-
-  const int iatom = iatom_mod + iatom_div * vector_length;
+template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeBi<chemsnap>, const int& iatom, const int& jjb) const {
   if (iatom >= chunk_size) return;
+  snaKK.template compute_bi<chemsnap>(iatom, jjb);
+}
 
-  if (idxb >= snaKK.idxb_max) return;
-
-  const int ntriples = snaKK.ntriples;
-
-  for (int itriple = 0; itriple < ntriples; itriple++) {
-
-    const real_type blocal = snaKK.blist_pack(iatom_mod, idxb, itriple, iatom_div);
-
-    snaKK.blist(iatom, itriple, idxb) = blocal;
-  }
-
+template<class DeviceType, typename real_type, int vector_length>
+template <bool chemsnap> KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalComputeBi<chemsnap>, const int& iatom) const {
+  if (iatom >= chunk_size) return;
+  for (int jjb = 0; jjb < snaKK.idxb_max; jjb++)
+    snaKK.template compute_bi<chemsnap>(iatom, jjb);
 }
 
 template<class DeviceType, typename real_type, int vector_length>
diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h
index 783043e6d9..17ce8e1c9d 100644
--- a/src/KOKKOS/pair_snap_kokkos_impl.h
+++ b/src/KOKKOS/pair_snap_kokkos_impl.h
@@ -3,12 +3,10 @@
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    https://www.lammps.org/, Sandia National Laboratories
    LAMMPS development team: developers@lammps.org
-
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
-
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
@@ -39,17 +37,6 @@
 
 namespace LAMMPS_NS {
 
-// Outstanding issues with quadratic term
-// 1. there seems to a problem with compute_optimized energy calc
-// it does not match compute_regular, even when quadratic coeffs = 0
-
-//static double t1 = 0.0;
-//static double t2 = 0.0;
-//static double t3 = 0.0;
-//static double t4 = 0.0;
-//static double t5 = 0.0;
-//static double t6 = 0.0;
-//static double t7 = 0.0;
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType, typename real_type, int vector_length>
@@ -219,7 +206,8 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::compute(int eflag_in,
       // team_size_compute_neigh is defined in `pair_snap_kokkos.h`
       int scratch_size = scratch_size_helper<int>(team_size_compute_neigh * max_neighs);
 
-      SnapAoSoATeamPolicy<DeviceType, team_size_compute_neigh, TagPairSNAPComputeNeigh> policy_neigh(chunk_size,team_size_compute_neigh,vector_length);
+      SnapAoSoATeamPolicy<DeviceType, team_size_compute_neigh, TagPairSNAPComputeNeigh>
+        policy_neigh(chunk_size,team_size_compute_neigh,vector_length);
       policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
       Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this);
     }
@@ -259,7 +247,8 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::compute(int eflag_in,
         const int n_teams = chunk_size_div * max_neighs * (twojmax + 1);
         const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
 
-        SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagPairSNAPComputeUiSmall> policy_ui(n_teams_div, team_size_compute_ui, vector_length);
+        SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagPairSNAPComputeUiSmall>
+          policy_ui(n_teams_div, team_size_compute_ui, vector_length);
         policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
         Kokkos::parallel_for("ComputeUiSmall",policy_ui,*this);
       } else {
@@ -269,7 +258,8 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::compute(int eflag_in,
         const int n_teams = chunk_size_div * max_neighs;
         const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
 
-        SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagPairSNAPComputeUiLarge> policy_ui(n_teams_div, team_size_compute_ui, vector_length);
+        SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagPairSNAPComputeUiLarge>
+          policy_ui(n_teams_div, team_size_compute_ui, vector_length);
         policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
         Kokkos::parallel_for("ComputeUiLarge",policy_ui,*this);
       }

From eb5977dc66881f63d0c6a200c8321845e261094a Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Tue, 14 Jan 2025 15:57:15 -0700
Subject: [PATCH 48/51] Fix issues with host_flag

---
 src/KOKKOS/compute_sna_grid_kokkos.h          |  3 ---
 src/KOKKOS/compute_sna_grid_kokkos_impl.h     | 26 +------------------
 src/KOKKOS/compute_sna_grid_local_kokkos.h    |  3 ---
 .../compute_sna_grid_local_kokkos_impl.h      | 25 +-----------------
 4 files changed, 2 insertions(+), 55 deletions(-)

diff --git a/src/KOKKOS/compute_sna_grid_kokkos.h b/src/KOKKOS/compute_sna_grid_kokkos.h
index ac378b07df..5a81309a4e 100644
--- a/src/KOKKOS/compute_sna_grid_kokkos.h
+++ b/src/KOKKOS/compute_sna_grid_kokkos.h
@@ -167,7 +167,6 @@ class ComputeSNAGridKokkos : public ComputeSNAGrid {
   ComputeSNAGridKokkos(class LAMMPS *, int, char **);
   ~ComputeSNAGridKokkos() override;
 
-  void init() override;
   void setup() override;
   void compute_array() override;
 
@@ -321,7 +320,6 @@ class ComputeSNAGridKokkosDevice : public ComputeSNAGridKokkos<DeviceType, SNAP_
 
   ComputeSNAGridKokkosDevice(class LAMMPS *, int, char **);
 
-  void init() override;
   void compute_array() override;
 
 };
@@ -337,7 +335,6 @@ class ComputeSNAGridKokkosHost : public ComputeSNAGridKokkos<DeviceType, SNAP_KO
 
   ComputeSNAGridKokkosHost(class LAMMPS *, int, char **);
 
-  void init() override;
   void compute_array() override;
 
 };
diff --git a/src/KOKKOS/compute_sna_grid_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
index ec69b8bbdc..432dbe9f98 100644
--- a/src/KOKKOS/compute_sna_grid_kokkos_impl.h
+++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
@@ -141,24 +141,11 @@ ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::~ComputeSNAGridKokko
   //memoryKK->destroy_kokkos(k_gridlocal, gridlocal);
 }
 
-// Init
-
-template<class DeviceType, typename real_type, int vector_length>
-void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::init()
-{
-  if (host_flag) {
-    return;
-  }
-  ComputeSNAGrid::init();
-
-}
-
 // Setup
 
 template<class DeviceType, typename real_type, int vector_length>
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::setup()
 {
-
   // Do not call ComputeGrid::setup(), we don't wanna allocate the grid array there.
   // Instead, call ComputeGrid::set_grid_global and set_grid_local to set the n indices.
 
@@ -184,6 +171,7 @@ template<class DeviceType, typename real_type, int vector_length>
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::compute_array()
 {
   if (host_flag) {
+    ComputeSNAGrid::compute_array();
     return;
   }
 
@@ -907,12 +895,6 @@ template<class DeviceType>
 ComputeSNAGridKokkosDevice<DeviceType>::ComputeSNAGridKokkosDevice(class LAMMPS *lmp, int narg, char **arg)
    : ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN>(lmp, narg, arg) { ; }
 
-template<class DeviceType>
-void ComputeSNAGridKokkosDevice<DeviceType>::init()
-{
-  Base::init();
-}
-
 template<class DeviceType>
 void ComputeSNAGridKokkosDevice<DeviceType>::compute_array()
 {
@@ -924,12 +906,6 @@ template<class DeviceType>
 ComputeSNAGridKokkosHost<DeviceType>::ComputeSNAGridKokkosHost(class LAMMPS *lmp, int narg, char **arg)
    : ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_HOST_VECLEN>(lmp, narg, arg) { ; }
 
-template<class DeviceType>
-void ComputeSNAGridKokkosHost<DeviceType>::init()
-{
-  Base::init();
-}
-
 template<class DeviceType>
 void ComputeSNAGridKokkosHost<DeviceType>::compute_array()
 {
diff --git a/src/KOKKOS/compute_sna_grid_local_kokkos.h b/src/KOKKOS/compute_sna_grid_local_kokkos.h
index 735e1b03d0..754d4e36af 100644
--- a/src/KOKKOS/compute_sna_grid_local_kokkos.h
+++ b/src/KOKKOS/compute_sna_grid_local_kokkos.h
@@ -164,7 +164,6 @@ class ComputeSNAGridLocalKokkos : public ComputeSNAGridLocal {
   ComputeSNAGridLocalKokkos(class LAMMPS *, int, char **);
   ~ComputeSNAGridLocalKokkos() override;
 
-  void init() override;
   void setup() override;
   void compute_local() override;
 
@@ -320,7 +319,6 @@ class ComputeSNAGridLocalKokkosDevice : public ComputeSNAGridLocalKokkos<DeviceT
 
   ComputeSNAGridLocalKokkosDevice(class LAMMPS *, int, char **);
 
-  void init() override;
   void compute_local() override;
   //void setup() override;
 
@@ -337,7 +335,6 @@ class ComputeSNAGridLocalKokkosHost : public ComputeSNAGridLocalKokkos<DeviceTyp
 
   ComputeSNAGridLocalKokkosHost(class LAMMPS *, int, char **);
 
-  void init() override;
   void compute_local() override;
 
 };
diff --git a/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h
index 1c3fed3a0c..434db594ef 100644
--- a/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h
+++ b/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h
@@ -142,18 +142,6 @@ ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::~ComputeSNAGrid
   //memoryKK->destroy_kokkos(k_gridlocal, gridlocal);
 }
 
-// Init
-
-template<class DeviceType, typename real_type, int vector_length>
-void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::init()
-{
-  if (host_flag) {
-    return;
-  }
-  ComputeSNAGridLocal::init();
-
-}
-
 // Setup
 
 template<class DeviceType, typename real_type, int vector_length>
@@ -191,6 +179,7 @@ template<class DeviceType, typename real_type, int vector_length>
 void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::compute_local()
 {
   if (host_flag) {
+    ComputeSNAGridLocal::compute_array();
     return;
   }
 
@@ -924,12 +913,6 @@ template<class DeviceType>
 ComputeSNAGridLocalKokkosDevice<DeviceType>::ComputeSNAGridLocalKokkosDevice(class LAMMPS *lmp, int narg, char **arg)
    : ComputeSNAGridLocalKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN>(lmp, narg, arg) { ; }
 
-template<class DeviceType>
-void ComputeSNAGridLocalKokkosDevice<DeviceType>::init()
-{
-  Base::init();
-}
-
 template<class DeviceType>
 void ComputeSNAGridLocalKokkosDevice<DeviceType>::compute_local()
 {
@@ -941,12 +924,6 @@ template<class DeviceType>
 ComputeSNAGridLocalKokkosHost<DeviceType>::ComputeSNAGridLocalKokkosHost(class LAMMPS *lmp, int narg, char **arg)
    : ComputeSNAGridLocalKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_HOST_VECLEN>(lmp, narg, arg) { ; }
 
-template<class DeviceType>
-void ComputeSNAGridLocalKokkosHost<DeviceType>::init()
-{
-  Base::init();
-}
-
 template<class DeviceType>
 void ComputeSNAGridLocalKokkosHost<DeviceType>::compute_local()
 {

From 536aa7cadffd8ea998ea0b89e7c1cc569964c2fe Mon Sep 17 00:00:00 2001
From: Lenz Fiedler <l.fiedler@hzdr.de>
Date: Tue, 21 Jan 2025 17:09:04 +0100
Subject: [PATCH 49/51] Removed debug comments/old printfs etc.

---
 .../compute_gaussian_grid_local_kokkos.cpp    |  53 -------
 .../compute_gaussian_grid_local_kokkos.h      |  22 ---
 src/KOKKOS/compute_sna_grid_kokkos.cpp        |  56 -------
 src/KOKKOS/compute_sna_grid_kokkos.h          |  91 -----------
 src/KOKKOS/compute_sna_grid_kokkos_impl.h     | 130 ---------------
 src/KOKKOS/compute_sna_grid_local_kokkos.cpp  |  56 -------
 src/KOKKOS/compute_sna_grid_local_kokkos.h    |  58 -------
 .../compute_sna_grid_local_kokkos_impl.h      | 150 ------------------
 src/ML-SNAP/compute_gaussian_grid_local.cpp   |   4 -
 src/ML-SNAP/compute_grid.cpp                  |   3 -
 src/ML-SNAP/compute_grid_local.cpp            |   8 -
 11 files changed, 631 deletions(-)

diff --git a/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp b/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp
index 99380e0d63..cfd7e5a582 100644
--- a/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp
+++ b/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp
@@ -54,17 +54,12 @@ ComputeGaussianGridLocalKokkos<DeviceType>::ComputeGaussianGridLocalKokkos(LAMMP
 
   host_flag = (execution_space == Host);
 
-  // TODO: Extract cutsq in double loop below, no need for cutsq_tmp
-
-  //cutsq_tmp = cutsq[1][1];
-
   for (int i = 1; i <= atom->ntypes; i++) {
     for (int j = 1; j <= atom->ntypes; j++){
       k_cutsq.h_view(i,j) = k_cutsq.h_view(j,i) = cutsq[i][j]; //cutsq_tmp;
       k_cutsq.template modify<LMPHostType>();
     }
   }
-  //printf(">>> 1\n");
   // Set up element lists
   int n = atom->ntypes;
   MemKK::realloc_kokkos(d_radelem,"ComputeSNAGridKokkos::radelem",n);
@@ -72,13 +67,11 @@ ComputeGaussianGridLocalKokkos<DeviceType>::ComputeGaussianGridLocalKokkos(LAMMP
   MemKK::realloc_kokkos(d_prefacelem,"ComputeSNAGridKokkos::prefacelem",n+1);
   MemKK::realloc_kokkos(d_argfacelem,"ComputeSNAGridKokkos::argfacelem",n+1);
   MemKK::realloc_kokkos(d_map,"ComputeSNAGridKokkos::map",n+1);
-  //printf(">>> 2\n");
   auto h_radelem = Kokkos::create_mirror_view(d_radelem);
   auto h_sigmaelem = Kokkos::create_mirror_view(d_sigmaelem);
   auto h_prefacelem = Kokkos::create_mirror_view(d_prefacelem);
   auto h_argfacelem = Kokkos::create_mirror_view(d_argfacelem);
   auto h_map = Kokkos::create_mirror_view(d_map);
-  //printf(">>> 3\n");
   // start from index 1 because of how compute sna/grid is
   for (int i = 1; i <= atom->ntypes; i++) {
     h_radelem(i-1) = radelem[i];
@@ -86,21 +79,11 @@ ComputeGaussianGridLocalKokkos<DeviceType>::ComputeGaussianGridLocalKokkos(LAMMP
     h_prefacelem(i-1) = prefacelem[i];
     h_argfacelem(i-1) = argfacelem[i];
   }
-  //printf(">>> 4\n");
-  // In pair snap some things like `map` get allocated regardless of chem flag.
-  // In this compute, however, map does not get allocated in parent classes.
-  /*
-  for (int i = 1; i <= atom->ntypes; i++) {
-    h_map(i) = map[i];
-  }
-  */
-  //printf(">>> 5\n");
   Kokkos::deep_copy(d_radelem,h_radelem);
   Kokkos::deep_copy(d_sigmaelem,h_sigmaelem);
   Kokkos::deep_copy(d_prefacelem, h_prefacelem);
   Kokkos::deep_copy(d_argfacelem, h_argfacelem);
   Kokkos::deep_copy(d_map,h_map);
-  //printf(">>> 6\n");
 
 }
 
@@ -109,14 +92,12 @@ ComputeGaussianGridLocalKokkos<DeviceType>::ComputeGaussianGridLocalKokkos(LAMMP
 template<class DeviceType>
 ComputeGaussianGridLocalKokkos<DeviceType>::~ComputeGaussianGridLocalKokkos()
 {
-  //printf(">>> ComputeGaussianGridLocalKokkos destruct begin, copymode %d\n", copymode);
   if (copymode) return;
 
   memoryKK->destroy_kokkos(k_cutsq,cutsq);
   memoryKK->destroy_kokkos(k_alocal,alocal);
   //gridlocal_allocated = 0;
 
-  //printf(">>> ComputeGaussianGridLocalKokkos end\n");
 }
 
 /* ---------------------------------------------------------------------- */
@@ -125,25 +106,12 @@ template<class DeviceType>
 void ComputeGaussianGridLocalKokkos<DeviceType>::setup()
 {
 
-  // Do not call ComputeGrid::setup(), we don't wanna allocate the grid array there.
-  // Instead, call ComputeGrid::set_grid_global and set_grid_local to set the n indices.
-
-  //ComputeGrid::set_grid_global();
-  //ComputeGrid::set_grid_local();
   ComputeGridLocal::setup();
 
   // allocate arrays
-  //printf(">>> rows cols kokkos init: %d %d\n", size_local_rows, size_local_cols);
   memoryKK->create_kokkos(k_alocal, alocal, size_local_rows, size_local_cols, "grid:alocal");
-
-  //gridlocal_allocated = 1;
-  //array = gridall;
-
   array_local = alocal;
-
   d_alocal = k_alocal.template view<DeviceType>();
-  //d_grid = k_grid.template view<DeviceType>();
-  //d_gridall = k_gridall.template view<DeviceType>();
 
 }
 
@@ -160,8 +128,6 @@ void ComputeGaussianGridLocalKokkos<DeviceType>::init()
 template<class DeviceType>
 void ComputeGaussianGridLocalKokkos<DeviceType>::compute_local()
 {
-  //printf(">>> compute_local Kokkos begin\n");
-
   if (host_flag) {
     return;
   }
@@ -202,11 +168,6 @@ void ComputeGaussianGridLocalKokkos<DeviceType>::compute_local()
     team_size_default = 1; // cost will increase with increasing team size //32;//max_neighs;
 
   if (triclinic){
-    /*
-    xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0];
-    xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1];
-    xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2];
-    */
     h0 = domain->h[0];
     h1 = domain->h[1];
     h2 = domain->h[2];
@@ -228,9 +189,7 @@ void ComputeGaussianGridLocalKokkos<DeviceType>::compute_local()
       int vector_length = vector_length_default;
       int team_size = team_size_default;
       check_team_size_for<TagComputeGaussianGridLocalNeigh>(chunk_size,team_size,vector_length);
-      //printf(">>> Check 1 %d %d %d\n", chunk_size, team_size, vector_length);
       typename Kokkos::TeamPolicy<DeviceType, TagComputeGaussianGridLocalNeigh> policy_neigh(chunk_size,team_size,vector_length);
-      //printf(">>> Check 2\n");
       Kokkos::parallel_for("ComputeGaussianGridLocalNeigh",policy_neigh,*this);
     }
 
@@ -243,8 +202,6 @@ void ComputeGaussianGridLocalKokkos<DeviceType>::compute_local()
   k_alocal.template modify<DeviceType>();
   k_alocal.template sync<LMPHostType>();
 
-  //printf(">>> k_alocal: %f\n", k_alocal.h_view(0,6));
-
 }
 
 /* ---------------------------------------------------------------------- */
@@ -254,7 +211,6 @@ KOKKOS_INLINE_FUNCTION
 void ComputeGaussianGridLocalKokkos<DeviceType>::operator() (TagComputeGaussianGridLocalNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagComputeGaussianGridLocalNeigh>::member_type& team) const
 {
   const int ii = team.league_rank();
-  //printf("%d\n", ii);
 
   if (ii >= chunk_size) return;
 
@@ -284,7 +240,6 @@ void ComputeGaussianGridLocalKokkos<DeviceType>::operator() (TagComputeGaussianG
 
   // index ii already captures the proper grid point
   //int igrid = iz * (nx * ny) + iy * nx + ix;
-  //printf("%d %d\n", ii, igrid);
 
   // grid2x converts igrid to ix,iy,iz like we've done before
   // multiply grid integers by grid spacing delx, dely, delz
@@ -302,11 +257,6 @@ void ComputeGaussianGridLocalKokkos<DeviceType>::operator() (TagComputeGaussianG
     // Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed
 
     // Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats.
-    /*
-    xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0];
-    xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1];
-    xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2];
-    */
     xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0;
     xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1;
     xgrid[2] = h2*xgrid[2] + lo2;
@@ -348,13 +298,10 @@ void ComputeGaussianGridLocalKokkos<DeviceType>::operator() (TagComputeGaussianG
     const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
 
     if (rsq < rnd_cutsq(jtype, jtype) ) {
-      //printf("%f %f\n", d_prefacelem(jtype-1), d_argfacelem(jtype-1));
       int icol = size_local_cols_base + jtype - 1;
       d_alocal(igrid, icol) += d_prefacelem(jtype-1) * exp(-rsq * d_argfacelem(jtype-1));
     }
   }
-
-  //printf("%f\n", d_alocal(igrid, 6));
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/KOKKOS/compute_gaussian_grid_local_kokkos.h b/src/KOKKOS/compute_gaussian_grid_local_kokkos.h
index deb5eaa8cb..34e12bc4af 100644
--- a/src/KOKKOS/compute_gaussian_grid_local_kokkos.h
+++ b/src/KOKKOS/compute_gaussian_grid_local_kokkos.h
@@ -58,8 +58,6 @@ template <class DeviceType> class ComputeGaussianGridLocalKokkos : public Comput
   void operator() (TagComputeGaussianGridLocalNeigh, const typename Kokkos::TeamPolicy<DeviceType, TagComputeGaussianGridLocalNeigh>::member_type& team) const;
 
  private:
-  //double adof, mvv2e, mv2d, boltz;
-
   Kokkos::View<double*, DeviceType> d_radelem;              // element radii
   Kokkos::View<double*, DeviceType> d_sigmaelem;
   Kokkos::View<double*, DeviceType> d_prefacelem;
@@ -73,21 +71,6 @@ template <class DeviceType> class ComputeGaussianGridLocalKokkos : public Comput
       Kokkos::MemoryTraits<Kokkos::RandomAccess> > t_fparams_rnd;
   t_fparams_rnd rnd_cutsq;
 
-  /*
-  typename AT::t_x_array x;
-  typename AT::t_v_array v;
-  typename ArrayTypes<DeviceType>::t_float_1d rmass;
-  typename ArrayTypes<DeviceType>::t_float_1d mass;
-  typename ArrayTypes<DeviceType>::t_int_1d type;
-  typename ArrayTypes<DeviceType>::t_int_1d mask;
-  */
-
-  //typename AT::t_neighbors_2d d_neighbors;
-  //typename AT::t_int_1d d_ilist;
-  //typename AT::t_int_1d d_numneigh;
-
-  //DAT::tdual_float_2d k_result;
-  //typename AT::t_float_2d d_result;
 
   int max_neighs, inum, chunk_size, chunk_offset;
   int host_flag;
@@ -103,11 +86,6 @@ template <class DeviceType> class ComputeGaussianGridLocalKokkos : public Comput
   typename AT::t_float_2d d_alocal;
 
   // triclinic vars
-  /*
-  xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0];
-  xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1];
-  xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2];
-  */
   double h0, h1, h2, h3, h4, h5;
   double lo0, lo1, lo2;
 };
diff --git a/src/KOKKOS/compute_sna_grid_kokkos.cpp b/src/KOKKOS/compute_sna_grid_kokkos.cpp
index 8a05ba7901..197234cf1d 100644
--- a/src/KOKKOS/compute_sna_grid_kokkos.cpp
+++ b/src/KOKKOS/compute_sna_grid_kokkos.cpp
@@ -23,59 +23,3 @@ template class ComputeSNAGridKokkosHost<LMPHostType>;
 #endif
 
 }
-
-
-
-
-// The following chunk will compile but we're gonna try a wrapper approach like pair snap.
-/*
-#include "compute_sna_grid_kokkos.h"
-
-#include "atom_kokkos.h"
-#include "atom_masks.h"
-#include "comm.h"
-#include "error.h"
-#include "memory_kokkos.h"
-#include "modify.h"
-#include "neigh_list.h"
-#include "neigh_request.h"
-#include "neighbor_kokkos.h"
-#include "sna_kokkos.h"
-#include "update.h"
-
-using namespace LAMMPS_NS;
-
-// ----------------------------------------------------------------------
-
-template<class DeviceType>
-ComputeSNAGridKokkos<DeviceType>::ComputeSNAGridKokkos(LAMMPS *lmp, int narg, char **arg) :
-  ComputeSNAGrid(lmp, narg, arg)
-{
-
-  printf("^^^ inside ComputeSNAGridKokkos constructor\n");
-  kokkosable = 1;
-  atomKK = (AtomKokkos *) atom;
-  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
-  datamask_read = EMPTY_MASK;
-  datamask_modify = EMPTY_MASK;
-
-}
-
-// ----------------------------------------------------------------------
-
-template<class DeviceType>
-ComputeSNAGridKokkos<DeviceType>::~ComputeSNAGridKokkos()
-{
-  if (copymode) return;
-
-
-}
-
-namespace LAMMPS_NS {
-template class ComputeSNAGridKokkos<LMPDeviceType>;
-#ifdef LMP_KOKKOS_GPU
-template class ComputeSNAGridKokkos<LMPHostType>;
-#endif
-}
-*/
-
diff --git a/src/KOKKOS/compute_sna_grid_kokkos.h b/src/KOKKOS/compute_sna_grid_kokkos.h
index 5a81309a4e..8a7d87acbb 100644
--- a/src/KOKKOS/compute_sna_grid_kokkos.h
+++ b/src/KOKKOS/compute_sna_grid_kokkos.h
@@ -29,38 +29,13 @@ ComputeStyle(sna/grid/kk/host,ComputeSNAGridKokkosDevice<LMPHostType>);
 
 #include "compute_sna_grid.h"
 #include "kokkos_type.h"
-//#include "pair_snap.h"
-//#include "kokkos_type.h"
-//#include "neigh_list_kokkos.h"
 #include "sna_kokkos.h"
-//#include "pair_kokkos.h"
 
 namespace LAMMPS_NS {
 
 // Routines for both the CPU and GPU backend
-//template<int NEIGHFLAG, int EVFLAG>
-//struct TagPairSNAPComputeForce{};
-
 
 // GPU backend only
-/*
-struct TagPairSNAPComputeNeigh{};
-struct TagPairSNAPComputeCayleyKlein{};
-struct TagPairSNAPPreUi{};
-struct TagPairSNAPComputeUiSmall{}; // more parallelism, more divergence
-struct TagPairSNAPComputeUiLarge{}; // less parallelism, no divergence
-struct TagPairSNAPTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist
-struct TagPairSNAPComputeZi{};
-struct TagPairSNAPBeta{};
-struct TagPairSNAPComputeBi{};
-struct TagPairSNAPComputeYi{};
-struct TagPairSNAPComputeYiWithZlist{};
-template<int dir>
-struct TagPairSNAPComputeFusedDeidrjSmall{}; // more parallelism, more divergence
-template<int dir>
-struct TagPairSNAPComputeFusedDeidrjLarge{}; // less parallelism, no divergence
-*/
-//struct TagPairSNAPPreUi{};
 struct TagCSNAGridComputeNeigh{};
 struct TagCSNAGridComputeCayleyKlein{};
 struct TagCSNAGridPreUi{};
@@ -70,26 +45,11 @@ struct TagCSNAGridTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero yl
 template <bool chemsnap> struct TagCSNAGridComputeZi{};
 template <bool chemsnap> struct TagCSNAGridComputeBi{};
 struct TagCSNAGridLocalFill{}; // fill the gridlocal array
-//struct TagCSNAGridLocalFill2{}; // fill the gridlocal array using same kinda loop as ComputeForce
 
 struct TagComputeSNAGridLoop{};
 struct TagComputeSNAGrid3D{};
-//struct TagCSNAGridTeam{};
 
 // CPU backend only
-/*
-struct TagPairSNAPComputeNeighCPU{};
-struct TagPairSNAPPreUiCPU{};
-struct TagPairSNAPComputeUiCPU{};
-struct TagPairSNAPTransformUiCPU{};
-struct TagPairSNAPComputeZiCPU{};
-struct TagPairSNAPBetaCPU{};
-struct TagPairSNAPComputeBiCPU{};
-struct TagPairSNAPZeroYiCPU{};
-struct TagPairSNAPComputeYiCPU{};
-struct TagPairSNAPComputeDuidrjCPU{};
-struct TagPairSNAPComputeDeidrjCPU{};
-*/
 struct TagComputeSNAGridLoopCPU{};
 
 //template<class DeviceType>
@@ -180,7 +140,6 @@ class ComputeSNAGridKokkos : public ComputeSNAGrid {
 
   // operator function for example team policy
   //KOKKOS_INLINE_FUNCTION
-  //void operator() (TagCSNAGridTeam, const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridTeam>::member_type& team) const;
 
   KOKKOS_INLINE_FUNCTION
   void operator() (TagComputeSNAGridLoop, const int& ) const;
@@ -191,9 +150,6 @@ class ComputeSNAGridKokkos : public ComputeSNAGrid {
   KOKKOS_INLINE_FUNCTION
   void operator() (TagCSNAGridComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridComputeNeigh>::member_type& team) const;
 
-  // PrintNeigh
-  //void operator() (TagPrintNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagPrintNeigh>::member_type& team) const;
-
   // 3D case - used by parallel_for
   KOKKOS_INLINE_FUNCTION
   void operator()(TagComputeSNAGrid3D, const int& iz, const int& iy, const int& ix) const;
@@ -294,11 +250,6 @@ class ComputeSNAGridKokkos : public ComputeSNAGrid {
   class DomainKokkos *domainKK;
 
   // triclinic vars
-  /*
-  xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0];
-  xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1];
-  xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2];
-  */
   double h0, h1, h2, h3, h4, h5;
   double lo0, lo1, lo2;
 
@@ -344,45 +295,3 @@ class ComputeSNAGridKokkosHost : public ComputeSNAGridKokkos<DeviceType, SNAP_KO
 
 #endif
 #endif
-
-// The following will compile with the chunk in cpp file but we're gonna try wrapper like pair snap.
-/*
-#ifdef COMPUTE_CLASS
-// clang-format off
-ComputeStyle(sna/grid/kk,ComputeSNAGridKokkos<LMPDeviceType>);
-ComputeStyle(sna/grid/kk/device,ComputeSNAGridKokkos<LMPDeviceType>);
-ComputeStyle(sna/grid/kk/host,ComputeSNAGridKokkos<LMPHostType>);
-// clang-format on
-#else
-
-// clang-format off
-#ifndef LMP_COMPUTE_SNA_GRID_KOKKOS_H
-#define LMP_COMPUTE_SNA_GRID_KOKKOS_H
-
-#include "compute_sna_grid.h"
-#include "kokkos_type.h"
-
-namespace LAMMPS_NS {
-
-//template<int CSTYLE, int NCOL>
-//struct TagComputeCoordAtom{};
-
-template<class DeviceType>
-class ComputeSNAGridKokkos : public ComputeSNAGrid {
- public:
-  typedef DeviceType device_type;
-  typedef ArrayTypes<DeviceType> AT;
-
-  ComputeSNAGridKokkos(class LAMMPS *, int, char **);
-  ~ComputeSNAGridKokkos() override;
-
- private:
-
-};
-
-}
-
-#endif
-#endif
-*/
-
diff --git a/src/KOKKOS/compute_sna_grid_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
index 432dbe9f98..665a1b67e7 100644
--- a/src/KOKKOS/compute_sna_grid_kokkos_impl.h
+++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
@@ -27,7 +27,6 @@
 #include "neigh_list.h"
 #include "neigh_request.h"
 #include "neighbor_kokkos.h"
-//#include "sna_kokkos.h"
 #include "domain.h"
 #include "domain_kokkos.h"
 #include "sna.h"
@@ -131,14 +130,10 @@ ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::ComputeSNAGridKokkos
 template<class DeviceType, typename real_type, int vector_length>
 ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::~ComputeSNAGridKokkos()
 {
-  //printf(">>> ComputeSNAGridKokkos destruct begin copymode %d\n", copymode);
   if (copymode) return;
-  //printf(">>> After copymode\n");
 
   memoryKK->destroy_kokkos(k_cutsq,cutsq);
-  //memoryKK->destroy_kokkos(k_grid,grid);
   memoryKK->destroy_kokkos(k_gridall, gridall);
-  //memoryKK->destroy_kokkos(k_gridlocal, gridlocal);
 }
 
 // Setup
@@ -161,7 +156,6 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::setup()
   array = gridall;
 
   d_gridlocal = k_gridlocal.template view<DeviceType>();
-  //d_grid = k_grid.template view<DeviceType>();
   d_gridall = k_gridall.template view<DeviceType>();
 }
 
@@ -199,23 +193,14 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::compute_array()
 
   // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user
   // `total_range` is the number of grid points which may be larger than chunk size.
-  //printf(">>> total_range: %d\n", total_range);
   chunk_size = MIN(chunksize, total_range);
   chunk_offset = 0;
-  //snaKK.grow_rij(chunk_size, ntotal);
   snaKK.grow_rij(chunk_size, max_neighs);
 
-  //chunk_size = total_range;
-
   // Pre-compute ceil(chunk_size / vector_length) for code cleanliness
   const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length;
 
   if (triclinic) {
-    /*
-    xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0];
-    xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1];
-    xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2];
-    */
     h0 = domain->h[0];
     h1 = domain->h[1];
     h2 = domain->h[2];
@@ -232,7 +217,6 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::compute_array()
     if (chunk_size > total_range - chunk_offset)
       chunk_size = total_range - chunk_offset;
 
-    //printf(">>> chunk_offset: %d\n", chunk_offset);
 
     //ComputeNeigh
     {
@@ -333,9 +317,6 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::compute_array()
   k_gridlocal.template modify<DeviceType>();
   k_gridlocal.template sync<LMPHostType>();
 
-  //k_grid.template modify<DeviceType>();
-  //k_grid.template sync<LMPHostType>();
-
   k_gridall.template modify<DeviceType>();
   k_gridall.template sync<LMPHostType>();
 }
@@ -396,7 +377,6 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
 
   // index ii already captures the proper grid point
   //int igrid = iz * (nx * ny) + iy * nx + ix;
-  //printf("%d %d\n", ii, igrid);
 
   // grid2x converts igrid to ix,iy,iz like we've done before
   // multiply grid integers by grid spacing delx, dely, delz
@@ -414,11 +394,6 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
     // Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed
 
     // Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats.
-    /*
-    xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0];
-    xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1];
-    xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2];
-    */
     xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0;
     xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1;
     xgrid[2] = h2*xgrid[2] + lo2;
@@ -436,14 +411,6 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
   if (chemflag) ielem = d_map[itype];
   //const double radi = d_radelem[ielem];
 
-  // We need a DomainKokkos::lamda2x parallel for loop here, but let's ignore for now.
-  // The purpose here is to transform for triclinic boxes.
-  /*
-  if (triclinic){
-    printf("We are triclinic %f %f %f\n", xtmp, ytmp, ztmp);
-  }
-  */
-
   // Compute the number of neighbors, store rsq
   int ninside = 0;
 
@@ -464,29 +431,6 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
       ninside++;
   }
 
-  /*
-  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,ntotal),
-    [&] (const int j, int& count) {
-    const F_FLOAT dx = x(j,0) - xtmp;
-    const F_FLOAT dy = x(j,1) - ytmp;
-    const F_FLOAT dz = x(j,2) - ztmp;
-
-    int jtype = type(j);
-    const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
-
-    // don't include atoms that share location with grid point
-    if (rsq >= rnd_cutsq(itype,jtype) || rsq < 1e-20) {
-      jtype = -1; // use -1 to signal it's outside the radius
-    }
-
-    type_cache[j] = jtype;
-
-    if (jtype >= 0)
-     count++;
-
-  }, ninside);
-  */
-
   d_ninside(ii) = ninside;
 
   // TODO: Adjust for multi-element, currently we set jelem = 0 regardless of type.
@@ -521,75 +465,6 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
       offset++;
     }
   }
-
-  /*
-  int offset = 0;
-  for (int j = 0; j < ntotal; j++){
-    const int jtype = type_cache[j];
-    if (jtype >= 0) {
-      printf(">>> offset: %d\n", offset);
-      const F_FLOAT dx = x(j,0) - xtmp;
-      const F_FLOAT dy = x(j,1) - ytmp;
-      const F_FLOAT dz = x(j,2) - ztmp;
-      int jtype = type(j);
-      int jelem = 0;
-      if (chemflag) jelem = d_map[jtype];
-      snaKK.rij(ii,offset,0) = static_cast<real_type>(dx);
-      snaKK.rij(ii,offset,1) = static_cast<real_type>(dy);
-      snaKK.rij(ii,offset,2) = static_cast<real_type>(dz);
-      // pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp
-      // actually since the views here have values starting at 0, let's use jelem
-      snaKK.wj(ii,offset) = static_cast<real_type>(d_wjelem[jelem]);
-      snaKK.rcutij(ii,offset) = static_cast<real_type>((2.0 * d_radelem[jelem])*rcutfac);
-      snaKK.inside(ii,offset) = j;
-      if (switchinnerflag) {
-        snaKK.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]);
-        snaKK.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]);
-      }
-      if (chemflag)
-        snaKK.element(ii,offset) = jelem;
-      else
-        snaKK.element(ii,offset) = 0;
-      offset++;
-    }
-  }
-  */
-
-  /*
-  Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team,ntotal),
-    [&] (const int j, int& offset, bool final) {
-
-    const int jtype = type_cache[j];
-
-    if (jtype >= 0) {
-      if (final) {
-        const F_FLOAT dx = x(j,0) - xtmp;
-        const F_FLOAT dy = x(j,1) - ytmp;
-        const F_FLOAT dz = x(j,2) - ztmp;
-        int jtype = type(j);
-        int jelem = 0;
-        if (chemflag) jelem = d_map[jtype];
-        snaKK.rij(ii,offset,0) = static_cast<real_type>(dx);
-        snaKK.rij(ii,offset,1) = static_cast<real_type>(dy);
-        snaKK.rij(ii,offset,2) = static_cast<real_type>(dz);
-        // pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp
-        // actually since the views here have values starting at 0, let's use jelem
-        snaKK.wj(ii,offset) = static_cast<real_type>(d_wjelem[jelem]);
-        snaKK.rcutij(ii,offset) = static_cast<real_type>((2.0 * d_radelem[jelem])*rcutfac);
-        snaKK.inside(ii,offset) = j;
-        if (switchinnerflag) {
-          snaKK.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]);
-          snaKK.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]);
-        }
-        if (chemflag)
-          snaKK.element(ii,offset) = jelem;
-        else
-          snaKK.element(ii,offset) = 0;
-      }
-      offset++;
-    }
-  });
-  */
 }
 
 /* ----------------------------------------------------------------------
@@ -821,11 +696,6 @@ void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (Tag
     // Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed
 
     // Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats.
-    /*
-    xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0];
-    xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1];
-    xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2];
-    */
     xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0;
     xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1;
     xgrid[2] = h2*xgrid[2] + lo2;
diff --git a/src/KOKKOS/compute_sna_grid_local_kokkos.cpp b/src/KOKKOS/compute_sna_grid_local_kokkos.cpp
index 087dbc5fd5..3835a56bf8 100644
--- a/src/KOKKOS/compute_sna_grid_local_kokkos.cpp
+++ b/src/KOKKOS/compute_sna_grid_local_kokkos.cpp
@@ -23,59 +23,3 @@ template class ComputeSNAGridLocalKokkosHost<LMPHostType>;
 #endif
 
 }
-
-
-
-
-// The following chunk will compile but we're gonna try a wrapper approach like pair snap.
-/*
-#include "compute_sna_grid_kokkos.h"
-
-#include "atom_kokkos.h"
-#include "atom_masks.h"
-#include "comm.h"
-#include "error.h"
-#include "memory_kokkos.h"
-#include "modify.h"
-#include "neigh_list.h"
-#include "neigh_request.h"
-#include "neighbor_kokkos.h"
-#include "sna_kokkos.h"
-#include "update.h"
-
-using namespace LAMMPS_NS;
-
-// ----------------------------------------------------------------------
-
-template<class DeviceType>
-ComputeSNAGridKokkos<DeviceType>::ComputeSNAGridKokkos(LAMMPS *lmp, int narg, char **arg) :
-  ComputeSNAGrid(lmp, narg, arg)
-{
-
-  printf("^^^ inside ComputeSNAGridKokkos constructor\n");
-  kokkosable = 1;
-  atomKK = (AtomKokkos *) atom;
-  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
-  datamask_read = EMPTY_MASK;
-  datamask_modify = EMPTY_MASK;
-
-}
-
-// ----------------------------------------------------------------------
-
-template<class DeviceType>
-ComputeSNAGridKokkos<DeviceType>::~ComputeSNAGridKokkos()
-{
-  if (copymode) return;
-
-
-}
-
-namespace LAMMPS_NS {
-template class ComputeSNAGridKokkos<LMPDeviceType>;
-#ifdef LMP_KOKKOS_GPU
-template class ComputeSNAGridKokkos<LMPHostType>;
-#endif
-}
-*/
-
diff --git a/src/KOKKOS/compute_sna_grid_local_kokkos.h b/src/KOKKOS/compute_sna_grid_local_kokkos.h
index 754d4e36af..2ffc050b2d 100644
--- a/src/KOKKOS/compute_sna_grid_local_kokkos.h
+++ b/src/KOKKOS/compute_sna_grid_local_kokkos.h
@@ -29,38 +29,13 @@ ComputeStyle(sna/grid/local/kk/host,ComputeSNAGridLocalKokkosDevice<LMPHostType>
 
 #include "compute_sna_grid_local.h"
 #include "kokkos_type.h"
-//#include "pair_snap.h"
-//#include "kokkos_type.h"
-//#include "neigh_list_kokkos.h"
 #include "sna_kokkos.h"
-//#include "pair_kokkos.h"
 
 namespace LAMMPS_NS {
 
 // Routines for both the CPU and GPU backend
-//template<int NEIGHFLAG, int EVFLAG>
-//struct TagPairSNAPComputeForce{};
-
 
 // GPU backend only
-/*
-struct TagPairSNAPComputeNeigh{};
-struct TagPairSNAPComputeCayleyKlein{};
-struct TagPairSNAPPreUi{};
-struct TagPairSNAPComputeUiSmall{}; // more parallelism, more divergence
-struct TagPairSNAPComputeUiLarge{}; // less parallelism, no divergence
-struct TagPairSNAPTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist
-struct TagPairSNAPComputeZi{};
-struct TagPairSNAPBeta{};
-struct TagPairSNAPComputeBi{};
-struct TagPairSNAPComputeYi{};
-struct TagPairSNAPComputeYiWithZlist{};
-template<int dir>
-struct TagPairSNAPComputeFusedDeidrjSmall{}; // more parallelism, more divergence
-template<int dir>
-struct TagPairSNAPComputeFusedDeidrjLarge{}; // less parallelism, no divergence
-*/
-//struct TagPairSNAPPreUi{};
 struct TagCSNAGridLocalComputeNeigh{};
 struct TagCSNAGridLocalComputeCayleyKlein{};
 struct TagCSNAGridLocalPreUi{};
@@ -70,25 +45,11 @@ struct TagCSNAGridLocalTransformUi{}; // re-order ulisttot from SoA to AoSoA, ze
 template <bool chemsnap> struct TagCSNAGridLocalComputeZi{};
 template <bool chemsnap> struct TagCSNAGridLocalComputeBi{};
 struct TagCSNAGridLocal2Fill{}; // fill the gridlocal array
-//struct TagCSNAGridLocalFill2{}; // fill the gridlocal array using same kinda loop as ComputeForce
 
 struct TagComputeSNAGridLocalLoop{};
 struct TagComputeSNAGridLocal3D{};
 
 // CPU backend only
-/*
-struct TagPairSNAPComputeNeighCPU{};
-struct TagPairSNAPPreUiCPU{};
-struct TagPairSNAPComputeUiCPU{};
-struct TagPairSNAPTransformUiCPU{};
-struct TagPairSNAPComputeZiCPU{};
-struct TagPairSNAPBetaCPU{};
-struct TagPairSNAPComputeBiCPU{};
-struct TagPairSNAPZeroYiCPU{};
-struct TagPairSNAPComputeYiCPU{};
-struct TagPairSNAPComputeDuidrjCPU{};
-struct TagPairSNAPComputeDeidrjCPU{};
-*/
 struct TagComputeSNAGridLocalLoopCPU{};
 
 //template<class DeviceType>
@@ -184,9 +145,6 @@ class ComputeSNAGridLocalKokkos : public ComputeSNAGridLocal {
   KOKKOS_INLINE_FUNCTION
   void operator() (TagCSNAGridLocalComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridLocalComputeNeigh>::member_type& team) const;
 
-  // PrintNeigh
-  //void operator() (TagPrintNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagPrintNeigh>::member_type& team) const;
-
   // 3D case - used by parallel_for
   KOKKOS_INLINE_FUNCTION
   void operator()(TagComputeSNAGridLocal3D, const int& iz, const int& iy, const int& ix) const;
@@ -274,16 +232,6 @@ class ComputeSNAGridLocalKokkos : public ComputeSNAGridLocal {
   DAT::tdual_float_2d k_alocal;
   typename AT::t_float_2d d_alocal;
 
-  /*
-  DAT::tdual_float_2d k_grid;
-  DAT::tdual_float_2d k_gridall;
-  typename AT::t_float_2d d_grid;
-  typename AT::t_float_2d d_gridall;
-
-  DAT::tdual_float_4d k_gridlocal;
-  typename AT::t_float_4d d_gridlocal;
-  */
-
 
   // Utility routine which wraps computing per-team scratch size requirements for
   // ComputeNeigh, ComputeUi, and ComputeFusedDeidrj
@@ -293,11 +241,6 @@ class ComputeSNAGridLocalKokkos : public ComputeSNAGridLocal {
   class DomainKokkos *domainKK;
 
   // triclinic vars
-  /*
-  xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0];
-  xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1];
-  xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2];
-  */
   double h0, h1, h2, h3, h4, h5;
   double lo0, lo1, lo2;
 
@@ -320,7 +263,6 @@ class ComputeSNAGridLocalKokkosDevice : public ComputeSNAGridLocalKokkos<DeviceT
   ComputeSNAGridLocalKokkosDevice(class LAMMPS *, int, char **);
 
   void compute_local() override;
-  //void setup() override;
 
 };
 
diff --git a/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h
index 434db594ef..734706d2a3 100644
--- a/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h
+++ b/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h
@@ -27,7 +27,6 @@
 #include "neigh_list.h"
 #include "neigh_request.h"
 #include "neighbor_kokkos.h"
-//#include "sna_kokkos.h"
 #include "domain.h"
 #include "domain_kokkos.h"
 #include "sna.h"
@@ -131,15 +130,10 @@ ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::ComputeSNAGridL
 template<class DeviceType, typename real_type, int vector_length>
 ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::~ComputeSNAGridLocalKokkos()
 {
-  //printf(">>> ComputeSNAGridLocalKokkos destruct begin copymode %d\n", copymode);
   if (copymode) return;
-  //printf(">>> After copymode\n");
 
   memoryKK->destroy_kokkos(k_cutsq,cutsq);
   memoryKK->destroy_kokkos(k_alocal,alocal);
-  //memoryKK->destroy_kokkos(k_grid,grid);
-  //memoryKK->destroy_kokkos(k_gridall, gridall);
-  //memoryKK->destroy_kokkos(k_gridlocal, gridlocal);
 }
 
 // Setup
@@ -148,28 +142,11 @@ template<class DeviceType, typename real_type, int vector_length>
 void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::setup()
 {
 
-  // Do not call ComputeGrid::setup(), we don't wanna allocate the grid array there.
-  // Instead, call ComputeGrid::set_grid_global and set_grid_local to set the n indices.
-
-  //ComputeGrid::set_grid_global();
-  //ComputeGrid::set_grid_local();
-  //ComputeSNAGridLocal::setup();
   ComputeGridLocal::setup();
 
   // allocate arrays
-  //memoryKK->create_kokkos(k_gridall, gridall, size_array_rows, size_array_cols, "grid:gridall");
   memoryKK->create_kokkos(k_alocal, alocal, size_local_rows, size_local_cols, "grid:alocal");
-
-  // do not use or allocate gridlocal for now
-
-  //gridlocal_allocated = 0;
-  //array = gridall;
-
   array_local = alocal;
-
-  //d_gridlocal = k_gridlocal.template view<DeviceType>();
-  //d_grid = k_grid.template view<DeviceType>();
-  //d_gridall = k_gridall.template view<DeviceType>();
   d_alocal = k_alocal.template view<DeviceType>();
 }
 
@@ -183,8 +160,6 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::compute_lo
     return;
   }
 
-  //printf(">>> ComputeSNAGridLocalKokkos::compute_local begin\n");
-
   copymode = 1;
 
   zlen = nzhi-nzlo+1;
@@ -205,12 +180,10 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::compute_lo
 
   ntotal = atomKK->nlocal + atomKK->nghost;
   // Allocate view for number of neighbors per grid point
-  //printf(">>> total_range: %d\n", total_range);
   MemKK::realloc_kokkos(d_ninside,"ComputeSNAGridLocalKokkos:ninside",total_range);
 
   // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user
   // `total_range` is the number of grid points which may be larger than chunk size.
-  //printf(">>> total_range: %d\n", total_range);
   chunk_size = MIN(chunksize, total_range);
   chunk_offset = 0;
   //snaKK.grow_rij(chunk_size, ntotal);
@@ -222,11 +195,6 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::compute_lo
   const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length;
 
   if (triclinic) {
-    /*
-    xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0];
-    xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1];
-    xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2];
-    */
     h0 = domain->h[0];
     h1 = domain->h[1];
     h2 = domain->h[2];
@@ -243,7 +211,6 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::compute_lo
     if (chunk_size > total_range - chunk_offset)
       chunk_size = total_range - chunk_offset;
 
-    //printf(">>> chunk_offset: %d\n", chunk_offset);
 
     //ComputeNeigh
     {
@@ -401,7 +368,6 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator()
 
   // index ii already captures the proper grid point
   //int igrid = iz * (nx * ny) + iy * nx + ix;
-  //printf("%d %d\n", ii, igrid);
 
   // grid2x converts igrid to ix,iy,iz like we've done before
   // multiply grid integers by grid spacing delx, dely, delz
@@ -419,11 +385,6 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator()
     // Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed
 
     // Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats.
-    /*
-    xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0];
-    xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1];
-    xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2];
-    */
     xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0;
     xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1;
     xgrid[2] = h2*xgrid[2] + lo2;
@@ -454,14 +415,6 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator()
   if (chemflag) ielem = d_map[itype];
   //const double radi = d_radelem[ielem];
 
-  // We need a DomainKokkos::lamda2x parallel for loop here, but let's ignore for now.
-  // The purpose here is to transform for triclinic boxes.
-  /*
-  if (triclinic){
-    printf("We are triclinic %f %f %f\n", xtmp, ytmp, ztmp);
-  }
-  */
-
   // Compute the number of neighbors, store rsq
   int ninside = 0;
 
@@ -482,29 +435,6 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator()
       ninside++;
   }
 
-  /*
-  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,ntotal),
-    [&] (const int j, int& count) {
-    const F_FLOAT dx = x(j,0) - xtmp;
-    const F_FLOAT dy = x(j,1) - ytmp;
-    const F_FLOAT dz = x(j,2) - ztmp;
-
-    int jtype = type(j);
-    const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
-
-    // don't include atoms that share location with grid point
-    if (rsq >= rnd_cutsq(itype,jtype) || rsq < 1e-20) {
-      jtype = -1; // use -1 to signal it's outside the radius
-    }
-
-    type_cache[j] = jtype;
-
-    if (jtype >= 0)
-     count++;
-
-  }, ninside);
-  */
-
   d_ninside(ii) = ninside;
 
   // TODO: Adjust for multi-element, currently we set jelem = 0 regardless of type.
@@ -539,75 +469,6 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator()
       offset++;
     }
   }
-
-  /*
-  int offset = 0;
-  for (int j = 0; j < ntotal; j++){
-    const int jtype = type_cache[j];
-    if (jtype >= 0) {
-      printf(">>> offset: %d\n", offset);
-      const F_FLOAT dx = x(j,0) - xtmp;
-      const F_FLOAT dy = x(j,1) - ytmp;
-      const F_FLOAT dz = x(j,2) - ztmp;
-      int jtype = type(j);
-      int jelem = 0;
-      if (chemflag) jelem = d_map[jtype];
-      snaKK.rij(ii,offset,0) = static_cast<real_type>(dx);
-      snaKK.rij(ii,offset,1) = static_cast<real_type>(dy);
-      snaKK.rij(ii,offset,2) = static_cast<real_type>(dz);
-      // pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp
-      // actually since the views here have values starting at 0, let's use jelem
-      snaKK.wj(ii,offset) = static_cast<real_type>(d_wjelem[jelem]);
-      snaKK.rcutij(ii,offset) = static_cast<real_type>((2.0 * d_radelem[jelem])*rcutfac);
-      snaKK.inside(ii,offset) = j;
-      if (switchinnerflag) {
-        snaKK.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]);
-        snaKK.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]);
-      }
-      if (chemflag)
-        snaKK.element(ii,offset) = jelem;
-      else
-        snaKK.element(ii,offset) = 0;
-      offset++;
-    }
-  }
-  */
-
-  /*
-  Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team,ntotal),
-    [&] (const int j, int& offset, bool final) {
-
-    const int jtype = type_cache[j];
-
-    if (jtype >= 0) {
-      if (final) {
-        const F_FLOAT dx = x(j,0) - xtmp;
-        const F_FLOAT dy = x(j,1) - ytmp;
-        const F_FLOAT dz = x(j,2) - ztmp;
-        int jtype = type(j);
-        int jelem = 0;
-        if (chemflag) jelem = d_map[jtype];
-        snaKK.rij(ii,offset,0) = static_cast<real_type>(dx);
-        snaKK.rij(ii,offset,1) = static_cast<real_type>(dy);
-        snaKK.rij(ii,offset,2) = static_cast<real_type>(dz);
-        // pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp
-        // actually since the views here have values starting at 0, let's use jelem
-        snaKK.wj(ii,offset) = static_cast<real_type>(d_wjelem[jelem]);
-        snaKK.rcutij(ii,offset) = static_cast<real_type>((2.0 * d_radelem[jelem])*rcutfac);
-        snaKK.inside(ii,offset) = j;
-        if (switchinnerflag) {
-          snaKK.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]);
-          snaKK.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]);
-        }
-        if (chemflag)
-          snaKK.element(ii,offset) = jelem;
-        else
-          snaKK.element(ii,offset) = 0;
-      }
-      offset++;
-    }
-  });
-  */
 }
 
 /* ----------------------------------------------------------------------
@@ -839,22 +700,11 @@ void ComputeSNAGridLocalKokkos<DeviceType, real_type, vector_length>::operator()
     // Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed
 
     // Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats.
-    /*
-    xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0];
-    xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1];
-    xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2];
-    */
     xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0;
     xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1;
     xgrid[2] = h2*xgrid[2] + lo2;
   }
 
-  //const F_FLOAT xtmp = xgrid[0];
-  //const F_FLOAT ytmp = xgrid[1];
-  //const F_FLOAT ztmp = xgrid[2];
-  //d_gridall(igrid,0) = xtmp;
-  //d_gridall(igrid,1) = ytmp;
-  //d_gridall(igrid,2) = ztmp;
 
   const auto idxb_max = snaKK.idxb_max;
 
diff --git a/src/ML-SNAP/compute_gaussian_grid_local.cpp b/src/ML-SNAP/compute_gaussian_grid_local.cpp
index 81286f9d81..8a747a7908 100644
--- a/src/ML-SNAP/compute_gaussian_grid_local.cpp
+++ b/src/ML-SNAP/compute_gaussian_grid_local.cpp
@@ -89,14 +89,12 @@ ComputeGaussianGridLocal::ComputeGaussianGridLocal(LAMMPS *lmp, int narg, char *
 
 ComputeGaussianGridLocal::~ComputeGaussianGridLocal()
 {
-  //printf(">>> ComputeGaussianGridLocal begin destruct copymode %d\n", copymode);
   if (copymode) return;
   memory->destroy(radelem);
   memory->destroy(sigmaelem);
   memory->destroy(prefacelem);
   memory->destroy(argfacelem);
   memory->destroy(cutsq);
-  //printf(">>> ComputeGaussianGridLocal end destruct\n");
 }
 
 /* ---------------------------------------------------------------------- */
@@ -111,8 +109,6 @@ void ComputeGaussianGridLocal::init()
 
 void ComputeGaussianGridLocal::compute_local()
 {
-  //printf(">>> compute_local CPU\n");
-  //printf(">>> size_local_cols_base, size_local_cols: %d %d\n", size_local_cols_base, size_local_cols);
   invoked_local = update->ntimestep;
 
   // compute gaussian for each gridpoint
diff --git a/src/ML-SNAP/compute_grid.cpp b/src/ML-SNAP/compute_grid.cpp
index dce2ab0283..12135c705d 100644
--- a/src/ML-SNAP/compute_grid.cpp
+++ b/src/ML-SNAP/compute_grid.cpp
@@ -88,7 +88,6 @@ void ComputeGrid::grid2x(int igrid, double *x)
   x[2] = iz * delz;
 
   if (triclinic) domain->lamda2x(x, x);
-  //printf(">>>>> ComputeGrid::grid2x\n");
 }
 
 /* ----------------------------------------------------------------------
@@ -104,7 +103,6 @@ void ComputeGrid::assign_coords_all()
     gridall[igrid][1] = x[1];
     gridall[igrid][2] = x[2];
   }
-  //printf(">>>>> ComputeGrid::assign_coords_all\n");
 }
 
 /* ----------------------------------------------------------------------
@@ -113,7 +111,6 @@ void ComputeGrid::assign_coords_all()
 
 void ComputeGrid::allocate()
 {
-  //printf(">>> ComputeGrid::allocate\n");
   // allocate arrays
   memory->create(grid, size_array_rows, size_array_cols, "grid:grid");
   memory->create(gridall, size_array_rows, size_array_cols, "grid:gridall");
diff --git a/src/ML-SNAP/compute_grid_local.cpp b/src/ML-SNAP/compute_grid_local.cpp
index 92bb556c50..80feb75be5 100644
--- a/src/ML-SNAP/compute_grid_local.cpp
+++ b/src/ML-SNAP/compute_grid_local.cpp
@@ -61,9 +61,7 @@ ComputeGridLocal::ComputeGridLocal(LAMMPS *lmp, int narg, char **arg) :
 
 ComputeGridLocal::~ComputeGridLocal()
 {
-  //printf(">>> ComputeGridLocal begin destruct\n");
   deallocate();
-  //printf(">>> ComputeGridLocal end destruct\n");
 }
 
 /* ---------------------------------------------------------------------- */
@@ -75,7 +73,6 @@ void ComputeGridLocal::setup()
   set_grid_local();
   allocate();
   assign_coords();
-  //printf(">>> ComputeGridLocal setup nx ny nz %d %d %d %d %d %d\n", nxlo, nxhi, nylo, nyhi, nzlo, nzhi);
 }
 
 /* ----------------------------------------------------------------------
@@ -109,7 +106,6 @@ void ComputeGridLocal::grid2lamda(int ix, int iy, int iz, double *x)
 
 void ComputeGridLocal::allocate()
 {
-  //printf(">>> ComputeGridLocal::allocate %d %d\n", size_local_rows, size_local_cols);
   if (nxlo <= nxhi && nylo <= nyhi && nzlo <= nzhi) {
     gridlocal_allocated = 1;
     memory->create(alocal, size_local_rows, size_local_cols, "compute/grid/local:alocal");
@@ -123,14 +119,12 @@ void ComputeGridLocal::allocate()
 
 void ComputeGridLocal::deallocate()
 {
-  //printf(">>> ComputeGridLocal::deallocate begin gridlocal_allocated %d copymode %d\n", gridlocal_allocated, copymode);
   if (copymode) return;
 
   if (gridlocal_allocated) {
     gridlocal_allocated = 0;
     memory->destroy(alocal);
   }
-  //printf(">>> ComputeGridLocal:: deallocate end\n");
   array_local = nullptr;
 }
 
@@ -186,8 +180,6 @@ void ComputeGridLocal::set_grid_local()
   //   the 2 equality if tests ensure a consistent decision
   //   as to which proc owns it
 
-  //printf(">>> ComputeGridLocal set_grid_local\n");
-
   double xfraclo, xfrachi, yfraclo, yfrachi, zfraclo, zfrachi;
 
   if (comm->layout != Comm::LAYOUT_TILED) {

From c0be84356ebc49cdde896db41c704758e0486077 Mon Sep 17 00:00:00 2001
From: Lenz Fiedler <l.fiedler@hzdr.de>
Date: Wed, 22 Jan 2025 08:13:36 +0100
Subject: [PATCH 50/51] Removed debugging output, added contributor, added
 files to Install.sh

---
 src/KOKKOS/Install.sh                           | 1 +
 src/KOKKOS/compute_sna_grid_local_kokkos_impl.h | 3 +--
 src/KOKKOS/pair_mliap_kokkos.cpp                | 1 -
 3 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/KOKKOS/Install.sh b/src/KOKKOS/Install.sh
index 3da88f2fc4..efbf7bfaff 100755
--- a/src/KOKKOS/Install.sh
+++ b/src/KOKKOS/Install.sh
@@ -125,6 +125,7 @@ action compute_sna_grid_kokkos_impl.h compute_sna_grid.cpp
 action compute_sna_grid_local_kokkos.cpp compute_sna_grid_local.cpp
 action compute_sna_grid_local_kokkos.h compute_sna_grid_local.h
 action compute_sna_grid_local_kokkos_impl.h compute_sna_grid_local.cpp
+action compute_gaussian_grid_local_kokkos.h compute_gaussian_grid_local_kokkos.cpp
 action compute_temp_deform_kokkos.cpp
 action compute_temp_deform_kokkos.h
 action compute_temp_kokkos.cpp
diff --git a/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h
index 734706d2a3..01bb2b427b 100644
--- a/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h
+++ b/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h
@@ -11,8 +11,7 @@
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
-   Contributing authors: Christian Trott (SNL), Stan Moore (SNL),
-                         Evan Weinberg (NVIDIA)
+   Contributing authors: Andrew Rohskopf (SNL)
 ------------------------------------------------------------------------- */
 
 #include "compute_sna_grid_local_kokkos.h"
diff --git a/src/KOKKOS/pair_mliap_kokkos.cpp b/src/KOKKOS/pair_mliap_kokkos.cpp
index 8b9305d48c..6c98399416 100644
--- a/src/KOKKOS/pair_mliap_kokkos.cpp
+++ b/src/KOKKOS/pair_mliap_kokkos.cpp
@@ -233,7 +233,6 @@ void PairMLIAPKokkos<DeviceType>::coeff(int narg, char **arg) {
   // map[i] = which element the Ith atom type is, -1 if not mapped
   // map[0] is not used
 
-  //printf(">>> ntypes: %d\n", atom->ntypes);
   for (int i = 1; i <= atom->ntypes; i++) {
     char* elemname = elemtypes[i-1];
     int jelem;

From 6b4ecfd719af983e65910dd023c65d403f7ad846 Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Wed, 22 Jan 2025 15:33:09 -0700
Subject: [PATCH 51/51] Fix issues with GNU Make build

---
 src/.gitignore        | 2 ++
 src/KOKKOS/Install.sh | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/.gitignore b/src/.gitignore
index c1f6b6e892..45f7a9f1a0 100644
--- a/src/.gitignore
+++ b/src/.gitignore
@@ -252,6 +252,8 @@
 /*rheo*.cpp
 /*rheo*.h
 
+/compute_gaussian_grid_local.cpp
+/compute_gaussian_grid_local.h
 /compute_grid.cpp
 /compute_grid.h
 /compute_grid_local.cpp
diff --git a/src/KOKKOS/Install.sh b/src/KOKKOS/Install.sh
index 191be47ff6..d34d5eb9ee 100755
--- a/src/KOKKOS/Install.sh
+++ b/src/KOKKOS/Install.sh
@@ -125,7 +125,8 @@ action compute_sna_grid_kokkos_impl.h compute_sna_grid.cpp
 action compute_sna_grid_local_kokkos.cpp compute_sna_grid_local.cpp
 action compute_sna_grid_local_kokkos.h compute_sna_grid_local.h
 action compute_sna_grid_local_kokkos_impl.h compute_sna_grid_local.cpp
-action compute_gaussian_grid_local_kokkos.h compute_gaussian_grid_local_kokkos.cpp
+action compute_gaussian_grid_local_kokkos.cpp compute_gaussian_grid_local.cpp
+action compute_gaussian_grid_local_kokkos.h compute_gaussian_grid_local.h
 action compute_temp_deform_kokkos.cpp
 action compute_temp_deform_kokkos.h
 action compute_temp_kokkos.cpp