Add all SNAP computations and 4D view Kokkos memory allocator

2023-04-08 11:19:07 -06:00
parent de4dbec661
commit 212b864052
7 changed files with 1118 additions and 57 deletions
--- a/src/KOKKOS/compute_sna_grid_kokkos.cpp
+++ b/src/KOKKOS/compute_sna_grid_kokkos.cpp
@ -23,3 +23,59 @@ template class ComputeSNAGridKokkosHost<LMPHostType>;
 #endif

 }
+
+
+
+
+// The following chunk will compile but we're gonna try a wrapper approach like pair snap.
+/*
+#include "compute_sna_grid_kokkos.h"
+
+#include "atom_kokkos.h"
+#include "atom_masks.h"
+#include "comm.h"
+#include "error.h"
+#include "memory_kokkos.h"
+#include "modify.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "neighbor_kokkos.h"
+#include "sna_kokkos.h"
+#include "update.h"
+
+using namespace LAMMPS_NS;
+
+// ----------------------------------------------------------------------
+
+template<class DeviceType>
+ComputeSNAGridKokkos<DeviceType>::ComputeSNAGridKokkos(LAMMPS *lmp, int narg, char **arg) :
+  ComputeSNAGrid(lmp, narg, arg)
+{
+
+  printf("^^^ inside ComputeSNAGridKokkos constructor\n");
+  kokkosable = 1;
+  atomKK = (AtomKokkos *) atom;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  datamask_read = EMPTY_MASK;
+  datamask_modify = EMPTY_MASK;
+
+}
+
+// ----------------------------------------------------------------------
+
+template<class DeviceType>
+ComputeSNAGridKokkos<DeviceType>::~ComputeSNAGridKokkos()
+{
+  if (copymode) return;
+
+
+}
+
+namespace LAMMPS_NS {
+template class ComputeSNAGridKokkos<LMPDeviceType>;
+#ifdef LMP_KOKKOS_GPU
+template class ComputeSNAGridKokkos<LMPHostType>;
+#endif
+}
+*/
+
--- a/src/KOKKOS/compute_sna_grid_kokkos.h
+++ b/src/KOKKOS/compute_sna_grid_kokkos.h
@ -29,42 +29,233 @@ ComputeStyle(sna/grid/kk/host,ComputeSNAGridKokkosDevice<LMPHostType>);

 #include "compute_sna_grid.h"
 #include "kokkos_type.h"
+//#include "pair_snap.h"
+//#include "kokkos_type.h"
 //#include "neigh_list_kokkos.h"
 #include "sna_kokkos.h"
 //#include "pair_kokkos.h"

 namespace LAMMPS_NS {

+// Routines for both the CPU and GPU backend
+//template<int NEIGHFLAG, int EVFLAG>
+//struct TagPairSNAPComputeForce{};
+
+
+// GPU backend only
+/*
+struct TagPairSNAPComputeNeigh{};
+struct TagPairSNAPComputeCayleyKlein{};
+struct TagPairSNAPPreUi{};
+struct TagPairSNAPComputeUiSmall{}; // more parallelism, more divergence
+struct TagPairSNAPComputeUiLarge{}; // less parallelism, no divergence
+struct TagPairSNAPTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist
+struct TagPairSNAPComputeZi{};
+struct TagPairSNAPBeta{};
+struct TagPairSNAPComputeBi{};
+struct TagPairSNAPTransformBi{}; // re-order blist from AoSoA to AoS
+struct TagPairSNAPComputeYi{};
+struct TagPairSNAPComputeYiWithZlist{};
+template<int dir>
+struct TagPairSNAPComputeFusedDeidrjSmall{}; // more parallelism, more divergence
+template<int dir>
+struct TagPairSNAPComputeFusedDeidrjLarge{}; // less parallelism, no divergence
+*/
+//struct TagPairSNAPPreUi{};
+struct TagCSNAGridComputeNeigh{};
+struct TagCSNAGridComputeCayleyKlein{};
+struct TagCSNAGridPreUi{};
+struct TagCSNAGridComputeUiSmall{}; // more parallelism, more divergence
+struct TagCSNAGridComputeUiLarge{}; // less parallelism, no divergence
+struct TagCSNAGridTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist
+struct TagCSNAGridComputeZi{};
+struct TagCSNAGridComputeBi{};
+
+struct TagComputeSNAGridLoop{};
+struct TagComputeSNAGrid3D{};
+//struct TagCSNAGridTeam{};
+
+// CPU backend only
+/*
+struct TagPairSNAPComputeNeighCPU{};
+struct TagPairSNAPPreUiCPU{};
+struct TagPairSNAPComputeUiCPU{};
+struct TagPairSNAPTransformUiCPU{};
+struct TagPairSNAPComputeZiCPU{};
+struct TagPairSNAPBetaCPU{};
+struct TagPairSNAPComputeBiCPU{};
+struct TagPairSNAPZeroYiCPU{};
+struct TagPairSNAPComputeYiCPU{};
+struct TagPairSNAPComputeDuidrjCPU{};
+struct TagPairSNAPComputeDeidrjCPU{};
+*/
+struct TagComputeSNAGridLoopCPU{};
+
+//template<class DeviceType>
 template<class DeviceType, typename real_type_, int vector_length_>
 class ComputeSNAGridKokkos : public ComputeSNAGrid {
 public:
-  //enum {EnabledNeighFlags=FULL|HALF|HALFTHREAD};
-  //enum {COUL_FLAG=0};
  typedef DeviceType device_type;
  typedef ArrayTypes<DeviceType> AT;
-  typedef EV_FLOAT value_type;

  static constexpr int vector_length = vector_length_;
  using real_type = real_type_;
-  //using complex = SNAComplex<real_type>;
+  using complex = SNAComplex<real_type>;
+
+  // Static team/tile sizes for device offload
+
+#ifdef KOKKOS_ENABLE_HIP
+  static constexpr int team_size_compute_neigh = 2;
+  static constexpr int tile_size_compute_ck = 2;
+  static constexpr int tile_size_pre_ui = 2;
+  static constexpr int team_size_compute_ui = 2;
+  static constexpr int tile_size_transform_ui = 2;
+  static constexpr int tile_size_compute_zi = 2;
+  static constexpr int tile_size_compute_bi = 2;
+  static constexpr int tile_size_transform_bi = 2;
+  static constexpr int tile_size_compute_yi = 2;
+  static constexpr int team_size_compute_fused_deidrj = 2;
+#else
+  static constexpr int team_size_compute_neigh = 4;
+  static constexpr int tile_size_compute_ck = 4;
+  static constexpr int tile_size_pre_ui = 4;
+  static constexpr int team_size_compute_ui = sizeof(real_type) == 4 ? 8 : 4;
+  static constexpr int tile_size_transform_ui = 4;
+  static constexpr int tile_size_compute_zi = 8;
+  static constexpr int tile_size_compute_bi = 4;
+  static constexpr int tile_size_transform_bi = 4;
+  static constexpr int tile_size_compute_yi = 8;
+  static constexpr int team_size_compute_fused_deidrj = sizeof(real_type) == 4 ? 4 : 2;
+#endif
+
+  // Custom MDRangePolicy, Rank3, to reduce verbosity of kernel launches
+  // This hides the Kokkos::IndexType<int> and Kokkos::Rank<3...>
+  // and reduces the verbosity of the LaunchBound by hiding the explicit
+  // multiplication by vector_length
+  template <class Device, int num_tiles, class TagComputeSNAP>
+  using Snap3DRangePolicy = typename Kokkos::MDRangePolicy<Device, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, Kokkos::LaunchBounds<vector_length * num_tiles>, TagComputeSNAP>;
+
+  // MDRangePolicy for the 3D grid loop:
+  template <class Device, class TagComputeSNAP>
+  using CSNAGrid3DPolicy = typename Kokkos::MDRangePolicy<Device, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>>;
+
+  // Testing out team policies
+  template <class Device, int num_teams,  class TagComputeSNAP>
+  using CSNAGridTeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::LaunchBounds<vector_length * num_teams>, TagComputeSNAP>;
+  //using CSNAGridTeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::IndexType<int>, Kokkos::IndexType<int>, Kokkos::IndexType<int>, TagComputeSNAP>;
+  //using team_member = typename team_policy::member_type;
+
+  // Custom SnapAoSoATeamPolicy to reduce the verbosity of kernel launches
+  // This hides the LaunchBounds abstraction by hiding the explicit
+  // multiplication by vector length
+  template <class Device, int num_teams, class TagComputeSNAP>
+  using SnapAoSoATeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::LaunchBounds<vector_length * num_teams>, TagComputeSNAP>;

  ComputeSNAGridKokkos(class LAMMPS *, int, char **);
  ~ComputeSNAGridKokkos() override;

  void init() override;
-  //void compute_array(int, int) override;
-  //double memory_usage() override;
-  
+  void setup() override;
+  void compute_array() override;
+
+  // Utility functions for teams
+
+  template<class TagStyle>
+  void check_team_size_for(int, int&);
+
+  template<class TagStyle>
+  void check_team_size_reduce(int, int&);
+
+  // operator function for example team policy
+  //KOKKOS_INLINE_FUNCTION
+  //void operator() (TagCSNAGridTeam, const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridTeam>::member_type& team) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagComputeSNAGridLoop, const int& ) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagComputeSNAGridLoopCPU, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridComputeNeigh>::member_type& team) const;
+
+  // 3D case - used by parallel_for
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagComputeSNAGrid3D, const int& iz, const int& iy, const int& ix) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeCayleyKlein, const int iatom_mod, const int jnbor, const int iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridPreUi,const int iatom_mod, const int j, const int iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridComputeUiSmall>::member_type& team) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridComputeUiLarge>::member_type& team) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridTransformUi,const int iatom_mod, const int j, const int iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeZi,const int iatom_mod, const int idxz, const int iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeBi,const int iatom_mod, const int idxb, const int iatom_div) const;
+
 protected:
+
  SNAKokkos<DeviceType, real_type, vector_length> snaKK;

+  int chunk_size, chunk_offset;
+  int host_flag;
+  int ntotal;
+  int total_range; // total number of loop iterations in grid
+  int zlen; //= nzhi-nzlo+1;
+  int ylen; //= nyhi-nylo+1;
+  int xlen; //= nxhi-nxlo+1;

-  using KKDeviceType = typename KKDevice<DeviceType>::value;
+  double cutsq_tmp; // temporary cutsq until we get a view
+
+  Kokkos::View<real_type*, DeviceType> d_radelem;              // element radii
+  Kokkos::View<real_type*, DeviceType> d_wjelem;               // elements weights
+  //Kokkos::View<real_type**, Kokkos::LayoutRight, DeviceType> d_coeffelem;           // element bispectrum coefficients
+  Kokkos::View<real_type*, DeviceType> d_sinnerelem;           // element inner cutoff midpoint
+  Kokkos::View<real_type*, DeviceType> d_dinnerelem;           // element inner cutoff half-width
+  Kokkos::View<T_INT*, DeviceType> d_ninside;                // ninside for all atoms in list
+  Kokkos::View<T_INT*, DeviceType> d_map;                    // mapping from atom types to elements
+
+  typedef Kokkos::DualView<F_FLOAT**, DeviceType> tdual_fparams;
+  tdual_fparams k_cutsq;
+  typedef Kokkos::View<const F_FLOAT**, DeviceType,
+      Kokkos::MemoryTraits<Kokkos::RandomAccess> > t_fparams_rnd;
+  t_fparams_rnd rnd_cutsq;
+
+  typename AT::t_x_array_randomread x;
+  typename AT::t_int_1d_randomread type;
+  DAT::tdual_float_2d k_grid;
+  DAT::tdual_float_2d k_gridall;
+  typename AT::t_float_2d d_grid;
+  typename AT::t_float_2d d_gridall;
+
+  //DAT::tdual_float_4d k_gridlocal;
+  //typedef Kokkos::DualView<real_type****, Kokkos::LayoutLeft, DeviceType> t_gridlocal_4d;
+  //typedef Kokkos::View<real_type****, DeviceType> t_4d;
+  typedef Kokkos::DualView<LMP_FLOAT****, LMPDeviceType> tdual_float_4d;
+  tdual_float_4d k_gridlocal;
+  tdual_float_4d d_gridlocal; 
+
+
+  // Utility routine which wraps computing per-team scratch size requirements for
+  // ComputeNeigh, ComputeUi, and ComputeFusedDeidrj
+  template <typename scratch_type>
+  int scratch_size_helper(int values_per_team);

 };

 // These wrapper classes exist to make the compute style factory happy/avoid having
-// to extend the compute style factory to support Compute classes w/an arbitrary number
+// to extend the compute  style factory to support Compute classes w/an arbitrary number
 // of extra template parameters

 template <class DeviceType>
@ -76,10 +267,9 @@ class ComputeSNAGridKokkosDevice : public ComputeSNAGridKokkos<DeviceType, SNAP_
 public:

  ComputeSNAGridKokkosDevice(class LAMMPS *, int, char **);
-  //ComputeSNAGridKokkosDevice(class LAMMPS *);

  void init() override;
-  //double memory_usage() override;
+  void compute_array() override;

 };

@ -93,10 +283,9 @@ class ComputeSNAGridKokkosHost : public ComputeSNAGridKokkos<DeviceType, SNAP_KO
 public:

  ComputeSNAGridKokkosHost(class LAMMPS *, int, char **);
-  //ComputeSNAGridKokkosHost(class LAMMPS *);

-  void init();
-  //double memory_usage();
+  void init() override;
+  void compute_array() override;

 };
 #endif
@ -105,3 +294,45 @@ class ComputeSNAGridKokkosHost : public ComputeSNAGridKokkos<DeviceType, SNAP_KO

 #endif
 #endif
+
+// The following will compile with the chunk in cpp file but we're gonna try wrapper like pair snap.
+/*
+#ifdef COMPUTE_CLASS
+// clang-format off
+ComputeStyle(sna/grid/kk,ComputeSNAGridKokkos<LMPDeviceType>);
+ComputeStyle(sna/grid/kk/device,ComputeSNAGridKokkos<LMPDeviceType>);
+ComputeStyle(sna/grid/kk/host,ComputeSNAGridKokkos<LMPHostType>);
+// clang-format on
+#else
+
+// clang-format off
+#ifndef LMP_COMPUTE_SNA_GRID_KOKKOS_H
+#define LMP_COMPUTE_SNA_GRID_KOKKOS_H
+
+#include "compute_sna_grid.h"
+#include "kokkos_type.h"
+
+namespace LAMMPS_NS {
+
+//template<int CSTYLE, int NCOL>
+//struct TagComputeCoordAtom{};
+
+template<class DeviceType>
+class ComputeSNAGridKokkos : public ComputeSNAGrid {
+ public:
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+
+  ComputeSNAGridKokkos(class LAMMPS *, int, char **);
+  ~ComputeSNAGridKokkos() override;
+
+ private:
+
+};
+
+}
+
+#endif
+#endif
+*/
+
--- a/src/KOKKOS/compute_sna_grid_kokkos_impl.h
+++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
@ -3,12 +3,10 @@
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   https://www.lammps.org/, Sandia National Laboratories
   LAMMPS development team: developers@lammps.org
-
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.
-
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */

@ -18,17 +16,20 @@
 ------------------------------------------------------------------------- */

 #include "compute_sna_grid_kokkos.h"
+#include "pair_snap_kokkos.h"

 #include "atom_kokkos.h"
 #include "atom_masks.h"
 #include "comm.h"
 #include "error.h"
-#include "force.h"
-#include "kokkos.h"
 #include "memory_kokkos.h"
-#include "neighbor_kokkos.h"
+#include "modify.h"
+#include "neigh_list.h"
 #include "neigh_request.h"
+#include "neighbor_kokkos.h"
+//#include "sna_kokkos.h"
 #include "sna.h"
+#include "update.h"

 #include <cmath>
 #include <cstdlib>
@ -39,69 +40,757 @@

 namespace LAMMPS_NS {

-/* ---------------------------------------------------------------------- */
+// Constructor

 template<class DeviceType, typename real_type, int vector_length>
-ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::ComputeSNAGridKokkos(LAMMPS *lmp, int narg, char **arg) : ComputeSNAGrid (lmp, narg, arg)
+ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::ComputeSNAGridKokkos(LAMMPS *lmp, int narg, char **arg) : ComputeSNAGrid(lmp, narg, arg)
 {
  //respa_enable = 0;

  kokkosable = 1;
  atomKK = (AtomKokkos *) atom;
  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
-  //datamask_read = EMPTY_MASK;
-  //datamask_modify = EMPTY_MASK;
+  datamask_read = EMPTY_MASK;
+  datamask_modify = EMPTY_MASK;

-  //host_flag = (execution_space == Host);
+  k_cutsq = tdual_fparams("ComputeSNAGridKokkos::cutsq",atom->ntypes+1,atom->ntypes+1);
+  auto d_cutsq = k_cutsq.template view<DeviceType>();
+  rnd_cutsq = d_cutsq;
+
+  host_flag = (execution_space == Host);
+
+  // ComputeSNAGrid constructor allocates `map` so let's do same here.
+  // actually, let's move this down to init
+  //int n = atom->ntypes;
+  //printf("^^^ realloc d_map\n");
+  //MemKK::realloc_kokkos(d_map,"ComputeSNAGridKokkos::map",n+1);
+  
+
+  printf("^^^^^ cutsq: %f\n", cutsq[1][1]);
+
+  cutsq_tmp = cutsq[1][1];
+
+  //memoryKK->create_kokkos(k_gridlocal,
+  //printf("^^^^^ gridlocal: %f\n", gridlocal[0][0][0][0]); 
 }

-/* ---------------------------------------------------------------------- */
+// Destructor

 template<class DeviceType, typename real_type, int vector_length>
 ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::~ComputeSNAGridKokkos()
 {
  if (copymode) return;

+  //memoryKK->destroy_kokkos(k_eatom,eatom);
+  //memoryKK->destroy_kokkos(k_vatom,vatom);
+  printf("^^^ Finish ComputeSNAGridKokkos destructor\n");
 }

-/* ---------------------------------------------------------------------- */
+// Init

 template<class DeviceType, typename real_type, int vector_length>
 void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::init()
 {
+  printf("^^^ Begin ComputeSNAGridKokkos init()\n");
+  // The part of pair_snap_kokkos_impl.h that allocates snap params is coeff(), and it 
+  // calls the original coeff function. So let's do that here: 
+
+  ComputeSNAGrid::init();
+
+  // Set up element lists
+  printf("^^^ Begin kokkos reallocs\n");
+  MemKK::realloc_kokkos(d_radelem,"ComputeSNAGridKokkos::radelem",nelements);
+  MemKK::realloc_kokkos(d_wjelem,"pair:wjelem",nelements);
+  // pair snap kokkos uses `ncoeffall` in the following, inherits from original.
+  //MemKK::realloc_kokkos(d_coeffelem,"pair:coeffelem",nelements,ncoeff);
+  MemKK::realloc_kokkos(d_sinnerelem,"pair:sinnerelem",nelements);
+  MemKK::realloc_kokkos(d_dinnerelem,"pair:dinnerelem",nelements);
+  int n = atom->ntypes;
+  //printf("^^^ realloc d_map\n");
+  printf("^^^ n: %d\n", n);
+  MemKK::realloc_kokkos(d_map,"ComputeSNAGridKokkos::map",n+1);
+
+  printf("^^^ begin mirrow view creation\n");
+  auto h_radelem = Kokkos::create_mirror_view(d_radelem);
+  auto h_wjelem = Kokkos::create_mirror_view(d_wjelem);
+  //auto h_coeffelem = Kokkos::create_mirror_view(d_coeffelem);
+  auto h_sinnerelem = Kokkos::create_mirror_view(d_sinnerelem);
+  auto h_dinnerelem = Kokkos::create_mirror_view(d_dinnerelem);
+  auto h_map = Kokkos::create_mirror_view(d_map);
+
+  printf("^^^ begin loop over elements, nelements = %d\n", nelements);
+  for (int ielem = 0; ielem < nelements; ielem++) {
+    printf("^^^^^ ielem %d\n", ielem);
+    h_radelem(ielem) = radelem[ielem];
+    printf("^^^^^ 1\n");
+    h_wjelem(ielem) = wjelem[ielem];
+    printf("^^^^^ 2\n");
+    if (switchinnerflag){
+      h_sinnerelem(ielem) = sinnerelem[ielem];
+      h_dinnerelem(ielem) = dinnerelem[ielem];
+    }
+    // pair snap kokkos uses `ncoeffall` in the following.
+    //for (int jcoeff = 0; jcoeff < ncoeff; jcoeff++) {
+    //  h_coeffelem(ielem,jcoeff) = coeffelem[ielem][jcoeff];
+    //}
+  }
+
+  printf("^^^ begin loop over map\n");
+  // NOTE: At this point it's becoming obvious that compute sna grid is not like pair snap, where 
+  // some things like `map` get allocated regardless of chem flag.
+  if (chemflag){ 
+    for (int i = 1; i <= atom->ntypes; i++) {
+      h_map(i) = map[i];
+      printf("%d\n", map[i]);
+    }
+  }
+
+  Kokkos::deep_copy(d_radelem,h_radelem);
+  Kokkos::deep_copy(d_wjelem,h_wjelem);
+  if (switchinnerflag){
+    Kokkos::deep_copy(d_sinnerelem,h_sinnerelem);
+    Kokkos::deep_copy(d_dinnerelem,h_dinnerelem);
+  }
+  if (chemflag){
+    Kokkos::deep_copy(d_map,h_map);
+  }
+
+  snaKK = SNAKokkos<DeviceType, real_type, vector_length>(rfac0,twojmax,
+    rmin0,switchflag,bzeroflag,chemflag,bnormflag,wselfallflag,nelements,switchinnerflag);
+  snaKK.grow_rij(0,0);
+  snaKK.init();

-  printf("^^^ inside ComputeSNAGridKokkos init\n");
-  // from pair_snap_kokkos_impl.h :
-  /*
  if (host_flag) {
-    if (lmp->kokkos->nthreads > 1)
-      error->all(FLERR,"Pair style snap/kk can currently only run on a single "
-                         "CPU thread");

-    PairSNAP::init_style();
+    // The following lmp->kokkos will compile error with pointer to incomplete class type not allowed.
+    //if (lmp->kokkos->nthreads > 1)
+    //  error->all(FLERR,"Compute style sna/grid/kk can currently only run on a single "
+    //                     "CPU thread");
+
+    ComputeSNAGrid::init();
    return;
  }

-  if (force->newton_pair == 0)
-    error->all(FLERR,"Pair style SNAP requires newton pair on");
+  printf("^^^ Finished ComputeSNAGridKokkos init\n");

-  // neighbor list request for KOKKOS
-
-  neighflag = lmp->kokkos->neighflag;
-
-  auto request = neighbor->add_request(this, NeighConst::REQ_FULL);
-  request->set_kokkos_host(std::is_same<DeviceType,LMPHostType>::value &&
-                           !std::is_same<DeviceType,LMPDeviceType>::value);
-  request->set_kokkos_device(std::is_same<DeviceType,LMPDeviceType>::value);
-  if (neighflag == FULL)
-    error->all(FLERR,"Must use half neighbor list style with pair snap/kk");
-  */
 }

+// Setup
+
+template<class DeviceType, typename real_type, int vector_length>
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::setup()
+{
+  // Do not call ComputeGrid::setup(), we don't wanna allocate the grid array there.
+  // Instead, call ComputeGrid::set_grid_global and set_grid_local to set the n indices.
+  //ComputeGrid::setup();
+  printf("^^^^^ SETUP!\n");
+  //printf("^^^^^ gridlocal: %f\n", gridlocal[0][0][0][0]);
+  ComputeGrid::set_grid_global();
+  ComputeGrid::set_grid_local();
+  
+  // allocate arrays
+
+  memoryKK->create_kokkos(k_grid,grid, size_array_rows, size_array_cols, "grid:grid");
+  memoryKK->create_kokkos(k_gridall, gridall, size_array_rows, size_array_cols, "grid:gridall");
+  if (nxlo <= nxhi && nylo <= nyhi && nzlo <= nzhi) {
+    gridlocal_allocated = 1;
+    memoryKK->create4d_offset_kokkos(k_gridlocal, gridlocal, size_array_cols, nzlo, nzhi, nylo, 
+                                     nyhi, nxlo, nxhi, "grid:gridlocal");
+  }
+  array = gridall;
+}
+
+// Compute
+
+template<class DeviceType, typename real_type, int vector_length>
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::compute_array()
+{
+  printf("^^^ Begin ComputeSNAGridKokkos compute_array()\n");
+
+  if (DeviceType::in_parallel()) {
+    printf("^^^ compute_array() is a host function\n");
+  } else {
+    printf("^^^ compute_array() is not a host function\n");
+  }
+
+  if (host_flag) {
+    /*
+    atomKK->sync(Host,X_MASK|F_MASK|TYPE_MASK);
+    PairSNAP::compute(eflag_in,vflag_in);
+    atomKK->modified(Host,F_MASK);
+    */
+    return;
+  }
+
+  copymode = 1;
+
+  zlen = nzhi-nzlo+1;
+  ylen = nyhi-nylo+1;
+  xlen = nxhi-nxlo+1;
+  printf("^^^ nzlo nzhi nylo nyhi nxlo nxhi: %d %d %d %d %d %d\n", nzlo, nzhi, nylo, nyhi, nxlo, nxhi);
+  total_range = (nzhi-nzlo+1)*(nyhi-nylo+1)*(nxhi-nxlo+1);
+
+  atomKK->sync(execution_space,X_MASK|F_MASK|TYPE_MASK);
+  x = atomKK->k_x.view<DeviceType>();
+  // This will error because trying to access host view on the device:
+  //printf("x(0,0): %f\n", x(0,0));
+  type = atomKK->k_type.view<DeviceType>();
+  k_cutsq.template sync<DeviceType>();
+
+
+  MemKK::realloc_kokkos(d_ninside,"PairSNAPKokkos:ninside",total_range);
+
+  //printf("^^^ nzlo nzhi nylo nyhi nxlo nxhi: %d %d %d %d %d %d\n", nzlo, nzhi, nylo, nyhi, nxlo, nxhi);
+  
+  // Pair snap/kk uses grow_ij with some max number of neighs but compute sna/grid uses total 
+  // number of atoms.
+  
+  //const int ntotal = atomKK->nlocal + atomKK->nghost;
+  ntotal = atomKK->nlocal + atomKK->nghost;
+  //printf("^^^ ntotal:  %d\n", ntotal);
+
+  // ensure rij, inside, and typej are of size jnum
+  // snaKK.grow_rij(int, int) requires 2 args where one is a chunksize.
+
+  chunk_size = MIN(chunksize, total_range); // "chunksize" variable is set by user
+  //printf("^^^ chunk_size: %d\n", chunk_size);
+  snaKK.grow_rij(chunk_size, ntotal);
+
+  // Launch 3 teams of the maximum number of threads per team
+  //const int team_size_max = team_policy(3, 1).team_size_max(
+  //    TagCSNAGridTeamPolicy, Kokkos::ParallelForTag());
+  //typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridTeamPolicy> team_policy_test(3,1);
+
+  // Using custom policy:
+  /* 
+  CSNAGridTeamPolicy<DeviceType, team_size_compute_neigh ,TagCSNAGridTeam> team_policy(chunk_size,team_size_compute_neigh,vector_length);
+  //team_policy = team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+  Kokkos::parallel_for("TeamPolicy",team_policy,*this);
+  */
+
+
+  chunk_size = total_range; 
+  printf("%d %d %d\n", chunk_size, team_size_compute_neigh, vector_length);
+  // team_size_compute_neigh is defined in `pair_snap_kokkos.h`
+
+
+  // Pre-compute ceil(chunk_size / vector_length) for code cleanliness
+  const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length;
+
+  //ComputeNeigh 
+  {
+    int scratch_size = scratch_size_helper<int>(team_size_compute_neigh * ntotal);
+
+    SnapAoSoATeamPolicy<DeviceType, team_size_compute_neigh, TagCSNAGridComputeNeigh> 
+      policy_neigh(chunk_size, team_size_compute_neigh, vector_length);
+    policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+    Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this);
+  }
+
+  //ComputeCayleyKlein
+  {
+    // tile_size_compute_ck is defined in `compute_sna_grid_kokkos.h`
+    Snap3DRangePolicy<DeviceType, tile_size_compute_ck, TagCSNAGridComputeCayleyKlein>
+      policy_compute_ck({0,0,0}, {vector_length, ntotal, chunk_size_div}, {vector_length, tile_size_compute_ck, 1});
+    Kokkos::parallel_for("ComputeCayleyKlein", policy_compute_ck, *this);
+  }
+
+  //PreUi
+  {
+    // tile_size_pre_ui is defined in `compute_sna_grid_kokkos.h`
+    Snap3DRangePolicy<DeviceType, tile_size_pre_ui, TagCSNAGridPreUi>
+      policy_preui({0,0,0},{vector_length,twojmax+1,chunk_size_div},{vector_length,tile_size_pre_ui,1});
+    Kokkos::parallel_for("PreUi",policy_preui,*this);
+  }
+
+  // ComputeUi w/ vector parallelism, shared memory, direct atomicAdd into ulisttot
+  {
+    // team_size_compute_ui is defined in `compute_sna_grid_kokkos.h`
+    // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer
+    const int tile_size = vector_length * (twojmax + 1);
+    const int scratch_size = scratch_size_helper<complex>(team_size_compute_ui * tile_size);
+
+    if (chunk_size < parallel_thresh)
+    {
+      // Version with parallelism over j_bend
+
+      // total number of teams needed: (natoms / 32) * (ntotal) * ("bend" locations)
+      const int n_teams = chunk_size_div * ntotal * (twojmax + 1);
+      const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
+
+      SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagCSNAGridComputeUiSmall>
+        policy_ui(n_teams_div, team_size_compute_ui, vector_length);
+      policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+      Kokkos::parallel_for("ComputeUiSmall", policy_ui, *this);
+    } else {
+      // Version w/out parallelism  over j_bend
+
+      // total number of teams needed: (natoms / 32) * (ntotal)
+      const int n_teams = chunk_size_div * ntotal;
+      const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
+
+      SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagCSNAGridComputeUiLarge>
+        policy_ui(n_teams_div, team_size_compute_ui, vector_length);
+      policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+      Kokkos::parallel_for("ComputeUiLarge", policy_ui, *this);
+    }
+  }
+
+  //TransformUi: un-"fold" ulisttot, zero ylist
+  {
+    // team_size_transform_ui is defined in `pair_snap_kokkos.h`
+    Snap3DRangePolicy<DeviceType, tile_size_transform_ui, TagCSNAGridTransformUi>
+        policy_transform_ui({0,0,0},{vector_length,snaKK.idxu_max,chunk_size_div},{vector_length,tile_size_transform_ui,1});
+    Kokkos::parallel_for("TransformUi",policy_transform_ui,*this);
+  }
+
+  //Compute bispectrum in AoSoA data layout, transform Bi
+  //if (quadraticflag || eflag) {
+
+  //ComputeZi
+  const int idxz_max = snaKK.idxz_max;
+  Snap3DRangePolicy<DeviceType, tile_size_compute_zi, TagCSNAGridComputeZi>
+      policy_compute_zi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_zi,1});
+  Kokkos::parallel_for("ComputeZi",policy_compute_zi,*this);
+
+  //ComputeBi
+  const int idxb_max = snaKK.idxb_max;
+  Snap3DRangePolicy<DeviceType, tile_size_compute_bi, TagCSNAGridComputeBi>
+      policy_compute_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_compute_bi,1});
+  Kokkos::parallel_for("ComputeBi",policy_compute_bi,*this);
+
+  //Looks like best way to grab blist is in a parallel_for
+
+  //Transform data layout of blist out of AoSoA
+  //We need this because `blist` gets used in ComputeForce which doesn't
+  //take advantage of AoSoA, which at best would only be beneficial on the margins
+  //NOTE: Do we need this in compute sna/grid/kk?
+  /*
+  Snap3DRangePolicy<DeviceType, tile_size_transform_bi, TagPairSNAPTransformBi>
+      policy_transform_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_transform_bi,1});
+  Kokkos::parallel_for("TransformBi",policy_transform_bi,*this);
+  */
+
+
+
+  // let's try a simple parallel for loop
+  // NOTE: We get the compiler error calling host function DeviceType::in_parallel() in this 
+  // function, because this is a host-device function.
+  /*
+  typename Kokkos::RangePolicy<DeviceType,TagComputeSNAGridLoop> policy_loop(0,4);
+  Kokkos::parallel_for("Loop",policy_loop,*this);
+  */
+
+
+  // Simple working loop:
+  /* 
+  Kokkos::parallel_for("Loop1", 4, KOKKOS_LAMBDA (const int& i) {
+    printf("Greeting from iteration %i\n",i);
+  });
+  */
+
+  /*
+  // NOTE: We get the compiler error calling host function DeviceType::in_parallel() in this 
+  // function, because this is a host-device function.
+  const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length;
+  Snap3DRangePolicy<DeviceType, tile_size_compute_ck, TagCSNAGridComputeCayleyKlein>
+      policy_compute_ck({0,0,0},{vector_length,ntotal,chunk_size_div},{vector_length,tile_size_compute_ck,1});
+  Kokkos::parallel_for("ComputeCayleyKlein",policy_compute_ck,*this);
+  */
+
+  // Simple example of 3D MD range policy.
+  // Begin loop over grid points.
+  /*
+  // NOTE: We don't get the compiler error calling host function DeviceType::in_parallel() in this 
+  // function, but we get it in the above function.
+  int n = 3; // bounds for mdrange policy
+  typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagComputeSNAGrid3D> policy_3d({0,0,0},{n,n,n});
+  Kokkos::parallel_for("3D",policy_3d,*this);
+  */
+
+  printf("^^^ End ComputeSNAGridKokkos compute_array()\n");
+}
+
+/* ----------------------------------------------------------------------
+   Begin routines that are unique to the GPU codepath. These take advantage
+   of AoSoA data layouts and scratch memory for recursive polynomials
+------------------------------------------------------------------------- */
+
+/*
+ Simple team policy functor seeing how many layers deep we can go with the parallelism.
+ */
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridComputeNeigh>::member_type& team) const {
+
+  // this function is following the same procedure in ComputeNeigh of PairSNAPKokkos
+
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+  // basic quantities associated with this team:
+  // team_rank : rank of thread in this team
+  // league_rank : rank of team in this league
+  // team_size : number of threads in this team
+  //printf("%d %d %d\n", team.team_rank(), team.league_rank(), team.team_size());
+
+  // extract loop index
+  int ii = team.team_rank() + team.league_rank() * team.team_size();
+  if (ii >= chunk_size) return;
+
+  // get a pointer to scratch memory
+  // This is used to cache whether or not an atom is within the cutoff.
+  // If it is, type_cache is assigned to the atom type.
+  // If it's not, it's assigned to -1.
+  const int tile_size = ntotal; // number of elements per thread
+  const int team_rank = team.team_rank();
+  const int scratch_shift = team_rank * tile_size; // offset into pointer for entire team
+  //printf("ntotal scratch_shift: %d %d\n", ntotal, scratch_shift);
+  int* type_cache = (int*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(int), 0) + scratch_shift;
+
+  //printf("ii: %d\n", ii);
+
+  // convert to grid indices
+
+  int iz = ii/(xlen*ylen);
+  int i2 = ii - (iz*xlen*ylen);
+  int iy = i2/xlen;
+  int ix = i2 % xlen;
+  iz += nzlo;
+  iy += nylo;
+  ix += nxlo;
+
+  double xgrid[3];
+  //int igrid = iz * (nx * ny) + iy * nx + ix;
+
+  // these end up being the same...?
+  //printf("ii igrid: %d %d\n", ii, igrid);
+
+  // grid2x converts igrid to ix,iy,iz like we've done before
+  //grid2x(igrid, xgrid);
+  xgrid[0] = ix * delx;
+  xgrid[1] = iy * dely;
+  xgrid[2] = iz * delz;
+  const double xtmp = xgrid[0];
+  const double ytmp = xgrid[1];
+  const double ztmp = xgrid[2];
+
+  // currently, all grid points are type 1
+  // not clear what a better choice would be
+
+  const int itype = 1;
+  const int ielem = d_map[itype];
+  const double radi = d_radelem[ielem];
+
+  // We need a DomainKokkos::lamda2x parallel for loop here, but let's ignore for now.
+  if (triclinic){
+    printf("We are triclinic %f %f %f\n", xtmp, ytmp, ztmp);
+  } else {
+    //printf("We are not triclinic\n");
+  }
+
+  // can check xgrid positions with original
+  //printf("%f %f %f\n", xgrid[0], xgrid[1], xgrid[2]);
+
+  // Compute the number of neighbors, store rsq
+  int ninside = 0;
+  // want to loop over ntotal... keep getting seg fault when accessing type_cache[j]?
+  //printf("ntotal: %d\n", ntotal);
+  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,ntotal),
+    [&] (const int j, int& count) {
+
+    // From pair snap/kk :
+    /*
+    T_INT j = d_neighbors(i,jj);
+    const F_FLOAT dx = x(j,0) - xtmp;
+    const F_FLOAT dy = x(j,1) - ytmp;
+    const F_FLOAT dz = x(j,2) - ztmp;
+    */
+    // From compute sna/grid/kk :
+    /*
+    const double delx = xtmp - x[j][0];
+    const double dely = ytmp - x[j][1];
+    const double delz = ztmp - x[j][2];
+    */
+    const F_FLOAT dx = x(j,0) - xtmp;
+    const F_FLOAT dy = x(j,1) - ytmp;
+    const F_FLOAT dz = x(j,2) - ztmp;
+    //printf("dx: %f\n", dx);
+
+    //const double rsq = delx * delx + dely * dely + delz * delz;
+    int jtype = type(j);
+    //printf("jtype: %d\n", jtype);
+    //int jelem = 0;
+    //if (rsq < cutsq[jtype][jtype] && rsq > 1e-20) {
+    const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
+
+    //if (rsq >= rnd_cutsq(itype,jtype)) {
+    if (rsq >= cutsq_tmp){
+      jtype = -1; // use -1 to signal it's outside the radius
+    }
+    //printf("jtype rsq rnd_cutsq: %d %f %f\n", jtype, rsq, rnd_cutsq(itype, jtype));
+
+    if (j > 340){
+      printf("j: %d\n", j);
+    }
+
+    //printf("j: %d\n", j);
+    type_cache[j] = jtype;
+
+    if (jtype >= 0)
+     count++;
+
+  }, ninside);
+
+  //printf("ninside: %d\n", ninside);
+
+  d_ninside(ii) = ninside;
+
+  // TODO: Make sure itype is appropriate instead of ielem
+  Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team,ntotal),
+    [&] (const int j, int& offset, bool final) {
+
+    const int jtype = type_cache[j];
+
+    if (jtype >= 0) {
+      if (final) {
+        const F_FLOAT dx = x(j,0) - xtmp;
+        const F_FLOAT dy = x(j,1) - ytmp;
+        const F_FLOAT dz = x(j,2) - ztmp;
+        const int jelem = d_map[jtype];
+        my_sna.rij(ii,offset,0) = static_cast<real_type>(dx);
+        my_sna.rij(ii,offset,1) = static_cast<real_type>(dy);
+        my_sna.rij(ii,offset,2) = static_cast<real_type>(dz);
+        my_sna.wj(ii,offset) = static_cast<real_type>(d_wjelem[jelem]);
+        my_sna.rcutij(ii,offset) = static_cast<real_type>((radi + d_radelem[jelem])*rcutfac);
+        my_sna.inside(ii,offset) = j;
+        if (switchinnerflag) {
+          my_sna.sinnerij(ii,offset) = 0.5*(d_sinnerelem[itype] + d_sinnerelem[jelem]);
+          my_sna.dinnerij(ii,offset) = 0.5*(d_dinnerelem[itype] + d_dinnerelem[jelem]);
+        }
+        if (chemflag)
+          my_sna.element(ii,offset) = jelem;
+        else
+          my_sna.element(ii,offset) = 0;
+      }
+      offset++;
+    }
+  });
+}
+
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeCayleyKlein,const int iatom_mod, const int jnbor, const int iatom_div) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+  //printf("^^^ ComputeCayleyKlein\n");
+
+  /*
+  if (DeviceType::in_parallel()) {
+    printf("operator() of TagCSNAGridComputeCayleyKlein is a host function\n");
+  } else {
+    printf("operator() of TagCSNAGridComputeCayleyKlein is not a host function\n");
+  }
+  */
+
+  const int ii = iatom_mod + iatom_div * vector_length;
+  if (ii >= chunk_size) return;
+
+  const int ninside = ntotal; //d_ninside(ii);
+  if (jnbor >= ninside) return;
+
+  my_sna.compute_cayley_klein(iatom_mod,jnbor,iatom_div);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridPreUi, const int iatom_mod, const int j, const int iatom_div) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+  const int ii = iatom_mod + iatom_div * vector_length;
+  if (ii >= chunk_size) return;
+
+  int itype = type(ii);
+  int ielem = d_map[itype];
+
+  my_sna.pre_ui(iatom_mod, j, ielem, iatom_div);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridComputeUiSmall>::member_type& team) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+  // extract flattened atom_div / neighbor number / bend_location
+  int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
+
+  // extract neighbor index, iatom_div
+  int iatom_div = flattened_idx / (ntotal * (twojmax + 1)); // removed "const" to work around GCC 7 bug
+  const int jj_jbend = flattened_idx - iatom_div * (ntotal * (twojmax + 1));
+  const int jbend = jj_jbend / ntotal;
+  int jj = jj_jbend - jbend * ntotal; // removed "const" to work around GCC 7 bug
+
+  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
+    [&] (const int iatom_mod) {
+    const int ii = iatom_mod + vector_length * iatom_div;
+    if (ii >= chunk_size) return;
+
+    const int ninside = d_ninside(ii);
+    if (jj >= ninside) return;
+
+    my_sna.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div);
+  });
+
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridComputeUiLarge>::member_type& team) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+  // extract flattened atom_div / neighbor number / bend location
+  int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
+
+  // extract neighbor index, iatom_div
+  int iatom_div = flattened_idx / ntotal; // removed "const" to work around GCC 7 bug
+  int jj = flattened_idx - iatom_div * ntotal;
+
+  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
+    [&] (const int iatom_mod) {
+    const int ii = iatom_mod + vector_length * iatom_div;
+    if (ii >= chunk_size) return;
+
+    const int ninside = d_ninside(ii);
+    if (jj >= ninside) return;
+
+    my_sna.compute_ui_large(team,iatom_mod, jj, iatom_div);
+  });
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridTransformUi,const int iatom_mod, const int idxu, const int iatom_div) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+
+  if (idxu > my_sna.idxu_max) return;
+
+  int elem_count = chemflag ? nelements : 1;
+
+  for (int ielem = 0; ielem < elem_count; ielem++){
+
+    const FullHalfMapper mapper = my_sna.idxu_full_half[idxu];
+
+    auto utot_re = my_sna.ulisttot_re_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div);
+    auto utot_im = my_sna.ulisttot_im_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div);
+
+    if (mapper.flip_sign == 1){
+      utot_im = -utot_im;
+    } else if (mapper.flip_sign == -1){
+      utot_re = -utot_re;
+    }
+
+    my_sna.ulisttot_pack(iatom_mod, idxu, ielem, iatom_div) = { utot_re, utot_im };
+
+    if (mapper.flip_sign == 0) {
+      my_sna.ylist_pack_re(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.;
+      my_sna.ylist_pack_im(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.;
+    }
+  }
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeZi,const int iatom_mod, const int jjz, const int iatom_div) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+
+  if (jjz >= my_sna.idxz_max) return;
+
+  my_sna.compute_zi(iatom_mod,jjz,iatom_div);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeBi,const int iatom_mod, const int jjb, const int iatom_div) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+
+  if (jjb >= my_sna.idxb_max) return;
+
+  my_sna.compute_bi(iatom_mod,jjb,iatom_div);
+}
+
+/*
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridComputeNeigh>::member_type& team) const {
+
+
+}
+*/
+
+/* ----------------------------------------------------------------------
+   Begin routines that are unique to the CPU codepath. These do not take
+   advantage of AoSoA data layouts, but that could be a good point of
+   future optimization and unification with the above kernels. It's unlikely
+   that scratch memory optimizations will ever be useful for the CPU due to
+   different arithmetic intensity requirements for the CPU vs GPU.
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagComputeSNAGridLoopCPU,const int& ii) const {
+
+}
+
+/* ----------------------------------------------------------------------
+   utility functions
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+template<class TagStyle>
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::check_team_size_for(int inum, int &team_size) {
+  int team_size_max;
+
+  team_size_max = Kokkos::TeamPolicy<DeviceType,TagStyle>(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelForTag());
+
+  if (team_size*vector_length > team_size_max)
+    team_size = team_size_max/vector_length;
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+template<class TagStyle>
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::check_team_size_reduce(int inum, int &team_size) {
+  int team_size_max;
+
+  team_size_max = Kokkos::TeamPolicy<DeviceType,TagStyle>(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelReduceTag());
+
+  if (team_size*vector_length > team_size_max)
+    team_size = team_size_max/vector_length;
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+template<typename scratch_type>
+int ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::scratch_size_helper(int values_per_team) {
+  typedef Kokkos::View<scratch_type*, Kokkos::DefaultExecutionSpace::scratch_memory_space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > ScratchViewType;
+
+  return ScratchViewType::shmem_size(values_per_team);
+}
+
+/* ---------------------------------------------------------------------- */
+
 /* ----------------------------------------------------------------------
   routines used by template reference classes
 ------------------------------------------------------------------------- */

+
 template<class DeviceType>
 ComputeSNAGridKokkosDevice<DeviceType>::ComputeSNAGridKokkosDevice(class LAMMPS *lmp, int narg, char **arg)
   : ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN>(lmp, narg, arg) { ; }
@ -112,6 +801,12 @@ void ComputeSNAGridKokkosDevice<DeviceType>::init()
  Base::init();
 }

+template<class DeviceType>
+void ComputeSNAGridKokkosDevice<DeviceType>::compute_array()
+{
+  Base::compute_array();
+}
+
 #ifdef LMP_KOKKOS_GPU
 template<class DeviceType>
 ComputeSNAGridKokkosHost<DeviceType>::ComputeSNAGridKokkosHost(class LAMMPS *lmp, int narg, char **arg)
@ -122,6 +817,12 @@ void ComputeSNAGridKokkosHost<DeviceType>::init()
 {
  Base::init();
 }
+
+template<class DeviceType>
+void ComputeSNAGridKokkosHost<DeviceType>::compute_array()
+{
+  Base::compute_array();
+}
 #endif

 }
--- a/src/KOKKOS/memory_kokkos.h
+++ b/src/KOKKOS/memory_kokkos.h
@ -183,6 +183,56 @@ TYPE create_kokkos(TYPE &data, typename TYPE::value_type **&array,
  return data;
 }

+/* ----------------------------------------------------------------------
+   create a 4d array with indices 2,3,4 offset, but not first
+   2nd index from n2lo to n2hi inclusive
+   3rd index from n3lo to n3hi inclusive
+   4th index from n4lo to n4hi inclusive
+   cannot grow it
+------------------------------------------------------------------------- */
+
+template <typename TYPE>
+TYPE create4d_offset_kokkos(TYPE &data, typename TYPE::value_type ****&array,
+                             int n1, int n2lo, int n2hi, int n3lo, int n3hi, int n4lo, int n4hi,
+                             const char *name)
+{
+  //if (n1 <= 0 || n2lo > n2hi || n3lo > n3hi || n4lo > n4hi) array =  nullptr;
+
+  printf("^^^^^ memoryKK->create_4d_offset_kokkos\n");
+
+  int n2 = n2hi - n2lo + 1;
+  int n3 = n3hi - n3lo + 1;
+  int n4 = n4hi - n4lo + 1;
+  data = TYPE(std::string(name),n1,n2,n3,n4);
+  bigint nbytes = ((bigint) sizeof(typename TYPE::value_type ***)) * n1;
+  array = (typename TYPE::value_type ****) smalloc(nbytes,name);
+
+  for (int i = 0; i < n1; i++) {
+    if (n2 == 0) {
+      array[i] = nullptr;
+    } else {
+      nbytes = ((bigint) sizeof(typename TYPE::value_type **)) * n2;
+      array[i] = (typename TYPE::value_type ***) smalloc(nbytes,name);
+      for (int j = 0; j < n2; j++){
+        if (n3 == 0){
+          array[i][j] = nullptr;
+        } else {
+          nbytes = ((bigint) sizeof(typename TYPE::value_type *)) * n3;
+          array[i][j] = (typename TYPE::value_type **) smalloc(nbytes, name);
+          for (int k = 0; k < n3; k++){
+            if (n4 == 0)
+              array[i][j][k] = nullptr;
+            else
+              array[i][j][k] = &data.h_view(i,j,k,0);
+          }
+        }
+      }
+    }
+  }
+
+  return data;
+}
+
 template <typename TYPE, typename HTYPE>
  TYPE create_kokkos(TYPE &data, HTYPE &h_data,
                     typename TYPE::value_type **&array, int n1, int n2,
--- a/src/ML-SNAP/compute_grid.cpp
+++ b/src/ML-SNAP/compute_grid.cpp
@ -57,6 +57,8 @@ ComputeGrid::ComputeGrid(LAMMPS *lmp, int narg, char **arg) :

 ComputeGrid::~ComputeGrid()
 {
+  printf("^^^ begin ComputeGrid destructor\n");
+  if (copymode) return;
  deallocate();
 }

@ -111,7 +113,7 @@ void ComputeGrid::assign_coords_all()
 void ComputeGrid::allocate()
 {
  // allocate arrays
-
+  printf("^^^^^^^^^^^^^^^ ComputeGrid::allocate()\n");
  memory->create(grid, size_array_rows, size_array_cols, "grid:grid");
  memory->create(gridall, size_array_rows, size_array_cols, "grid:gridall");
  if (nxlo <= nxhi && nylo <= nyhi && nzlo <= nzhi) {
--- a/src/ML-SNAP/compute_sna_grid.cpp
+++ b/src/ML-SNAP/compute_sna_grid.cpp
@ -31,14 +31,14 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) :
  // skip over arguments used by base class
  // so that argument positions are identical to
  // regular per-atom compute
-
+  printf("^^^ inside compute sna grid constructor\n");
  arg += nargbase;
  narg -= nargbase;

  // begin code common to all SNAP computes

-  double rfac0, rmin0;
-  int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;
+  //double rfac0, rmin0;
+  //int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;

  int ntypes = atom->ntypes;
  int nargmin = 6 + 2 * ntypes;
@ -56,6 +56,8 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) :
  wselfallflag = 0;
  switchinnerflag = 0;
  nelements = 1;
+  chunksize = 32768;
+  parallel_thresh = 8192;

  // process required arguments

@ -112,6 +114,7 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) :
      quadraticflag = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
      iarg += 2;
    } else if (strcmp(arg[iarg], "chem") == 0) {
+      printf("^^^ chem flag, creating map\n");
      if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
      chemflag = 1;
      memory->create(map, ntypes + 1, "compute_sna_grid:map");
@ -181,11 +184,17 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) :

 ComputeSNAGrid::~ComputeSNAGrid()
 {
-  memory->destroy(radelem);
-  memory->destroy(wjelem);
-  memory->destroy(cutsq);
-  delete snaptr;
+  if (copymode) return;

+  printf("^^^ begin ComputeSNAGrid destructor\n");
+  memory->destroy(radelem);
+  printf("^^^^ CSG 1\n");
+  memory->destroy(wjelem);
+  printf("^^^^ CSG 2\n");
+  memory->destroy(cutsq);
+  printf("^^^^ CSG 3\n");
+  delete snaptr;
+  printf("^^^^ CSG 4\n");
  if (chemflag) memory->destroy(map);
 }

@ -196,12 +205,16 @@ void ComputeSNAGrid::init()
  if ((modify->get_compute_by_style("^sna/grid$").size() > 1) && (comm->me == 0))
    error->warning(FLERR, "More than one instance of compute sna/grid");
  snaptr->init();
+
+  printf("^^^ finished ComputeSNAGrid init()\n");
 }

 /* ---------------------------------------------------------------------- */

 void ComputeSNAGrid::compute_array()
 {
+  printf("^^^ inside ComputeSNAGrid compute_array()\n");
+
  invoked_array = update->ntimestep;

  // compute sna for each gridpoint
@ -211,6 +224,8 @@ void ComputeSNAGrid::compute_array()
  int *const type = atom->type;
  const int ntotal = atom->nlocal + atom->nghost;

+  printf("^^^ ntotal: %d\n", ntotal);
+
  // ensure rij, inside, and typej are of size jnum

  snaptr->grow_rij(ntotal);
--- a/src/ML-SNAP/compute_sna_grid.h
+++ b/src/ML-SNAP/compute_sna_grid.h
@ -31,21 +31,27 @@ class ComputeSNAGrid : public ComputeGrid {
  void init() override;
  void compute_array() override;
  double memory_usage() override;
+  int ncoeff,nelements; // public for kokkos, but could go in the protected block now

- private:
-  int ncoeff;
+ protected:
+  //int ncoeff;
  double **cutsq;
  double rcutfac;
  double *radelem;
  double *wjelem;
  int *map;    // map types to [0,nelements)
-  int nelements, chemflag;
+  int chemflag;
  int switchinnerflag;
  double *sinnerelem;
  double *dinnerelem;
+  int parallel_thresh;
  class SNA *snaptr;
  double cutmax;
  int quadraticflag;
+  double rfac0, rmin0;
+  int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;
+  int chunksize;
+
 };

 }    // namespace LAMMPS_NS