Merge pull request #1 from rohskopf/sna-grid-kokkos

compute sna/grid/kokkos
2023-06-07 13:57:50 +02:00
parent e169327162 904fde8189
commit 5e01ffe6a6
11 changed files with 1474 additions and 12 deletions
--- a/src/KOKKOS/compute_sna_grid_kokkos.cpp
+++ b/src/KOKKOS/compute_sna_grid_kokkos.cpp
@ -0,0 +1,81 @@
+// clang-format off
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "compute_sna_grid_kokkos.h"
+#include "compute_sna_grid_kokkos_impl.h"
+
+namespace LAMMPS_NS {
+
+template class ComputeSNAGridKokkosDevice<LMPDeviceType>;
+#ifdef LMP_KOKKOS_GPU
+template class ComputeSNAGridKokkosHost<LMPHostType>;
+#endif
+
+}
+
+
+
+
+// The following chunk will compile but we're gonna try a wrapper approach like pair snap.
+/*
+#include "compute_sna_grid_kokkos.h"
+
+#include "atom_kokkos.h"
+#include "atom_masks.h"
+#include "comm.h"
+#include "error.h"
+#include "memory_kokkos.h"
+#include "modify.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "neighbor_kokkos.h"
+#include "sna_kokkos.h"
+#include "update.h"
+
+using namespace LAMMPS_NS;
+
+// ----------------------------------------------------------------------
+
+template<class DeviceType>
+ComputeSNAGridKokkos<DeviceType>::ComputeSNAGridKokkos(LAMMPS *lmp, int narg, char **arg) :
+  ComputeSNAGrid(lmp, narg, arg)
+{
+
+  printf("^^^ inside ComputeSNAGridKokkos constructor\n");
+  kokkosable = 1;
+  atomKK = (AtomKokkos *) atom;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  datamask_read = EMPTY_MASK;
+  datamask_modify = EMPTY_MASK;
+
+}
+
+// ----------------------------------------------------------------------
+
+template<class DeviceType>
+ComputeSNAGridKokkos<DeviceType>::~ComputeSNAGridKokkos()
+{
+  if (copymode) return;
+
+
+}
+
+namespace LAMMPS_NS {
+template class ComputeSNAGridKokkos<LMPDeviceType>;
+#ifdef LMP_KOKKOS_GPU
+template class ComputeSNAGridKokkos<LMPHostType>;
+#endif
+}
+*/
+
--- a/src/KOKKOS/compute_sna_grid_kokkos.h
+++ b/src/KOKKOS/compute_sna_grid_kokkos.h
@ -0,0 +1,366 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef COMPUTE_CLASS
+// clang-format off
+ComputeStyle(sna/grid/kk,ComputeSNAGridKokkosDevice<LMPDeviceType>);
+ComputeStyle(sna/grid/kk/device,ComputeSNAGridKokkosDevice<LMPDeviceType>);
+#ifdef LMP_KOKKOS_GPU
+ComputeStyle(sna/grid/kk/host,ComputeSNAGridKokkosHost<LMPHostType>);
+#else
+ComputeStyle(sna/grid/kk/host,ComputeSNAGridKokkosDevice<LMPHostType>);
+#endif
+// clang-format on
+#else
+
+// clang-format off
+#ifndef LMP_COMPUTE_SNA_GRID_KOKKOS_H
+#define LMP_COMPUTE_SNA_GRID_KOKKOS_H
+
+#include "compute_sna_grid.h"
+#include "kokkos_type.h"
+//#include "pair_snap.h"
+//#include "kokkos_type.h"
+//#include "neigh_list_kokkos.h"
+#include "sna_kokkos.h"
+//#include "pair_kokkos.h"
+
+namespace LAMMPS_NS {
+
+// Routines for both the CPU and GPU backend
+//template<int NEIGHFLAG, int EVFLAG>
+//struct TagPairSNAPComputeForce{};
+
+
+// GPU backend only
+/*
+struct TagPairSNAPComputeNeigh{};
+struct TagPairSNAPComputeCayleyKlein{};
+struct TagPairSNAPPreUi{};
+struct TagPairSNAPComputeUiSmall{}; // more parallelism, more divergence
+struct TagPairSNAPComputeUiLarge{}; // less parallelism, no divergence
+struct TagPairSNAPTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist
+struct TagPairSNAPComputeZi{};
+struct TagPairSNAPBeta{};
+struct TagPairSNAPComputeBi{};
+struct TagPairSNAPTransformBi{}; // re-order blist from AoSoA to AoS
+struct TagPairSNAPComputeYi{};
+struct TagPairSNAPComputeYiWithZlist{};
+template<int dir>
+struct TagPairSNAPComputeFusedDeidrjSmall{}; // more parallelism, more divergence
+template<int dir>
+struct TagPairSNAPComputeFusedDeidrjLarge{}; // less parallelism, no divergence
+*/
+//struct TagPairSNAPPreUi{};
+struct TagCSNAGridComputeNeigh{};
+struct TagCSNAGridComputeCayleyKlein{};
+struct TagCSNAGridPreUi{};
+struct TagCSNAGridComputeUiSmall{}; // more parallelism, more divergence
+struct TagCSNAGridComputeUiLarge{}; // less parallelism, no divergence
+struct TagCSNAGridTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist
+struct TagCSNAGridComputeZi{};
+struct TagCSNAGridComputeBi{};
+struct TagCSNAGridTransformBi{}; // re-order blist from AoSoA to AoS
+struct TagCSNAGridLocalFill{}; // fill the gridlocal array
+//struct TagCSNAGridLocalFill2{}; // fill the gridlocal array using same kinda loop as ComputeForce
+
+struct TagComputeSNAGridLoop{};
+struct TagComputeSNAGrid3D{};
+//struct TagCSNAGridTeam{};
+
+// CPU backend only
+/*
+struct TagPairSNAPComputeNeighCPU{};
+struct TagPairSNAPPreUiCPU{};
+struct TagPairSNAPComputeUiCPU{};
+struct TagPairSNAPTransformUiCPU{};
+struct TagPairSNAPComputeZiCPU{};
+struct TagPairSNAPBetaCPU{};
+struct TagPairSNAPComputeBiCPU{};
+struct TagPairSNAPZeroYiCPU{};
+struct TagPairSNAPComputeYiCPU{};
+struct TagPairSNAPComputeDuidrjCPU{};
+struct TagPairSNAPComputeDeidrjCPU{};
+*/
+struct TagComputeSNAGridLoopCPU{};
+
+//template<class DeviceType>
+template<class DeviceType, typename real_type_, int vector_length_>
+class ComputeSNAGridKokkos : public ComputeSNAGrid {
+ public:
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+
+  static constexpr int vector_length = vector_length_;
+  using real_type = real_type_;
+  using complex = SNAComplex<real_type>;
+
+  // Static team/tile sizes for device offload
+
+#ifdef KOKKOS_ENABLE_HIP
+  static constexpr int team_size_compute_neigh = 2;
+  static constexpr int tile_size_compute_ck = 2;
+  static constexpr int tile_size_pre_ui = 2;
+  static constexpr int team_size_compute_ui = 2;
+  static constexpr int tile_size_transform_ui = 2;
+  static constexpr int tile_size_compute_zi = 2;
+  static constexpr int tile_size_compute_bi = 2;
+  static constexpr int tile_size_transform_bi = 2;
+  static constexpr int tile_size_compute_yi = 2;
+  static constexpr int team_size_compute_fused_deidrj = 2;
+#else
+  static constexpr int team_size_compute_neigh = 4;
+  static constexpr int tile_size_compute_ck = 4;
+  static constexpr int tile_size_pre_ui = 4;
+  static constexpr int team_size_compute_ui = sizeof(real_type) == 4 ? 8 : 4;
+  static constexpr int tile_size_transform_ui = 4;
+  static constexpr int tile_size_compute_zi = 8;
+  static constexpr int tile_size_compute_bi = 4;
+  static constexpr int tile_size_transform_bi = 4;
+  static constexpr int tile_size_compute_yi = 8;
+  static constexpr int team_size_compute_fused_deidrj = sizeof(real_type) == 4 ? 4 : 2;
+#endif
+
+  // Custom MDRangePolicy, Rank3, to reduce verbosity of kernel launches
+  // This hides the Kokkos::IndexType<int> and Kokkos::Rank<3...>
+  // and reduces the verbosity of the LaunchBound by hiding the explicit
+  // multiplication by vector_length
+  template <class Device, int num_tiles, class TagComputeSNAP>
+  using Snap3DRangePolicy = typename Kokkos::MDRangePolicy<Device, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, Kokkos::LaunchBounds<vector_length * num_tiles>, TagComputeSNAP>;
+
+  // MDRangePolicy for the 3D grid loop:
+  template <class Device, class TagComputeSNAP>
+  using CSNAGrid3DPolicy = typename Kokkos::MDRangePolicy<Device, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>>;
+
+  // Testing out team policies
+  template <class Device, int num_teams,  class TagComputeSNAP>
+  using CSNAGridTeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::LaunchBounds<vector_length * num_teams>, TagComputeSNAP>;
+  //using CSNAGridTeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::IndexType<int>, Kokkos::IndexType<int>, Kokkos::IndexType<int>, TagComputeSNAP>;
+  //using team_member = typename team_policy::member_type;
+
+  // Custom SnapAoSoATeamPolicy to reduce the verbosity of kernel launches
+  // This hides the LaunchBounds abstraction by hiding the explicit
+  // multiplication by vector length
+  template <class Device, int num_teams, class TagComputeSNAP>
+  using SnapAoSoATeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::LaunchBounds<vector_length * num_teams>, TagComputeSNAP>;
+
+  ComputeSNAGridKokkos(class LAMMPS *, int, char **);
+  ~ComputeSNAGridKokkos() override;
+
+  void init() override;
+  void setup() override;
+  void compute_array() override;
+
+  // Utility functions for teams
+
+  template<class TagStyle>
+  void check_team_size_for(int, int&);
+
+  template<class TagStyle>
+  void check_team_size_reduce(int, int&);
+
+  // operator function for example team policy
+  //KOKKOS_INLINE_FUNCTION
+  //void operator() (TagCSNAGridTeam, const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridTeam>::member_type& team) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagComputeSNAGridLoop, const int& ) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagComputeSNAGridLoopCPU, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridComputeNeigh>::member_type& team) const;
+
+  // PrintNeigh
+  //void operator() (TagPrintNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagPrintNeigh>::member_type& team) const;
+
+  // 3D case - used by parallel_for
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagComputeSNAGrid3D, const int& iz, const int& iy, const int& ix) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeCayleyKlein, const int iatom_mod, const int jnbor, const int iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridPreUi,const int iatom_mod, const int j, const int iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridComputeUiSmall>::member_type& team) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridComputeUiLarge>::member_type& team) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridTransformUi,const int iatom_mod, const int j, const int iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeZi,const int iatom_mod, const int idxz, const int iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridComputeBi,const int iatom_mod, const int idxb, const int iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagCSNAGridLocalFill,const int& ii) const;
+
+ protected:
+
+  SNAKokkos<DeviceType, real_type, vector_length> snaKK;
+
+  int max_neighs, chunk_size, chunk_offset;
+  int host_flag;
+  int ntotal;
+  int total_range; // total number of loop iterations in grid
+  int zlen; //= nzhi-nzlo+1;
+  int ylen; //= nyhi-nylo+1;
+  int xlen; //= nxhi-nxlo+1;
+
+  double cutsq_tmp; // temporary cutsq until we get a view
+
+  Kokkos::View<real_type*, DeviceType> d_radelem;              // element radii
+  Kokkos::View<real_type*, DeviceType> d_wjelem;               // elements weights
+  //Kokkos::View<real_type**, Kokkos::LayoutRight, DeviceType> d_coeffelem;           // element bispectrum coefficients
+  Kokkos::View<real_type*, DeviceType> d_sinnerelem;           // element inner cutoff midpoint
+  Kokkos::View<real_type*, DeviceType> d_dinnerelem;           // element inner cutoff half-width
+  Kokkos::View<T_INT*, DeviceType> d_ninside;                // ninside for all atoms in list
+  Kokkos::View<T_INT*, DeviceType> d_map;                    // mapping from atom types to elements
+  Kokkos::View<real_type*, DeviceType> d_test;              // test view
+
+  typedef Kokkos::DualView<F_FLOAT**, DeviceType> tdual_fparams;
+  tdual_fparams k_cutsq;
+  typedef Kokkos::View<const F_FLOAT**, DeviceType,
+      Kokkos::MemoryTraits<Kokkos::RandomAccess> > t_fparams_rnd;
+  t_fparams_rnd rnd_cutsq;
+
+  typename AT::t_x_array_randomread x;
+  typename AT::t_int_1d_randomread type;
+  DAT::tdual_float_2d k_grid;
+  DAT::tdual_float_2d k_gridall;
+  typename AT::t_float_2d d_grid;
+  typename AT::t_float_2d d_gridall;
+
+  //DAT::tdual_float_4d k_gridlocal;
+  //typedef Kokkos::DualView<real_type****, Kokkos::LayoutLeft, DeviceType> t_gridlocal_4d;
+  //typedef Kokkos::View<real_type****, DeviceType> t_4d;
+  // should we use LMPDeviceType below?
+  //typedef Kokkos::DualView<LMP_FLOAT****, LMPDeviceType> tdual_float_4d;
+  //typedef tdual_float_4d::t_dev tdev_float_4d;
+  //tdual_float_4d k_gridlocal;
+  //tdev_float_4d d_gridlocal; 
+  DAT::tdual_float_4d k_gridlocal;
+  typename AT::t_float_4d d_gridlocal;
+
+
+  // Utility routine which wraps computing per-team scratch size requirements for
+  // ComputeNeigh, ComputeUi, and ComputeFusedDeidrj
+  template <typename scratch_type>
+  int scratch_size_helper(int values_per_team);
+
+  class DomainKokkos *domainKK;
+
+  // triclinic vars
+  /*
+  xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0];
+  xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1];
+  xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2];
+  */
+  double h0, h1, h2, h3, h4, h5;
+  double lo0, lo1, lo2;
+
+};
+
+// These wrapper classes exist to make the compute style factory happy/avoid having
+// to extend the compute  style factory to support Compute classes w/an arbitrary number
+// of extra template parameters
+
+template <class DeviceType>
+class ComputeSNAGridKokkosDevice : public ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN> {
+
+ private:
+  using Base = ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN>;
+
+ public:
+
+  ComputeSNAGridKokkosDevice(class LAMMPS *, int, char **);
+
+  void init() override;
+  void compute_array() override;
+
+};
+
+#ifdef LMP_KOKKOS_GPU
+template <class DeviceType>
+class ComputeSNAGridKokkosHost : public ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_HOST_VECLEN> {
+
+ private:
+  using Base = ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_HOST_VECLEN>;
+
+ public:
+
+  ComputeSNAGridKokkosHost(class LAMMPS *, int, char **);
+
+  void init() override;
+  void compute_array() override;
+
+};
+#endif
+
+}
+
+#endif
+#endif
+
+// The following will compile with the chunk in cpp file but we're gonna try wrapper like pair snap.
+/*
+#ifdef COMPUTE_CLASS
+// clang-format off
+ComputeStyle(sna/grid/kk,ComputeSNAGridKokkos<LMPDeviceType>);
+ComputeStyle(sna/grid/kk/device,ComputeSNAGridKokkos<LMPDeviceType>);
+ComputeStyle(sna/grid/kk/host,ComputeSNAGridKokkos<LMPHostType>);
+// clang-format on
+#else
+
+// clang-format off
+#ifndef LMP_COMPUTE_SNA_GRID_KOKKOS_H
+#define LMP_COMPUTE_SNA_GRID_KOKKOS_H
+
+#include "compute_sna_grid.h"
+#include "kokkos_type.h"
+
+namespace LAMMPS_NS {
+
+//template<int CSTYLE, int NCOL>
+//struct TagComputeCoordAtom{};
+
+template<class DeviceType>
+class ComputeSNAGridKokkos : public ComputeSNAGrid {
+ public:
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+
+  ComputeSNAGridKokkos(class LAMMPS *, int, char **);
+  ~ComputeSNAGridKokkos() override;
+
+ private:
+
+};
+
+}
+
+#endif
+#endif
+*/
+
--- a/src/KOKKOS/compute_sna_grid_kokkos_impl.h
+++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h
@ -0,0 +1,932 @@
+// clang-format off
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Christian Trott (SNL), Stan Moore (SNL),
+                         Evan Weinberg (NVIDIA)
+------------------------------------------------------------------------- */
+
+#include "compute_sna_grid_kokkos.h"
+#include "pair_snap_kokkos.h"
+
+#include "atom_kokkos.h"
+#include "atom_masks.h"
+#include "comm.h"
+#include "error.h"
+#include "memory_kokkos.h"
+#include "modify.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "neighbor_kokkos.h"
+//#include "sna_kokkos.h"
+#include "domain.h"
+#include "domain_kokkos.h"
+#include "sna.h"
+#include "update.h"
+
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+
+#include <iostream>
+
+#define MAXLINE 1024
+#define MAXWORD 3
+
+namespace LAMMPS_NS {
+
+// Constructor
+
+template<class DeviceType, typename real_type, int vector_length>
+ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::ComputeSNAGridKokkos(LAMMPS *lmp, int narg, char **arg) : ComputeSNAGrid(lmp, narg, arg)
+{
+  kokkosable = 1;
+  atomKK = (AtomKokkos *) atom;
+  domainKK = (DomainKokkos *) domain;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  datamask_read = EMPTY_MASK;
+  datamask_modify = EMPTY_MASK;
+
+  k_cutsq = tdual_fparams("ComputeSNAGridKokkos::cutsq",atom->ntypes+1,atom->ntypes+1);
+  auto d_cutsq = k_cutsq.template view<DeviceType>();
+  rnd_cutsq = d_cutsq;
+
+  host_flag = (execution_space == Host);
+
+  // TODO: Extract cutsq in double loop below, no need for cutsq_tmp
+
+  cutsq_tmp = cutsq[1][1];
+
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = 1; j <= atom->ntypes; j++){
+      k_cutsq.h_view(i,j) = k_cutsq.h_view(j,i) = cutsq_tmp;
+      k_cutsq.template modify<LMPHostType>();
+    }
+  }
+
+   // Set up element lists
+  MemKK::realloc_kokkos(d_radelem,"ComputeSNAGridKokkos::radelem",nelements);
+  MemKK::realloc_kokkos(d_wjelem,"ComputeSNAGridKokkos:wjelem",nelements);
+  MemKK::realloc_kokkos(d_sinnerelem,"ComputeSNAGridKokkos:sinnerelem",nelements);
+  MemKK::realloc_kokkos(d_dinnerelem,"ComputeSNAGridKokkos:dinnerelem",nelements);
+  // test
+  MemKK::realloc_kokkos(d_test, "ComputeSNAGridKokkos::test", nelements);
+
+  int n = atom->ntypes;
+  MemKK::realloc_kokkos(d_map,"ComputeSNAGridKokkos::map",n+1);
+
+  auto h_radelem = Kokkos::create_mirror_view(d_radelem);
+  auto h_wjelem = Kokkos::create_mirror_view(d_wjelem);
+  auto h_sinnerelem = Kokkos::create_mirror_view(d_sinnerelem);
+  auto h_dinnerelem = Kokkos::create_mirror_view(d_dinnerelem);
+  auto h_map = Kokkos::create_mirror_view(d_map);
+  // test
+  auto h_test = Kokkos::create_mirror_view(d_test);
+  h_test(0) = 2.0;
+
+  // start from index 1 because of how compute sna/grid is
+  for (int i = 1; i <= atom->ntypes; i++) {
+    h_radelem(i-1) = radelem[i];
+    h_wjelem(i-1) = wjelem[i];
+    if (switchinnerflag){
+      h_sinnerelem(i) = sinnerelem[i];
+      h_dinnerelem(i) = dinnerelem[i];
+    }
+  }
+
+  // In pair snap some things like `map` get allocated regardless of chem flag.
+  if (chemflag){ 
+    for (int i = 1; i <= atom->ntypes; i++) {
+      h_map(i) = map[i];
+    }
+  }
+
+  Kokkos::deep_copy(d_radelem,h_radelem);
+  Kokkos::deep_copy(d_wjelem,h_wjelem);
+  if (switchinnerflag){
+    Kokkos::deep_copy(d_sinnerelem,h_sinnerelem);
+    Kokkos::deep_copy(d_dinnerelem,h_dinnerelem);
+  }
+  if (chemflag){
+    Kokkos::deep_copy(d_map,h_map);
+  }
+  Kokkos::deep_copy(d_test,h_test);
+
+  double bytes =  MemKK::memory_usage(d_wjelem);
+
+  snaKK = SNAKokkos<DeviceType, real_type, vector_length>(rfac0,twojmax,
+    rmin0,switchflag,bzeroflag,chemflag,bnormflag,wselfallflag,nelements,switchinnerflag);
+  snaKK.grow_rij(0,0);
+  snaKK.init();
+
+}
+
+// Destructor
+
+template<class DeviceType, typename real_type, int vector_length>
+ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::~ComputeSNAGridKokkos()
+{
+  if (copymode) return;
+
+  memoryKK->destroy_kokkos(k_cutsq,cutsq);
+  //memoryKK->destroy_kokkos(k_grid,grid);
+  memoryKK->destroy_kokkos(k_gridall, gridall);
+  //memoryKK->destroy_kokkos(k_gridlocal, gridlocal);
+}
+
+// Init
+
+template<class DeviceType, typename real_type, int vector_length>
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::init()
+{
+  if (host_flag) {
+    return;
+  }
+  ComputeSNAGrid::init();
+
+}
+
+// Setup
+
+template<class DeviceType, typename real_type, int vector_length>
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::setup()
+{
+
+  // Do not call ComputeGrid::setup(), we don't wanna allocate the grid array there.
+  // Instead, call ComputeGrid::set_grid_global and set_grid_local to set the n indices.
+
+  ComputeGrid::set_grid_global();
+  ComputeGrid::set_grid_local();
+  
+  // allocate arrays
+  //printf(">>> Allocating gridall.\n");
+  //printf(">>> %d %d\n", size_array_rows, size_array_cols);
+  //memoryKK->create_kokkos(k_grid,grid, size_array_rows, size_array_cols, "grid:grid");
+  memoryKK->create_kokkos(k_gridall, gridall, size_array_rows, size_array_cols, "grid:gridall");
+  //printf(">>> Allocated gridall.\n");
+
+  // do not use or allocate gridlocal for now
+
+  gridlocal_allocated = 0;
+  /*
+  if (nxlo <= nxhi && nylo <= nyhi && nzlo <= nzhi) {
+    gridlocal_allocated = 1;
+    memoryKK->create4d_offset_kokkos(k_gridlocal, gridlocal, size_array_cols, nzlo, nzhi, nylo, 
+                                     nyhi, nxlo, nxhi, "grid:gridlocal");
+  }
+  */
+  array = gridall;
+
+  d_gridlocal = k_gridlocal.template view<DeviceType>();
+  //d_grid = k_grid.template view<DeviceType>();
+  d_gridall = k_gridall.template view<DeviceType>();
+}
+
+// Compute
+
+template<class DeviceType, typename real_type, int vector_length>
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::compute_array()
+{
+  if (host_flag) {
+    return;
+  }
+
+  copymode = 1;
+
+  zlen = nzhi-nzlo+1;
+  ylen = nyhi-nylo+1;
+  xlen = nxhi-nxlo+1;
+  total_range = (nzhi-nzlo+1)*(nyhi-nylo+1)*(nxhi-nxlo+1);
+
+  atomKK->sync(execution_space,X_MASK|F_MASK|TYPE_MASK);
+  x = atomKK->k_x.view<DeviceType>();
+  type = atomKK->k_type.view<DeviceType>();
+  k_cutsq.template sync<DeviceType>();
+
+  // max_neighs is defined here - think of more elaborate methods.
+  max_neighs = 100;
+
+  // Pair snap/kk uses grow_ij with some max number of neighs but compute sna/grid uses total 
+  // number of atoms.
+
+  ntotal = atomKK->nlocal + atomKK->nghost;
+  // Allocate view for number of neighbors per grid point
+  MemKK::realloc_kokkos(d_ninside,"ComputeSNAGridKokkos:ninside",total_range);
+
+  // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user 
+  // `total_range` is the number of grid points which may be larger than chunk size.
+  //printf(">>> total_range: %d\n", total_range);
+  chunk_size = MIN(chunksize, total_range);
+  chunk_offset = 0;
+  //snaKK.grow_rij(chunk_size, ntotal);
+  snaKK.grow_rij(chunk_size, max_neighs);
+
+  //chunk_size = total_range;
+ 
+  // Pre-compute ceil(chunk_size / vector_length) for code cleanliness
+  const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length;
+
+  if (triclinic){
+    /*
+    xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0];
+    xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1];
+    xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2];
+    */
+    h0 = domain->h[0];
+    h1 = domain->h[1];
+    h2 = domain->h[2];
+    h3 = domain->h[3];
+    h4 = domain->h[4];   
+    h5 = domain->h[5];   
+    lo0 = domain->boxlo[0];
+    lo1 = domain->boxlo[1];
+    lo2 = domain->boxlo[2];
+  }
+
+  while (chunk_offset < total_range) { // chunk up loop to prevent running out of memory
+
+    if (chunk_size > total_range - chunk_offset)
+      chunk_size = total_range - chunk_offset;
+
+    //printf(">>> chunk_offset: %d\n", chunk_offset);
+
+    //ComputeNeigh 
+    {
+      int scratch_size = scratch_size_helper<int>(team_size_compute_neigh * max_neighs); //ntotal);
+
+      SnapAoSoATeamPolicy<DeviceType, team_size_compute_neigh, TagCSNAGridComputeNeigh> 
+        policy_neigh(chunk_size, team_size_compute_neigh, vector_length);
+      policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+      Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this);
+    }
+
+    //ComputeCayleyKlein
+    {
+      // tile_size_compute_ck is defined in `compute_sna_grid_kokkos.h`
+      Snap3DRangePolicy<DeviceType, tile_size_compute_ck, TagCSNAGridComputeCayleyKlein>
+        policy_compute_ck({0,0,0}, {vector_length, max_neighs, chunk_size_div}, {vector_length, tile_size_compute_ck, 1});
+      Kokkos::parallel_for("ComputeCayleyKlein", policy_compute_ck, *this);
+    }
+
+    //PreUi
+    {
+      // tile_size_pre_ui is defined in `compute_sna_grid_kokkos.h`
+      Snap3DRangePolicy<DeviceType, tile_size_pre_ui, TagCSNAGridPreUi>
+        policy_preui({0,0,0},{vector_length,twojmax+1,chunk_size_div},{vector_length,tile_size_pre_ui,1});
+      Kokkos::parallel_for("PreUi",policy_preui,*this);
+    }
+
+    // ComputeUi w/ vector parallelism, shared memory, direct atomicAdd into ulisttot
+    {
+      // team_size_compute_ui is defined in `compute_sna_grid_kokkos.h`
+      // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer
+      const int tile_size = vector_length * (twojmax + 1);
+      const int scratch_size = scratch_size_helper<complex>(team_size_compute_ui * tile_size);
+
+      if (chunk_size < parallel_thresh)
+      {
+        // Version with parallelism over j_bend
+
+        // total number of teams needed: (natoms / 32) * (ntotal) * ("bend" locations)
+        const int n_teams = chunk_size_div * max_neighs * (twojmax + 1);
+        const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
+
+        SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagCSNAGridComputeUiSmall>
+          policy_ui(n_teams_div, team_size_compute_ui, vector_length);
+        policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+        Kokkos::parallel_for("ComputeUiSmall", policy_ui, *this);
+      } else {
+        // Version w/out parallelism  over j_bend
+
+        // total number of teams needed: (natoms / 32) * (ntotal)
+        const int n_teams = chunk_size_div * max_neighs;
+        const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
+
+        SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagCSNAGridComputeUiLarge>
+          policy_ui(n_teams_div, team_size_compute_ui, vector_length);
+        policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+        Kokkos::parallel_for("ComputeUiLarge", policy_ui, *this);
+      }
+    }
+
+    //TransformUi: un-"fold" ulisttot, zero ylist
+    {
+      // team_size_transform_ui is defined in `pair_snap_kokkos.h`
+      Snap3DRangePolicy<DeviceType, tile_size_transform_ui, TagCSNAGridTransformUi>
+          policy_transform_ui({0,0,0},{vector_length,snaKK.idxu_max,chunk_size_div},{vector_length,tile_size_transform_ui,1});
+      Kokkos::parallel_for("TransformUi",policy_transform_ui,*this);
+    }
+
+    //Compute bispectrum in AoSoA data layout, transform Bi
+
+    //ComputeZi
+    const int idxz_max = snaKK.idxz_max;
+    Snap3DRangePolicy<DeviceType, tile_size_compute_zi, TagCSNAGridComputeZi>
+        policy_compute_zi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_zi,1});
+    Kokkos::parallel_for("ComputeZi",policy_compute_zi,*this);
+
+    //ComputeBi
+    const int idxb_max = snaKK.idxb_max;
+    Snap3DRangePolicy<DeviceType, tile_size_compute_bi, TagCSNAGridComputeBi>
+        policy_compute_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_compute_bi,1});
+    Kokkos::parallel_for("ComputeBi",policy_compute_bi,*this);
+
+    //Transform data layout of blist out of AoSoA
+    //We need this because `blist` gets used in ComputeForce which doesn't
+    //take advantage of AoSoA, which at best would only be beneficial on the margins
+    //NOTE: Do we need this in compute sna/grid/kk?
+    Snap3DRangePolicy<DeviceType, tile_size_transform_bi, TagCSNAGridTransformBi>
+        policy_transform_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_transform_bi,1});
+    Kokkos::parallel_for("TransformBi",policy_transform_bi,*this);
+
+    // Fill the grid array with bispectrum values
+    {
+      typename Kokkos::RangePolicy<DeviceType,TagCSNAGridLocalFill> policy_fill(0,chunk_size);
+      Kokkos::parallel_for(policy_fill, *this);
+    }
+
+    // Proceed to the next chunk.
+    chunk_offset += chunk_size;
+
+  } // end while
+
+  k_gridlocal.template modify<DeviceType>();
+  k_gridlocal.template sync<LMPHostType>();
+
+  //k_grid.template modify<DeviceType>();
+  //k_grid.template sync<LMPHostType>();
+
+  k_gridall.template modify<DeviceType>();
+  k_gridall.template sync<LMPHostType>();
+}
+
+/* ----------------------------------------------------------------------
+   Begin routines that are unique to the GPU codepath. These take advantage
+   of AoSoA data layouts and scratch memory for recursive polynomials
+------------------------------------------------------------------------- */
+
+/*
+ Simple team policy functor seeing how many layers deep we can go with the parallelism.
+ */
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridComputeNeigh>::member_type& team) const {
+
+  // This function follows similar procedure as ComputeNeigh of PairSNAPKokkos.
+  // Main difference is that we don't use the neighbor class or neighbor variables here.
+  // This is because the grid points are not atoms and therefore do not get assigned
+  // neighbors in LAMMPS. 
+  // TODO: If we did make a neighborlist for each grid point, we could use current 
+  //       routines and avoid having to loop over all atoms (which limits us to 
+  //       natoms = max team size).
+
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+  // basic quantities associated with this team:
+  // team_rank : rank of thread in this team
+  // league_rank : rank of team in this league
+  // team_size : number of threads in this team
+
+  // extract loop index
+  int ii = team.team_rank() + team.league_rank() * team.team_size();
+
+  if (ii >= chunk_size) return;
+
+  // extract grid index
+  int igrid = ii + chunk_offset;
+
+  // get a pointer to scratch memory
+  // This is used to cache whether or not an atom is within the cutoff.
+  // If it is, type_cache is assigned to the atom type.
+  // If it's not, it's assigned to -1.
+  const int tile_size = ntotal; //max_neighs; // number of elements per thread
+  const int team_rank = team.team_rank();
+  const int scratch_shift = team_rank * tile_size; // offset into pointer for entire team
+  int* type_cache = (int*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(int), 0) + scratch_shift;
+
+  // convert to grid indices
+
+  int iz = igrid/(xlen*ylen);
+  int i2 = igrid - (iz*xlen*ylen);
+  int iy = i2/xlen;
+  int ix = i2 % xlen;
+  iz += nzlo;
+  iy += nylo;
+  ix += nxlo;
+
+  double xgrid[3];
+
+  // index ii already captures the proper grid point
+  //int igrid = iz * (nx * ny) + iy * nx + ix;
+  //printf("%d %d\n", ii, igrid);
+
+  // grid2x converts igrid to ix,iy,iz like we've done before
+  // multiply grid integers by grid spacing delx, dely, delz
+  //grid2x(igrid, xgrid);
+  xgrid[0] = ix * delx;
+  xgrid[1] = iy * dely;
+  xgrid[2] = iz * delz;
+
+  if (triclinic) {
+
+    // Do a conversion on `xgrid` here like we do in the CPU version.
+
+    // Can't do this:
+    // domainKK->lamda2x(xgrid, xgrid);
+    // Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed
+
+    // Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats.
+    /*
+    xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0];
+    xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1];
+    xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2];
+    */
+    xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0;
+    xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1;
+    xgrid[2] = h2*xgrid[2] + lo2;
+  }
+
+  const F_FLOAT xtmp = xgrid[0];
+  const F_FLOAT ytmp = xgrid[1];
+  const F_FLOAT ztmp = xgrid[2];
+
+  // currently, all grid points are type 1
+  // not clear what a better choice would be
+
+  const int itype = 1;
+  int ielem = 0;
+  if (chemflag) ielem = d_map[itype];
+  const double radi = d_radelem[ielem];
+
+  // We need a DomainKokkos::lamda2x parallel for loop here, but let's ignore for now.
+  // The purpose here is to transform for triclinic boxes.
+  /*
+  if (triclinic){
+    printf("We are triclinic %f %f %f\n", xtmp, ytmp, ztmp);
+  }
+  */
+
+  // Compute the number of neighbors, store rsq
+  int ninside = 0;
+  
+  // Looping over ntotal for now.
+  for (int j = 0; j < ntotal; j++){
+    const F_FLOAT dx = x(j,0) - xtmp;
+    const F_FLOAT dy = x(j,1) - ytmp;
+    const F_FLOAT dz = x(j,2) - ztmp;
+    int jtype = type(j);
+    const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
+
+    // don't include atoms that share location with grid point
+    if (rsq >= rnd_cutsq(itype,jtype) || rsq < 1e-20) {
+      jtype = -1; // use -1 to signal it's outside the radius
+    } 
+
+    if (jtype >= 0)
+      ninside++;
+
+  } 
+
+  /*
+  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,ntotal),
+    [&] (const int j, int& count) {
+    const F_FLOAT dx = x(j,0) - xtmp;
+    const F_FLOAT dy = x(j,1) - ytmp;
+    const F_FLOAT dz = x(j,2) - ztmp;
+
+    int jtype = type(j);
+    const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
+
+    // don't include atoms that share location with grid point
+    if (rsq >= rnd_cutsq(itype,jtype) || rsq < 1e-20) {
+      jtype = -1; // use -1 to signal it's outside the radius
+    } 
+
+    type_cache[j] = jtype;
+
+    if (jtype >= 0)
+     count++;
+
+  }, ninside);
+  */
+
+  d_ninside(ii) = ninside; 
+
+  // TODO: Adjust for multi-element, currently we set jelem = 0 regardless of type.
+  int offset = 0;
+  for (int j = 0; j < ntotal; j++){
+    //const int jtype = type_cache[j];
+    //if (jtype >= 0) {
+    const F_FLOAT dx = x(j,0) - xtmp;
+    const F_FLOAT dy = x(j,1) - ytmp;
+    const F_FLOAT dz = x(j,2) - ztmp;
+    const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
+    int jtype = type(j);
+    if (rsq < rnd_cutsq(itype,jtype) && rsq > 1e-20) {
+      int jelem = 0;
+      if (chemflag) jelem = d_map[jtype];
+      my_sna.rij(ii,offset,0) = static_cast<real_type>(dx);
+      my_sna.rij(ii,offset,1) = static_cast<real_type>(dy);
+      my_sna.rij(ii,offset,2) = static_cast<real_type>(dz);
+      // pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp
+      // actually since the views here have values starting at 0, let's use jelem
+      my_sna.wj(ii,offset) = static_cast<real_type>(d_wjelem[jelem]);
+      my_sna.rcutij(ii,offset) = static_cast<real_type>((2.0 * d_radelem[jelem])*rcutfac);
+      my_sna.inside(ii,offset) = j;
+      if (switchinnerflag) {
+        my_sna.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]);
+        my_sna.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]);
+      }
+      if (chemflag)
+        my_sna.element(ii,offset) = jelem;
+      else
+        my_sna.element(ii,offset) = 0;
+      offset++;
+    }
+  }
+
+  /*
+  int offset = 0;
+  for (int j = 0; j < ntotal; j++){
+    const int jtype = type_cache[j];
+    if (jtype >= 0) {
+      printf(">>> offset: %d\n", offset);
+      const F_FLOAT dx = x(j,0) - xtmp;
+      const F_FLOAT dy = x(j,1) - ytmp;
+      const F_FLOAT dz = x(j,2) - ztmp;
+      int jtype = type(j);
+      int jelem = 0;
+      if (chemflag) jelem = d_map[jtype];
+      my_sna.rij(ii,offset,0) = static_cast<real_type>(dx);
+      my_sna.rij(ii,offset,1) = static_cast<real_type>(dy);
+      my_sna.rij(ii,offset,2) = static_cast<real_type>(dz);
+      // pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp
+      // actually since the views here have values starting at 0, let's use jelem
+      my_sna.wj(ii,offset) = static_cast<real_type>(d_wjelem[jelem]);
+      my_sna.rcutij(ii,offset) = static_cast<real_type>((2.0 * d_radelem[jelem])*rcutfac);
+      my_sna.inside(ii,offset) = j;
+      if (switchinnerflag) {
+        my_sna.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]);
+        my_sna.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]);
+      }
+      if (chemflag)
+        my_sna.element(ii,offset) = jelem;
+      else
+        my_sna.element(ii,offset) = 0;
+      offset++;
+    }
+  }
+  */
+
+  /*
+  Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team,ntotal),
+    [&] (const int j, int& offset, bool final) {
+
+    const int jtype = type_cache[j];
+
+    if (jtype >= 0) {
+      if (final) {
+        const F_FLOAT dx = x(j,0) - xtmp;
+        const F_FLOAT dy = x(j,1) - ytmp;
+        const F_FLOAT dz = x(j,2) - ztmp;
+        int jtype = type(j);
+        int jelem = 0;
+        if (chemflag) jelem = d_map[jtype];
+        my_sna.rij(ii,offset,0) = static_cast<real_type>(dx);
+        my_sna.rij(ii,offset,1) = static_cast<real_type>(dy);
+        my_sna.rij(ii,offset,2) = static_cast<real_type>(dz);
+        // pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp
+        // actually since the views here have values starting at 0, let's use jelem
+        my_sna.wj(ii,offset) = static_cast<real_type>(d_wjelem[jelem]);
+        my_sna.rcutij(ii,offset) = static_cast<real_type>((2.0 * d_radelem[jelem])*rcutfac);
+        my_sna.inside(ii,offset) = j;
+        if (switchinnerflag) {
+          my_sna.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]);
+          my_sna.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]);
+        }
+        if (chemflag)
+          my_sna.element(ii,offset) = jelem;
+        else
+          my_sna.element(ii,offset) = 0;
+      }
+      offset++;
+    }
+  });
+  */
+}
+
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeCayleyKlein,const int iatom_mod, const int jnbor, const int iatom_div) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+  const int ii = iatom_mod + iatom_div * vector_length;
+  if (ii >= chunk_size) return;
+
+  const int ninside = d_ninside(ii);
+  if (jnbor >= ninside) return;
+
+  my_sna.compute_cayley_klein(iatom_mod,jnbor,iatom_div);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridPreUi, const int iatom_mod, const int j, const int iatom_div) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+  const int ii = iatom_mod + iatom_div * vector_length;
+  if (ii >= chunk_size) return;
+
+  int itype = type(ii);
+  // force ielem to be zero (i.e. type 1) per `compute_sna_grid.cpp`
+  int ielem = 0;
+
+  my_sna.pre_ui(iatom_mod, j, ielem, iatom_div);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridComputeUiSmall>::member_type& team) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+  // extract flattened atom_div / neighbor number / bend_location
+  int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
+
+  // extract neighbor index, iatom_div
+  int iatom_div = flattened_idx / (max_neighs * (twojmax + 1)); // removed "const" to work around GCC 7 bug
+  const int jj_jbend = flattened_idx - iatom_div * (max_neighs * (twojmax + 1));
+  const int jbend = jj_jbend / max_neighs;
+  int jj = jj_jbend - jbend * max_neighs; // removed "const" to work around GCC 7 bug
+
+  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
+    [&] (const int iatom_mod) {
+    const int ii = iatom_mod + vector_length * iatom_div;
+    if (ii >= chunk_size) return;
+
+    const int ninside = d_ninside(ii);
+    if (jj >= ninside) return;
+
+    my_sna.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div);
+  });
+
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridComputeUiLarge>::member_type& team) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+  // extract flattened atom_div / neighbor number / bend location
+  int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
+
+  // extract neighbor index, iatom_div
+  int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug
+  int jj = flattened_idx - iatom_div * max_neighs;
+
+  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
+    [&] (const int iatom_mod) {
+    const int ii = iatom_mod + vector_length * iatom_div;
+    if (ii >= chunk_size) return;
+
+    const int ninside = d_ninside(ii);
+    if (jj >= ninside) return;
+
+    my_sna.compute_ui_large(team,iatom_mod, jj, iatom_div);
+  });
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridTransformUi,const int iatom_mod, const int idxu, const int iatom_div) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+
+  if (idxu > my_sna.idxu_max) return;
+
+  int elem_count = chemflag ? nelements : 1;
+
+  for (int ielem = 0; ielem < elem_count; ielem++){
+
+    const FullHalfMapper mapper = my_sna.idxu_full_half[idxu];
+
+    auto utot_re = my_sna.ulisttot_re_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div);
+    auto utot_im = my_sna.ulisttot_im_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div);
+
+    if (mapper.flip_sign == 1){
+      utot_im = -utot_im;
+    } else if (mapper.flip_sign == -1){
+      utot_re = -utot_re;
+    }
+
+    my_sna.ulisttot_pack(iatom_mod, idxu, ielem, iatom_div) = { utot_re, utot_im };
+
+    if (mapper.flip_sign == 0) {
+      my_sna.ylist_pack_re(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.;
+      my_sna.ylist_pack_im(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.;
+    }
+  }
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeZi,const int iatom_mod, const int jjz, const int iatom_div) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+
+  if (jjz >= my_sna.idxz_max) return;
+
+  my_sna.compute_zi(iatom_mod,jjz,iatom_div);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeBi,const int iatom_mod, const int jjb, const int iatom_div) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+
+  if (jjb >= my_sna.idxb_max) return;
+
+  my_sna.compute_bi(iatom_mod,jjb,iatom_div);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+
+  if (idxb >= my_sna.idxb_max) return;
+
+  const int ntriples = my_sna.ntriples;
+
+  for (int itriple = 0; itriple < ntriples; itriple++) {
+
+    const real_type blocal = my_sna.blist_pack(iatom_mod, idxb, itriple, iatom_div);
+
+    my_sna.blist(iatom, itriple, idxb) = blocal;
+  }
+
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalFill, const int& ii) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+
+  // extract grid index
+  int igrid = ii + chunk_offset;
+
+  // convert to grid indices
+
+  int iz = igrid/(xlen*ylen);
+  int i2 = igrid - (iz*xlen*ylen);
+  int iy = i2/xlen;
+  int ix = i2 % xlen;
+  iz += nzlo;
+  iy += nylo;
+  ix += nxlo;
+
+  double xgrid[3];
+
+  // index ii already captures the proper grid point
+  // int igrid = iz * (nx * ny) + iy * nx + ix;
+  // printf("ii igrid: %d %d\n", ii, igrid);
+
+  // grid2x converts igrid to ix,iy,iz like we've done before
+  //grid2x(igrid, xgrid);
+  xgrid[0] = ix * delx;
+  xgrid[1] = iy * dely;
+  xgrid[2] = iz * delz;
+  if (triclinic) {
+
+    // Do a conversion on `xgrid` here like we do in the CPU version.
+
+    // Can't do this:
+    // domainKK->lamda2x(xgrid, xgrid);
+    // Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed
+
+    // Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats.
+    /*
+    xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0];
+    xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1];
+    xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2];
+    */
+    xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0;
+    xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1;
+    xgrid[2] = h2*xgrid[2] + lo2;
+  }
+
+  const F_FLOAT xtmp = xgrid[0];
+  const F_FLOAT ytmp = xgrid[1];
+  const F_FLOAT ztmp = xgrid[2];
+  d_gridall(igrid,0) = xtmp;
+  d_gridall(igrid,1) = ytmp;
+  d_gridall(igrid,2) = ztmp;
+
+  const auto idxb_max = snaKK.idxb_max;
+
+  // linear contributions
+
+  for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
+    const auto idxb = icoeff % idxb_max;
+    const auto idx_chem = icoeff / idxb_max;
+    d_gridall(igrid,icoeff+3) = my_sna.blist(ii,idx_chem,idxb);
+  }
+
+}
+
+/* ----------------------------------------------------------------------
+   utility functions
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+template<class TagStyle>
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::check_team_size_for(int inum, int &team_size) {
+  int team_size_max;
+
+  team_size_max = Kokkos::TeamPolicy<DeviceType,TagStyle>(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelForTag());
+
+  if (team_size*vector_length > team_size_max)
+    team_size = team_size_max/vector_length;
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+template<class TagStyle>
+void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::check_team_size_reduce(int inum, int &team_size) {
+  int team_size_max;
+
+  team_size_max = Kokkos::TeamPolicy<DeviceType,TagStyle>(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelReduceTag());
+
+  if (team_size*vector_length > team_size_max)
+    team_size = team_size_max/vector_length;
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+template<typename scratch_type>
+int ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::scratch_size_helper(int values_per_team) {
+  typedef Kokkos::View<scratch_type*, Kokkos::DefaultExecutionSpace::scratch_memory_space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > ScratchViewType;
+
+  return ScratchViewType::shmem_size(values_per_team);
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   routines used by template reference classes
+------------------------------------------------------------------------- */
+
+
+template<class DeviceType>
+ComputeSNAGridKokkosDevice<DeviceType>::ComputeSNAGridKokkosDevice(class LAMMPS *lmp, int narg, char **arg)
+   : ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN>(lmp, narg, arg) { ; }
+
+template<class DeviceType>
+void ComputeSNAGridKokkosDevice<DeviceType>::init()
+{
+  Base::init();
+}
+
+template<class DeviceType>
+void ComputeSNAGridKokkosDevice<DeviceType>::compute_array()
+{
+  Base::compute_array();
+}
+
+#ifdef LMP_KOKKOS_GPU
+template<class DeviceType>
+ComputeSNAGridKokkosHost<DeviceType>::ComputeSNAGridKokkosHost(class LAMMPS *lmp, int narg, char **arg)
+   : ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_HOST_VECLEN>(lmp, narg, arg) { ; }
+
+template<class DeviceType>
+void ComputeSNAGridKokkosHost<DeviceType>::init()
+{
+  Base::init();
+}
+
+template<class DeviceType>
+void ComputeSNAGridKokkosHost<DeviceType>::compute_array()
+{
+  Base::compute_array();
+}
+#endif
+
+}
--- a/src/KOKKOS/kokkos_type.h
+++ b/src/KOKKOS/kokkos_type.h
@ -769,6 +769,14 @@ typedef tdual_float_3d::t_dev_um t_float_3d_um;
 typedef tdual_float_3d::t_dev_const_um t_float_3d_const_um;
 typedef tdual_float_3d::t_dev_const_randomread t_float_3d_randomread;

+//4d float array n
+typedef Kokkos::DualView<LMP_FLOAT****, Kokkos::LayoutRight, LMPDeviceType> tdual_float_4d;
+typedef tdual_float_4d::t_dev t_float_4d;
+typedef tdual_float_4d::t_dev_const t_float_4d_const;
+typedef tdual_float_4d::t_dev_um t_float_4d_um;
+typedef tdual_float_4d::t_dev_const_um t_float_4d_const_um;
+typedef tdual_float_4d::t_dev_const_randomread t_float_4d_randomread;
+
 #ifdef LMP_KOKKOS_NO_LEGACY
 typedef Kokkos::DualView<X_FLOAT*[4], Kokkos::LayoutLeft, LMPDeviceType> tdual_float_1d_4;
 #else
@ -1075,6 +1083,14 @@ typedef tdual_float_2d::t_host_um t_float_2d_um;
 typedef tdual_float_2d::t_host_const_um t_float_2d_const_um;
 typedef tdual_float_2d::t_host_const_randomread t_float_2d_randomread;

+//4d float array n
+typedef Kokkos::DualView<LMP_FLOAT****, Kokkos::LayoutRight, LMPDeviceType> tdual_float_4d;
+typedef tdual_float_4d::t_host t_float_4d;
+typedef tdual_float_4d::t_host_const t_float_4d_const;
+typedef tdual_float_4d::t_host_um t_float_4d_um;
+typedef tdual_float_4d::t_host_const_um t_float_4d_const_um;
+typedef tdual_float_4d::t_host_const_randomread t_float_4d_randomread;
+
 #ifdef LMP_KOKKOS_NO_LEGACY
 typedef Kokkos::DualView<X_FLOAT*[4], Kokkos::LayoutLeft, LMPDeviceType> tdual_float_1d_4;
 #else
--- a/src/KOKKOS/memory_kokkos.h
+++ b/src/KOKKOS/memory_kokkos.h
@ -163,6 +163,7 @@ template <typename TYPE, typename HTYPE>
 {
  data = TYPE(std::string(name),n1,n2);
  h_data = Kokkos::create_mirror_view(data);
+  //printf(">>> name: %s\n", name);
  return data;
 }

@ -173,6 +174,7 @@ TYPE create_kokkos(TYPE &data, typename TYPE::value_type **&array,
  data = TYPE(std::string(name),n1,n2);
  bigint nbytes = ((bigint) sizeof(typename TYPE::value_type *)) * n1;
  array = (typename TYPE::value_type **) smalloc(nbytes,name);
+  //printf(">>> name %s nbytes %d\n", name, nbytes);

  for (int i = 0; i < n1; i++) {
    if (n2 == 0)
@ -183,6 +185,56 @@ TYPE create_kokkos(TYPE &data, typename TYPE::value_type **&array,
  return data;
 }

+/* ----------------------------------------------------------------------
+   create a 4d array with indices 2,3,4 offset, but not first
+   2nd index from n2lo to n2hi inclusive
+   3rd index from n3lo to n3hi inclusive
+   4th index from n4lo to n4hi inclusive
+   cannot grow it
+------------------------------------------------------------------------- */
+
+template <typename TYPE>
+TYPE create4d_offset_kokkos(TYPE &data, typename TYPE::value_type ****&array,
+                             int n1, int n2lo, int n2hi, int n3lo, int n3hi, int n4lo, int n4hi,
+                             const char *name)
+{
+  //if (n1 <= 0 || n2lo > n2hi || n3lo > n3hi || n4lo > n4hi) array =  nullptr;
+
+  printf("^^^^^ memoryKK->create_4d_offset_kokkos\n");
+
+  int n2 = n2hi - n2lo + 1;
+  int n3 = n3hi - n3lo + 1;
+  int n4 = n4hi - n4lo + 1;
+  data = TYPE(std::string(name),n1,n2,n3,n4);
+  bigint nbytes = ((bigint) sizeof(typename TYPE::value_type ***)) * n1;
+  array = (typename TYPE::value_type ****) smalloc(nbytes,name);
+
+  for (int i = 0; i < n1; i++) {
+    if (n2 == 0) {
+      array[i] = nullptr;
+    } else {
+      nbytes = ((bigint) sizeof(typename TYPE::value_type **)) * n2;
+      array[i] = (typename TYPE::value_type ***) smalloc(nbytes,name);
+      for (int j = 0; j < n2; j++){
+        if (n3 == 0){
+          array[i][j] = nullptr;
+        } else {
+          nbytes = ((bigint) sizeof(typename TYPE::value_type *)) * n3;
+          array[i][j] = (typename TYPE::value_type **) smalloc(nbytes, name);
+          for (int k = 0; k < n3; k++){
+            if (n4 == 0)
+              array[i][j][k] = nullptr;
+            else
+              array[i][j][k] = &data.h_view(i,j,k,0);
+          }
+        }
+      }
+    }
+  }
+
+  return data;
+}
+
 template <typename TYPE, typename HTYPE>
  TYPE create_kokkos(TYPE &data, HTYPE &h_data,
                     typename TYPE::value_type **&array, int n1, int n2,
--- a/src/KOKKOS/pair_mliap_kokkos.cpp
+++ b/src/KOKKOS/pair_mliap_kokkos.cpp
@ -232,6 +232,7 @@ void PairMLIAPKokkos<DeviceType>::coeff(int narg, char **arg) {
  // map[i] = which element the Ith atom type is, -1 if not mapped
  // map[0] is not used

+  //printf(">>> ntypes: %d\n", atom->ntypes);
  for (int i = 1; i <= atom->ntypes; i++) {
    char* elemname = elemtypes[i-1];
    int jelem;
@ -239,6 +240,7 @@ void PairMLIAPKokkos<DeviceType>::coeff(int narg, char **arg) {
      if (strcmp(elemname,descriptor->elements[jelem]) == 0)
        break;

+    //printf(">>> nelements: %d\n", descriptor->nelements);
    if (jelem < descriptor->nelements)
      map[i] = jelem;
    else if (strcmp(elemname,"NULL") == 0) map[i] = -1;
--- a/src/KOKKOS/pair_snap_kokkos_impl.h
+++ b/src/KOKKOS/pair_snap_kokkos_impl.h
@ -664,6 +664,8 @@ template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeNeigh>::member_type& team) const {

+
+  printf("d_wjelem: %f %f %f %f\n", d_wjelem[0], d_wjelem[1], d_wjelem(0), d_wjelem(1));
  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;

  // extract atom number
--- a/src/KOKKOS/sna_kokkos_impl.h
+++ b/src/KOKKOS/sna_kokkos_impl.h
@ -643,7 +643,6 @@ void SNAKokkos<DeviceType, real_type, vector_length>::evaluate_ui_jbend(const Wi
    }

    ulist_wrapper.set(ma, ulist_accum);
-
    mb++;
  }

@ -870,7 +869,6 @@ typename SNAKokkos<DeviceType, real_type, vector_length>::complex SNAKokkos<Devi
  int jju1 = idxu_block[j1] + (j1+1)*mb1min;
  int jju2 = idxu_block[j2] + (j2+1)*mb2max;
  int icgb = mb1min*(j2+1) + mb2max;
-
  #ifdef LMP_KK_DEVICE_COMPILE
  #pragma unroll
  #endif
@ -2298,7 +2296,7 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_s_dsfac(const real
  constexpr real_type zero = static_cast<real_type>(0.0);
  constexpr real_type onehalf = static_cast<real_type>(0.5);

-  if (switch_flag == 0) { sfac_outer = zero; dsfac_outer = zero; }
+  if (switch_flag == 0) { sfac_outer = one; dsfac_outer = zero; }
  else if (switch_flag == 1) {
    if (r <= rmin0) { sfac_outer = one; dsfac_outer = zero; }
    else if (r > rcut) { sfac = zero; dsfac = zero; return; }
--- a/src/ML-SNAP/compute_grid.cpp
+++ b/src/ML-SNAP/compute_grid.cpp
@ -57,6 +57,7 @@ ComputeGrid::ComputeGrid(LAMMPS *lmp, int narg, char **arg) :

 ComputeGrid::~ComputeGrid()
 {
+  if (copymode) return;
  deallocate();
 }

@ -87,6 +88,7 @@ void ComputeGrid::grid2x(int igrid, double *x)
  x[2] = iz * delz;

  if (triclinic) domain->lamda2x(x, x);
+  //printf(">>>>> ComputeGrid::grid2x\n");
 }

 /* ----------------------------------------------------------------------
@ -102,6 +104,7 @@ void ComputeGrid::assign_coords_all()
    gridall[igrid][1] = x[1];
    gridall[igrid][2] = x[2];
  }
+  //printf(">>>>> ComputeGrid::assign_coords_all\n");
 }

 /* ----------------------------------------------------------------------
@ -110,8 +113,8 @@ void ComputeGrid::assign_coords_all()

 void ComputeGrid::allocate()
 {
+  //printf(">>> ComputeGrid::allocate\n");
  // allocate arrays
-
  memory->create(grid, size_array_rows, size_array_cols, "grid:grid");
  memory->create(gridall, size_array_rows, size_array_cols, "grid:gridall");
  if (nxlo <= nxhi && nylo <= nyhi && nzlo <= nzhi) {
--- a/src/ML-SNAP/compute_sna_grid.cpp
+++ b/src/ML-SNAP/compute_sna_grid.cpp
@ -31,14 +31,13 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) :
  // skip over arguments used by base class
  // so that argument positions are identical to
  // regular per-atom compute
-
  arg += nargbase;
  narg -= nargbase;

  // begin code common to all SNAP computes

-  double rfac0, rmin0;
-  int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;
+  //double rfac0, rmin0;
+  //int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;

  int ntypes = atom->ntypes;
  int nargmin = 6 + 2 * ntypes;
@ -56,6 +55,8 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) :
  wselfallflag = 0;
  switchinnerflag = 0;
  nelements = 1;
+  chunksize = 32768;
+  parallel_thresh = 8192;

  // process required arguments

@ -67,8 +68,9 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) :
  twojmax = utils::inumeric(FLERR, arg[5], false, lmp);

  for (int i = 0; i < ntypes; i++) radelem[i + 1] = utils::numeric(FLERR, arg[6 + i], false, lmp);
-  for (int i = 0; i < ntypes; i++)
+  for (int i = 0; i < ntypes; i++) {
    wjelem[i + 1] = utils::numeric(FLERR, arg[6 + ntypes + i], false, lmp);
+  }

  // construct cutsq

@ -181,11 +183,12 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) :

 ComputeSNAGrid::~ComputeSNAGrid()
 {
+  if (copymode) return;
+
  memory->destroy(radelem);
  memory->destroy(wjelem);
  memory->destroy(cutsq);
  delete snaptr;
-
  if (chemflag) memory->destroy(map);
 }

@ -202,6 +205,7 @@ void ComputeSNAGrid::init()

 void ComputeSNAGrid::compute_array()
 {
+
  invoked_array = update->ntimestep;

  // compute sna for each gridpoint
--- a/src/ML-SNAP/compute_sna_grid.h
+++ b/src/ML-SNAP/compute_sna_grid.h
@ -31,21 +31,27 @@ class ComputeSNAGrid : public ComputeGrid {
  void init() override;
  void compute_array() override;
  double memory_usage() override;
+  int ncoeff,nelements; // public for kokkos, but could go in the protected block now

- private:
-  int ncoeff;
+ protected:
+  //int ncoeff;
  double **cutsq;
  double rcutfac;
  double *radelem;
  double *wjelem;
  int *map;    // map types to [0,nelements)
-  int nelements, chemflag;
+  int chemflag;
  int switchinnerflag;
  double *sinnerelem;
  double *dinnerelem;
+  int parallel_thresh;
  class SNA *snaptr;
  double cutmax;
  int quadraticflag;
+  double rfac0, rmin0;
+  int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;
+  int chunksize;
+
 };

 }    // namespace LAMMPS_NS