From befd4c8bfdd309029f81fd517d7df3181098e991 Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Fri, 19 Jun 2020 14:12:29 -0400
Subject: [PATCH 1/6] Optimizations to Compute[Yi/Zi/Bi], switching over to an
 AoSoA data layout on the GPU. CPU vs GPU code paths are now maximally
 divergent, will include some discussion of that in PR. Small performance
 tweaks in Compute[UiTot/FusedDeidrj].

---
 src/KOKKOS/pair_snap_kokkos.h      |  63 +-
 src/KOKKOS/pair_snap_kokkos_impl.h | 522 +++++++++++----
 src/KOKKOS/sna_kokkos.h            |  64 +-
 src/KOKKOS/sna_kokkos_impl.h       | 978 +++++++++++++++++++----------
 4 files changed, 1144 insertions(+), 483 deletions(-)
diff --git a/src/KOKKOS/pair_snap_kokkos.h b/src/KOKKOS/pair_snap_kokkos.h
index d6b028640d..a4e15fe5ce 100644
--- a/src/KOKKOS/pair_snap_kokkos.h
+++ b/src/KOKKOS/pair_snap_kokkos.h
@@ -30,19 +30,31 @@ PairStyle(snap/kk/host,PairSNAPKokkos<LMPHostType>)
 
 namespace LAMMPS_NS {
 
+// Routines for both the CPU and GPU backend
 template<int NEIGHFLAG, int EVFLAG>
 struct TagPairSNAPComputeForce{};
-
-struct TagPairSNAPBeta{};
 struct TagPairSNAPComputeNeigh{};
+
+// GPU backend only
 struct TagPairSNAPPreUi{};
 struct TagPairSNAPComputeUi{};
-struct TagPairSNAPComputeUiCPU{};
+struct TagPairSNAPTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist
 struct TagPairSNAPComputeZi{};
+struct TagPairSNAPBeta{};
 struct TagPairSNAPComputeBi{};
-struct TagPairSNAPZeroYi{};
+struct TagPairSNAPTransformBi{}; // re-order blist from AoSoA to AoS
 struct TagPairSNAPComputeYi{};
+struct TagPairSNAPTransformYi{}; // re-order ylist from AoSoA to AoS
 struct TagPairSNAPComputeFusedDeidrj{};
+
+// CPU backend only
+struct TagPairSNAPPreUiCPU{};
+struct TagPairSNAPComputeUiCPU{};
+struct TagPairSNAPComputeZiCPU{};
+struct TagPairSNAPBetaCPU{};
+struct TagPairSNAPComputeBiCPU{};
+struct TagPairSNAPZeroYiCPU{};
+struct TagPairSNAPComputeYiCPU{};
 struct TagPairSNAPComputeDuidrjCPU{};
 struct TagPairSNAPComputeDeidrjCPU{};
 
@@ -81,6 +93,10 @@ public:
   KOKKOS_INLINE_FUNCTION
   void operator() (TagPairSNAPComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeNeigh>::member_type& team) const;
 
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagPairSNAPBetaCPU,const int& ii) const;
+
+  // GPU backend only
   KOKKOS_INLINE_FUNCTION
   void operator() (TagPairSNAPPreUi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPPreUi>::member_type& team) const;
 
@@ -88,32 +104,54 @@ public:
   void operator() (TagPairSNAPComputeUi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUi>::member_type& team) const;
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiCPU>::member_type& team) const;
+  void operator() (TagPairSNAPTransformUi,const int iatom_mod, const int idxu, const int iatom_div) const;
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (TagPairSNAPComputeZi,const int& ii) const;
+  void operator() (TagPairSNAPComputeZi,const int iatom_mod, const int idxz, const int iatom_div) const;
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (TagPairSNAPComputeBi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeBi>::member_type& team) const;
+  void operator() (TagPairSNAPBeta, const int& ii) const;
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (TagPairSNAPZeroYi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPZeroYi>::member_type& team) const;
+  void operator() (TagPairSNAPComputeBi,const int iatom_mod, const int idxb, const int iatom_div) const;
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (TagPairSNAPComputeYi,const int& ii) const;
+  void operator() (TagPairSNAPTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagPairSNAPComputeYi,const int iatom_mod, const int idxz, const int iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagPairSNAPTransformYi,const int iatom_mod, const int idxu, const int iatom_div) const;
 
   KOKKOS_INLINE_FUNCTION
   void operator() (TagPairSNAPComputeFusedDeidrj,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeFusedDeidrj>::member_type& team) const;
 
+  // CPU backend only
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagPairSNAPPreUiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPPreUiCPU>::member_type& team) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiCPU>::member_type& team) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagPairSNAPComputeZiCPU,const int& ii) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagPairSNAPComputeBiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeBiCPU>::member_type& team) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagPairSNAPZeroYiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPZeroYiCPU>::member_type& team) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagPairSNAPComputeYiCPU,const int& ii) const;
+
   KOKKOS_INLINE_FUNCTION
   void operator() (TagPairSNAPComputeDuidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrjCPU>::member_type& team) const;
 
   KOKKOS_INLINE_FUNCTION
   void operator() (TagPairSNAPComputeDeidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrjCPU>::member_type& team) const;
 
-  KOKKOS_INLINE_FUNCTION
-  void operator() (TagPairSNAPBeta,const int& ii) const;
-
   template<int NEIGHFLAG>
   KOKKOS_INLINE_FUNCTION
   void v_tally_xyz(EV_FLOAT &ev, const int &i, const int &j,
@@ -175,6 +213,7 @@ inline double dist2(double* x,double* y);
   Kokkos::View<T_INT*, DeviceType> d_map;                    // mapping from atom types to elements
   Kokkos::View<T_INT*, DeviceType> d_ninside;                // ninside for all atoms in list
   Kokkos::View<F_FLOAT**, DeviceType> d_beta;                // betas for all atoms in list
+  Kokkos::View<F_FLOAT***, Kokkos::LayoutLeft, DeviceType> d_beta_pack;          // betas for all atoms in list, GPU
   Kokkos::View<F_FLOAT**, DeviceType> d_bispectrum;          // bispectrum components for all atoms in list
 
   typedef Kokkos::DualView<F_FLOAT**, DeviceType> tdual_fparams;
diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h
index 5dbf8e3a5a..64831e91cf 100644
--- a/src/KOKKOS/pair_snap_kokkos_impl.h
+++ b/src/KOKKOS/pair_snap_kokkos_impl.h
@@ -194,6 +194,8 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   if (beta_max < inum) {
     beta_max = inum;
     d_beta = Kokkos::View<F_FLOAT**, DeviceType>("PairSNAPKokkos:beta",ncoeff,inum);
+    if (!host_flag)
+      d_beta_pack = Kokkos::View<F_FLOAT***, Kokkos::LayoutLeft, DeviceType>("PairSNAPKokkos:beta_pack",32,ncoeff,(inum+32-1)/32);
     d_ninside = Kokkos::View<int*, DeviceType>("PairSNAPKokkos:ninside",inum);
   }
 
@@ -222,32 +224,95 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
       Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this);
     }
 
-    //PreUi
+    if (host_flag)
     {
-      int vector_length = vector_length_default;
-      int team_size = team_size_default;
-      if (!host_flag)
-        vector_length = 32;
-      check_team_size_for<TagPairSNAPPreUi>(chunk_size,team_size,vector_length);
-      typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPPreUi> policy_preui((chunk_size+team_size-1)/team_size,team_size,vector_length);
-      Kokkos::parallel_for("PreUi",policy_preui,*this);
-    }
+      // Host codepath
 
-    // ComputeUI
-    {
-      int vector_length = vector_length_default;
-      int team_size = team_size_default;
-      if (host_flag) { // CPU
-        // Run a fused calculation of ulist and accumulation into ulisttot using atomics
+      //PreUi
+      {
+        int vector_length = vector_length_default;
+        int team_size = team_size_default;
+        check_team_size_for<TagPairSNAPPreUiCPU>(chunk_size,team_size,vector_length);
+        typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPPreUiCPU> policy_preui_cpu((chunk_size+team_size-1)/team_size,team_size,vector_length);
+        Kokkos::parallel_for("PreUiCPU",policy_preui_cpu,*this);
+      }
 
+      // ComputeUi
+      {
+        int vector_length = vector_length_default;
+        int team_size = team_size_default;
+        // Fused calculation of ulist and accumulation into ulisttot using atomics
         typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUiCPU> policy_ui_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
-
         Kokkos::parallel_for("ComputeUiCPU",policy_ui_cpu,*this);
+      }
 
-      } else { // GPU, vector parallelism, shared memory, separate ulist and ulisttot to avoid atomics
+      //Compute bispectrum
+      if (quadraticflag || eflag) {
+        //ComputeZi
+        int idxz_max = snaKK.idxz_max;
+        typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeZiCPU> policy_zi_cpu(0,chunk_size*idxz_max);
+        Kokkos::parallel_for("ComputeZiCPU",policy_zi_cpu,*this);
 
-        vector_length = 32;
-        team_size = 4; // need to cap b/c of shared memory reqs
+        //ComputeBi
+        int vector_length = vector_length_default;
+        int team_size = team_size_default;
+        check_team_size_for<TagPairSNAPComputeBiCPU>(chunk_size,team_size,vector_length);
+        typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeBiCPU> policy_bi_cpu(chunk_size,team_size,vector_length);
+        Kokkos::parallel_for("ComputeBiCPU",policy_bi_cpu,*this);
+      }
+
+
+      //ZeroYi,ComputeYi
+      {
+
+        int vector_length = vector_length_default;
+        int team_size = team_size_default;
+
+        //Compute beta = dE_i/dB_i for all i in list
+        typename Kokkos::RangePolicy<DeviceType,TagPairSNAPBetaCPU> policy_beta(0,chunk_size);
+        Kokkos::parallel_for("ComputeBetaCPU",policy_beta,*this);
+
+        //ZeroYi
+        check_team_size_for<TagPairSNAPZeroYiCPU>(chunk_size,team_size,vector_length);
+        typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPZeroYiCPU> policy_zero_yi(((idxu_max+team_size-1)/team_size)*chunk_size,team_size,vector_length);
+        Kokkos::parallel_for("ZeroYiCPU",policy_zero_yi,*this);
+
+        //ComputeYi
+        int idxz_max = snaKK.idxz_max;
+        typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeYiCPU> policy_yi_cpu(0,chunk_size*idxz_max);
+        Kokkos::parallel_for("ComputeYiCPU",policy_yi_cpu,*this);
+      } // host flag
+
+      //ComputeDuidrj and Deidrj
+      {
+        int team_size = team_size_default;
+        int vector_length = vector_length_default;
+
+        typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeDuidrjCPU> policy_duidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
+        snaKK.set_dir(-1); // technically doesn't do anything
+        Kokkos::parallel_for("ComputeDuidrjCPU",policy_duidrj_cpu,*this);
+
+        typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeDeidrjCPU> policy_deidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
+
+        Kokkos::parallel_for("ComputeDeidrjCPU",policy_deidrj_cpu,*this);
+      }
+    } else { // GPU
+
+#ifdef KOKKOS_ENABLE_CUDA
+      //PreUi
+      {
+        int vector_length = vector_length_default;
+        int team_size = team_size_default;
+        check_team_size_for<TagPairSNAPPreUi>(chunk_size,team_size,vector_length);
+        typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPPreUi> policy_preui((chunk_size+team_size-1)/team_size,team_size,vector_length);
+        Kokkos::parallel_for("PreUi",policy_preui,*this);
+      }
+
+      // ComputeUi w/vector parallelism, shared memory, direct atomicAdd into ulisttot
+      {
+
+        int vector_length = 32;
+        int team_size = 4; // need to cap b/c of shared memory reqs
         check_team_size_for<TagPairSNAPComputeUi>(chunk_size,team_size,vector_length);
 
         // scratch size: 2 * team_size * (twojmax+1)^2, to cover all `m1`,`m2` values
@@ -265,62 +330,54 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
 
         Kokkos::parallel_for("ComputeUi",policy_ui,*this);
 
+        //Transform data layout of ulisttot to AoSoA, zero ylist
+        typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformUi> policy_transform_ui({0,0,0},{32,idxu_max,(chunk_size + 32 - 1) / 32},{32,4,1});
+        Kokkos::parallel_for("TransformUi",policy_transform_ui,*this);
+
       }
-    }
 
+      //Compute bispectrum in AoSoA data layout, transform Bi
+      if (quadraticflag || eflag) {
+        //ComputeZi
+        int idxz_max = snaKK.idxz_max;
+        typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPComputeZi> policy_compute_zi({0,0,0},{32,idxz_max,(chunk_size + 32 - 1) / 32},{32,4,1});
+        Kokkos::parallel_for("ComputeZi",policy_compute_zi,*this);
 
-    //Compute bispectrum
-    if (quadraticflag || eflag) {
-      //ComputeZi
-      int idxz_max = snaKK.idxz_max;
-      typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeZi> policy_zi(0,chunk_size*idxz_max);
-      Kokkos::parallel_for("ComputeZi",policy_zi,*this);
+        //ComputeBi
+        int idxb_max = snaKK.idxb_max;
+        typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPComputeBi> policy_compute_bi({0,0,0},{32,idxb_max,(chunk_size + 32 - 1) / 32},{32,4,1});
+        Kokkos::parallel_for("ComputeBi",policy_compute_bi,*this);
 
-      //ComputeBi
-      int vector_length = vector_length_default;
-      int team_size = team_size_default;
-      check_team_size_for<TagPairSNAPComputeBi>(chunk_size,team_size,vector_length);
-      typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeBi> policy_bi(chunk_size,team_size,vector_length);
-      Kokkos::parallel_for("ComputeBi",policy_bi,*this);
-    }
+        //Transform data layout of blist out of AoSoA
+        //We need this b/c `blist` gets used in ComputeForce which doesn't
+        //take advantage of AoSoA (which at best would only be beneficial
+        //on the margins)
+        typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformBi> policy_transform_bi({0,0,0},{32,idxb_max,(chunk_size + 32 - 1) / 32},{32,4,1});
+        Kokkos::parallel_for("TransformBi",policy_transform_bi,*this);
+      }
 
-    //Compute beta = dE_i/dB_i for all i in list
-    typename Kokkos::RangePolicy<DeviceType,TagPairSNAPBeta> policy_beta(0,chunk_size);
-    Kokkos::parallel_for("ComputeBeta",policy_beta,*this);
+      //ComputeYi in AoSoA data layout, transform to AoS for ComputeFusedDeidrj
+      //Note zeroing `ylist` is fused into `TransformUi`.
+      {
+        //Compute beta = dE_i/dB_i for all i in list
+        typename Kokkos::RangePolicy<DeviceType,TagPairSNAPBeta> policy_beta(0,chunk_size);
+        Kokkos::parallel_for("ComputeBeta",policy_beta,*this);
 
-    //ZeroYi
-    {
-      int vector_length = vector_length_default;
-      int team_size = team_size_default;
-      if (!host_flag)
-        team_size = 128;
-      check_team_size_for<TagPairSNAPZeroYi>(chunk_size,team_size,vector_length);
-      typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPZeroYi> policy_zero_yi(((idxu_max+team_size-1)/team_size)*chunk_size,team_size,vector_length);
-      Kokkos::parallel_for("ZeroYi",policy_zero_yi,*this);
-    }
+        //ComputeYi
+        const int idxz_max = snaKK.idxz_max;
+        typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPComputeYi> policy_compute_yi({0,0,0},{32,idxz_max,(chunk_size + 32 - 1) / 32},{32,4,1});
+        Kokkos::parallel_for("ComputeYi",policy_compute_yi,*this);
 
-    //ComputeYi
-    int idxz_max = snaKK.idxz_max;
-    typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeYi> policy_yi(0,chunk_size*idxz_max);
-    Kokkos::parallel_for("ComputeYi",policy_yi,*this);
+        //Transform data layout of ylist out of AoSoA
+        typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformYi> policy_transform_yi({0,0,0},{32,idxu_max,(chunk_size + 32 - 1) / 32},{32,4,1});
+        Kokkos::parallel_for("TransformYi",policy_transform_yi,*this);
 
-    //ComputeDuidrj and Deidrj
-    {
-      int team_size = team_size_default;
-      int vector_length = vector_length_default;
-      if (host_flag) { // CPU
+      }
 
-        typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeDuidrjCPU> policy_duidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
-        snaKK.set_dir(-1); // technically doesn't do anything
-        Kokkos::parallel_for("ComputeDuidrjCPU",policy_duidrj_cpu,*this);
-
-        typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeDeidrjCPU> policy_deidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
-
-        Kokkos::parallel_for("ComputeDeidrjCPU",policy_deidrj_cpu,*this);
-      } else { // GPU, utilize scratch memory and splitting over dimensions, fused dui and dei
-
-        vector_length = 32;
-        team_size = 2; // need to cap b/c of shared memory reqs
+      // Fused ComputeDuidrj, ComputeDeidrj
+      {
+        int vector_length = 32;
+        int team_size = 2; // need to cap b/c of shared memory reqs
         check_team_size_for<TagPairSNAPComputeFusedDeidrj>(chunk_size,team_size,vector_length);
 
         // scratch size: 2 * 2 * team_size * (twojmax+1)*(twojmax/2+1), to cover half `m1`,`m2` values due to symmetry
@@ -341,6 +398,9 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
           Kokkos::parallel_for("ComputeFusedDeidrj",policy_fused_deidrj,*this);
         }
       }
+
+#endif // KOKKOS_ENABLE_CUDA
+
     }
 
     //ComputeForce
@@ -416,38 +476,6 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   }
 }
 
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType>
-KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPBeta,const int& ii) const {
-
-  const int i = d_ilist[ii + chunk_offset];
-  const int itype = type[i];
-  const int ielem = d_map[itype];
-  SNAKokkos<DeviceType> my_sna = snaKK;
-
-  Kokkos::View<double*,Kokkos::LayoutRight,DeviceType,Kokkos::MemoryTraits<Kokkos::Unmanaged>>
-    d_coeffi(d_coeffelem,ielem,Kokkos::ALL);
-
-  for (int icoeff = 0; icoeff < ncoeff; icoeff++)
-    d_beta(icoeff,ii) = d_coeffi[icoeff+1];
-
-  if (quadraticflag) {
-    int k = ncoeff+1;
-    for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
-      double bveci = my_sna.blist(icoeff,ii);
-      d_beta(icoeff,ii) += d_coeffi[k]*bveci;
-      k++;
-      for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
-        double bvecj = my_sna.blist(jcoeff,ii);
-        d_beta(icoeff,ii) += d_coeffi[k]*bvecj;
-        d_beta(jcoeff,ii) += d_coeffi[k]*bveci;
-        k++;
-      }
-    }
-  }
-}
 
 /* ----------------------------------------------------------------------
    allocate all arrays
@@ -520,6 +548,10 @@ void PairSNAPKokkos<DeviceType>::coeff(int narg, char **arg)
   snaKK.init();
 }
 
+/* ----------------------------------------------------------------------
+   Begin routines that are called on both CPU and GPU codepaths
+------------------------------------------------------------------------- */
+
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
@@ -594,6 +626,53 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeNeigh,const typen
   });
 }
 
+/* ----------------------------------------------------------------------
+   Begin routines that are unique to the GPU codepath. These take advantage
+   of AoSoA data layouts and scratch memory for recursive polynomials
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPBeta,const int& ii) const {
+  
+  if (ii >= chunk_size) return; 
+
+  const int iatom_mod = ii % 32;
+  const int iatom_div = ii / 32;
+
+  const int i = d_ilist[ii + chunk_offset];
+  const int itype = type[i];
+  const int ielem = d_map[itype];
+  SNAKokkos<DeviceType> my_sna = snaKK;
+
+  Kokkos::View<double*,Kokkos::LayoutRight,DeviceType,Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+    d_coeffi(d_coeffelem,ielem,Kokkos::ALL);
+
+  for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
+    d_beta_pack(iatom_mod,icoeff,iatom_div) = d_coeffi[icoeff+1];
+  }
+
+  if (quadraticflag) {
+    const auto idxb_max = my_sna.idxb_max;
+    int k = ncoeff+1;
+    for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
+      const auto idxb = icoeff % idxb_max;
+      const auto idx_chem = icoeff / idxb_max;
+      double bveci = my_sna.blist(idxb, idx_chem, ii);
+      d_beta_pack(iatom_mod,icoeff,iatom_div) += d_coeffi[k]*bveci;
+      k++;
+      for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
+        const auto jdxb = jcoeff % idxb_max;
+        const auto jdx_chem = jcoeff / idxb_max;
+        double bvecj = my_sna.blist(jdxb, jdx_chem, ii);
+        d_beta_pack(iatom_mod,icoeff,iatom_div) += d_coeffi[k]*bvecj;
+        d_beta_pack(iatom_mod,jcoeff,iatom_div) += d_coeffi[k]*bveci;
+        k++;
+      }
+    }
+  }
+}
+
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPPreUi,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPPreUi>::member_type& team) const {
@@ -627,61 +706,107 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUi,const typename
 
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUiCPU>::member_type& team) const {
+void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPTransformUi,const int iatom_mod, const int idxu, const int iatom_div) const {
   SNAKokkos<DeviceType> my_sna = snaKK;
 
-  // Extract the atom number
-  int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size()));
-  if (ii >= chunk_size) return;
+  const int iatom = iatom_mod + iatom_div * 32;
+  if (iatom >= chunk_size) return;
 
-  // Extract the neighbor number
-  const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size());
-  const int ninside = d_ninside(ii);
-  if (jj >= ninside) return;
+  if (idxu >= my_sna.idxu_max) return;
+
+  int elem_count = chemflag ? nelements : 1;
+
+  for (int ielem = 0; ielem < elem_count; ielem++) {
+
+    const auto utot_re = my_sna.ulisttot_re(idxu, ielem, iatom);
+    const auto utot_im = my_sna.ulisttot_im(idxu, ielem, iatom);
+
+    my_sna.ulisttot_pack(iatom_mod, idxu, ielem, iatom_div) = { utot_re, utot_im };
+
+    my_sna.ylist_pack_re(iatom_mod, idxu, ielem, iatom_div) = 0.;
+    my_sna.ylist_pack_im(iatom_mod, idxu, ielem, iatom_div) = 0.;
+  }
 
-  my_sna.compute_ui_cpu(team,ii,jj);
 }
 
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPZeroYi,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPZeroYi>::member_type& team) const {
+void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeYi,const int iatom_mod, const int jjz, const int iatom_div) const {
   SNAKokkos<DeviceType> my_sna = snaKK;
 
-  // Extract the quantum number
-  const int idx = team.team_rank() + team.team_size() * (team.league_rank() % ((my_sna.idxu_max+team.team_size()-1)/team.team_size()));
-  if (idx >= my_sna.idxu_max) return;
+  const int iatom = iatom_mod + iatom_div * 32;
+  if (iatom >= chunk_size) return;
 
-  // Extract the atomic index
-  const int ii = team.league_rank() / ((my_sna.idxu_max+team.team_size()-1)/team.team_size());
-  if (ii >= chunk_size) return;
+  if (jjz >= my_sna.idxz_max) return;
 
-  if (chemflag)
-    for(int ielem = 0; ielem < nelements; ielem++)
-      my_sna.zero_yi(idx,ii,ielem);
-  else
-    my_sna.zero_yi(idx,ii,0);
+  my_sna.compute_yi(iatom_mod,jjz,iatom_div,d_beta_pack);
 }
 
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeYi,const int& ii) const {
+void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPTransformYi,const int iatom_mod, const int idxu, const int iatom_div) const {
   SNAKokkos<DeviceType> my_sna = snaKK;
-  my_sna.compute_yi(ii,d_beta);
+
+  const int iatom = iatom_mod + iatom_div * 32;
+  if (iatom >= chunk_size) return;
+
+  if (idxu >= my_sna.idxu_max) return;
+
+  int elem_count = chemflag ? nelements : 1;
+  for (int ielem = 0; ielem < elem_count; ielem++) {
+    const auto y_re = my_sna.ylist_pack_re(iatom_mod, idxu, ielem, iatom_div);
+    const auto y_im = my_sna.ylist_pack_im(iatom_mod, idxu, ielem, iatom_div);
+
+    my_sna.ylist(idxu, ielem, iatom) = { y_re, y_im };
+  }
+
 }
 
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeZi,const int& ii) const {
+void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeZi,const int iatom_mod, const int jjz, const int iatom_div) const {
   SNAKokkos<DeviceType> my_sna = snaKK;
-  my_sna.compute_zi(ii);
+
+  const int iatom = iatom_mod + iatom_div * 32;
+  if (iatom >= chunk_size) return;
+
+  if (jjz >= my_sna.idxz_max) return;
+
+  my_sna.compute_zi(iatom_mod,jjz,iatom_div);
 }
 
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeBi,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeBi>::member_type& team) const {
-  int ii = team.league_rank();
+void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeBi,const int iatom_mod, const int jjb, const int iatom_div) const {
   SNAKokkos<DeviceType> my_sna = snaKK;
-  my_sna.compute_bi(team,ii);
+
+  const int iatom = iatom_mod + iatom_div * 32;
+  if (iatom >= chunk_size) return;
+
+  if (jjb >= my_sna.idxb_max) return;
+
+  my_sna.compute_bi(iatom_mod,jjb,iatom_div);
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const {
+  SNAKokkos<DeviceType> my_sna = snaKK;
+
+  const int iatom = iatom_mod + iatom_div * 32;
+  if (iatom >= chunk_size) return;
+
+  if (idxb >= my_sna.idxb_max) return;
+
+  const int ntriples = my_sna.ntriples;
+
+  for (int itriple = 0; itriple < ntriples; itriple++) {
+
+    const auto blocal = my_sna.blist_pack(iatom_mod, idxb, itriple, iatom_div);
+
+    my_sna.blist(idxb, itriple, iatom) = blocal;
+  }
+
 }
 
 template<class DeviceType>
@@ -701,6 +826,126 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeFusedDeidrj,const
   my_sna.compute_fused_deidrj(team,ii,jj);
 }
 
+/* ----------------------------------------------------------------------
+   Begin routines that are unique to the CPU codepath. These do not take
+   advantage of AoSoA data layouts, but that could be a good point of
+   future optimization and unification with the above kernels. It's unlikely
+   that scratch memory optimizations will ever be useful for the CPU due to
+   different arithmetic intensity requirements for the CPU vs GPU.
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPBetaCPU,const int& ii) const {
+
+  const int i = d_ilist[ii + chunk_offset];
+  const int itype = type[i];
+  const int ielem = d_map[itype];
+  SNAKokkos<DeviceType> my_sna = snaKK;
+
+  Kokkos::View<double*,Kokkos::LayoutRight,DeviceType,Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+    d_coeffi(d_coeffelem,ielem,Kokkos::ALL);
+
+  for (int icoeff = 0; icoeff < ncoeff; icoeff++)
+    d_beta(icoeff,ii) = d_coeffi[icoeff+1];
+
+  if (quadraticflag) {
+    const auto idxb_max = my_sna.idxb_max;
+    int k = ncoeff+1;
+    for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
+      const auto idxb = icoeff % idxb_max;
+      const auto idx_chem = icoeff / idxb_max;
+      double bveci = my_sna.blist(idxb,idx_chem,ii);
+      d_beta(icoeff,ii) += d_coeffi[k]*bveci;
+      k++;
+      for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
+        const auto jdxb = jcoeff % idxb_max;
+        const auto jdx_chem = jcoeff / idxb_max;
+        double bvecj = my_sna.blist(jdxb,jdx_chem,ii);
+        d_beta(icoeff,ii) += d_coeffi[k]*bvecj;
+        d_beta(jcoeff,ii) += d_coeffi[k]*bveci;
+        k++;
+      }
+    }
+  }
+}
+
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPPreUiCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPPreUiCPU>::member_type& team) const {
+  SNAKokkos<DeviceType> my_sna = snaKK;
+
+  // Extract the atom number
+  const int ii = team.team_rank() + team.team_size() * team.league_rank();
+  if (ii >= chunk_size) return;
+  int itype = type(ii);
+  int ielem = d_map[itype];
+
+  my_sna.pre_ui_cpu(team,ii,ielem);
+}
+
+
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUiCPU>::member_type& team) const {
+  SNAKokkos<DeviceType> my_sna = snaKK;
+
+  // Extract the atom number
+  int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size()));
+  if (ii >= chunk_size) return;
+
+  // Extract the neighbor number
+  const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size());
+  const int ninside = d_ninside(ii);
+  if (jj >= ninside) return;
+
+  my_sna.compute_ui_cpu(team,ii,jj);
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPZeroYiCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPZeroYiCPU>::member_type& team) const {
+  SNAKokkos<DeviceType> my_sna = snaKK;
+
+  // Extract the quantum number
+  const int idx = team.team_rank() + team.team_size() * (team.league_rank() % ((my_sna.idxu_max+team.team_size()-1)/team.team_size()));
+  if (idx >= my_sna.idxu_max) return;
+
+  // Extract the atomic index
+  const int ii = team.league_rank() / ((my_sna.idxu_max+team.team_size()-1)/team.team_size());
+  if (ii >= chunk_size) return;
+
+  if (chemflag)
+    for(int ielem = 0; ielem < nelements; ielem++)
+      my_sna.zero_yi_cpu(idx,ii,ielem);
+  else
+    my_sna.zero_yi_cpu(idx,ii,0);
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeYiCPU,const int& ii) const {
+  SNAKokkos<DeviceType> my_sna = snaKK;
+  my_sna.compute_yi_cpu(ii,d_beta);
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeZiCPU,const int& ii) const {
+  SNAKokkos<DeviceType> my_sna = snaKK;
+  my_sna.compute_zi_cpu(ii);
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeBiCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeBiCPU>::member_type& team) const {
+  int ii = team.league_rank();
+  SNAKokkos<DeviceType> my_sna = snaKK;
+  my_sna.compute_bi_cpu(team,ii);
+}
+
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDuidrjCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeDuidrjCPU>::member_type& team) const {
@@ -735,6 +980,12 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDeidrjCPU,const t
   my_sna.compute_deidrj_cpu(team,ii,jj);
 }
 
+/* ----------------------------------------------------------------------
+   Also used for both CPU and GPU codepaths. Could maybe benefit from a
+   separate GPU/CPU codepath, but this kernel takes so little time it's
+   likely not worth it.
+------------------------------------------------------------------------- */
+
 template<class DeviceType>
 template<int NEIGHFLAG, int EVFLAG>
 KOKKOS_INLINE_FUNCTION
@@ -799,20 +1050,31 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeForce<NEIGHFLAG,E
 
         // E = beta.B + 0.5*B^t.alpha.B
 
+        const auto idxb_max = snaKK.idxb_max;
+
         // linear contributions
 
-        for (int icoeff = 0; icoeff < ncoeff; icoeff++)
-          evdwl += d_coeffi[icoeff+1]*my_sna.blist(icoeff,ii);
+        for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
+          const auto idxb = icoeff % idxb_max;
+          const auto idx_chem = icoeff / idxb_max;
+          evdwl += d_coeffi[icoeff+1]*my_sna.blist(idxb,idx_chem,ii);
+        }
 
         // quadratic contributions
 
         if (quadraticflag) {
           int k = ncoeff+1;
           for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
-            double bveci = my_sna.blist(icoeff,ii);
+            const auto idxb = icoeff % idxb_max;
+            const auto idx_chem = icoeff / idxb_max;
+            double bveci = my_sna.blist(idxb,idx_chem,ii);
+
             evdwl += 0.5*d_coeffi[k++]*bveci*bveci;
             for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
-              double bvecj = my_sna.blist(jcoeff,ii);
+              auto jdxb = jcoeff % idxb_max;
+              auto jdx_chem = jcoeff / idxb_max;
+              double bvecj = my_sna.blist(jdxb,jdx_chem,ii);
+
               evdwl += d_coeffi[k++]*bveci*bvecj;
             }
           }
diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h
index 30c705cc41..121f293633 100644
--- a/src/KOKKOS/sna_kokkos.h
+++ b/src/KOKKOS/sna_kokkos.h
@@ -36,7 +36,9 @@ public:
   typedef Kokkos::View<double**, DeviceType> t_sna_2d;
   typedef Kokkos::View<double**, Kokkos::LayoutLeft, DeviceType> t_sna_2d_ll;
   typedef Kokkos::View<double***, DeviceType> t_sna_3d;
+  typedef Kokkos::View<double***, Kokkos::LayoutLeft, DeviceType> t_sna_3d_ll;
   typedef Kokkos::View<double***[3], DeviceType> t_sna_4d;
+  typedef Kokkos::View<double****, Kokkos::LayoutLeft, DeviceType> t_sna_4d_ll;
   typedef Kokkos::View<double**[3], DeviceType> t_sna_3d3;
   typedef Kokkos::View<double*****, DeviceType> t_sna_5d;
 
@@ -48,7 +50,8 @@ public:
   typedef Kokkos::View<SNAcomplex***, DeviceType> t_sna_3c;
   typedef Kokkos::View<SNAcomplex***, Kokkos::LayoutLeft, DeviceType> t_sna_3c_ll;
   typedef Kokkos::View<SNAcomplex***[3], DeviceType> t_sna_4c;
-  typedef Kokkos::View<SNAcomplex***[3], Kokkos::LayoutLeft, DeviceType> t_sna_4c_ll;
+  typedef Kokkos::View<SNAcomplex***[3], Kokkos::LayoutLeft, DeviceType> t_sna_4c3_ll;
+  typedef Kokkos::View<SNAcomplex****, Kokkos::LayoutLeft, DeviceType> t_sna_4c_ll;
   typedef Kokkos::View<SNAcomplex**[3], DeviceType> t_sna_3c3;
   typedef Kokkos::View<SNAcomplex*****, DeviceType> t_sna_5c;
 
@@ -73,27 +76,39 @@ inline
 
   int ncoeff;
 
-  // functions for bispectrum coefficients
+  // functions for bispectrum coefficients, GPU only
   KOKKOS_INLINE_FUNCTION
-  void pre_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team,const int&, int); // ForceSNAP
+  void pre_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team,const int&,const int&); // ForceSNAP
   KOKKOS_INLINE_FUNCTION
   void compute_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int); // ForceSNAP
   KOKKOS_INLINE_FUNCTION
+  void compute_zi(const int&, const int&, const int&);    // ForceSNAP
+  KOKKOS_INLINE_FUNCTION
+  void compute_yi(int,int,int,
+   const Kokkos::View<F_FLOAT***, Kokkos::LayoutLeft, DeviceType> &beta_pack); // ForceSNAP
+  KOKKOS_INLINE_FUNCTION
+  void compute_bi(const int&, const int&, const int&);    // ForceSNAP
+
+  // functions for bispectrum coefficients, CPU only
+  KOKKOS_INLINE_FUNCTION
+  void pre_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team,const int&,const int&); // ForceSNAP
+  KOKKOS_INLINE_FUNCTION
   void compute_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); // ForceSNAP
   KOKKOS_INLINE_FUNCTION
-  void compute_zi(const int&);    // ForceSNAP
+  void compute_zi_cpu(const int&);    // ForceSNAP
   KOKKOS_INLINE_FUNCTION
-  void zero_yi(const int&, const int&, int); // ForceSNAP
+  void zero_yi_cpu(const int&,const int&,const int&); // ForceSNAP
   KOKKOS_INLINE_FUNCTION
-  void compute_yi(int,
+  void compute_yi_cpu(int,
    const Kokkos::View<F_FLOAT**, DeviceType> &beta); // ForceSNAP
-  KOKKOS_INLINE_FUNCTION
-  void compute_bi(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int);    // ForceSNAP
-
-  // functions for derivatives
+    KOKKOS_INLINE_FUNCTION
+  void compute_bi_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int);    // ForceSNAP
 
+  // functions for derivatives, GPU only
   KOKKOS_INLINE_FUNCTION
   void compute_fused_deidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int); //ForceSNAP
+
+  // functions for derivatives, CPU only
   KOKKOS_INLINE_FUNCTION
   void compute_duidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); //ForceSNAP
   KOKKOS_INLINE_FUNCTION
@@ -139,18 +154,32 @@ inline
 
   int twojmax, diagonalstyle;
 
-  t_sna_2d_ll blist;
-  t_sna_2c_ll ulisttot;
-  t_sna_2c_ll zlist;
+  t_sna_3d_ll blist;
+  t_sna_3c_ll ulisttot;
+  t_sna_3c_ll zlist;
 
   t_sna_3c_ll ulist;
-  t_sna_2c_ll ylist;
-
+  t_sna_3c_ll ylist;
+  
   // derivatives of data
-  t_sna_4c_ll dulist;
+  t_sna_4c3_ll dulist;
+  
+  // Modified structures for GPU backend
+  t_sna_3d_ll ulisttot_re; // split real,
+  t_sna_3d_ll ulisttot_im; // imag
+  t_sna_4c_ll ulisttot_pack; // AoSoA layout
+  t_sna_4c_ll zlist_pack; // AoSoA layout
+  t_sna_4d_ll blist_pack;
+  t_sna_4d_ll ylist_pack_re; // split real, 
+  t_sna_4d_ll ylist_pack_im; // imag AoSoA layout
 
   int idxcg_max, idxu_max, idxz_max, idxb_max;
 
+  // Chem snap counts
+  int nelements;
+  int ndoubles;
+  int ntriples;
+
 private:
   double rmin0, rfac0;
 
@@ -212,9 +241,6 @@ inline
   // Chem snap flags
   int chem_flag;
   int bnorm_flag;
-  int nelements;
-  int ndoubles;
-  int ntriples;
 
   // Self-weight
   double wself;
diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h
index dae4a0c2a1..8f265751ed 100644
--- a/src/KOKKOS/sna_kokkos_impl.h
+++ b/src/KOKKOS/sna_kokkos_impl.h
@@ -237,56 +237,71 @@ void SNAKokkos<DeviceType>::grow_rij(int newnatom, int newnmax)
   element = t_sna_2i("sna:rcutij",natom,nmax);
   dedr = t_sna_3d("sna:dedr",natom,nmax,3);
 
-  blist = t_sna_2d_ll("sna:blist",idxb_max*ntriples,natom);
-  //ulisttot = t_sna_2c("sna:ulisttot",natom,idxu_max);
-  ulisttot = t_sna_2c_ll("sna:ulisttot",idxu_max*nelements,natom);
-
-  zlist = t_sna_2c_ll("sna:zlist",idxz_max*ndoubles,natom);
-
-  //ulist = t_sna_3c("sna:ulist",natom,nmax,idxu_max);
 #ifdef KOKKOS_ENABLE_CUDA
   if (std::is_same<DeviceType,Kokkos::Cuda>::value) {
     // dummy allocation
+    ulisttot = t_sna_3c_ll("sna:ulisttot",1,1,1);
+    ulisttot_re = t_sna_3d_ll("sna:ulisttot_re",idxu_max,nelements,natom);
+    ulisttot_im = t_sna_3d_ll("sna:ulisttot_im",idxu_max,nelements,natom);
+    ulisttot_pack = t_sna_4c_ll("sna:ulisttot_pack",32,idxu_max,nelements,(natom+32-1)/32);
     ulist = t_sna_3c_ll("sna:ulist",1,1,1);
-    dulist = t_sna_4c_ll("sna:dulist",1,1,1);
+    zlist = t_sna_3c_ll("sna:zlist",1,1,1);
+    zlist_pack = t_sna_4c_ll("sna:zlist_pack",32,idxz_max,ndoubles,(natom+32-1)/32);
+    blist = t_sna_3d_ll("sna:blist",idxb_max,ntriples,natom);
+    blist_pack = t_sna_4d_ll("sna:blist_pack",32,idxb_max,ntriples,(natom+32-1)/32);
+    ylist = t_sna_3c_ll("sna:ylist",idxu_max,nelements,natom);
+    ylist_pack_re = t_sna_4d_ll("sna:ylist_pack_re",32,idxu_max,nelements,(natom+32-1)/32);
+    ylist_pack_im = t_sna_4d_ll("sna:ylist_pack_im",32,idxu_max,nelements,(natom+32-1)/32);
+    dulist = t_sna_4c3_ll("sna:dulist",1,1,1);
   } else {
 #endif
+    ulisttot = t_sna_3c_ll("sna:ulisttot",idxu_max,nelements,natom);
+    ulisttot_re = t_sna_3d_ll("sna:ulisttot_re",1,1,1);
+    ulisttot_im = t_sna_3d_ll("sna:ulisttot_im",1,1,1);
+    ulisttot_pack = t_sna_4c_ll("sna:ulisttot_pack",1,1,1,1);
     ulist = t_sna_3c_ll("sna:ulist",idxu_max,natom,nmax);
-    dulist = t_sna_4c_ll("sna:dulist",idxu_max,natom,nmax);
+    zlist = t_sna_3c_ll("sna:zlist",idxz_max,ndoubles,natom);
+    zlist_pack = t_sna_4c_ll("sna:zlist_pack",1,1,1,1);
+    blist = t_sna_3d_ll("sna:blist",idxb_max,ntriples,natom);
+    blist_pack = t_sna_4d_ll("sna:blist_pack",1,1,1,1);
+    ylist = t_sna_3c_ll("sna:ylist",idxu_max,nelements,natom);
+    ylist_pack_re = t_sna_4d_ll("sna:ylist_pack_re",1,1,1,1);
+    ylist_pack_im = t_sna_4d_ll("sna:ylist_pack_im",1,1,1,1);
+    dulist = t_sna_4c3_ll("sna:dulist",idxu_max,natom,nmax);
+
 #ifdef KOKKOS_ENABLE_CUDA
   }
 #endif
-
-  //ylist = t_sna_2c_lr("sna:ylist",natom,idxu_max);
-  ylist = t_sna_2c_ll("sna:ylist",idxu_max*nelements,natom);
 }
 
 /* ----------------------------------------------------------------------
- *    compute Ui by summing over neighbors j
- *    ------------------------------------------------------------------------- */
+ * GPU routines
+ * ----------------------------------------------------------------------*/
 
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType>::pre_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int& iatom, int ielem)
+void SNAKokkos<DeviceType>::pre_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int& iatom, const int& ielem)
 {
-  for(int jelem = 0; jelem < nelements; jelem++)
-  for (int j = 0; j <= twojmax; j++) {
-    const int jju = idxu_block(j);
+  for (int jelem = 0; jelem < nelements; jelem++) {
+    for (int j = 0; j <= twojmax; j++) {
+      const int jju = idxu_block(j);
 
-    // Only diagonal elements get initialized
-    // for (int m = 0; m < (j+1)*(j+1); m++)
-    Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, (j+1)*(j+1)),
-      [&] (const int m) {
+      // Only diagonal elements get initialized
+      // for (int m = 0; m < (j+1)*(j+1); m++)
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, (j+1)*(j+1)),
+        [&] (const int m) {
 
-      const int jjup = jju + m;
+        const int jjup = jju + m;
 
-      // if m is on the "diagonal", initialize it with the self energy.
-      // Otherwise zero it out
-      SNAcomplex init = {0., 0.};
-      if (m % (j+2) == 0 && (!chem_flag || ielem == jelem || wselfall_flag)) { init = {wself, 0.0}; } //need to map iatom to element
+        // if m is on the "diagonal", initialize it with the self energy.
+        // Otherwise zero it out
+        double re_part = 0.; 
+        if (m % (j+2) == 0 && (!chem_flag || ielem == jelem || wselfall_flag)) { re_part = wself; }
 
-      ulisttot(jelem*idxu_max+jjup, iatom) = init;
-    });
+        ulisttot_re(jjup, jelem, iatom) = re_part;
+        ulisttot_im(jjup, jelem, iatom) = 0.;
+      });
+    }
   }
 
 }
@@ -322,7 +337,7 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
 
   const double wj_local = wj(iatom, jnbor);
   const double rcut = rcutij(iatom, jnbor);
-  const int jpos = element(iatom, jnbor)*idxu_max;
+  const int jpos = element(iatom, jnbor);
 
   const double rsq = x * x + y * y + z * z;
   const double r = sqrt(rsq);
@@ -347,9 +362,8 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
   // All writes go to global memory and shared memory
   // so we can avoid all global memory reads
   Kokkos::single(Kokkos::PerThread(team), [=]() {
-    //ulist(0,iatom,jnbor) = { 1.0, 0.0 };
     buf1[0] = {1.,0.};
-    Kokkos::atomic_add(&(ulisttot(jpos,iatom).re), sfac);
+    Kokkos::atomic_add(&(ulisttot_re(0,jpos,iatom)), sfac);
   });
 
   for (int j = 1; j <= twojmax; j++) {
@@ -358,9 +372,7 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
 
     // fill in left side of matrix layer from previous layer
 
-    // Flatten loop over ma, mb, need to figure out total
-    // number of iterations
-
+    // Flatten loop over ma, mb
     // for (int ma = 0; ma <= j; ma++)
     const int n_ma = j+1;
     // for (int mb = 0; 2*mb <= j; mb++)
@@ -374,12 +386,12 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
       [&] (const int m) {
 
       // ma fast, mb slow
-      int ma = m % n_ma;
-      int mb = m / n_ma;
+      // Equivalent to `int ma = m % n_ma; int mb = m / n_ma;` IF everything's positive.
+      const int mb = m / n_ma;
+      const int ma = m - mb * n_ma;
 
       // index into global memory array
-      const int jju_index = jju+m+jpos;
-      //const int jjup_index = jjup+mb*j+ma;
+      const int jju_index = jju+m;
 
       // index into shared memory buffer for this level
       const int jju_shared_idx = m;
@@ -391,22 +403,23 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
 
       // VMK recursion relation: grab contribution which is multiplied by b*
       const double rootpq2 = -rootpqarray(ma, j - mb);
-      const SNAcomplex u_up2 = (ma > 0)?rootpq2*buf1[jjup_shared_idx-1]:SNAcomplex(0.,0.);
-      //const SNAcomplex u_up2 = (ma > 0)?rootpq2*ulist(jjup_index-1,iatom,jnbor):SNAcomplex(0.,0.);
+      const SNAcomplex u_up2 = rootpq2*((ma > 0)?buf1[jjup_shared_idx-1]:SNAcomplex(0.,0.));
+
+      // u_accum += conj(b) * u_up2
       caconjxpy(b, u_up2, u_accum);
 
       // VMK recursion relation: grab contribution which is multiplied by a*
       const double rootpq1 = rootpqarray(j - ma, j - mb);
-      const SNAcomplex u_up1 = (ma < j)?rootpq1*buf1[jjup_shared_idx]:SNAcomplex(0.,0.);
-      //const SNAcomplex u_up1 = (ma < j)?rootpq1*ulist(jjup_index,iatom,jnbor):SNAcomplex(0.,0.);
+      const SNAcomplex u_up1 = rootpq1*((ma < j)?buf1[jjup_shared_idx]:SNAcomplex(0.,0.));
+
+      // u_accum += conj(a) * u_up1
       caconjxpy(a, u_up1, u_accum);
 
-      //ulist(jju_index,iatom,jnbor) = u_accum;
       // back up into shared memory for next iter
       buf2[jju_shared_idx] = u_accum;
 
-      Kokkos::atomic_add(&(ulisttot(jju_index,iatom).re), sfac * u_accum.re);
-      Kokkos::atomic_add(&(ulisttot(jju_index,iatom).im), sfac * u_accum.im);
+      Kokkos::atomic_add(&(ulisttot_re(jju_index,jpos,iatom)), sfac * u_accum.re);
+      Kokkos::atomic_add(&(ulisttot_im(jju_index,jpos,iatom)), sfac * u_accum.im);
 
       // copy left side to right side with inversion symmetry VMK 4.4(2)
       // u[ma-j,mb-j] = (-1)^(ma-mb)*Conj([u[ma,mb))
@@ -415,7 +428,7 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
       if (m < total_iters - 1 || j % 2 == 1) {
         const int sign_factor = (((ma+mb)%2==0)?1:-1);
         const int jju_shared_flip = (j+1-mb)*(j+1)-(ma+1);
-        const int jjup_flip = jju + jju_shared_flip + jpos; // jju+(j+1-mb)*(j+1)-(ma+1);
+        const int jjup_flip = jju + jju_shared_flip; // jju+(j+1-mb)*(j+1)-(ma+1);
 
 
         if (sign_factor == 1) {
@@ -423,11 +436,12 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
         } else {
           u_accum.re = -u_accum.re;
         }
-        //ulist(jjup_flip,iatom,jnbor) = u_accum;
+        
         buf2[jju_shared_flip] = u_accum;
 
-        Kokkos::atomic_add(&(ulisttot(jjup_flip,iatom).re), sfac * u_accum.re);
-        Kokkos::atomic_add(&(ulisttot(jjup_flip,iatom).im), sfac * u_accum.im);
+        // split re, im to get fully coalesced atomic add
+        Kokkos::atomic_add(&(ulisttot_re(jjup_flip,jpos,iatom)), sfac * u_accum.re);
+        Kokkos::atomic_add(&(ulisttot_im(jjup_flip,jpos,iatom)), sfac * u_accum.im);
       }
     });
     // In CUDA backend,
@@ -442,46 +456,13 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
 }
 
 /* ----------------------------------------------------------------------
-   compute Ui by summing over bispectrum components. CPU only.
+   compute Zi by summing over products of Ui, GPU version
 ------------------------------------------------------------------------- */
 
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType>::compute_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
+void SNAKokkos<DeviceType>::compute_zi(const int& iatom_mod, const int& jjz, const int& iatom_div)
 {
-  double rsq, r, x, y, z, z0, theta0;
-
-  // utot(j,ma,mb) = 0 for all j,ma,ma
-  // utot(j,ma,ma) = 1 for all j,ma
-  // for j in neighbors of i:
-  //   compute r0 = (x,y,z,z0)
-  //   utot(j,ma,mb) += u(r0;j,ma,mb) for all j,ma,mb
-
-  x = rij(iatom,jnbor,0);
-  y = rij(iatom,jnbor,1);
-  z = rij(iatom,jnbor,2);
-  rsq = x * x + y * y + z * z;
-  r = sqrt(rsq);
-
-  theta0 = (r - rmin0) * rfac0 * MY_PI / (rcutij(iatom,jnbor) - rmin0);
-  //    theta0 = (r - rmin0) * rscale0;
-  z0 = r / tan(theta0);
-
-  compute_uarray_cpu(team, iatom, jnbor, x, y, z, z0, r);
-  add_uarraytot(team, iatom, jnbor, r, wj(iatom,jnbor), rcutij(iatom,jnbor), element(iatom, jnbor));
-
-}
-
-/* ----------------------------------------------------------------------
-   compute Zi by summing over products of Ui
-------------------------------------------------------------------------- */
-
-template<class DeviceType>
-KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType>::compute_zi(const int& iter)
-{
-  const int iatom = iter / idxz_max;
-  const int jjz = iter % idxz_max;
 
   const int j1 = idxz(jjz, 0);
   const int j2 = idxz(jjz, 1);
@@ -493,77 +474,161 @@ void SNAKokkos<DeviceType>::compute_zi(const int& iter)
   const int na = idxz(jjz, 7);
   const int nb = idxz(jjz, 8);
 
-  const double *cgblock = cglist.data() + idxcg_block(j1, j2, j);
+  const double* cgblock = cglist.data() + idxcg_block(j1, j2, j);
 
   int idouble = 0;
-  for(int elem1 = 0; elem1 < nelements; elem1++)
-  for(int elem2 = 0; elem2 < nelements; elem2++) {
 
-  const int jjza = idouble*idxz_max + jjz;
+  for (int elem1 = 0; elem1 < nelements; elem1++) {
+    for (int elem2 = 0; elem2 < nelements; elem2++) {
+      double ztmp_r = 0.;
+      double ztmp_i = 0.;
 
-  zlist(jjza, iatom).re = 0.0;
-  zlist(jjza, iatom).im = 0.0;
+      int jju1 = idxu_block[j1] + (j1+1)*mb1min;
+      int jju2 = idxu_block[j2] + (j2+1)*mb2max;
+      int icgb = mb1min*(j2+1) + mb2max;
 
-  int jju1 = elem1 * idxu_max + idxu_block[j1] + (j1 + 1) * mb1min;
-  int jju2 = elem2 * idxu_max + idxu_block[j2] + (j2 + 1) * mb2max;
-  int icgb = mb1min * (j2 + 1) + mb2max;
-  for (int ib = 0; ib < nb; ib++) {
+      #pragma unroll
+      for(int ib = 0; ib < nb; ib++) {
 
-    double suma1_r = 0.0;
-    double suma1_i = 0.0;
+        int ma1 = ma1min;
+        int ma2 = ma2max;
+        int icga = ma1min*(j2+1) + ma2max;
 
-    int ma1 = ma1min;
-    int ma2 = ma2max;
-    int icga = ma1min * (j2 + 1) + ma2max;
-    for (int ia = 0; ia < na; ia++) {
-      suma1_r += cgblock[icga] * (ulisttot(jju1 + ma1, iatom).re * ulisttot(jju2 + ma2, iatom).re -
-                                  ulisttot(jju1 + ma1, iatom).im * ulisttot(jju2 + ma2, iatom).im);
-      suma1_i += cgblock[icga] * (ulisttot(jju1 + ma1, iatom).re * ulisttot(jju2 + ma2, iatom).im +
-                                  ulisttot(jju1 + ma1, iatom).im * ulisttot(jju2 + ma2, iatom).re);
-      ma1++;
-      ma2--;
-      icga += j2;
-    } // end loop over ia
+        #pragma unroll
+        for(int ia = 0; ia < na; ia++) {
+          const SNAcomplex utot1 = ulisttot_pack(iatom_mod, jju1+ma1, elem1, iatom_div);
+          const SNAcomplex utot2 = ulisttot_pack(iatom_mod, jju2+ma2, elem2, iatom_div);
+          const auto cgcoeff_a = cgblock[icga];
+          const auto cgcoeff_b = cgblock[icgb];
+          ztmp_r += cgcoeff_a * cgcoeff_b * (utot1.re * utot2.re - utot1.im * utot2.im);
+          ztmp_i += cgcoeff_a * cgcoeff_b * (utot1.re * utot2.im + utot1.im * utot2.re);
+          ma1++;
+          ma2--;
+          icga += j2;
+        } // end loop over ia
 
-    zlist(jjza, iatom).re += cgblock[icgb] * suma1_r;
-    zlist(jjza, iatom).im += cgblock[icgb] * suma1_i;
+        jju1 += j1 + 1;
+        jju2 -= j2 + 1;
+        icgb += j2;
+      } // end loop over ib
 
-    jju1 += j1 + 1;
-    jju2 -= j2 + 1;
-    icgb += j2;
+      if (bnorm_flag) {
+        ztmp_r /= (j + 1);
+        ztmp_i /= (j + 1);
+      }
 
-  } // end loop over ib
-  if (bnorm_flag) {
-    zlist(jjza, iatom).re /= (j+1);
-    zlist(jjza, iatom).im /= (j+1);
-  }
-  idouble++;
+      zlist_pack(iatom_mod,jjz,idouble,iatom_div) = { ztmp_r, ztmp_i };
+
+      idouble++;
+    }
   }
 }
 
 /* ----------------------------------------------------------------------
-   compute Yi from Ui without storing Zi, looping over zlist indices
+   compute Bi by summing conj(Ui)*Zi
 ------------------------------------------------------------------------- */
 
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType>::zero_yi(const int& idx, const int& iatom, int ielem)
+void SNAKokkos<DeviceType>::compute_bi(const int& iatom_mod, const int& jjb, const int& iatom_div)
 {
-  ylist(ielem*idxu_max+idx,iatom) = {0.0, 0.0};
+  // for j1 = 0,...,twojmax
+  //   for j2 = 0,twojmax
+  //     for j = |j1-j2|,Min(twojmax,j1+j2),2
+  //        b(j1,j2,j) = 0
+  //        for mb = 0,...,jmid
+  //          for ma = 0,...,j
+  //            b(j1,j2,j) +=
+  //              2*Conj(u(j,ma,mb))*z(j1,j2,j,ma,mb)
+
+  const int j1 = idxb(jjb,0);
+  const int j2 = idxb(jjb,1);
+  const int j = idxb(jjb,2);
+
+  const int jjz = idxz_block(j1,j2,j);
+  const int jju = idxu_block[j];
+
+  int itriple = 0;
+  int idouble = 0;
+  for (int elem1 = 0; elem1 < nelements; elem1++) {
+    for (int elem2 = 0; elem2 < nelements; elem2++) {
+      const int jalloy = idouble;
+
+      for (int elem3 = 0; elem3 < nelements; elem3++) {
+        const int jjballoy = itriple;
+
+        double sumzu = 0.0;
+        double sumzu_temp = 0.0;
+
+        for(int mb = 0; 2*mb < j; mb++) {
+          for(int ma = 0; ma <= j; ma++) {
+            const int jju_index = jju+mb*(j+1)+ma;
+            const int jjz_index = jjz+mb*(j+1)+ma;
+            if (2*mb == j) return; // I think we can remove this?
+            const auto utot = ulisttot_pack(iatom_mod, jju_index, elem3, iatom_div);
+            const auto zloc = zlist_pack(iatom_mod, jjz_index, jalloy, iatom_div);
+            sumzu_temp += utot.re * zloc.re + utot.im * zloc.im;
+          }
+        }
+        sumzu += sumzu_temp;
+
+        // For j even, special treatment for middle column
+        if (j%2 == 0) {
+          sumzu_temp = 0.;
+
+          const int mb = j/2;
+          for(int ma = 0; ma < mb; ma++) {
+            const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma;
+            const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma;
+
+            const auto utot = ulisttot_pack(iatom_mod, jju_index, elem3, iatom_div);
+            const auto zloc = zlist_pack(iatom_mod, jjz_index, jalloy, iatom_div);
+            sumzu_temp += utot.re * zloc.re + utot.im * zloc.im;
+
+          }
+          sumzu += sumzu_temp;
+
+          const int ma = mb;
+          const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma;
+          const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma;
+
+          const auto utot = ulisttot_pack(iatom_mod, jju_index, elem3, iatom_div);
+          const auto zloc = zlist_pack(iatom_mod, jjz_index, jalloy, iatom_div);
+          sumzu += 0.5 * (utot.re * zloc.re + utot.im * zloc.im);
+        } // end if jeven
+
+        sumzu *= 2.0;
+        if (bzero_flag) {
+          if (!wselfall_flag) {
+            if (elem1 == elem2 && elem1 == elem3) {
+              sumzu -= bzero[j];
+            }
+          } else {
+            sumzu -= bzero[j];
+          }
+        }
+        blist_pack(iatom_mod, jjb, jjballoy, iatom_div) = sumzu;
+            //} // end loop over j
+          //} // end loop over j1, j2
+        itriple++; 
+      } // end loop over elem3
+      idouble++;
+    } // end loop over elem2
+  } // end loop over elem1
 }
 
+
 /* ----------------------------------------------------------------------
-   compute Yi from Ui without storing Zi, looping over zlist indices
+   compute Yi from Ui without storing Zi, looping over zlist indices,
+   GPU version
 ------------------------------------------------------------------------- */
 
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType>::compute_yi(int iter,
- const Kokkos::View<F_FLOAT**, DeviceType> &beta)
+void SNAKokkos<DeviceType>::compute_yi(int iatom_mod, int jjz, int iatom_div,
+ const Kokkos::View<F_FLOAT***, Kokkos::LayoutLeft, DeviceType> &beta_pack)
 {
   double betaj;
-  const int iatom = iter / idxz_max;
-  const int jjz = iter % idxz_max;
 
   const int j1 = idxz(jjz, 0);
   const int j2 = idxz(jjz, 1);
@@ -576,88 +641,84 @@ void SNAKokkos<DeviceType>::compute_yi(int iter,
   const int nb = idxz(jjz, 8);
   const int jju = idxz(jjz, 9);
 
-  const double *cgblock = cglist.data() + idxcg_block(j1, j2, j);
+  const double *cgblock = cglist.data() + idxcg_block(j1,j2,j);
   //int mb = (2 * (mb1min+mb2max) - j1 - j2 + j) / 2;
   //int ma = (2 * (ma1min+ma2max) - j1 - j2 + j) / 2;
 
-  int itriple;
-  for(int elem1 = 0; elem1 < nelements; elem1++)
+  for (int elem1 = 0; elem1 < nelements; elem1++) {
     for (int elem2 = 0; elem2 < nelements; elem2++) {
 
-    double ztmp_r = 0.0;
-    double ztmp_i = 0.0;
+      double ztmp_r = 0.0;
+      double ztmp_i = 0.0;
 
-    int jju1 = elem1 * idxu_max + idxu_block[j1] + (j1 + 1) * mb1min;
-    int jju2 = elem2 * idxu_max + idxu_block[j2] + (j2 + 1) * mb2max;
-    int icgb = mb1min * (j2 + 1) + mb2max;
-    for (int ib = 0; ib < nb; ib++) {
+      int jju1 = idxu_block[j1] + (j1 + 1) * mb1min;
+      int jju2 = idxu_block[j2] + (j2 + 1) * mb2max;
+      int icgb = mb1min * (j2 + 1) + mb2max;
 
-      double suma1_r = 0.0;
-      double suma1_i = 0.0;
+      #pragma unroll
+      for (int ib = 0; ib < nb; ib++) {
 
-      int ma1 = ma1min;
-      int ma2 = ma2max;
-      int icga = ma1min * (j2 + 1) + ma2max;
+        int ma1 = ma1min;
+        int ma2 = ma2max;
+        int icga = ma1min*(j2+1) + ma2max;
 
-      for (int ia = 0; ia < na; ia++) {
-        suma1_r += cgblock[icga] * (ulisttot(jju1 + ma1, iatom).re *
-                                    ulisttot(jju2 + ma2, iatom).re -
-                                    ulisttot(jju1 + ma1, iatom).im *
-                                    ulisttot(jju2 + ma2, iatom).im);
-        suma1_i += cgblock[icga] * (ulisttot(jju1 + ma1, iatom).re *
-                                    ulisttot(jju2 + ma2, iatom).im +
-                                    ulisttot(jju1 + ma1, iatom).im *
-                                    ulisttot(jju2 + ma2, iatom).re);
-        ma1++;
-        ma2--;
-        icga += j2;
-      } // end loop over ia
+        #pragma unroll
+        for (int ia = 0; ia < na; ia++) {
+          const SNAcomplex utot1 = ulisttot_pack(iatom_mod,jju1+ma1,elem1,iatom_div);
+          const SNAcomplex utot2 = ulisttot_pack(iatom_mod,jju2+ma2,elem2,iatom_div);
+          const auto cgcoeff_a = cgblock[icga];
+          const auto cgcoeff_b = cgblock[icgb];
+          ztmp_r += cgcoeff_a * cgcoeff_b * (utot1.re * utot2.re - utot1.im * utot2.im);
+          ztmp_i += cgcoeff_a * cgcoeff_b * (utot1.re * utot2.im + utot1.im * utot2.re);
+          ma1++;
+          ma2--;
+          icga += j2;
+        } // end loop over ia
 
-      ztmp_r += cgblock[icgb] * suma1_r;
-      ztmp_i += cgblock[icgb] * suma1_i;
-      jju1 += j1 + 1;
-      jju2 -= j2 + 1;
-      icgb += j2;
-    } // end loop over ib
+        jju1 += j1 + 1;
+        jju2 -= j2 + 1;
+        icgb += j2;
+      } // end loop over ib
 
-    if (bnorm_flag) {
-      ztmp_i /= j + 1;
-      ztmp_r /= j + 1;
-    }
-
-    // apply to z(j1,j2,j,ma,mb) to unique element of y(j)
-    // find right y_list[jju] and beta(iatom,jjb) entries
-    // multiply and divide by j+1 factors
-    // account for multiplicity of 1, 2, or 3
-
-    // pick out right beta value
-    for (int elem3 = 0; elem3 < nelements; elem3++) {
-      const int jjuy = elem3 * idxu_max + jju;
-      if (j >= j1) {
-        const int jjb = idxb_block(j1, j2, j);
-        itriple = ((elem1 * nelements + elem2) * nelements + elem3) * idxb_max + jjb;
-        if (j1 == j) {
-          if (j2 == j) betaj = 3 * beta(itriple, iatom);
-          else betaj = 2 * beta(itriple, iatom);
-        } else betaj = beta(itriple, iatom);
-      } else if (j >= j2) {
-        const int jjb = idxb_block(j, j2, j1);
-        itriple = ((elem3 * nelements + elem2) * nelements + elem1) * idxb_max + jjb;
-        if (j2 == j) betaj = 2 * beta(itriple, iatom);
-        else betaj = beta(itriple, iatom);
-      } else {
-        const int jjb = idxb_block(j2, j, j1);
-        itriple = ((elem2 * nelements + elem3) * nelements + elem1) * idxb_max + jjb;
-        betaj = beta(itriple, iatom);
+      if (bnorm_flag) {
+        ztmp_r /= j + 1;
+        ztmp_i /= j + 1;
       }
 
-      if (!bnorm_flag && j1 > j)
-        betaj *= (j1 + 1) / (j + 1.0);
+      // apply to z(j1,j2,j,ma,mb) to unique element of y(j)
+      // find right y_list[jju] and beta(iatom,jjb) entries
+      // multiply and divide by j+1 factors
+      // account for multiplicity of 1, 2, or 3
 
-      Kokkos::atomic_add(&(ylist(jjuy, iatom).re), betaj * ztmp_r);
-      Kokkos::atomic_add(&(ylist(jjuy, iatom).im), betaj * ztmp_i);
-    }
-  }
+      // pick out right beta value
+      for (int elem3 = 0; elem3 < nelements; elem3++) {
+        if (j >= j1) {
+          const int jjb = idxb_block(j1, j2, j);
+          const auto itriple = ((elem1 * nelements + elem2) * nelements + elem3) * idxb_max + jjb;
+          if (j1 == j) {
+            if (j2 == j) betaj = 3 * beta_pack(iatom_mod, itriple, iatom_div);
+            else betaj = 2 * beta_pack(iatom_mod, itriple, iatom_div);
+          } else betaj = beta_pack(iatom_mod, itriple, iatom_div);
+        } else if (j >= j2) {
+          const int jjb = idxb_block(j, j2, j1);
+          const auto itriple = ((elem3 * nelements + elem2) * nelements + elem1) * idxb_max + jjb;
+          if (j2 == j) betaj = 2 * beta_pack(iatom_mod, itriple, iatom_div);
+          else betaj = beta_pack(iatom_mod, itriple, iatom_div);
+        } else {
+          const int jjb = idxb_block(j2, j, j1);
+          const auto itriple = ((elem2 * nelements + elem3) * nelements + elem1) * idxb_max + jjb;
+          betaj = beta_pack(iatom_mod, itriple, iatom_div);
+        }
+
+        if (!bnorm_flag && j1 > j)
+          betaj *= (j1 + 1) / (j + 1.0);
+
+
+        Kokkos::atomic_add(&(ylist_pack_re(iatom_mod, jju, elem3, iatom_div)), betaj*ztmp_r);
+        Kokkos::atomic_add(&(ylist_pack_im(iatom_mod, jju, elem3, iatom_div)), betaj*ztmp_i);
+      } // end loop over elem3
+    } // end loop over elem2
+  } // end loop over elem1
 }
 
 /* ----------------------------------------------------------------------
@@ -673,7 +734,7 @@ void SNAKokkos<DeviceType>::compute_fused_deidrj(const typename Kokkos::TeamPoli
   const int max_m_tile = (twojmax+1)*(twojmax/2+1);
   const int team_rank = team.team_rank();
   const int scratch_shift = team_rank * max_m_tile;
-  const int jpos = element(iatom, jnbor)*idxu_max;
+  const int jelem = element(iatom, jnbor);
 
   // double buffer for ulist
   SNAcomplex* ulist_buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
@@ -723,14 +784,14 @@ void SNAKokkos<DeviceType>::compute_fused_deidrj(const typename Kokkos::TeamPoli
 
   // Accumulate the full contribution to dedr on the fly
   const double du_prod = dsfac * u; // chain rule
-  const SNAcomplex y_local = ylist(jpos, iatom);
+  const SNAcomplex y_local = ylist(0, jelem, iatom);
 
   // Symmetry factor of 0.5 b/c 0 element is on diagonal for even j==0
   double dedr_full_sum = 0.5 * du_prod * y_local.re;
 
   // single has a warp barrier at the end
   Kokkos::single(Kokkos::PerThread(team), [=]() {
-    //dulist(0,iatom,jnbor,dir) = { dsfac * u, 0. }; // fold in chain rule here
+    
     ulist_buf1[0] = {1., 0.};
     dulist_buf1[0] = {0., 0.};
   });
@@ -755,10 +816,11 @@ void SNAKokkos<DeviceType>::compute_fused_deidrj(const typename Kokkos::TeamPoli
       [&] (const int m, double& sum_tmp) {
 
       // ma fast, mb slow
-      int ma = m % n_ma;
-      int mb = m / n_ma;
+      // Equivalent to `int ma = m % n_ma; int mb = m / n_ma;` IF everything's positive.
+      const int mb = m / n_ma;
+      const int ma = m - mb * n_ma;
 
-      const int jju_index = jpos+jju+m;
+      const int jju_index = jju+m;
 
       // Load y_local, apply the symmetry scaling factor
       // The "secret" of the shared memory optimization is it eliminates
@@ -766,7 +828,7 @@ void SNAKokkos<DeviceType>::compute_fused_deidrj(const typename Kokkos::TeamPoli
       // shared memory and otherwise always writing, making the kernel
       // ultimately compute bound. We take advantage of that by adding
       // some reads back in.
-      auto y_local = ylist(jju_index,iatom);
+      auto y_local = ylist(jju_index, jelem, iatom);
       if (j % 2 == 0 && 2*mb == j) {
         if (ma == mb) { y_local = 0.5*y_local; }
         else if (ma > mb) { y_local = { 0., 0. }; } // can probably avoid this outright
@@ -783,29 +845,35 @@ void SNAKokkos<DeviceType>::compute_fused_deidrj(const typename Kokkos::TeamPoli
       SNAcomplex du_accum = { 0., 0. };
 
       const double rootpq2 = -rootpqarray(ma, j - mb);
-      const SNAcomplex u_up2 = (ma > 0)?rootpq2*ulist_buf1[jjup_shared_idx-1]:SNAcomplex(0.,0.);
+      const SNAcomplex u_up2 = rootpq2*((ma > 0) ? ulist_buf1[jjup_shared_idx-1]:SNAcomplex(0.,0.));
+
+      // u_accum += conj(b) * u_up2
       caconjxpy(b, u_up2, u_accum);
 
       const double rootpq1 = rootpqarray(j - ma, j - mb);
-      const SNAcomplex u_up1 = (ma < j)?rootpq1*ulist_buf1[jjup_shared_idx]:SNAcomplex(0.,0.);
+      const SNAcomplex u_up1 = rootpq1*((ma < j) ? ulist_buf1[jjup_shared_idx]:SNAcomplex(0.,0.));
+
+      // u_accum += conj(a) * u_up1
       caconjxpy(a, u_up1, u_accum);
 
       // Next, spin up du_accum
-      const SNAcomplex du_up1 = (ma < j) ? rootpq1*dulist_buf1[jjup_shared_idx] : SNAcomplex(0.,0.);
+      const SNAcomplex du_up1 = rootpq1*((ma < j) ? dulist_buf1[jjup_shared_idx] : SNAcomplex(0.,0.));
+
+      // du_accum += conj(da) * u_up1 + conj(a) * du_up1
       caconjxpy(da, u_up1, du_accum);
       caconjxpy(a, du_up1, du_accum);
 
-      const SNAcomplex du_up2 = (ma > 0) ? rootpq2*dulist_buf1[jjup_shared_idx-1] : SNAcomplex(0.,0.);
+      const SNAcomplex du_up2 = rootpq2*((ma > 0) ? dulist_buf1[jjup_shared_idx-1] : SNAcomplex(0.,0.));
+
+      // du_accum += conj(db) * u_up2 + conj(b) * du_up2
       caconjxpy(db, u_up2, du_accum);
       caconjxpy(b, du_up2, du_accum);
 
-      // No need to save u_accum to global memory
       // Cache u_accum, du_accum to scratch memory.
       ulist_buf2[jju_shared_idx] = u_accum;
       dulist_buf2[jju_shared_idx] = du_accum;
 
       // Directly accumulate deidrj into sum_tmp
-      //dulist(jju_index,iatom,jnbor,dir) = ((dsfac * u)*u_accum) + (sfac*du_accum);
       const SNAcomplex du_prod = ((dsfac * u)*u_accum) + (sfac*du_accum);
       sum_tmp += du_prod.re * y_local.re + du_prod.im * y_local.im;
 
@@ -813,7 +881,7 @@ void SNAKokkos<DeviceType>::compute_fused_deidrj(const typename Kokkos::TeamPoli
       // u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb])
       if (j%2==1 && mb+1==n_mb) {
         int sign_factor = (((ma+mb)%2==0)?1:-1);
-        //const int jjup_flip = jju+(j+1-mb)*(j+1)-(ma+1); // no longer needed b/c we don't update dulist
+        
         const int jju_shared_flip = (j+1-mb)*(j+1)-(ma+1);
 
         if (sign_factor == 1) {
@@ -826,8 +894,6 @@ void SNAKokkos<DeviceType>::compute_fused_deidrj(const typename Kokkos::TeamPoli
 
         // We don't need the second half of the tile for the deidrj accumulation.
         // That's taken care of by the symmetry factor above.
-        //dulist(jjup_flip,iatom,jnbor,dir) = ((dsfac * u)*u_accum) + (sfac*du_accum);
-
         // We do need it for ortho polynomial generation, though
         ulist_buf2[jju_shared_flip] = u_accum;
         dulist_buf2[jju_shared_flip] = du_accum;
@@ -854,69 +920,150 @@ void SNAKokkos<DeviceType>::compute_fused_deidrj(const typename Kokkos::TeamPoli
   });
 }
 
-/* ----------------------------------------------------------------------
-   compute dEidRj, CPU path only.
-------------------------------------------------------------------------- */
 
 
+/* ----------------------------------------------------------------------
+ * CPU routines
+ * ----------------------------------------------------------------------*/
+
+/* ----------------------------------------------------------------------
+ *    compute Ui by summing over neighbors j
+ *    ------------------------------------------------------------------------- */
+
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType>::compute_deidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
+void SNAKokkos<DeviceType>::pre_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int& iatom, const int& ielem)
 {
-  t_scalar3<double> final_sum;
-  const int jelem = element(iatom, jnbor);
+  for (int jelem = 0; jelem < nelements; jelem++) {
+    for (int j = 0; j <= twojmax; j++) {
+      const int jju = idxu_block(j);
 
-  //for(int j = 0; j <= twojmax; j++) {
-  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,twojmax+1),
-      [&] (const int& j, t_scalar3<double>& sum_tmp) {
-    int jju = idxu_block[j];
-    int jjuy = idxu_block[j] + jelem*idxu_max;
+      // Only diagonal elements get initialized
+      // for (int m = 0; m < (j+1)*(j+1); m++)
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, (j+1)*(j+1)),
+        [&] (const int m) {
 
-    for(int mb = 0; 2*mb < j; mb++)
-      for(int ma = 0; ma <= j; ma++) {
-        sum_tmp.x += dulist(jju,iatom,jnbor,0).re * ylist(jjuy,iatom).re + dulist(jju,iatom,jnbor,0).im * ylist(jjuy,iatom).im;
-        sum_tmp.y += dulist(jju,iatom,jnbor,1).re * ylist(jjuy,iatom).re + dulist(jju,iatom,jnbor,1).im * ylist(jjuy,iatom).im;
-        sum_tmp.z += dulist(jju,iatom,jnbor,2).re * ylist(jjuy,iatom).re + dulist(jju,iatom,jnbor,2).im * ylist(jjuy,iatom).im;
-        jju++;
-        jjuy++;
-      } //end loop over ma mb
+        const int jjup = jju + m;
 
-    // For j even, handle middle column
+        // if m is on the "diagonal", initialize it with the self energy.
+        // Otherwise zero it out
+        SNAcomplex init = {0., 0.};
+        if (m % (j+2) == 0 && (!chem_flag || ielem == jelem || wselfall_flag)) { init = {wself, 0.0}; } //need to map iatom to element
 
-    if (j%2 == 0) {
-
-      int mb = j/2;
-      for(int ma = 0; ma < mb; ma++) {
-        sum_tmp.x += dulist(jju,iatom,jnbor,0).re * ylist(jjuy,iatom).re + dulist(jju,iatom,jnbor,0).im * ylist(jjuy,iatom).im;
-        sum_tmp.y += dulist(jju,iatom,jnbor,1).re * ylist(jjuy,iatom).re + dulist(jju,iatom,jnbor,1).im * ylist(jjuy,iatom).im;
-        sum_tmp.z += dulist(jju,iatom,jnbor,2).re * ylist(jjuy,iatom).re + dulist(jju,iatom,jnbor,2).im * ylist(jjuy,iatom).im;
-        jju++;
-        jjuy++;
-      }
-
-      //int ma = mb;
-      sum_tmp.x += (dulist(jju,iatom,jnbor,0).re * ylist(jjuy,iatom).re + dulist(jju,iatom,jnbor,0).im * ylist(jjuy,iatom).im)*0.5;
-      sum_tmp.y += (dulist(jju,iatom,jnbor,1).re * ylist(jjuy,iatom).re + dulist(jju,iatom,jnbor,1).im * ylist(jjuy,iatom).im)*0.5;
-      sum_tmp.z += (dulist(jju,iatom,jnbor,2).re * ylist(jjuy,iatom).re + dulist(jju,iatom,jnbor,2).im * ylist(jjuy,iatom).im)*0.5;
-    } // end if jeven
-
-  },final_sum); // end loop over j
-
-  Kokkos::single(Kokkos::PerThread(team), [&] () {
-    dedr(iatom,jnbor,0) = final_sum.x*2.0;
-    dedr(iatom,jnbor,1) = final_sum.y*2.0;
-    dedr(iatom,jnbor,2) = final_sum.z*2.0;
-  });
+        ulisttot(jjup, jelem, iatom) = init;
+      });
+    }
+  }
 
 }
 
+
 /* ----------------------------------------------------------------------
-   compute Bi by summing conj(Ui)*Zi
+   compute Ui by summing over bispectrum components. CPU only.
 ------------------------------------------------------------------------- */
 
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType>::compute_bi(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom)
+void SNAKokkos<DeviceType>::compute_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
+{
+  double rsq, r, x, y, z, z0, theta0;
+
+  // utot(j,ma,mb) = 0 for all j,ma,ma
+  // utot(j,ma,ma) = 1 for all j,ma
+  // for j in neighbors of i:
+  //   compute r0 = (x,y,z,z0)
+  //   utot(j,ma,mb) += u(r0;j,ma,mb) for all j,ma,mb
+
+  x = rij(iatom,jnbor,0);
+  y = rij(iatom,jnbor,1);
+  z = rij(iatom,jnbor,2);
+  rsq = x * x + y * y + z * z;
+  r = sqrt(rsq);
+
+  theta0 = (r - rmin0) * rfac0 * MY_PI / (rcutij(iatom,jnbor) - rmin0);
+  //    theta0 = (r - rmin0) * rscale0;
+  z0 = r / tan(theta0);
+
+  compute_uarray_cpu(team, iatom, jnbor, x, y, z, z0, r);
+  add_uarraytot(team, iatom, jnbor, r, wj(iatom,jnbor), rcutij(iatom,jnbor), element(iatom, jnbor));
+
+}
+/* ----------------------------------------------------------------------
+   compute Zi by summing over products of Ui, CPU version
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void SNAKokkos<DeviceType>::compute_zi_cpu(const int& iter)
+{
+  const int iatom = iter / idxz_max;
+  const int jjz = iter % idxz_max;
+
+  const int j1 = idxz(jjz, 0);
+  const int j2 = idxz(jjz, 1);
+  const int j = idxz(jjz, 2);
+  const int ma1min = idxz(jjz, 3);
+  const int ma2max = idxz(jjz, 4);
+  const int mb1min = idxz(jjz, 5);
+  const int mb2max = idxz(jjz, 6);
+  const int na = idxz(jjz, 7);
+  const int nb = idxz(jjz, 8);
+
+  const double *cgblock = cglist.data() + idxcg_block(j1,j2,j);
+
+  int idouble = 0;
+
+  for (int elem1 = 0; elem1 < nelements; elem1++) {
+    for (int elem2 = 0; elem2 < nelements; elem2++) {
+      zlist(jjz, idouble, iatom).re = 0.0;
+      zlist(jjz, idouble, iatom).im = 0.0;
+
+      int jju1 = idxu_block[j1] + (j1+1)*mb1min;
+      int jju2 = idxu_block[j2] + (j2+1)*mb2max;
+      int icgb = mb1min*(j2+1) + mb2max;
+      for(int ib = 0; ib < nb; ib++) {
+
+        double suma1_r = 0.0;
+        double suma1_i = 0.0;
+
+        int ma1 = ma1min;
+        int ma2 = ma2max;
+        int icga = ma1min * (j2 + 1) + ma2max;
+        for(int ia = 0; ia < na; ia++) {
+          suma1_r += cgblock[icga] * (ulisttot(jju1+ma1, elem1, iatom).re * ulisttot(jju2+ma2, elem2, iatom).re -
+                                      ulisttot(jju1+ma1, elem1, iatom).im * ulisttot(jju2+ma2, elem2, iatom).im);
+          suma1_i += cgblock[icga] * (ulisttot(jju1+ma1, elem1, iatom).re * ulisttot(jju2+ma2, elem2, iatom).im +
+                                      ulisttot(jju1+ma1, elem1, iatom).im * ulisttot(jju2+ma2, elem2, iatom).re);
+          ma1++;
+          ma2--;
+          icga += j2;
+        } // end loop over ia
+
+        zlist(jjz, idouble, iatom).re += cgblock[icgb] * suma1_r;
+        zlist(jjz, idouble, iatom).im += cgblock[icgb] * suma1_i;
+
+        jju1 += j1 + 1;
+        jju2 -= j2 + 1;
+        icgb += j2;
+      } // end loop over ib
+
+      if (bnorm_flag) {
+        zlist(jjz, idouble, iatom).re /= (j+1);
+        zlist(jjz, idouble, iatom).im /= (j+1);
+      }
+      idouble++;
+    } // end loop over elem2
+  } // end loop over elem1
+}
+
+
+/* ----------------------------------------------------------------------
+   compute Bi by summing conj(Ui)*Zi, CPU version
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void SNAKokkos<DeviceType>::compute_bi_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom)
 {
   // for j1 = 0,...,twojmax
   //   for j2 = 0,twojmax
@@ -929,15 +1076,14 @@ void SNAKokkos<DeviceType>::compute_bi(const typename Kokkos::TeamPolicy<DeviceT
 
   int itriple = 0;
   int idouble = 0;
-  int jalloy = 0;
-  for (int elem1 = 0; elem1 < nelements; elem1++)
+  for (int elem1 = 0; elem1 < nelements; elem1++) {
     for (int elem2 = 0; elem2 < nelements; elem2++) {
-      jalloy = idouble*idxz_max;
+      const auto jalloy = idouble;
       for (int elem3 = 0; elem3 < nelements; elem3++) {
-        Kokkos::parallel_for(Kokkos::TeamThreadRange(team, idxb_max),
-        [&](const int &jjb) {
+        Kokkos::parallel_for(Kokkos::TeamThreadRange(team,idxb_max),
+          [&] (const int& jjb) {
         //for(int jjb = 0; jjb < idxb_max; jjb++) {
-          const int jjballoy = itriple*idxb_max+jjb;
+          const auto jjballoy = itriple;
           const int j1 = idxb(jjb, 0);
           const int j2 = idxb(jjb, 1);
           const int j = idxb(jjb, 2);
@@ -946,76 +1092,191 @@ void SNAKokkos<DeviceType>::compute_bi(const typename Kokkos::TeamPolicy<DeviceT
           int jju = idxu_block[j];
           double sumzu = 0.0;
           double sumzu_temp = 0.0;
-          const int bound = (j + 2) / 2;
-          Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, (j + 1) * bound),
-          [&](const int mbma, double &sum) {
-            //for(int mb = 0; 2*mb < j; mb++)
-            //for(int ma = 0; ma <= j; ma++) {
-            const int ma = mbma % (j + 1);
-            const int mb = mbma / (j + 1);
-            const int jju_index = elem3 * idxu_max + jju + mb * (j + 1) + ma;
-            const int jjz_index = jalloy + jjz + mb * (j + 1) + ma;
-            if (2 * mb == j) return;
-            sum +=
-                ulisttot(jju_index, iatom).re *
-                zlist(jjz_index, iatom).re +
-                ulisttot(jju_index, iatom).im *
-                zlist(jjz_index, iatom).im;
-          }, sumzu_temp); // end loop over ma, mb
-          sumzu += sumzu_temp;
+          const int bound = (j+2)/2;
+          Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,(j+1)*bound),
+              [&] (const int mbma, double& sum) {
+              //for(int mb = 0; 2*mb < j; mb++)
+                //for(int ma = 0; ma <= j; ma++) {
+              const int ma = mbma % (j + 1);
+              const int mb = mbma / (j + 1);
+              const int jju_index = jju + mb * (j + 1) + ma;
+              const int jjz_index = jjz + mb * (j + 1) + ma;
+              if (2*mb == j) return;
+              sum +=
+                ulisttot(jju_index, elem3, iatom).re * zlist(jjz_index, jalloy, iatom).re +
+                ulisttot(jju_index, elem3, iatom).im * zlist(jjz_index, jalloy, iatom).im;
+            },sumzu_temp); // end loop over ma, mb
+            sumzu += sumzu_temp;
 
           // For j even, special treatment for middle column
 
-          if (j % 2 == 0) {
-            const int mb = j / 2;
+          if (j%2 == 0) {
+            const int mb = j/2;
             Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, mb),
-            [&](const int ma, double &sum) {
-              //for(int ma = 0; ma < mb; ma++) {
-              const int jju_index =
-                  elem3 * idxu_max + jju + (mb - 1) * (j + 1) + (j + 1) + ma;
-              const int jjz_index =
-                  jalloy + jjz + (mb - 1) * (j + 1) + (j + 1) + ma;
+                [&] (const int ma, double& sum) {
+            //for(int ma = 0; ma < mb; ma++) {
+              const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma;
+              const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma;
               sum +=
-                  ulisttot(jju_index, iatom).re *
-                  zlist(jjz_index, iatom).re +
-                  ulisttot(jju_index, iatom).im *
-                  zlist(jjz_index, iatom).im;
-            }, sumzu_temp); // end loop over ma
+                ulisttot(jju_index, elem3, iatom).re * zlist(jjz_index, jalloy, iatom).re +
+                ulisttot(jju_index, elem3, iatom).im * zlist(jjz_index, jalloy, iatom).im;
+            },sumzu_temp); // end loop over ma
             sumzu += sumzu_temp;
 
             const int ma = mb;
-            const int jju_index = elem3 * idxu_max + jju + (mb - 1) * (j + 1) + (j + 1) + ma;
-            const int jjz_index = jalloy + jjz + (mb - 1) * (j + 1) + (j + 1) + ma;
-            sumzu += 0.5 *
-                     (ulisttot(jju_index, iatom).re *
-                      zlist(jjz_index, iatom).re +
-                      ulisttot(jju_index, iatom).im *
-                      zlist(jjz_index, iatom).im);
+            const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma;
+            const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma;
+            sumzu += 0.5*
+              (ulisttot(jju_index, elem3, iatom).re * zlist(jjz_index, jalloy, iatom).re +
+               ulisttot(jju_index, elem3, iatom).im * zlist(jjz_index, jalloy, iatom).im);
           } // end if jeven
 
-          Kokkos::single(Kokkos::PerThread(team), [&]() {
+          Kokkos::single(Kokkos::PerThread(team), [&] () {
             sumzu *= 2.0;
 
             // apply bzero shift
 
             if (bzero_flag){
               if (!wselfall_flag) {
-                if (elem1 == elem2 && elem1 == elem3)
+                if (elem1 == elem2 && elem1 == elem3) {
                   sumzu -= bzero[j];
-              } else sumzu -= bzero[j];
+                }
+              } else { 
+                sumzu -= bzero[j];
+              }
             }
 
-            blist(jjballoy, iatom) = sumzu;
+            blist(jjb, jjballoy, iatom) = sumzu;
           });
         });
+          //} // end loop over j
+        //} // end loop over j1, j2
         itriple++;
       }
       idouble++;
-    }
-    //} // end loop over j
-  //} // end loop over j1, j2
+    } // end loop over elem2
+  } // end loop over elem1
+  
 }
 
+/* ----------------------------------------------------------------------
+   compute Yi from Ui without storing Zi, looping over zlist indices
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void SNAKokkos<DeviceType>::zero_yi_cpu(const int& idx, const int& iatom, const int& ielem)
+{
+  ylist(idx,ielem,iatom) = {0.0, 0.0};
+}
+
+/* ----------------------------------------------------------------------
+   compute Yi from Ui without storing Zi, looping over zlist indices,
+   CPU version
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void SNAKokkos<DeviceType>::compute_yi_cpu(int iter,
+ const Kokkos::View<F_FLOAT**, DeviceType> &beta)
+{
+  double betaj;
+  const int iatom = iter / idxz_max;
+  const int jjz = iter % idxz_max;
+
+  const int j1 = idxz(jjz, 0);
+  const int j2 = idxz(jjz, 1);
+  const int j = idxz(jjz, 2);
+  const int ma1min = idxz(jjz, 3);
+  const int ma2max = idxz(jjz, 4);
+  const int mb1min = idxz(jjz, 5);
+  const int mb2max = idxz(jjz, 6);
+  const int na = idxz(jjz, 7);
+  const int nb = idxz(jjz, 8);
+  const int jju = idxz(jjz, 9);
+
+  const double *cgblock = cglist.data() + idxcg_block(j1,j2,j);
+  //int mb = (2 * (mb1min+mb2max) - j1 - j2 + j) / 2;
+  //int ma = (2 * (ma1min+ma2max) - j1 - j2 + j) / 2;
+
+  for (int elem1 = 0; elem1 < nelements; elem1++) {
+    for (int elem2 = 0; elem2 < nelements; elem2++) {
+
+      double ztmp_r = 0.0;
+      double ztmp_i = 0.0;
+
+      int jju1 = idxu_block[j1] + (j1 + 1) * mb1min;
+      int jju2 = idxu_block[j2] + (j2 + 1) * mb2max;
+      int icgb = mb1min * (j2 +1 ) + mb2max;
+
+      for (int ib = 0; ib < nb; ib++) {
+
+        double suma1_r = 0.0;
+        double suma1_i = 0.0;
+
+        int ma1 = ma1min;
+        int ma2 = ma2max;
+        int icga = ma1min*(j2+1) + ma2max;
+
+        for (int ia = 0; ia < na; ia++) {
+          suma1_r += cgblock[icga] * (ulisttot(jju1+ma1, elem1, iatom).re * ulisttot(jju2+ma2, elem2, iatom).re -
+                                      ulisttot(jju1+ma1, elem1, iatom).im * ulisttot(jju2+ma2, elem2, iatom).im);
+          suma1_i += cgblock[icga] * (ulisttot(jju1+ma1, elem1, iatom).re * ulisttot(jju2+ma2, elem2, iatom).im +
+                                      ulisttot(jju1+ma1, elem1, iatom).im * ulisttot(jju2+ma2, elem2, iatom).re);
+          ma1++;
+          ma2--;
+          icga += j2;
+        } // end loop over ia
+
+        ztmp_r += cgblock[icgb] * suma1_r;
+        ztmp_i += cgblock[icgb] * suma1_i;
+        jju1 += j1 + 1;
+        jju2 -= j2 + 1;
+        icgb += j2;
+      } // end loop over ib
+
+      if (bnorm_flag) {
+        ztmp_i /= j + 1;
+        ztmp_r /= j + 1;
+      }
+
+      // apply to z(j1,j2,j,ma,mb) to unique element of y(j)
+      // find right y_list[jju] and beta(iatom,jjb) entries
+      // multiply and divide by j+1 factors
+      // account for multiplicity of 1, 2, or 3
+
+      // pick out right beta value
+      for (int elem3 = 0; elem3 < nelements; elem3++) {
+
+        if (j >= j1) {
+          const int jjb = idxb_block(j1, j2, j);
+          const auto itriple = ((elem1 * nelements + elem2) * nelements + elem3) * idxb_max + jjb;
+          if (j1 == j) {
+            if (j2 == j) betaj = 3 * beta(itriple, iatom);
+            else betaj = 2 * beta(itriple, iatom);
+          } else betaj = beta(itriple, iatom);
+        } else if (j >= j2) {
+          const int jjb = idxb_block(j, j2, j1);
+          const auto itriple = ((elem3 * nelements + elem2) * nelements + elem1) * idxb_max + jjb;
+          if (j2 == j) betaj = 2 * beta(itriple, iatom);
+          else betaj = beta(itriple, iatom);
+        } else {
+          const int jjb = idxb_block(j2, j, j1);
+          const auto itriple = ((elem2 * nelements + elem3) * nelements + elem1) * idxb_max + jjb;
+          betaj = beta(itriple, iatom);
+        }
+
+        if (!bnorm_flag && j1 > j)
+          betaj *= (j1 + 1) / (j + 1.0);
+
+        Kokkos::atomic_add(&(ylist(jju, elem3, iatom).re), betaj*ztmp_r);
+        Kokkos::atomic_add(&(ylist(jju, elem3, iatom).im), betaj*ztmp_i);
+      } // end loop over elem3
+    } // end loop over elem2
+  } // end loop over elem1
+}
+
+
 /* ----------------------------------------------------------------------
    calculate derivative of Ui w.r.t. atom j
 ------------------------------------------------------------------------- */
@@ -1042,6 +1303,61 @@ void SNAKokkos<DeviceType>::compute_duidrj_cpu(const typename Kokkos::TeamPolicy
   compute_duarray_cpu(team, iatom, jnbor, x, y, z, z0, r, dz0dr, wj(iatom,jnbor), rcutij(iatom,jnbor));
 }
 
+
+/* ----------------------------------------------------------------------
+   compute dEidRj, CPU path only.
+------------------------------------------------------------------------- */
+
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void SNAKokkos<DeviceType>::compute_deidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
+{
+  t_scalar3<double> final_sum;
+  const int jelem = element(iatom, jnbor);
+
+  //for(int j = 0; j <= twojmax; j++) {
+  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,twojmax+1),
+      [&] (const int& j, t_scalar3<double>& sum_tmp) {
+    int jju = idxu_block[j];
+
+    for(int mb = 0; 2*mb < j; mb++)
+      for(int ma = 0; ma <= j; ma++) {
+        sum_tmp.x += dulist(jju,iatom,jnbor,0).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,0).im * ylist(jju,jelem,iatom).im;
+        sum_tmp.y += dulist(jju,iatom,jnbor,1).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,1).im * ylist(jju,jelem,iatom).im;
+        sum_tmp.z += dulist(jju,iatom,jnbor,2).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,2).im * ylist(jju,jelem,iatom).im;
+        jju++;
+      } //end loop over ma mb
+
+    // For j even, handle middle column
+
+    if (j%2 == 0) {
+
+      int mb = j/2;
+      for(int ma = 0; ma < mb; ma++) {
+        sum_tmp.x += dulist(jju,iatom,jnbor,0).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,0).im * ylist(jju,jelem,iatom).im;
+        sum_tmp.y += dulist(jju,iatom,jnbor,1).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,1).im * ylist(jju,jelem,iatom).im;
+        sum_tmp.z += dulist(jju,iatom,jnbor,2).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,2).im * ylist(jju,jelem,iatom).im;
+        jju++;
+      }
+
+      //int ma = mb;
+      sum_tmp.x += (dulist(jju,iatom,jnbor,0).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,0).im * ylist(jju,jelem,iatom).im)*0.5;
+      sum_tmp.y += (dulist(jju,iatom,jnbor,1).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,1).im * ylist(jju,jelem,iatom).im)*0.5;
+      sum_tmp.z += (dulist(jju,iatom,jnbor,2).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,2).im * ylist(jju,jelem,iatom).im)*0.5;
+    } // end if jeven
+
+  },final_sum); // end loop over j
+
+  Kokkos::single(Kokkos::PerThread(team), [&] () {
+    dedr(iatom,jnbor,0) = final_sum.x*2.0;
+    dedr(iatom,jnbor,1) = final_sum.y*2.0;
+    dedr(iatom,jnbor,2) = final_sum.z*2.0;
+  });
+
+}
+
+
 /* ----------------------------------------------------------------------
    add Wigner U-functions for one neighbor to the total
 ------------------------------------------------------------------------- */
@@ -1052,10 +1368,11 @@ void SNAKokkos<DeviceType>::add_uarraytot(const typename Kokkos::TeamPolicy<Devi
                                           double r, double wj, double rcut, int jelem)
 {
   const double sfac = compute_sfac(r, rcut) * wj;
-  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,idxu_max),
+
+  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,ulisttot.extent(0)),
       [&] (const int& i) {
-    Kokkos::atomic_add(&(ulisttot(jelem*idxu_max+i,iatom).re), sfac * ulist(i,iatom,jnbor).re);
-    Kokkos::atomic_add(&(ulisttot(jelem*idxu_max+i,iatom).im), sfac * ulist(i,iatom,jnbor).im);
+    Kokkos::atomic_add(&(ulisttot(i,jelem,iatom).re), sfac * ulist(i,iatom,jnbor).re);
+    Kokkos::atomic_add(&(ulisttot(i,jelem,iatom).im), sfac * ulist(i,iatom,jnbor).im);
   });
 }
 
@@ -1162,7 +1479,7 @@ void SNAKokkos<DeviceType>::compute_duarray_cpu(const typename Kokkos::TeamPolic
                           double z0, double r, double dz0dr,
                           double wj, double rcut)
 {
-  double r0inv;
+double r0inv;
   double a_r, a_i, b_r, b_i;
   double da_r[3], da_i[3], db_r[3], db_i[3];
   double dz0[3], dr0inv[3], dr0invdr;
@@ -1710,6 +2027,7 @@ double SNAKokkos<DeviceType>::memory_usage()
 {
   int jdimpq = twojmax + 2;
   int jdim = twojmax + 1;
+  int natom_pad = ((natom + 32 - 1) / 32) * 32; // for AoSoA layouts
   double bytes;
 
   bytes = 0;
@@ -1717,21 +2035,38 @@ double SNAKokkos<DeviceType>::memory_usage()
   bytes += jdimpq*jdimpq * sizeof(double);               // pqarray
   bytes += idxcg_max * sizeof(double);                   // cglist
 
+
+
 #ifdef KOKKOS_ENABLE_CUDA
-  if (!std::is_same<DeviceType,Kokkos::Cuda>::value) {
+  if (std::is_same<DeviceType,Kokkos::Cuda>::value) {
+    
+    bytes += natom * idxu_max * nelements * sizeof(double);          // ulisttot_re
+    bytes += natom * idxu_max * nelements * sizeof(double);          // ulisttot_im
+    bytes += natom_pad * idxu_max * nelements * sizeof(double) * 2;  // ulisttot_pack
+
+    bytes += natom_pad * idxz_max * ndoubles * sizeof(double) * 2;   // zlist_pack
+    bytes += natom_pad * idxb_max * ntriples * sizeof(double);       // blist_pack
+
+    bytes += natom_pad * idxu_max * nelements * sizeof(double);      // ylist_pack_re
+    bytes += natom_pad * idxu_max * nelements * sizeof(double);      // ylist_pack_im
+    bytes += natom * idxu_max * nelements * sizeof(double) * 2;      // ylist
+  } else {
 #endif
-    bytes += natom * idxu_max * sizeof(double) * 2;        // ulist
-    bytes += natom * idxu_max * 3 * sizeof(double) * 2;    // dulist
+
+    bytes += natom * nmax * idxu_max * sizeof(double) * 2;           // ulist
+    bytes += natom * idxu_max * nelements * sizeof(double) * 2;      // ulisttot
+
+    bytes += natom * idxz_max * ndoubles * sizeof(double) * 2;       // zlist
+    bytes += natom * idxb_max * ntriples * sizeof(double);           // blist
+
+    bytes += natom * idxu_max * nelements * sizeof(double) * 2;      // ylist
+
+    bytes += natom * nmax * idxu_max * 3 * sizeof(double) * 2;       // dulist
 #ifdef KOKKOS_ENABLE_CUDA
   }
 #endif
-  bytes += natom * idxu_max * nelements * sizeof(double) * 2;        // ulisttot
-  if (!std::is_same<typename DeviceType::array_layout,Kokkos::LayoutRight>::value)
-    bytes += natom * idxu_max * nelements * sizeof(double) * 2;        // ulisttot_lr
 
-  bytes += natom * idxz_max * ndoubles * sizeof(double) * 2;        // zlist
-  bytes += natom * idxb_max * ntriples * sizeof(double);            // blist
-  bytes += natom * idxu_max * nelements * sizeof(double) * 2;        // ylist
+  bytes += natom * nmax * 3 * sizeof(double);            // dedr
 
   bytes += jdim * jdim * jdim * sizeof(int);             // idxcg_block
   bytes += jdim * sizeof(int);                           // idxu_block
@@ -1747,7 +2082,6 @@ double SNAKokkos<DeviceType>::memory_usage()
   bytes += natom * nmax * sizeof(int);                   // inside
   bytes += natom * nmax * sizeof(double);                // wj
   bytes += natom * nmax * sizeof(double);                // rcutij
-  bytes += natom * nmax * idxu_max * sizeof(double) * 2; // ulist_ij
 
   return bytes;
 }

From 55d6f1a34f55b3d8365130b31fa1358c6c20baa5 Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Fri, 19 Jun 2020 19:33:17 -0400
Subject: [PATCH 2/6] Fixed variable name to be more consistent with rest of
 code.

---
 src/KOKKOS/sna_kokkos_impl.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h
index 8f265751ed..866b4c516f 100644
--- a/src/KOKKOS/sna_kokkos_impl.h
+++ b/src/KOKKOS/sna_kokkos_impl.h
@@ -337,7 +337,7 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
 
   const double wj_local = wj(iatom, jnbor);
   const double rcut = rcutij(iatom, jnbor);
-  const int jpos = element(iatom, jnbor);
+  const int jelem = element(iatom, jnbor);
 
   const double rsq = x * x + y * y + z * z;
   const double r = sqrt(rsq);
@@ -363,7 +363,7 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
   // so we can avoid all global memory reads
   Kokkos::single(Kokkos::PerThread(team), [=]() {
     buf1[0] = {1.,0.};
-    Kokkos::atomic_add(&(ulisttot_re(0,jpos,iatom)), sfac);
+    Kokkos::atomic_add(&(ulisttot_re(0,jelem,iatom)), sfac);
   });
 
   for (int j = 1; j <= twojmax; j++) {
@@ -418,8 +418,8 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
       // back up into shared memory for next iter
       buf2[jju_shared_idx] = u_accum;
 
-      Kokkos::atomic_add(&(ulisttot_re(jju_index,jpos,iatom)), sfac * u_accum.re);
-      Kokkos::atomic_add(&(ulisttot_im(jju_index,jpos,iatom)), sfac * u_accum.im);
+      Kokkos::atomic_add(&(ulisttot_re(jju_index,jelem,iatom)), sfac * u_accum.re);
+      Kokkos::atomic_add(&(ulisttot_im(jju_index,jelem,iatom)), sfac * u_accum.im);
 
       // copy left side to right side with inversion symmetry VMK 4.4(2)
       // u[ma-j,mb-j] = (-1)^(ma-mb)*Conj([u[ma,mb))
@@ -440,8 +440,8 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
         buf2[jju_shared_flip] = u_accum;
 
         // split re, im to get fully coalesced atomic add
-        Kokkos::atomic_add(&(ulisttot_re(jjup_flip,jpos,iatom)), sfac * u_accum.re);
-        Kokkos::atomic_add(&(ulisttot_im(jjup_flip,jpos,iatom)), sfac * u_accum.im);
+        Kokkos::atomic_add(&(ulisttot_re(jjup_flip,jelem,iatom)), sfac * u_accum.re);
+        Kokkos::atomic_add(&(ulisttot_im(jjup_flip,jelem,iatom)), sfac * u_accum.im);
       }
     });
     // In CUDA backend,

From b0dd93f06fc03a35c593b76a589a134187767209 Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Mon, 22 Jun 2020 10:50:31 -0600
Subject: [PATCH 3/6] Whitespace tweak

---
 src/KOKKOS/pair_snap_kokkos_impl.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h
index 64831e91cf..6bedf5554e 100644
--- a/src/KOKKOS/pair_snap_kokkos_impl.h
+++ b/src/KOKKOS/pair_snap_kokkos_impl.h
@@ -261,10 +261,8 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
         Kokkos::parallel_for("ComputeBiCPU",policy_bi_cpu,*this);
       }
 
-
       //ZeroYi,ComputeYi
       {
-
         int vector_length = vector_length_default;
         int team_size = team_size_default;
 

From 198258766d768cce97e560014630c68a3eb3620f Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Mon, 22 Jun 2020 18:59:09 -0400
Subject: [PATCH 4/6] ensure that per-arrays are only freed if the last atom
 style instance is deleted

---
 src/atom_vec.cpp | 76 ++++++++++++++++++++++++++----------------------
 src/atom_vec.h   |  4 +++
 2 files changed, 46 insertions(+), 34 deletions(-)

diff --git a/src/atom_vec.cpp b/src/atom_vec.cpp
index 6a294d3ccf..5a86caa4fb 100644
--- a/src/atom_vec.cpp
+++ b/src/atom_vec.cpp
@@ -33,6 +33,8 @@ using namespace MathConst;
 #define DELTA 16384
 #define DELTA_BONUS 8192
 
+int AtomVec::num_atom_vecs = 0;
+
 /* ---------------------------------------------------------------------- */
 
 AtomVec::AtomVec(LAMMPS *lmp) : Pointers(lmp)
@@ -54,6 +56,8 @@ AtomVec::AtomVec(LAMMPS *lmp) : Pointers(lmp)
 
   threads = NULL;
 
+  ++num_atom_vecs;
+
   // peratom variables auto-included in corresponding child style fields string
   // these fields cannot be specified in the fields string
 
@@ -93,44 +97,48 @@ AtomVec::~AtomVec()
   int datatype,cols;
   void *pdata;
 
+  --num_atom_vecs;
+
   for (int i = 0; i < nargcopy; i++) delete [] argcopy[i];
   delete [] argcopy;
 
-  memory->destroy(atom->tag);
-  memory->destroy(atom->type);
-  memory->destroy(atom->mask);
-  memory->destroy(atom->image);
-  memory->destroy(atom->x);
-  memory->destroy(atom->v);
-  memory->destroy(atom->f);
+  if (num_atom_vecs == 0) {
+    memory->destroy(atom->tag);
+    memory->destroy(atom->type);
+    memory->destroy(atom->mask);
+    memory->destroy(atom->image);
+    memory->destroy(atom->x);
+    memory->destroy(atom->v);
+    memory->destroy(atom->f);
 
-  for (int i = 0; i < ngrow; i++) {
-    pdata = mgrow.pdata[i];
-    datatype = mgrow.datatype[i];
-    cols = mgrow.cols[i];
-    if (datatype == Atom::DOUBLE) {
-      if (cols == 0)
-        memory->destroy(*((double **) pdata));
-      else if (cols > 0)
-        memory->destroy(*((double ***) pdata));
-      else {
-        memory->destroy(*((double ***) pdata));
-      }
-    } else if (datatype == Atom::INT) {
-      if (cols == 0)
-        memory->destroy(*((int **) pdata));
-      else if (cols > 0)
-        memory->destroy(*((int ***) pdata));
-      else {
-        memory->destroy(*((int ***) pdata));
-      }
-    } else if (datatype == Atom::BIGINT) {
-      if (cols == 0)
-        memory->destroy(*((bigint **) pdata));
-      else if (cols > 0)
-        memory->destroy(*((bigint ***) pdata));
-      else {
-        memory->destroy(*((bigint ***) pdata));
+    for (int i = 0; i < ngrow; i++) {
+      pdata = mgrow.pdata[i];
+      datatype = mgrow.datatype[i];
+      cols = mgrow.cols[i];
+      if (datatype == Atom::DOUBLE) {
+        if (cols == 0)
+          memory->destroy(*((double **) pdata));
+        else if (cols > 0)
+          memory->destroy(*((double ***) pdata));
+        else {
+          memory->destroy(*((double ***) pdata));
+        }
+      } else if (datatype == Atom::INT) {
+        if (cols == 0)
+          memory->destroy(*((int **) pdata));
+        else if (cols > 0)
+          memory->destroy(*((int ***) pdata));
+        else {
+          memory->destroy(*((int ***) pdata));
+        }
+      } else if (datatype == Atom::BIGINT) {
+        if (cols == 0)
+          memory->destroy(*((bigint **) pdata));
+        else if (cols > 0)
+          memory->destroy(*((bigint ***) pdata));
+        else {
+          memory->destroy(*((bigint ***) pdata));
+        }
       }
     }
   }
diff --git a/src/atom_vec.h b/src/atom_vec.h
index 8ccf922c4b..625307face 100644
--- a/src/atom_vec.h
+++ b/src/atom_vec.h
@@ -209,6 +209,10 @@ class AtomVec : protected Pointers {
 
   bool *threads;
 
+  // counter for atom vec instances
+
+  static int num_atom_vecs;
+
   // local methods
 
   void grow_nmax();

From afe6484c445db453135f296541fa50187940aa0b Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Mon, 22 Jun 2020 19:15:28 -0400
Subject: [PATCH 5/6] Revert "reorganize memory (de-)allocation and fix
 substantial memory leak in MSM"

This reverts commit f78671c1a4d81e5907e929c9341d784e03881bd6.
---
 src/KSPACE/msm.cpp | 195 +++++++++++++++++++++++----------------------
 1 file changed, 100 insertions(+), 95 deletions(-)

diff --git a/src/KSPACE/msm.cpp b/src/KSPACE/msm.cpp
index abc8ca1150..70a36984d7 100644
--- a/src/KSPACE/msm.cpp
+++ b/src/KSPACE/msm.cpp
@@ -28,8 +28,6 @@
 #include "domain.h"
 #include "memory.h"
 #include "error.h"
-#include "utils.h"
-#include "fmt/format.h"
 
 #include "math_const.h"
 
@@ -65,6 +63,9 @@ MSM::MSM(LAMMPS *lmp) : KSpace(lmp),
   factors[0] = 2;
 
   MPI_Comm_rank(world,&me);
+  procneigh_levels = NULL;
+  world_levels = NULL;
+  active_flag = NULL;
 
   phi1d = dphi1d = NULL;
 
@@ -80,7 +81,15 @@ MSM::MSM(LAMMPS *lmp) : KSpace(lmp),
   v0_direct_top = v1_direct_top = v2_direct_top = NULL;
   v3_direct_top = v4_direct_top = v5_direct_top = NULL;
 
+  cg_all = cg_peratom_all = NULL;
+  cg = cg_peratom = NULL;
+
   ngrid = NULL;
+  cg = NULL;
+  cg_peratom = NULL;
+  procneigh_levels = NULL;
+  world_levels = NULL;
+  active_flag = NULL;
 
   alpha = betax = betay = betaz = NULL;
   nx_msm = ny_msm = nz_msm = NULL;
@@ -141,7 +150,10 @@ MSM::~MSM()
 
 void MSM::init()
 {
-  if (me == 0) utils::logmesg(lmp,"MSM initialization ...\n");
+  if (me == 0) {
+    if (screen) fprintf(screen,"MSM initialization ...\n");
+    if (logfile) fprintf(logfile,"MSM initialization ...\n");
+  }
 
   // error check
 
@@ -157,8 +169,13 @@ void MSM::init()
   if ((slabflag == 1) && (me == 0))
     error->warning(FLERR,"Slab correction not needed for MSM");
 
-  if ((order < 4) || (order > 10) || (order%2 != 0))
-    error->all(FLERR,"MSM order must be 4, 6, 8, or 10");
+  if (order < 4 || order > 10) {
+    char str[128];
+    sprintf(str,"MSM order must be 4, 6, 8, or 10");
+    error->all(FLERR,str);
+  }
+
+  if (order%2 != 0) error->all(FLERR,"MSM order must be 4, 6, 8, or 10");
 
   if (sizeof(FFT_SCALAR) != 8)
     error->all(FLERR,"Cannot (yet) use single precision with MSM "
@@ -204,14 +221,33 @@ void MSM::init()
   MPI_Allreduce(&ngrid[0],&ngrid_max,1,MPI_INT,MPI_MAX,world);
 
   if (me == 0) {
-    std::string mesg = fmt::format("  3d grid size/proc = {}\n", ngrid_max);
-    mesg += fmt::format("  estimated absolute RMS force accuracy = {}\n",
-                        estimated_error);
-    mesg += fmt::format("  estimated relative force accuracy = {}\n",
-                        estimated_error/two_charge_force);
-    mesg += fmt::format("  grid = {} {} {}\n",nx_msm[0],ny_msm[0],nz_msm[0]);
-    mesg += fmt::format("  order = {}\n",order);
-    utils::logmesg(lmp,mesg);
+    if (screen) {
+      fprintf(screen,"  3d grid size/proc = %d\n",
+                        ngrid_max);
+      fprintf(screen,"  estimated absolute RMS force accuracy = %g\n",
+              estimated_error);
+      fprintf(screen,"  estimated relative force accuracy = %g\n",
+              estimated_error/two_charge_force);
+    }
+    if (logfile) {
+      fprintf(logfile,"  3d grid size/proc = %d\n",
+                         ngrid_max);
+      fprintf(logfile,"  estimated absolute RMS force accuracy = %g\n",
+              estimated_error);
+      fprintf(logfile,"  estimated relative force accuracy = %g\n",
+              estimated_error/two_charge_force);
+    }
+  }
+
+  if (me == 0) {
+    if (screen) {
+      fprintf(screen,"  grid = %d %d %d\n",nx_msm[0],ny_msm[0],nz_msm[0]);
+      fprintf(screen,"  order = %d\n",order);
+    }
+    if (logfile) {
+      fprintf(logfile,"  grid = %d %d %d\n",nx_msm[0],ny_msm[0],nz_msm[0]);
+      fprintf(logfile,"  order = %d\n",order);
+    }
   }
 }
 
@@ -362,6 +398,7 @@ void MSM::setup()
 
   nmax_direct = 8*(nxhi_direct+1)*(nyhi_direct+1)*(nzhi_direct+1);
 
+  deallocate();
   if (peratom_allocate_flag) deallocate_peratom();
 
   // compute direct sum interaction weights
@@ -612,8 +649,6 @@ void MSM::compute(int eflag, int vflag)
 
 void MSM::allocate()
 {
-  deallocate();
-
   // interpolation coeffs
 
   order_allocated = order;
@@ -635,9 +670,9 @@ void MSM::allocate()
   // allocate memory for each grid level
 
   for (int n=0; n<levels; n++) {
-
     memory->create3d_offset(qgrid[n],nzlo_out[n],nzhi_out[n],
             nylo_out[n],nyhi_out[n],nxlo_out[n],nxhi_out[n],"msm:qgrid");
+
     memory->create3d_offset(egrid[n],nzlo_out[n],nzhi_out[n],
             nylo_out[n],nyhi_out[n],nxlo_out[n],nxhi_out[n],"msm:egrid");
 
@@ -660,23 +695,24 @@ void MSM::allocate()
 
 void MSM::deallocate()
 {
-  delete cg_all;
-  cg_all = nullptr;
-
   memory->destroy2d_offset(phi1d,-order_allocated);
   memory->destroy2d_offset(dphi1d,-order_allocated);
 
+  if (cg_all) delete cg_all;
+
   for (int n=0; n<levels; n++) {
-    memory->destroy3d_offset(qgrid[n],nzlo_out[n],nylo_out[n],nxlo_out[n]);
-    memory->destroy3d_offset(egrid[n],nzlo_out[n],nylo_out[n],nxlo_out[n]);
+    if (qgrid[n])
+      memory->destroy3d_offset(qgrid[n],nzlo_out[n],nylo_out[n],nxlo_out[n]);
 
-    if (world_levels[n] != MPI_COMM_NULL)
-      MPI_Comm_free(&world_levels[n]);
-    world_levels[n] = MPI_COMM_NULL;
-    active_flag[n] = 0;
+    if (egrid[n])
+      memory->destroy3d_offset(egrid[n],nzlo_out[n],nylo_out[n],nxlo_out[n]);
 
-    delete cg[n];
-    cg[n] = nullptr;
+    if (world_levels)
+      if (world_levels[n] != MPI_COMM_NULL)
+          MPI_Comm_free(&world_levels[n]);
+
+    if (cg)
+      if (cg[n]) delete cg[n];
   }
 }
 
@@ -765,7 +801,6 @@ void MSM::deallocate_peratom()
 
 void MSM::allocate_levels()
 {
-  deallocate_levels();
   ngrid = new int[levels];
 
   cg = new GridComm*[levels];
@@ -815,21 +850,21 @@ void MSM::allocate_levels()
   v5grid = new double***[levels];
 
   for (int n=0; n<levels; n++) {
-    cg[n] = nullptr;
+    cg[n] = NULL;
     world_levels[n] = MPI_COMM_NULL;
-    active_flag[n] = 0;
-    cg_peratom[n] = nullptr;
+    cg_peratom[n] = NULL;
 
-    qgrid[n] = nullptr;
-    egrid[n] = nullptr;
+    qgrid[n] = NULL;
+    egrid[n] = NULL;
 
-    v0grid[n] = nullptr;
-    v1grid[n] = nullptr;
-    v2grid[n] = nullptr;
-    v3grid[n] = nullptr;
-    v4grid[n] = nullptr;
-    v5grid[n] = nullptr;
+    v0grid[n] = NULL;
+    v1grid[n] = NULL;
+    v2grid[n] = NULL;
+    v3grid[n] = NULL;
+    v4grid[n] = NULL;
+    v5grid[n] = NULL;
   }
+
 }
 
 /* ----------------------------------------------------------------------
@@ -838,9 +873,7 @@ void MSM::allocate_levels()
 
 void MSM::deallocate_levels()
 {
-  if (cg) deallocate();
   delete [] ngrid;
-  ngrid = nullptr;
 
   memory->destroy(procneigh_levels);
   delete [] world_levels;
@@ -886,50 +919,6 @@ void MSM::deallocate_levels()
   delete [] v3grid;
   delete [] v4grid;
   delete [] v5grid;
-
-  world_levels = nullptr;
-  active_flag = nullptr;
-  cg = nullptr;
-  cg_peratom = nullptr;
-
-  alpha = nullptr;
-  betax = nullptr;
-  betay = nullptr;
-  betaz = nullptr;
-
-  nx_msm = nullptr;
-  ny_msm = nullptr;
-  nz_msm = nullptr;
-
-  nxlo_in = nullptr;
-  nylo_in = nullptr;
-  nzlo_in = nullptr;
-
-  nxhi_in = nullptr;
-  nyhi_in = nullptr;
-  nzhi_in = nullptr;
-
-  nxlo_out = nullptr;
-  nylo_out = nullptr;
-  nzlo_out = nullptr;
-
-  nxhi_out = nullptr;
-  nyhi_out = nullptr;
-  nzhi_out = nullptr;
-
-  delxinv = nullptr;
-  delyinv = nullptr;
-  delzinv = nullptr;
-
-  qgrid = nullptr;
-  egrid = nullptr;
-
-  v0grid = nullptr;
-  v1grid = nullptr;
-  v2grid = nullptr;
-  v3grid = nullptr;
-  v4grid = nullptr;
-  v5grid = nullptr;
 }
 
 /* ----------------------------------------------------------------------
@@ -1064,9 +1053,9 @@ void MSM::set_grid_global()
     double *p_cutoff = (double *) force->pair->extract("cut_coul",itmp);
     *p_cutoff = cutoff;
 
-    if (me == 0)
-      error->warning(FLERR,fmt::format("Adjusting Coulombic cutoff for "
-                                       "MSM, new cutoff = {}", cutoff));
+    char str[128];
+    sprintf(str,"Adjusting Coulombic cutoff for MSM, new cutoff = %g",cutoff);
+    if (me == 0) error->warning(FLERR,str);
   }
 
   if (triclinic == 0) {
@@ -1105,6 +1094,7 @@ void MSM::set_grid_global()
 
   if (!domain->nonperiodic) levels -= 1;
 
+  deallocate_levels();
   allocate_levels();
 
   // find number of grid levels in each direction
@@ -1328,8 +1318,8 @@ void MSM::set_grid_local()
 
 void MSM::set_proc_grid(int n)
 {
-  for (int i=0; i<3; i++)
-    myloc[i] = comm->myloc[i];
+    for (int i=0; i<3; i++)
+      myloc[i] = comm->myloc[i];
 
   // size of inner MSM grid owned by this proc
 
@@ -1372,7 +1362,6 @@ void MSM::set_proc_grid(int n)
 
   // define a new MPI communicator for this grid level that only includes active procs
 
-  if(world_levels[n] != MPI_COMM_NULL) MPI_Comm_free(&world_levels[n]);
   MPI_Comm_split(world,color,me,&world_levels[n]);
 
   if (!active_flag[n]) return;
@@ -1490,13 +1479,16 @@ void MSM::particle_map()
 
 void MSM::make_rho()
 {
+  //fprintf(screen,"MSM aninterpolation\n\n");
+
   int i,l,m,n,nx,ny,nz,mx,my,mz;
   double dx,dy,dz,x0,y0,z0;
 
   // clear 3d density array
 
-  double ***qgrid0 = qgrid[0];
-  memset(&(qgrid0[nzlo_out[0]][nylo_out[0]][nxlo_out[0]]),0,ngrid[0]*sizeof(double));
+  double ***qgridn = qgrid[0];
+
+  memset(&(qgridn[nzlo_out[0]][nylo_out[0]][nxlo_out[0]]),0,ngrid[0]*sizeof(double));
 
   // loop over my charges, add their contribution to nearby grid points
   // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
@@ -1527,7 +1519,7 @@ void MSM::make_rho()
         x0 = y0*phi1d[1][m];
         for (l = nlower; l <= nupper; l++) {
           mx = l+nx;
-          qgrid0[mz][my][mx] += x0*phi1d[0][l];
+          qgridn[mz][my][mx] += x0*phi1d[0][l];
         }
       }
     }
@@ -1542,6 +1534,8 @@ void MSM::make_rho()
 
 void MSM::direct(int n)
 {
+  //fprintf(screen,"Direct contribution on level %i\n\n",n);
+
   double ***qgridn = qgrid[n];
   double ***egridn = egrid[n];
   double ***v0gridn = v0grid[n];
@@ -1774,6 +1768,8 @@ void MSM::direct(int n)
 
 void MSM::direct_peratom(int n)
 {
+  //fprintf(screen,"Direct contribution on level %i\n\n",n);
+
   double ***qgridn = qgrid[n];
   double ***v0gridn = v0grid[n];
   double ***v1gridn = v1grid[n];
@@ -1896,6 +1892,8 @@ void MSM::direct_peratom(int n)
 
 void MSM::direct_top(int n)
 {
+  //fprintf(screen,"Direct contribution on level %i\n\n",n);
+
   double ***qgridn = qgrid[n];
   double ***egridn = egrid[n];
   double ***v0gridn = v0grid[n];
@@ -2259,6 +2257,8 @@ void MSM::direct_peratom_top(int n)
 
 void MSM::restriction(int n)
 {
+  //fprintf(screen,"Restricting from level %i to %i\n\n",n,n+1);
+
   const int p = order-1;
 
   double ***qgrid1 = qgrid[n];
@@ -2281,7 +2281,8 @@ void MSM::restriction(int n)
 
   // zero out charge on coarser grid
 
-  memset(&(qgrid2[nzlo_out[n+1]][nylo_out[n+1]][nxlo_out[n+1]]),0,ngrid[n+1]*sizeof(double));
+  memset(&(qgrid2[nzlo_out[n+1]][nylo_out[n+1]][nxlo_out[n+1]]),0,
+         ngrid[n+1]*sizeof(double));
 
   for (kp = nzlo_in[n+1]; kp <= nzhi_in[n+1]; kp++)
     for (jp = nylo_in[n+1]; jp <= nyhi_in[n+1]; jp++)
@@ -2330,6 +2331,8 @@ void MSM::restriction(int n)
 
 void MSM::prolongation(int n)
 {
+  //fprintf(screen,"Prolongating from level %i to %i\n\n",n+1,n);
+
   const int p = order-1;
 
   double ***egrid1 = egrid[n];
@@ -2718,6 +2721,8 @@ void MSM::unpack_reverse(int flag, double *buf, int nlist, int *list)
 
 void MSM::fieldforce()
 {
+  //fprintf(screen,"MSM interpolation\n\n");
+
   double ***egridn = egrid[0];
 
   int i,l,m,n,nx,ny,nz,mx,my,mz;

From 8d5a9ad4af7bf708a45ab20900e2373384ed2fa0 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Mon, 22 Jun 2020 19:28:24 -0400
Subject: [PATCH 6/6] implement alternate version of MSM leak fix

---
 src/KSPACE/msm.cpp | 152 ++++++++++++++++++++++-----------------------
 1 file changed, 76 insertions(+), 76 deletions(-)

diff --git a/src/KSPACE/msm.cpp b/src/KSPACE/msm.cpp
index 70a36984d7..e4e5846c9e 100644
--- a/src/KSPACE/msm.cpp
+++ b/src/KSPACE/msm.cpp
@@ -28,6 +28,8 @@
 #include "domain.h"
 #include "memory.h"
 #include "error.h"
+#include "utils.h"
+#include "fmt/format.h"
 
 #include "math_const.h"
 
@@ -63,9 +65,6 @@ MSM::MSM(LAMMPS *lmp) : KSpace(lmp),
   factors[0] = 2;
 
   MPI_Comm_rank(world,&me);
-  procneigh_levels = NULL;
-  world_levels = NULL;
-  active_flag = NULL;
 
   phi1d = dphi1d = NULL;
 
@@ -81,15 +80,7 @@ MSM::MSM(LAMMPS *lmp) : KSpace(lmp),
   v0_direct_top = v1_direct_top = v2_direct_top = NULL;
   v3_direct_top = v4_direct_top = v5_direct_top = NULL;
 
-  cg_all = cg_peratom_all = NULL;
-  cg = cg_peratom = NULL;
-
   ngrid = NULL;
-  cg = NULL;
-  cg_peratom = NULL;
-  procneigh_levels = NULL;
-  world_levels = NULL;
-  active_flag = NULL;
 
   alpha = betax = betay = betaz = NULL;
   nx_msm = ny_msm = nz_msm = NULL;
@@ -150,10 +141,7 @@ MSM::~MSM()
 
 void MSM::init()
 {
-  if (me == 0) {
-    if (screen) fprintf(screen,"MSM initialization ...\n");
-    if (logfile) fprintf(logfile,"MSM initialization ...\n");
-  }
+  if (me == 0) utils::logmesg(lmp,"MSM initialization ...\n");
 
   // error check
 
@@ -169,13 +157,8 @@ void MSM::init()
   if ((slabflag == 1) && (me == 0))
     error->warning(FLERR,"Slab correction not needed for MSM");
 
-  if (order < 4 || order > 10) {
-    char str[128];
-    sprintf(str,"MSM order must be 4, 6, 8, or 10");
-    error->all(FLERR,str);
-  }
-
-  if (order%2 != 0) error->all(FLERR,"MSM order must be 4, 6, 8, or 10");
+  if ((order < 4) || (order > 10) || (order%2 != 0))
+    error->all(FLERR,"MSM order must be 4, 6, 8, or 10");
 
   if (sizeof(FFT_SCALAR) != 8)
     error->all(FLERR,"Cannot (yet) use single precision with MSM "
@@ -221,33 +204,14 @@ void MSM::init()
   MPI_Allreduce(&ngrid[0],&ngrid_max,1,MPI_INT,MPI_MAX,world);
 
   if (me == 0) {
-    if (screen) {
-      fprintf(screen,"  3d grid size/proc = %d\n",
-                        ngrid_max);
-      fprintf(screen,"  estimated absolute RMS force accuracy = %g\n",
-              estimated_error);
-      fprintf(screen,"  estimated relative force accuracy = %g\n",
-              estimated_error/two_charge_force);
-    }
-    if (logfile) {
-      fprintf(logfile,"  3d grid size/proc = %d\n",
-                         ngrid_max);
-      fprintf(logfile,"  estimated absolute RMS force accuracy = %g\n",
-              estimated_error);
-      fprintf(logfile,"  estimated relative force accuracy = %g\n",
-              estimated_error/two_charge_force);
-    }
-  }
-
-  if (me == 0) {
-    if (screen) {
-      fprintf(screen,"  grid = %d %d %d\n",nx_msm[0],ny_msm[0],nz_msm[0]);
-      fprintf(screen,"  order = %d\n",order);
-    }
-    if (logfile) {
-      fprintf(logfile,"  grid = %d %d %d\n",nx_msm[0],ny_msm[0],nz_msm[0]);
-      fprintf(logfile,"  order = %d\n",order);
-    }
+    std::string mesg = fmt::format("  3d grid size/proc = {}\n", ngrid_max);
+    mesg += fmt::format("  estimated absolute RMS force accuracy = {}\n",
+                        estimated_error);
+    mesg += fmt::format("  estimated relative force accuracy = {}\n",
+                        estimated_error/two_charge_force);
+    mesg += fmt::format("  grid = {} {} {}\n",nx_msm[0],ny_msm[0],nz_msm[0]);
+    mesg += fmt::format("  order = {}\n",order);
+    utils::logmesg(lmp,mesg);
   }
 }
 
@@ -699,6 +663,7 @@ void MSM::deallocate()
   memory->destroy2d_offset(dphi1d,-order_allocated);
 
   if (cg_all) delete cg_all;
+  cg_all = nullptr;
 
   for (int n=0; n<levels; n++) {
     if (qgrid[n])
@@ -711,8 +676,12 @@ void MSM::deallocate()
       if (world_levels[n] != MPI_COMM_NULL)
           MPI_Comm_free(&world_levels[n]);
 
-    if (cg)
-      if (cg[n]) delete cg[n];
+    if (cg) {
+      if (cg[n]) {
+        delete cg[n];
+        cg[n] = nullptr;
+      }
+    }
   }
 }
 
@@ -873,7 +842,9 @@ void MSM::allocate_levels()
 
 void MSM::deallocate_levels()
 {
+  if (cg) deallocate();
   delete [] ngrid;
+  ngrid = nullptr;
 
   memory->destroy(procneigh_levels);
   delete [] world_levels;
@@ -919,6 +890,50 @@ void MSM::deallocate_levels()
   delete [] v3grid;
   delete [] v4grid;
   delete [] v5grid;
+
+  world_levels = nullptr;
+  active_flag = nullptr;
+  cg = nullptr;
+  cg_peratom = nullptr;
+
+  alpha = nullptr;
+  betax = nullptr;
+  betay = nullptr;
+  betaz = nullptr;
+
+  nx_msm = nullptr;
+  ny_msm = nullptr;
+  nz_msm = nullptr;
+
+  nxlo_in = nullptr;
+  nylo_in = nullptr;
+  nzlo_in = nullptr;
+
+  nxhi_in = nullptr;
+  nyhi_in = nullptr;
+  nzhi_in = nullptr;
+
+  nxlo_out = nullptr;
+  nylo_out = nullptr;
+  nzlo_out = nullptr;
+
+  nxhi_out = nullptr;
+  nyhi_out = nullptr;
+  nzhi_out = nullptr;
+
+  delxinv = nullptr;
+  delyinv = nullptr;
+  delzinv = nullptr;
+
+  qgrid = nullptr;
+  egrid = nullptr;
+
+  v0grid = nullptr;
+  v1grid = nullptr;
+  v2grid = nullptr;
+  v3grid = nullptr;
+  v4grid = nullptr;
+  v5grid = nullptr;
 }
 
 /* ----------------------------------------------------------------------
@@ -1053,9 +1068,9 @@ void MSM::set_grid_global()
     double *p_cutoff = (double *) force->pair->extract("cut_coul",itmp);
     *p_cutoff = cutoff;
 
-    char str[128];
-    sprintf(str,"Adjusting Coulombic cutoff for MSM, new cutoff = %g",cutoff);
-    if (me == 0) error->warning(FLERR,str);
+    if (me == 0)
+      error->warning(FLERR,fmt::format("Adjusting Coulombic cutoff for "
+                                       "MSM, new cutoff = {}", cutoff));
   }
 
   if (triclinic == 0) {
@@ -1318,8 +1333,8 @@ void MSM::set_grid_local()
 
 void MSM::set_proc_grid(int n)
 {
-    for (int i=0; i<3; i++)
-      myloc[i] = comm->myloc[i];
+  for (int i=0; i<3; i++)
+    myloc[i] = comm->myloc[i];
 
   // size of inner MSM grid owned by this proc
 
@@ -1362,6 +1377,7 @@ void MSM::set_proc_grid(int n)
 
   // define a new MPI communicator for this grid level that only includes active procs
 
+  if(world_levels[n] != MPI_COMM_NULL) MPI_Comm_free(&world_levels[n]);
   MPI_Comm_split(world,color,me,&world_levels[n]);
 
   if (!active_flag[n]) return;
@@ -1479,16 +1495,13 @@ void MSM::particle_map()
 
 void MSM::make_rho()
 {
-  //fprintf(screen,"MSM aninterpolation\n\n");
-
   int i,l,m,n,nx,ny,nz,mx,my,mz;
   double dx,dy,dz,x0,y0,z0;
 
   // clear 3d density array
 
-  double ***qgridn = qgrid[0];
-
-  memset(&(qgridn[nzlo_out[0]][nylo_out[0]][nxlo_out[0]]),0,ngrid[0]*sizeof(double));
+  double ***qgrid0 = qgrid[0];
+  memset(&(qgrid0[nzlo_out[0]][nylo_out[0]][nxlo_out[0]]),0,ngrid[0]*sizeof(double));
 
   // loop over my charges, add their contribution to nearby grid points
   // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
@@ -1519,7 +1532,7 @@ void MSM::make_rho()
         x0 = y0*phi1d[1][m];
         for (l = nlower; l <= nupper; l++) {
           mx = l+nx;
-          qgridn[mz][my][mx] += x0*phi1d[0][l];
+          qgrid0[mz][my][mx] += x0*phi1d[0][l];
         }
       }
     }
@@ -1534,8 +1547,6 @@ void MSM::make_rho()
 
 void MSM::direct(int n)
 {
-  //fprintf(screen,"Direct contribution on level %i\n\n",n);
-
   double ***qgridn = qgrid[n];
   double ***egridn = egrid[n];
   double ***v0gridn = v0grid[n];
@@ -1768,8 +1779,6 @@ void MSM::direct(int n)
 
 void MSM::direct_peratom(int n)
 {
-  //fprintf(screen,"Direct contribution on level %i\n\n",n);
-
   double ***qgridn = qgrid[n];
   double ***v0gridn = v0grid[n];
   double ***v1gridn = v1grid[n];
@@ -1892,8 +1901,6 @@ void MSM::direct_peratom(int n)
 
 void MSM::direct_top(int n)
 {
-  //fprintf(screen,"Direct contribution on level %i\n\n",n);
-
   double ***qgridn = qgrid[n];
   double ***egridn = egrid[n];
   double ***v0gridn = v0grid[n];
@@ -2257,8 +2264,6 @@ void MSM::direct_peratom_top(int n)
 
 void MSM::restriction(int n)
 {
-  //fprintf(screen,"Restricting from level %i to %i\n\n",n,n+1);
-
   const int p = order-1;
 
   double ***qgrid1 = qgrid[n];
@@ -2281,8 +2286,7 @@ void MSM::restriction(int n)
 
   // zero out charge on coarser grid
 
-  memset(&(qgrid2[nzlo_out[n+1]][nylo_out[n+1]][nxlo_out[n+1]]),0,
-         ngrid[n+1]*sizeof(double));
+  memset(&(qgrid2[nzlo_out[n+1]][nylo_out[n+1]][nxlo_out[n+1]]),0,ngrid[n+1]*sizeof(double));
 
   for (kp = nzlo_in[n+1]; kp <= nzhi_in[n+1]; kp++)
     for (jp = nylo_in[n+1]; jp <= nyhi_in[n+1]; jp++)
@@ -2331,8 +2335,6 @@ void MSM::restriction(int n)
 
 void MSM::prolongation(int n)
 {
-  //fprintf(screen,"Prolongating from level %i to %i\n\n",n+1,n);
-
   const int p = order-1;
 
   double ***egrid1 = egrid[n];
@@ -2721,8 +2723,6 @@ void MSM::unpack_reverse(int flag, double *buf, int nlist, int *list)
 
 void MSM::fieldforce()
 {
-  //fprintf(screen,"MSM interpolation\n\n");
-
   double ***egridn = egrid[0];
 
   int i,l,m,n,nx,ny,nz,mx,my,mz;