diff --git a/src/KOKKOS/pair_snap_kokkos.h b/src/KOKKOS/pair_snap_kokkos.h
index d6b028640d..a4e15fe5ce 100644
--- a/src/KOKKOS/pair_snap_kokkos.h
+++ b/src/KOKKOS/pair_snap_kokkos.h
@@ -30,19 +30,31 @@ PairStyle(snap/kk/host,PairSNAPKokkos<LMPHostType>)
 
 namespace LAMMPS_NS {
 
+// Routines for both the CPU and GPU backend
 template<int NEIGHFLAG, int EVFLAG>
 struct TagPairSNAPComputeForce{};
-
-struct TagPairSNAPBeta{};
 struct TagPairSNAPComputeNeigh{};
+
+// GPU backend only
 struct TagPairSNAPPreUi{};
 struct TagPairSNAPComputeUi{};
-struct TagPairSNAPComputeUiCPU{};
+struct TagPairSNAPTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist
 struct TagPairSNAPComputeZi{};
+struct TagPairSNAPBeta{};
 struct TagPairSNAPComputeBi{};
-struct TagPairSNAPZeroYi{};
+struct TagPairSNAPTransformBi{}; // re-order blist from AoSoA to AoS
 struct TagPairSNAPComputeYi{};
+struct TagPairSNAPTransformYi{}; // re-order ylist from AoSoA to AoS
 struct TagPairSNAPComputeFusedDeidrj{};
+
+// CPU backend only
+struct TagPairSNAPPreUiCPU{};
+struct TagPairSNAPComputeUiCPU{};
+struct TagPairSNAPComputeZiCPU{};
+struct TagPairSNAPBetaCPU{};
+struct TagPairSNAPComputeBiCPU{};
+struct TagPairSNAPZeroYiCPU{};
+struct TagPairSNAPComputeYiCPU{};
 struct TagPairSNAPComputeDuidrjCPU{};
 struct TagPairSNAPComputeDeidrjCPU{};
 
@@ -81,6 +93,10 @@ public:
   KOKKOS_INLINE_FUNCTION
   void operator() (TagPairSNAPComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeNeigh>::member_type& team) const;
 
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagPairSNAPBetaCPU,const int& ii) const;
+
+  // GPU backend only
   KOKKOS_INLINE_FUNCTION
   void operator() (TagPairSNAPPreUi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPPreUi>::member_type& team) const;
 
@@ -88,32 +104,54 @@ public:
   void operator() (TagPairSNAPComputeUi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUi>::member_type& team) const;
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiCPU>::member_type& team) const;
+  void operator() (TagPairSNAPTransformUi,const int iatom_mod, const int idxu, const int iatom_div) const;
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (TagPairSNAPComputeZi,const int& ii) const;
+  void operator() (TagPairSNAPComputeZi,const int iatom_mod, const int idxz, const int iatom_div) const;
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (TagPairSNAPComputeBi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeBi>::member_type& team) const;
+  void operator() (TagPairSNAPBeta, const int& ii) const;
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (TagPairSNAPZeroYi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPZeroYi>::member_type& team) const;
+  void operator() (TagPairSNAPComputeBi,const int iatom_mod, const int idxb, const int iatom_div) const;
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (TagPairSNAPComputeYi,const int& ii) const;
+  void operator() (TagPairSNAPTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagPairSNAPComputeYi,const int iatom_mod, const int idxz, const int iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagPairSNAPTransformYi,const int iatom_mod, const int idxu, const int iatom_div) const;
 
   KOKKOS_INLINE_FUNCTION
   void operator() (TagPairSNAPComputeFusedDeidrj,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeFusedDeidrj>::member_type& team) const;
 
+  // CPU backend only
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagPairSNAPPreUiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPPreUiCPU>::member_type& team) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiCPU>::member_type& team) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagPairSNAPComputeZiCPU,const int& ii) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagPairSNAPComputeBiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeBiCPU>::member_type& team) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagPairSNAPZeroYiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPZeroYiCPU>::member_type& team) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagPairSNAPComputeYiCPU,const int& ii) const;
+
   KOKKOS_INLINE_FUNCTION
   void operator() (TagPairSNAPComputeDuidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrjCPU>::member_type& team) const;
 
   KOKKOS_INLINE_FUNCTION
   void operator() (TagPairSNAPComputeDeidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrjCPU>::member_type& team) const;
 
-  KOKKOS_INLINE_FUNCTION
-  void operator() (TagPairSNAPBeta,const int& ii) const;
-
   template<int NEIGHFLAG>
   KOKKOS_INLINE_FUNCTION
   void v_tally_xyz(EV_FLOAT &ev, const int &i, const int &j,
@@ -175,6 +213,7 @@ inline double dist2(double* x,double* y);
   Kokkos::View<T_INT*, DeviceType> d_map;                    // mapping from atom types to elements
   Kokkos::View<T_INT*, DeviceType> d_ninside;                // ninside for all atoms in list
   Kokkos::View<F_FLOAT**, DeviceType> d_beta;                // betas for all atoms in list
+  Kokkos::View<F_FLOAT***, Kokkos::LayoutLeft, DeviceType> d_beta_pack;          // betas for all atoms in list, GPU
   Kokkos::View<F_FLOAT**, DeviceType> d_bispectrum;          // bispectrum components for all atoms in list
 
   typedef Kokkos::DualView<F_FLOAT**, DeviceType> tdual_fparams;
diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h
index 5dbf8e3a5a..6bedf5554e 100644
--- a/src/KOKKOS/pair_snap_kokkos_impl.h
+++ b/src/KOKKOS/pair_snap_kokkos_impl.h
@@ -194,6 +194,8 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   if (beta_max < inum) {
     beta_max = inum;
     d_beta = Kokkos::View<F_FLOAT**, DeviceType>("PairSNAPKokkos:beta",ncoeff,inum);
+    if (!host_flag)
+      d_beta_pack = Kokkos::View<F_FLOAT***, Kokkos::LayoutLeft, DeviceType>("PairSNAPKokkos:beta_pack",32,ncoeff,(inum+32-1)/32);
     d_ninside = Kokkos::View<int*, DeviceType>("PairSNAPKokkos:ninside",inum);
   }
 
@@ -222,32 +224,93 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
       Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this);
     }
 
-    //PreUi
+    if (host_flag)
     {
-      int vector_length = vector_length_default;
-      int team_size = team_size_default;
-      if (!host_flag)
-        vector_length = 32;
-      check_team_size_for<TagPairSNAPPreUi>(chunk_size,team_size,vector_length);
-      typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPPreUi> policy_preui((chunk_size+team_size-1)/team_size,team_size,vector_length);
-      Kokkos::parallel_for("PreUi",policy_preui,*this);
-    }
+      // Host codepath
 
-    // ComputeUI
-    {
-      int vector_length = vector_length_default;
-      int team_size = team_size_default;
-      if (host_flag) { // CPU
-        // Run a fused calculation of ulist and accumulation into ulisttot using atomics
+      //PreUi
+      {
+        int vector_length = vector_length_default;
+        int team_size = team_size_default;
+        check_team_size_for<TagPairSNAPPreUiCPU>(chunk_size,team_size,vector_length);
+        typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPPreUiCPU> policy_preui_cpu((chunk_size+team_size-1)/team_size,team_size,vector_length);
+        Kokkos::parallel_for("PreUiCPU",policy_preui_cpu,*this);
+      }
 
+      // ComputeUi
+      {
+        int vector_length = vector_length_default;
+        int team_size = team_size_default;
+        // Fused calculation of ulist and accumulation into ulisttot using atomics
         typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUiCPU> policy_ui_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
-
         Kokkos::parallel_for("ComputeUiCPU",policy_ui_cpu,*this);
+      }
 
-      } else { // GPU, vector parallelism, shared memory, separate ulist and ulisttot to avoid atomics
+      //Compute bispectrum
+      if (quadraticflag || eflag) {
+        //ComputeZi
+        int idxz_max = snaKK.idxz_max;
+        typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeZiCPU> policy_zi_cpu(0,chunk_size*idxz_max);
+        Kokkos::parallel_for("ComputeZiCPU",policy_zi_cpu,*this);
 
-        vector_length = 32;
-        team_size = 4; // need to cap b/c of shared memory reqs
+        //ComputeBi
+        int vector_length = vector_length_default;
+        int team_size = team_size_default;
+        check_team_size_for<TagPairSNAPComputeBiCPU>(chunk_size,team_size,vector_length);
+        typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeBiCPU> policy_bi_cpu(chunk_size,team_size,vector_length);
+        Kokkos::parallel_for("ComputeBiCPU",policy_bi_cpu,*this);
+      }
+
+      //ZeroYi,ComputeYi
+      {
+        int vector_length = vector_length_default;
+        int team_size = team_size_default;
+
+        //Compute beta = dE_i/dB_i for all i in list
+        typename Kokkos::RangePolicy<DeviceType,TagPairSNAPBetaCPU> policy_beta(0,chunk_size);
+        Kokkos::parallel_for("ComputeBetaCPU",policy_beta,*this);
+
+        //ZeroYi
+        check_team_size_for<TagPairSNAPZeroYiCPU>(chunk_size,team_size,vector_length);
+        typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPZeroYiCPU> policy_zero_yi(((idxu_max+team_size-1)/team_size)*chunk_size,team_size,vector_length);
+        Kokkos::parallel_for("ZeroYiCPU",policy_zero_yi,*this);
+
+        //ComputeYi
+        int idxz_max = snaKK.idxz_max;
+        typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeYiCPU> policy_yi_cpu(0,chunk_size*idxz_max);
+        Kokkos::parallel_for("ComputeYiCPU",policy_yi_cpu,*this);
+      } // host flag
+
+      //ComputeDuidrj and Deidrj
+      {
+        int team_size = team_size_default;
+        int vector_length = vector_length_default;
+
+        typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeDuidrjCPU> policy_duidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
+        snaKK.set_dir(-1); // technically doesn't do anything
+        Kokkos::parallel_for("ComputeDuidrjCPU",policy_duidrj_cpu,*this);
+
+        typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeDeidrjCPU> policy_deidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
+
+        Kokkos::parallel_for("ComputeDeidrjCPU",policy_deidrj_cpu,*this);
+      }
+    } else { // GPU
+
+#ifdef KOKKOS_ENABLE_CUDA
+      //PreUi
+      {
+        int vector_length = vector_length_default;
+        int team_size = team_size_default;
+        check_team_size_for<TagPairSNAPPreUi>(chunk_size,team_size,vector_length);
+        typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPPreUi> policy_preui((chunk_size+team_size-1)/team_size,team_size,vector_length);
+        Kokkos::parallel_for("PreUi",policy_preui,*this);
+      }
+
+      // ComputeUi w/vector parallelism, shared memory, direct atomicAdd into ulisttot
+      {
+
+        int vector_length = 32;
+        int team_size = 4; // need to cap b/c of shared memory reqs
         check_team_size_for<TagPairSNAPComputeUi>(chunk_size,team_size,vector_length);
 
         // scratch size: 2 * team_size * (twojmax+1)^2, to cover all `m1`,`m2` values
@@ -265,62 +328,54 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
 
         Kokkos::parallel_for("ComputeUi",policy_ui,*this);
 
+        //Transform data layout of ulisttot to AoSoA, zero ylist
+        typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformUi> policy_transform_ui({0,0,0},{32,idxu_max,(chunk_size + 32 - 1) / 32},{32,4,1});
+        Kokkos::parallel_for("TransformUi",policy_transform_ui,*this);
+
       }
-    }
 
+      //Compute bispectrum in AoSoA data layout, transform Bi
+      if (quadraticflag || eflag) {
+        //ComputeZi
+        int idxz_max = snaKK.idxz_max;
+        typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPComputeZi> policy_compute_zi({0,0,0},{32,idxz_max,(chunk_size + 32 - 1) / 32},{32,4,1});
+        Kokkos::parallel_for("ComputeZi",policy_compute_zi,*this);
 
-    //Compute bispectrum
-    if (quadraticflag || eflag) {
-      //ComputeZi
-      int idxz_max = snaKK.idxz_max;
-      typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeZi> policy_zi(0,chunk_size*idxz_max);
-      Kokkos::parallel_for("ComputeZi",policy_zi,*this);
+        //ComputeBi
+        int idxb_max = snaKK.idxb_max;
+        typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPComputeBi> policy_compute_bi({0,0,0},{32,idxb_max,(chunk_size + 32 - 1) / 32},{32,4,1});
+        Kokkos::parallel_for("ComputeBi",policy_compute_bi,*this);
 
-      //ComputeBi
-      int vector_length = vector_length_default;
-      int team_size = team_size_default;
-      check_team_size_for<TagPairSNAPComputeBi>(chunk_size,team_size,vector_length);
-      typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeBi> policy_bi(chunk_size,team_size,vector_length);
-      Kokkos::parallel_for("ComputeBi",policy_bi,*this);
-    }
+        //Transform data layout of blist out of AoSoA
+        //We need this b/c `blist` gets used in ComputeForce which doesn't
+        //take advantage of AoSoA (which at best would only be beneficial
+        //on the margins)
+        typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformBi> policy_transform_bi({0,0,0},{32,idxb_max,(chunk_size + 32 - 1) / 32},{32,4,1});
+        Kokkos::parallel_for("TransformBi",policy_transform_bi,*this);
+      }
 
-    //Compute beta = dE_i/dB_i for all i in list
-    typename Kokkos::RangePolicy<DeviceType,TagPairSNAPBeta> policy_beta(0,chunk_size);
-    Kokkos::parallel_for("ComputeBeta",policy_beta,*this);
+      //ComputeYi in AoSoA data layout, transform to AoS for ComputeFusedDeidrj
+      //Note zeroing `ylist` is fused into `TransformUi`.
+      {
+        //Compute beta = dE_i/dB_i for all i in list
+        typename Kokkos::RangePolicy<DeviceType,TagPairSNAPBeta> policy_beta(0,chunk_size);
+        Kokkos::parallel_for("ComputeBeta",policy_beta,*this);
 
-    //ZeroYi
-    {
-      int vector_length = vector_length_default;
-      int team_size = team_size_default;
-      if (!host_flag)
-        team_size = 128;
-      check_team_size_for<TagPairSNAPZeroYi>(chunk_size,team_size,vector_length);
-      typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPZeroYi> policy_zero_yi(((idxu_max+team_size-1)/team_size)*chunk_size,team_size,vector_length);
-      Kokkos::parallel_for("ZeroYi",policy_zero_yi,*this);
-    }
+        //ComputeYi
+        const int idxz_max = snaKK.idxz_max;
+        typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPComputeYi> policy_compute_yi({0,0,0},{32,idxz_max,(chunk_size + 32 - 1) / 32},{32,4,1});
+        Kokkos::parallel_for("ComputeYi",policy_compute_yi,*this);
 
-    //ComputeYi
-    int idxz_max = snaKK.idxz_max;
-    typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeYi> policy_yi(0,chunk_size*idxz_max);
-    Kokkos::parallel_for("ComputeYi",policy_yi,*this);
+        //Transform data layout of ylist out of AoSoA
+        typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformYi> policy_transform_yi({0,0,0},{32,idxu_max,(chunk_size + 32 - 1) / 32},{32,4,1});
+        Kokkos::parallel_for("TransformYi",policy_transform_yi,*this);
 
-    //ComputeDuidrj and Deidrj
-    {
-      int team_size = team_size_default;
-      int vector_length = vector_length_default;
-      if (host_flag) { // CPU
+      }
 
-        typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeDuidrjCPU> policy_duidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
-        snaKK.set_dir(-1); // technically doesn't do anything
-        Kokkos::parallel_for("ComputeDuidrjCPU",policy_duidrj_cpu,*this);
-
-        typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeDeidrjCPU> policy_deidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
-
-        Kokkos::parallel_for("ComputeDeidrjCPU",policy_deidrj_cpu,*this);
-      } else { // GPU, utilize scratch memory and splitting over dimensions, fused dui and dei
-
-        vector_length = 32;
-        team_size = 2; // need to cap b/c of shared memory reqs
+      // Fused ComputeDuidrj, ComputeDeidrj
+      {
+        int vector_length = 32;
+        int team_size = 2; // need to cap b/c of shared memory reqs
         check_team_size_for<TagPairSNAPComputeFusedDeidrj>(chunk_size,team_size,vector_length);
 
         // scratch size: 2 * 2 * team_size * (twojmax+1)*(twojmax/2+1), to cover half `m1`,`m2` values due to symmetry
@@ -341,6 +396,9 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
           Kokkos::parallel_for("ComputeFusedDeidrj",policy_fused_deidrj,*this);
         }
       }
+
+#endif // KOKKOS_ENABLE_CUDA
+
     }
 
     //ComputeForce
@@ -416,38 +474,6 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   }
 }
 
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType>
-KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPBeta,const int& ii) const {
-
-  const int i = d_ilist[ii + chunk_offset];
-  const int itype = type[i];
-  const int ielem = d_map[itype];
-  SNAKokkos<DeviceType> my_sna = snaKK;
-
-  Kokkos::View<double*,Kokkos::LayoutRight,DeviceType,Kokkos::MemoryTraits<Kokkos::Unmanaged>>
-    d_coeffi(d_coeffelem,ielem,Kokkos::ALL);
-
-  for (int icoeff = 0; icoeff < ncoeff; icoeff++)
-    d_beta(icoeff,ii) = d_coeffi[icoeff+1];
-
-  if (quadraticflag) {
-    int k = ncoeff+1;
-    for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
-      double bveci = my_sna.blist(icoeff,ii);
-      d_beta(icoeff,ii) += d_coeffi[k]*bveci;
-      k++;
-      for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
-        double bvecj = my_sna.blist(jcoeff,ii);
-        d_beta(icoeff,ii) += d_coeffi[k]*bvecj;
-        d_beta(jcoeff,ii) += d_coeffi[k]*bveci;
-        k++;
-      }
-    }
-  }
-}
 
 /* ----------------------------------------------------------------------
    allocate all arrays
@@ -520,6 +546,10 @@ void PairSNAPKokkos<DeviceType>::coeff(int narg, char **arg)
   snaKK.init();
 }
 
+/* ----------------------------------------------------------------------
+   Begin routines that are called on both CPU and GPU codepaths
+------------------------------------------------------------------------- */
+
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
@@ -594,6 +624,53 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeNeigh,const typen
   });
 }
 
+/* ----------------------------------------------------------------------
+   Begin routines that are unique to the GPU codepath. These take advantage
+   of AoSoA data layouts and scratch memory for recursive polynomials
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPBeta,const int& ii) const {
+  
+  if (ii >= chunk_size) return; 
+
+  const int iatom_mod = ii % 32;
+  const int iatom_div = ii / 32;
+
+  const int i = d_ilist[ii + chunk_offset];
+  const int itype = type[i];
+  const int ielem = d_map[itype];
+  SNAKokkos<DeviceType> my_sna = snaKK;
+
+  Kokkos::View<double*,Kokkos::LayoutRight,DeviceType,Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+    d_coeffi(d_coeffelem,ielem,Kokkos::ALL);
+
+  for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
+    d_beta_pack(iatom_mod,icoeff,iatom_div) = d_coeffi[icoeff+1];
+  }
+
+  if (quadraticflag) {
+    const auto idxb_max = my_sna.idxb_max;
+    int k = ncoeff+1;
+    for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
+      const auto idxb = icoeff % idxb_max;
+      const auto idx_chem = icoeff / idxb_max;
+      double bveci = my_sna.blist(idxb, idx_chem, ii);
+      d_beta_pack(iatom_mod,icoeff,iatom_div) += d_coeffi[k]*bveci;
+      k++;
+      for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
+        const auto jdxb = jcoeff % idxb_max;
+        const auto jdx_chem = jcoeff / idxb_max;
+        double bvecj = my_sna.blist(jdxb, jdx_chem, ii);
+        d_beta_pack(iatom_mod,icoeff,iatom_div) += d_coeffi[k]*bvecj;
+        d_beta_pack(iatom_mod,jcoeff,iatom_div) += d_coeffi[k]*bveci;
+        k++;
+      }
+    }
+  }
+}
+
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPPreUi,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPPreUi>::member_type& team) const {
@@ -627,61 +704,107 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUi,const typename
 
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUiCPU>::member_type& team) const {
+void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPTransformUi,const int iatom_mod, const int idxu, const int iatom_div) const {
   SNAKokkos<DeviceType> my_sna = snaKK;
 
-  // Extract the atom number
-  int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size()));
-  if (ii >= chunk_size) return;
+  const int iatom = iatom_mod + iatom_div * 32;
+  if (iatom >= chunk_size) return;
 
-  // Extract the neighbor number
-  const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size());
-  const int ninside = d_ninside(ii);
-  if (jj >= ninside) return;
+  if (idxu >= my_sna.idxu_max) return;
+
+  int elem_count = chemflag ? nelements : 1;
+
+  for (int ielem = 0; ielem < elem_count; ielem++) {
+
+    const auto utot_re = my_sna.ulisttot_re(idxu, ielem, iatom);
+    const auto utot_im = my_sna.ulisttot_im(idxu, ielem, iatom);
+
+    my_sna.ulisttot_pack(iatom_mod, idxu, ielem, iatom_div) = { utot_re, utot_im };
+
+    my_sna.ylist_pack_re(iatom_mod, idxu, ielem, iatom_div) = 0.;
+    my_sna.ylist_pack_im(iatom_mod, idxu, ielem, iatom_div) = 0.;
+  }
 
-  my_sna.compute_ui_cpu(team,ii,jj);
 }
 
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPZeroYi,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPZeroYi>::member_type& team) const {
+void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeYi,const int iatom_mod, const int jjz, const int iatom_div) const {
   SNAKokkos<DeviceType> my_sna = snaKK;
 
-  // Extract the quantum number
-  const int idx = team.team_rank() + team.team_size() * (team.league_rank() % ((my_sna.idxu_max+team.team_size()-1)/team.team_size()));
-  if (idx >= my_sna.idxu_max) return;
+  const int iatom = iatom_mod + iatom_div * 32;
+  if (iatom >= chunk_size) return;
 
-  // Extract the atomic index
-  const int ii = team.league_rank() / ((my_sna.idxu_max+team.team_size()-1)/team.team_size());
-  if (ii >= chunk_size) return;
+  if (jjz >= my_sna.idxz_max) return;
 
-  if (chemflag)
-    for(int ielem = 0; ielem < nelements; ielem++)
-      my_sna.zero_yi(idx,ii,ielem);
-  else
-    my_sna.zero_yi(idx,ii,0);
+  my_sna.compute_yi(iatom_mod,jjz,iatom_div,d_beta_pack);
 }
 
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeYi,const int& ii) const {
+void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPTransformYi,const int iatom_mod, const int idxu, const int iatom_div) const {
   SNAKokkos<DeviceType> my_sna = snaKK;
-  my_sna.compute_yi(ii,d_beta);
+
+  const int iatom = iatom_mod + iatom_div * 32;
+  if (iatom >= chunk_size) return;
+
+  if (idxu >= my_sna.idxu_max) return;
+
+  int elem_count = chemflag ? nelements : 1;
+  for (int ielem = 0; ielem < elem_count; ielem++) {
+    const auto y_re = my_sna.ylist_pack_re(iatom_mod, idxu, ielem, iatom_div);
+    const auto y_im = my_sna.ylist_pack_im(iatom_mod, idxu, ielem, iatom_div);
+
+    my_sna.ylist(idxu, ielem, iatom) = { y_re, y_im };
+  }
+
 }
 
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeZi,const int& ii) const {
+void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeZi,const int iatom_mod, const int jjz, const int iatom_div) const {
   SNAKokkos<DeviceType> my_sna = snaKK;
-  my_sna.compute_zi(ii);
+
+  const int iatom = iatom_mod + iatom_div * 32;
+  if (iatom >= chunk_size) return;
+
+  if (jjz >= my_sna.idxz_max) return;
+
+  my_sna.compute_zi(iatom_mod,jjz,iatom_div);
 }
 
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeBi,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeBi>::member_type& team) const {
-  int ii = team.league_rank();
+void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeBi,const int iatom_mod, const int jjb, const int iatom_div) const {
   SNAKokkos<DeviceType> my_sna = snaKK;
-  my_sna.compute_bi(team,ii);
+
+  const int iatom = iatom_mod + iatom_div * 32;
+  if (iatom >= chunk_size) return;
+
+  if (jjb >= my_sna.idxb_max) return;
+
+  my_sna.compute_bi(iatom_mod,jjb,iatom_div);
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const {
+  SNAKokkos<DeviceType> my_sna = snaKK;
+
+  const int iatom = iatom_mod + iatom_div * 32;
+  if (iatom >= chunk_size) return;
+
+  if (idxb >= my_sna.idxb_max) return;
+
+  const int ntriples = my_sna.ntriples;
+
+  for (int itriple = 0; itriple < ntriples; itriple++) {
+
+    const auto blocal = my_sna.blist_pack(iatom_mod, idxb, itriple, iatom_div);
+
+    my_sna.blist(idxb, itriple, iatom) = blocal;
+  }
+
 }
 
 template<class DeviceType>
@@ -701,6 +824,126 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeFusedDeidrj,const
   my_sna.compute_fused_deidrj(team,ii,jj);
 }
 
+/* ----------------------------------------------------------------------
+   Begin routines that are unique to the CPU codepath. These do not take
+   advantage of AoSoA data layouts, but that could be a good point of
+   future optimization and unification with the above kernels. It's unlikely
+   that scratch memory optimizations will ever be useful for the CPU due to
+   different arithmetic intensity requirements for the CPU vs GPU.
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPBetaCPU,const int& ii) const {
+
+  const int i = d_ilist[ii + chunk_offset];
+  const int itype = type[i];
+  const int ielem = d_map[itype];
+  SNAKokkos<DeviceType> my_sna = snaKK;
+
+  Kokkos::View<double*,Kokkos::LayoutRight,DeviceType,Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+    d_coeffi(d_coeffelem,ielem,Kokkos::ALL);
+
+  for (int icoeff = 0; icoeff < ncoeff; icoeff++)
+    d_beta(icoeff,ii) = d_coeffi[icoeff+1];
+
+  if (quadraticflag) {
+    const auto idxb_max = my_sna.idxb_max;
+    int k = ncoeff+1;
+    for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
+      const auto idxb = icoeff % idxb_max;
+      const auto idx_chem = icoeff / idxb_max;
+      double bveci = my_sna.blist(idxb,idx_chem,ii);
+      d_beta(icoeff,ii) += d_coeffi[k]*bveci;
+      k++;
+      for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
+        const auto jdxb = jcoeff % idxb_max;
+        const auto jdx_chem = jcoeff / idxb_max;
+        double bvecj = my_sna.blist(jdxb,jdx_chem,ii);
+        d_beta(icoeff,ii) += d_coeffi[k]*bvecj;
+        d_beta(jcoeff,ii) += d_coeffi[k]*bveci;
+        k++;
+      }
+    }
+  }
+}
+
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPPreUiCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPPreUiCPU>::member_type& team) const {
+  SNAKokkos<DeviceType> my_sna = snaKK;
+
+  // Extract the atom number
+  const int ii = team.team_rank() + team.team_size() * team.league_rank();
+  if (ii >= chunk_size) return;
+  int itype = type(ii);
+  int ielem = d_map[itype];
+
+  my_sna.pre_ui_cpu(team,ii,ielem);
+}
+
+
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUiCPU>::member_type& team) const {
+  SNAKokkos<DeviceType> my_sna = snaKK;
+
+  // Extract the atom number
+  int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size()));
+  if (ii >= chunk_size) return;
+
+  // Extract the neighbor number
+  const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size());
+  const int ninside = d_ninside(ii);
+  if (jj >= ninside) return;
+
+  my_sna.compute_ui_cpu(team,ii,jj);
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPZeroYiCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPZeroYiCPU>::member_type& team) const {
+  SNAKokkos<DeviceType> my_sna = snaKK;
+
+  // Extract the quantum number
+  const int idx = team.team_rank() + team.team_size() * (team.league_rank() % ((my_sna.idxu_max+team.team_size()-1)/team.team_size()));
+  if (idx >= my_sna.idxu_max) return;
+
+  // Extract the atomic index
+  const int ii = team.league_rank() / ((my_sna.idxu_max+team.team_size()-1)/team.team_size());
+  if (ii >= chunk_size) return;
+
+  if (chemflag)
+    for(int ielem = 0; ielem < nelements; ielem++)
+      my_sna.zero_yi_cpu(idx,ii,ielem);
+  else
+    my_sna.zero_yi_cpu(idx,ii,0);
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeYiCPU,const int& ii) const {
+  SNAKokkos<DeviceType> my_sna = snaKK;
+  my_sna.compute_yi_cpu(ii,d_beta);
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeZiCPU,const int& ii) const {
+  SNAKokkos<DeviceType> my_sna = snaKK;
+  my_sna.compute_zi_cpu(ii);
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeBiCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeBiCPU>::member_type& team) const {
+  int ii = team.league_rank();
+  SNAKokkos<DeviceType> my_sna = snaKK;
+  my_sna.compute_bi_cpu(team,ii);
+}
+
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDuidrjCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeDuidrjCPU>::member_type& team) const {
@@ -735,6 +978,12 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDeidrjCPU,const t
   my_sna.compute_deidrj_cpu(team,ii,jj);
 }
 
+/* ----------------------------------------------------------------------
+   Also used for both CPU and GPU codepaths. Could maybe benefit from a
+   separate GPU/CPU codepath, but this kernel takes so little time it's
+   likely not worth it.
+------------------------------------------------------------------------- */
+
 template<class DeviceType>
 template<int NEIGHFLAG, int EVFLAG>
 KOKKOS_INLINE_FUNCTION
@@ -799,20 +1048,31 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeForce<NEIGHFLAG,E
 
         // E = beta.B + 0.5*B^t.alpha.B
 
+        const auto idxb_max = snaKK.idxb_max;
+
         // linear contributions
 
-        for (int icoeff = 0; icoeff < ncoeff; icoeff++)
-          evdwl += d_coeffi[icoeff+1]*my_sna.blist(icoeff,ii);
+        for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
+          const auto idxb = icoeff % idxb_max;
+          const auto idx_chem = icoeff / idxb_max;
+          evdwl += d_coeffi[icoeff+1]*my_sna.blist(idxb,idx_chem,ii);
+        }
 
         // quadratic contributions
 
         if (quadraticflag) {
           int k = ncoeff+1;
           for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
-            double bveci = my_sna.blist(icoeff,ii);
+            const auto idxb = icoeff % idxb_max;
+            const auto idx_chem = icoeff / idxb_max;
+            double bveci = my_sna.blist(idxb,idx_chem,ii);
+
             evdwl += 0.5*d_coeffi[k++]*bveci*bveci;
             for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
-              double bvecj = my_sna.blist(jcoeff,ii);
+              auto jdxb = jcoeff % idxb_max;
+              auto jdx_chem = jcoeff / idxb_max;
+              double bvecj = my_sna.blist(jdxb,jdx_chem,ii);
+
               evdwl += d_coeffi[k++]*bveci*bvecj;
             }
           }
diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h
index 30c705cc41..121f293633 100644
--- a/src/KOKKOS/sna_kokkos.h
+++ b/src/KOKKOS/sna_kokkos.h
@@ -36,7 +36,9 @@ public:
   typedef Kokkos::View<double**, DeviceType> t_sna_2d;
   typedef Kokkos::View<double**, Kokkos::LayoutLeft, DeviceType> t_sna_2d_ll;
   typedef Kokkos::View<double***, DeviceType> t_sna_3d;
+  typedef Kokkos::View<double***, Kokkos::LayoutLeft, DeviceType> t_sna_3d_ll;
   typedef Kokkos::View<double***[3], DeviceType> t_sna_4d;
+  typedef Kokkos::View<double****, Kokkos::LayoutLeft, DeviceType> t_sna_4d_ll;
   typedef Kokkos::View<double**[3], DeviceType> t_sna_3d3;
   typedef Kokkos::View<double*****, DeviceType> t_sna_5d;
 
@@ -48,7 +50,8 @@ public:
   typedef Kokkos::View<SNAcomplex***, DeviceType> t_sna_3c;
   typedef Kokkos::View<SNAcomplex***, Kokkos::LayoutLeft, DeviceType> t_sna_3c_ll;
   typedef Kokkos::View<SNAcomplex***[3], DeviceType> t_sna_4c;
-  typedef Kokkos::View<SNAcomplex***[3], Kokkos::LayoutLeft, DeviceType> t_sna_4c_ll;
+  typedef Kokkos::View<SNAcomplex***[3], Kokkos::LayoutLeft, DeviceType> t_sna_4c3_ll;
+  typedef Kokkos::View<SNAcomplex****, Kokkos::LayoutLeft, DeviceType> t_sna_4c_ll;
   typedef Kokkos::View<SNAcomplex**[3], DeviceType> t_sna_3c3;
   typedef Kokkos::View<SNAcomplex*****, DeviceType> t_sna_5c;
 
@@ -73,27 +76,39 @@ inline
 
   int ncoeff;
 
-  // functions for bispectrum coefficients
+  // functions for bispectrum coefficients, GPU only
   KOKKOS_INLINE_FUNCTION
-  void pre_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team,const int&, int); // ForceSNAP
+  void pre_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team,const int&,const int&); // ForceSNAP
   KOKKOS_INLINE_FUNCTION
   void compute_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int); // ForceSNAP
   KOKKOS_INLINE_FUNCTION
+  void compute_zi(const int&, const int&, const int&);    // ForceSNAP
+  KOKKOS_INLINE_FUNCTION
+  void compute_yi(int,int,int,
+   const Kokkos::View<F_FLOAT***, Kokkos::LayoutLeft, DeviceType> &beta_pack); // ForceSNAP
+  KOKKOS_INLINE_FUNCTION
+  void compute_bi(const int&, const int&, const int&);    // ForceSNAP
+
+  // functions for bispectrum coefficients, CPU only
+  KOKKOS_INLINE_FUNCTION
+  void pre_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team,const int&,const int&); // ForceSNAP
+  KOKKOS_INLINE_FUNCTION
   void compute_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); // ForceSNAP
   KOKKOS_INLINE_FUNCTION
-  void compute_zi(const int&);    // ForceSNAP
+  void compute_zi_cpu(const int&);    // ForceSNAP
   KOKKOS_INLINE_FUNCTION
-  void zero_yi(const int&, const int&, int); // ForceSNAP
+  void zero_yi_cpu(const int&,const int&,const int&); // ForceSNAP
   KOKKOS_INLINE_FUNCTION
-  void compute_yi(int,
+  void compute_yi_cpu(int,
    const Kokkos::View<F_FLOAT**, DeviceType> &beta); // ForceSNAP
-  KOKKOS_INLINE_FUNCTION
-  void compute_bi(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int);    // ForceSNAP
-
-  // functions for derivatives
+    KOKKOS_INLINE_FUNCTION
+  void compute_bi_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int);    // ForceSNAP
 
+  // functions for derivatives, GPU only
   KOKKOS_INLINE_FUNCTION
   void compute_fused_deidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int); //ForceSNAP
+
+  // functions for derivatives, CPU only
   KOKKOS_INLINE_FUNCTION
   void compute_duidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); //ForceSNAP
   KOKKOS_INLINE_FUNCTION
@@ -139,18 +154,32 @@ inline
 
   int twojmax, diagonalstyle;
 
-  t_sna_2d_ll blist;
-  t_sna_2c_ll ulisttot;
-  t_sna_2c_ll zlist;
+  t_sna_3d_ll blist;
+  t_sna_3c_ll ulisttot;
+  t_sna_3c_ll zlist;
 
   t_sna_3c_ll ulist;
-  t_sna_2c_ll ylist;
-
+  t_sna_3c_ll ylist;
+  
   // derivatives of data
-  t_sna_4c_ll dulist;
+  t_sna_4c3_ll dulist;
+  
+  // Modified structures for GPU backend
+  t_sna_3d_ll ulisttot_re; // split real,
+  t_sna_3d_ll ulisttot_im; // imag
+  t_sna_4c_ll ulisttot_pack; // AoSoA layout
+  t_sna_4c_ll zlist_pack; // AoSoA layout
+  t_sna_4d_ll blist_pack;
+  t_sna_4d_ll ylist_pack_re; // split real, 
+  t_sna_4d_ll ylist_pack_im; // imag AoSoA layout
 
   int idxcg_max, idxu_max, idxz_max, idxb_max;
 
+  // Chem snap counts
+  int nelements;
+  int ndoubles;
+  int ntriples;
+
 private:
   double rmin0, rfac0;
 
@@ -212,9 +241,6 @@ inline
   // Chem snap flags
   int chem_flag;
   int bnorm_flag;
-  int nelements;
-  int ndoubles;
-  int ntriples;
 
   // Self-weight
   double wself;
diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h
index dae4a0c2a1..866b4c516f 100644
--- a/src/KOKKOS/sna_kokkos_impl.h
+++ b/src/KOKKOS/sna_kokkos_impl.h
@@ -237,56 +237,71 @@ void SNAKokkos<DeviceType>::grow_rij(int newnatom, int newnmax)
   element = t_sna_2i("sna:rcutij",natom,nmax);
   dedr = t_sna_3d("sna:dedr",natom,nmax,3);
 
-  blist = t_sna_2d_ll("sna:blist",idxb_max*ntriples,natom);
-  //ulisttot = t_sna_2c("sna:ulisttot",natom,idxu_max);
-  ulisttot = t_sna_2c_ll("sna:ulisttot",idxu_max*nelements,natom);
-
-  zlist = t_sna_2c_ll("sna:zlist",idxz_max*ndoubles,natom);
-
-  //ulist = t_sna_3c("sna:ulist",natom,nmax,idxu_max);
 #ifdef KOKKOS_ENABLE_CUDA
   if (std::is_same<DeviceType,Kokkos::Cuda>::value) {
     // dummy allocation
+    ulisttot = t_sna_3c_ll("sna:ulisttot",1,1,1);
+    ulisttot_re = t_sna_3d_ll("sna:ulisttot_re",idxu_max,nelements,natom);
+    ulisttot_im = t_sna_3d_ll("sna:ulisttot_im",idxu_max,nelements,natom);
+    ulisttot_pack = t_sna_4c_ll("sna:ulisttot_pack",32,idxu_max,nelements,(natom+32-1)/32);
     ulist = t_sna_3c_ll("sna:ulist",1,1,1);
-    dulist = t_sna_4c_ll("sna:dulist",1,1,1);
+    zlist = t_sna_3c_ll("sna:zlist",1,1,1);
+    zlist_pack = t_sna_4c_ll("sna:zlist_pack",32,idxz_max,ndoubles,(natom+32-1)/32);
+    blist = t_sna_3d_ll("sna:blist",idxb_max,ntriples,natom);
+    blist_pack = t_sna_4d_ll("sna:blist_pack",32,idxb_max,ntriples,(natom+32-1)/32);
+    ylist = t_sna_3c_ll("sna:ylist",idxu_max,nelements,natom);
+    ylist_pack_re = t_sna_4d_ll("sna:ylist_pack_re",32,idxu_max,nelements,(natom+32-1)/32);
+    ylist_pack_im = t_sna_4d_ll("sna:ylist_pack_im",32,idxu_max,nelements,(natom+32-1)/32);
+    dulist = t_sna_4c3_ll("sna:dulist",1,1,1);
   } else {
 #endif
+    ulisttot = t_sna_3c_ll("sna:ulisttot",idxu_max,nelements,natom);
+    ulisttot_re = t_sna_3d_ll("sna:ulisttot_re",1,1,1);
+    ulisttot_im = t_sna_3d_ll("sna:ulisttot_im",1,1,1);
+    ulisttot_pack = t_sna_4c_ll("sna:ulisttot_pack",1,1,1,1);
     ulist = t_sna_3c_ll("sna:ulist",idxu_max,natom,nmax);
-    dulist = t_sna_4c_ll("sna:dulist",idxu_max,natom,nmax);
+    zlist = t_sna_3c_ll("sna:zlist",idxz_max,ndoubles,natom);
+    zlist_pack = t_sna_4c_ll("sna:zlist_pack",1,1,1,1);
+    blist = t_sna_3d_ll("sna:blist",idxb_max,ntriples,natom);
+    blist_pack = t_sna_4d_ll("sna:blist_pack",1,1,1,1);
+    ylist = t_sna_3c_ll("sna:ylist",idxu_max,nelements,natom);
+    ylist_pack_re = t_sna_4d_ll("sna:ylist_pack_re",1,1,1,1);
+    ylist_pack_im = t_sna_4d_ll("sna:ylist_pack_im",1,1,1,1);
+    dulist = t_sna_4c3_ll("sna:dulist",idxu_max,natom,nmax);
+
 #ifdef KOKKOS_ENABLE_CUDA
   }
 #endif
-
-  //ylist = t_sna_2c_lr("sna:ylist",natom,idxu_max);
-  ylist = t_sna_2c_ll("sna:ylist",idxu_max*nelements,natom);
 }
 
 /* ----------------------------------------------------------------------
- *    compute Ui by summing over neighbors j
- *    ------------------------------------------------------------------------- */
+ * GPU routines
+ * ----------------------------------------------------------------------*/
 
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType>::pre_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int& iatom, int ielem)
+void SNAKokkos<DeviceType>::pre_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int& iatom, const int& ielem)
 {
-  for(int jelem = 0; jelem < nelements; jelem++)
-  for (int j = 0; j <= twojmax; j++) {
-    const int jju = idxu_block(j);
+  for (int jelem = 0; jelem < nelements; jelem++) {
+    for (int j = 0; j <= twojmax; j++) {
+      const int jju = idxu_block(j);
 
-    // Only diagonal elements get initialized
-    // for (int m = 0; m < (j+1)*(j+1); m++)
-    Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, (j+1)*(j+1)),
-      [&] (const int m) {
+      // Only diagonal elements get initialized
+      // for (int m = 0; m < (j+1)*(j+1); m++)
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, (j+1)*(j+1)),
+        [&] (const int m) {
 
-      const int jjup = jju + m;
+        const int jjup = jju + m;
 
-      // if m is on the "diagonal", initialize it with the self energy.
-      // Otherwise zero it out
-      SNAcomplex init = {0., 0.};
-      if (m % (j+2) == 0 && (!chem_flag || ielem == jelem || wselfall_flag)) { init = {wself, 0.0}; } //need to map iatom to element
+        // if m is on the "diagonal", initialize it with the self energy.
+        // Otherwise zero it out
+        double re_part = 0.; 
+        if (m % (j+2) == 0 && (!chem_flag || ielem == jelem || wselfall_flag)) { re_part = wself; }
 
-      ulisttot(jelem*idxu_max+jjup, iatom) = init;
-    });
+        ulisttot_re(jjup, jelem, iatom) = re_part;
+        ulisttot_im(jjup, jelem, iatom) = 0.;
+      });
+    }
   }
 
 }
@@ -322,7 +337,7 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
 
   const double wj_local = wj(iatom, jnbor);
   const double rcut = rcutij(iatom, jnbor);
-  const int jpos = element(iatom, jnbor)*idxu_max;
+  const int jelem = element(iatom, jnbor);
 
   const double rsq = x * x + y * y + z * z;
   const double r = sqrt(rsq);
@@ -347,9 +362,8 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
   // All writes go to global memory and shared memory
   // so we can avoid all global memory reads
   Kokkos::single(Kokkos::PerThread(team), [=]() {
-    //ulist(0,iatom,jnbor) = { 1.0, 0.0 };
     buf1[0] = {1.,0.};
-    Kokkos::atomic_add(&(ulisttot(jpos,iatom).re), sfac);
+    Kokkos::atomic_add(&(ulisttot_re(0,jelem,iatom)), sfac);
   });
 
   for (int j = 1; j <= twojmax; j++) {
@@ -358,9 +372,7 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
 
     // fill in left side of matrix layer from previous layer
 
-    // Flatten loop over ma, mb, need to figure out total
-    // number of iterations
-
+    // Flatten loop over ma, mb
     // for (int ma = 0; ma <= j; ma++)
     const int n_ma = j+1;
     // for (int mb = 0; 2*mb <= j; mb++)
@@ -374,12 +386,12 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
       [&] (const int m) {
 
       // ma fast, mb slow
-      int ma = m % n_ma;
-      int mb = m / n_ma;
+      // Equivalent to `int ma = m % n_ma; int mb = m / n_ma;` IF everything's positive.
+      const int mb = m / n_ma;
+      const int ma = m - mb * n_ma;
 
       // index into global memory array
-      const int jju_index = jju+m+jpos;
-      //const int jjup_index = jjup+mb*j+ma;
+      const int jju_index = jju+m;
 
       // index into shared memory buffer for this level
       const int jju_shared_idx = m;
@@ -391,22 +403,23 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
 
       // VMK recursion relation: grab contribution which is multiplied by b*
       const double rootpq2 = -rootpqarray(ma, j - mb);
-      const SNAcomplex u_up2 = (ma > 0)?rootpq2*buf1[jjup_shared_idx-1]:SNAcomplex(0.,0.);
-      //const SNAcomplex u_up2 = (ma > 0)?rootpq2*ulist(jjup_index-1,iatom,jnbor):SNAcomplex(0.,0.);
+      const SNAcomplex u_up2 = rootpq2*((ma > 0)?buf1[jjup_shared_idx-1]:SNAcomplex(0.,0.));
+
+      // u_accum += conj(b) * u_up2
       caconjxpy(b, u_up2, u_accum);
 
       // VMK recursion relation: grab contribution which is multiplied by a*
       const double rootpq1 = rootpqarray(j - ma, j - mb);
-      const SNAcomplex u_up1 = (ma < j)?rootpq1*buf1[jjup_shared_idx]:SNAcomplex(0.,0.);
-      //const SNAcomplex u_up1 = (ma < j)?rootpq1*ulist(jjup_index,iatom,jnbor):SNAcomplex(0.,0.);
+      const SNAcomplex u_up1 = rootpq1*((ma < j)?buf1[jjup_shared_idx]:SNAcomplex(0.,0.));
+
+      // u_accum += conj(a) * u_up1
       caconjxpy(a, u_up1, u_accum);
 
-      //ulist(jju_index,iatom,jnbor) = u_accum;
       // back up into shared memory for next iter
       buf2[jju_shared_idx] = u_accum;
 
-      Kokkos::atomic_add(&(ulisttot(jju_index,iatom).re), sfac * u_accum.re);
-      Kokkos::atomic_add(&(ulisttot(jju_index,iatom).im), sfac * u_accum.im);
+      Kokkos::atomic_add(&(ulisttot_re(jju_index,jelem,iatom)), sfac * u_accum.re);
+      Kokkos::atomic_add(&(ulisttot_im(jju_index,jelem,iatom)), sfac * u_accum.im);
 
       // copy left side to right side with inversion symmetry VMK 4.4(2)
       // u[ma-j,mb-j] = (-1)^(ma-mb)*Conj([u[ma,mb))
@@ -415,7 +428,7 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
       if (m < total_iters - 1 || j % 2 == 1) {
         const int sign_factor = (((ma+mb)%2==0)?1:-1);
         const int jju_shared_flip = (j+1-mb)*(j+1)-(ma+1);
-        const int jjup_flip = jju + jju_shared_flip + jpos; // jju+(j+1-mb)*(j+1)-(ma+1);
+        const int jjup_flip = jju + jju_shared_flip; // jju+(j+1-mb)*(j+1)-(ma+1);
 
 
         if (sign_factor == 1) {
@@ -423,11 +436,12 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
         } else {
           u_accum.re = -u_accum.re;
         }
-        //ulist(jjup_flip,iatom,jnbor) = u_accum;
+        
         buf2[jju_shared_flip] = u_accum;
 
-        Kokkos::atomic_add(&(ulisttot(jjup_flip,iatom).re), sfac * u_accum.re);
-        Kokkos::atomic_add(&(ulisttot(jjup_flip,iatom).im), sfac * u_accum.im);
+        // split re, im to get fully coalesced atomic add
+        Kokkos::atomic_add(&(ulisttot_re(jjup_flip,jelem,iatom)), sfac * u_accum.re);
+        Kokkos::atomic_add(&(ulisttot_im(jjup_flip,jelem,iatom)), sfac * u_accum.im);
       }
     });
     // In CUDA backend,
@@ -442,46 +456,13 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
 }
 
 /* ----------------------------------------------------------------------
-   compute Ui by summing over bispectrum components. CPU only.
+   compute Zi by summing over products of Ui, GPU version
 ------------------------------------------------------------------------- */
 
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType>::compute_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
+void SNAKokkos<DeviceType>::compute_zi(const int& iatom_mod, const int& jjz, const int& iatom_div)
 {
-  double rsq, r, x, y, z, z0, theta0;
-
-  // utot(j,ma,mb) = 0 for all j,ma,ma
-  // utot(j,ma,ma) = 1 for all j,ma
-  // for j in neighbors of i:
-  //   compute r0 = (x,y,z,z0)
-  //   utot(j,ma,mb) += u(r0;j,ma,mb) for all j,ma,mb
-
-  x = rij(iatom,jnbor,0);
-  y = rij(iatom,jnbor,1);
-  z = rij(iatom,jnbor,2);
-  rsq = x * x + y * y + z * z;
-  r = sqrt(rsq);
-
-  theta0 = (r - rmin0) * rfac0 * MY_PI / (rcutij(iatom,jnbor) - rmin0);
-  //    theta0 = (r - rmin0) * rscale0;
-  z0 = r / tan(theta0);
-
-  compute_uarray_cpu(team, iatom, jnbor, x, y, z, z0, r);
-  add_uarraytot(team, iatom, jnbor, r, wj(iatom,jnbor), rcutij(iatom,jnbor), element(iatom, jnbor));
-
-}
-
-/* ----------------------------------------------------------------------
-   compute Zi by summing over products of Ui
-------------------------------------------------------------------------- */
-
-template<class DeviceType>
-KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType>::compute_zi(const int& iter)
-{
-  const int iatom = iter / idxz_max;
-  const int jjz = iter % idxz_max;
 
   const int j1 = idxz(jjz, 0);
   const int j2 = idxz(jjz, 1);
@@ -493,77 +474,161 @@ void SNAKokkos<DeviceType>::compute_zi(const int& iter)
   const int na = idxz(jjz, 7);
   const int nb = idxz(jjz, 8);
 
-  const double *cgblock = cglist.data() + idxcg_block(j1, j2, j);
+  const double* cgblock = cglist.data() + idxcg_block(j1, j2, j);
 
   int idouble = 0;
-  for(int elem1 = 0; elem1 < nelements; elem1++)
-  for(int elem2 = 0; elem2 < nelements; elem2++) {
 
-  const int jjza = idouble*idxz_max + jjz;
+  for (int elem1 = 0; elem1 < nelements; elem1++) {
+    for (int elem2 = 0; elem2 < nelements; elem2++) {
+      double ztmp_r = 0.;
+      double ztmp_i = 0.;
 
-  zlist(jjza, iatom).re = 0.0;
-  zlist(jjza, iatom).im = 0.0;
+      int jju1 = idxu_block[j1] + (j1+1)*mb1min;
+      int jju2 = idxu_block[j2] + (j2+1)*mb2max;
+      int icgb = mb1min*(j2+1) + mb2max;
 
-  int jju1 = elem1 * idxu_max + idxu_block[j1] + (j1 + 1) * mb1min;
-  int jju2 = elem2 * idxu_max + idxu_block[j2] + (j2 + 1) * mb2max;
-  int icgb = mb1min * (j2 + 1) + mb2max;
-  for (int ib = 0; ib < nb; ib++) {
+      #pragma unroll
+      for(int ib = 0; ib < nb; ib++) {
 
-    double suma1_r = 0.0;
-    double suma1_i = 0.0;
+        int ma1 = ma1min;
+        int ma2 = ma2max;
+        int icga = ma1min*(j2+1) + ma2max;
 
-    int ma1 = ma1min;
-    int ma2 = ma2max;
-    int icga = ma1min * (j2 + 1) + ma2max;
-    for (int ia = 0; ia < na; ia++) {
-      suma1_r += cgblock[icga] * (ulisttot(jju1 + ma1, iatom).re * ulisttot(jju2 + ma2, iatom).re -
-                                  ulisttot(jju1 + ma1, iatom).im * ulisttot(jju2 + ma2, iatom).im);
-      suma1_i += cgblock[icga] * (ulisttot(jju1 + ma1, iatom).re * ulisttot(jju2 + ma2, iatom).im +
-                                  ulisttot(jju1 + ma1, iatom).im * ulisttot(jju2 + ma2, iatom).re);
-      ma1++;
-      ma2--;
-      icga += j2;
-    } // end loop over ia
+        #pragma unroll
+        for(int ia = 0; ia < na; ia++) {
+          const SNAcomplex utot1 = ulisttot_pack(iatom_mod, jju1+ma1, elem1, iatom_div);
+          const SNAcomplex utot2 = ulisttot_pack(iatom_mod, jju2+ma2, elem2, iatom_div);
+          const auto cgcoeff_a = cgblock[icga];
+          const auto cgcoeff_b = cgblock[icgb];
+          ztmp_r += cgcoeff_a * cgcoeff_b * (utot1.re * utot2.re - utot1.im * utot2.im);
+          ztmp_i += cgcoeff_a * cgcoeff_b * (utot1.re * utot2.im + utot1.im * utot2.re);
+          ma1++;
+          ma2--;
+          icga += j2;
+        } // end loop over ia
 
-    zlist(jjza, iatom).re += cgblock[icgb] * suma1_r;
-    zlist(jjza, iatom).im += cgblock[icgb] * suma1_i;
+        jju1 += j1 + 1;
+        jju2 -= j2 + 1;
+        icgb += j2;
+      } // end loop over ib
 
-    jju1 += j1 + 1;
-    jju2 -= j2 + 1;
-    icgb += j2;
+      if (bnorm_flag) {
+        ztmp_r /= (j + 1);
+        ztmp_i /= (j + 1);
+      }
 
-  } // end loop over ib
-  if (bnorm_flag) {
-    zlist(jjza, iatom).re /= (j+1);
-    zlist(jjza, iatom).im /= (j+1);
-  }
-  idouble++;
+      zlist_pack(iatom_mod,jjz,idouble,iatom_div) = { ztmp_r, ztmp_i };
+
+      idouble++;
+    }
   }
 }
 
 /* ----------------------------------------------------------------------
-   compute Yi from Ui without storing Zi, looping over zlist indices
+   compute Bi by summing conj(Ui)*Zi
 ------------------------------------------------------------------------- */
 
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType>::zero_yi(const int& idx, const int& iatom, int ielem)
+void SNAKokkos<DeviceType>::compute_bi(const int& iatom_mod, const int& jjb, const int& iatom_div)
 {
-  ylist(ielem*idxu_max+idx,iatom) = {0.0, 0.0};
+  // for j1 = 0,...,twojmax
+  //   for j2 = 0,twojmax
+  //     for j = |j1-j2|,Min(twojmax,j1+j2),2
+  //        b(j1,j2,j) = 0
+  //        for mb = 0,...,jmid
+  //          for ma = 0,...,j
+  //            b(j1,j2,j) +=
+  //              2*Conj(u(j,ma,mb))*z(j1,j2,j,ma,mb)
+
+  const int j1 = idxb(jjb,0);
+  const int j2 = idxb(jjb,1);
+  const int j = idxb(jjb,2);
+
+  const int jjz = idxz_block(j1,j2,j);
+  const int jju = idxu_block[j];
+
+  int itriple = 0;
+  int idouble = 0;
+  for (int elem1 = 0; elem1 < nelements; elem1++) {
+    for (int elem2 = 0; elem2 < nelements; elem2++) {
+      const int jalloy = idouble;
+
+      for (int elem3 = 0; elem3 < nelements; elem3++) {
+        const int jjballoy = itriple;
+
+        double sumzu = 0.0;
+        double sumzu_temp = 0.0;
+
+        for(int mb = 0; 2*mb < j; mb++) {
+          for(int ma = 0; ma <= j; ma++) {
+            const int jju_index = jju+mb*(j+1)+ma;
+            const int jjz_index = jjz+mb*(j+1)+ma;
+            if (2*mb == j) return; // I think we can remove this?
+            const auto utot = ulisttot_pack(iatom_mod, jju_index, elem3, iatom_div);
+            const auto zloc = zlist_pack(iatom_mod, jjz_index, jalloy, iatom_div);
+            sumzu_temp += utot.re * zloc.re + utot.im * zloc.im;
+          }
+        }
+        sumzu += sumzu_temp;
+
+        // For j even, special treatment for middle column
+        if (j%2 == 0) {
+          sumzu_temp = 0.;
+
+          const int mb = j/2;
+          for(int ma = 0; ma < mb; ma++) {
+            const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma;
+            const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma;
+
+            const auto utot = ulisttot_pack(iatom_mod, jju_index, elem3, iatom_div);
+            const auto zloc = zlist_pack(iatom_mod, jjz_index, jalloy, iatom_div);
+            sumzu_temp += utot.re * zloc.re + utot.im * zloc.im;
+
+          }
+          sumzu += sumzu_temp;
+
+          const int ma = mb;
+          const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma;
+          const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma;
+
+          const auto utot = ulisttot_pack(iatom_mod, jju_index, elem3, iatom_div);
+          const auto zloc = zlist_pack(iatom_mod, jjz_index, jalloy, iatom_div);
+          sumzu += 0.5 * (utot.re * zloc.re + utot.im * zloc.im);
+        } // end if jeven
+
+        sumzu *= 2.0;
+        if (bzero_flag) {
+          if (!wselfall_flag) {
+            if (elem1 == elem2 && elem1 == elem3) {
+              sumzu -= bzero[j];
+            }
+          } else {
+            sumzu -= bzero[j];
+          }
+        }
+        blist_pack(iatom_mod, jjb, jjballoy, iatom_div) = sumzu;
+            //} // end loop over j
+          //} // end loop over j1, j2
+        itriple++; 
+      } // end loop over elem3
+      idouble++;
+    } // end loop over elem2
+  } // end loop over elem1
 }
 
+
 /* ----------------------------------------------------------------------
-   compute Yi from Ui without storing Zi, looping over zlist indices
+   compute Yi from Ui without storing Zi, looping over zlist indices,
+   GPU version
 ------------------------------------------------------------------------- */
 
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType>::compute_yi(int iter,
- const Kokkos::View<F_FLOAT**, DeviceType> &beta)
+void SNAKokkos<DeviceType>::compute_yi(int iatom_mod, int jjz, int iatom_div,
+ const Kokkos::View<F_FLOAT***, Kokkos::LayoutLeft, DeviceType> &beta_pack)
 {
   double betaj;
-  const int iatom = iter / idxz_max;
-  const int jjz = iter % idxz_max;
 
   const int j1 = idxz(jjz, 0);
   const int j2 = idxz(jjz, 1);
@@ -576,88 +641,84 @@ void SNAKokkos<DeviceType>::compute_yi(int iter,
   const int nb = idxz(jjz, 8);
   const int jju = idxz(jjz, 9);
 
-  const double *cgblock = cglist.data() + idxcg_block(j1, j2, j);
+  const double *cgblock = cglist.data() + idxcg_block(j1,j2,j);
   //int mb = (2 * (mb1min+mb2max) - j1 - j2 + j) / 2;
   //int ma = (2 * (ma1min+ma2max) - j1 - j2 + j) / 2;
 
-  int itriple;
-  for(int elem1 = 0; elem1 < nelements; elem1++)
+  for (int elem1 = 0; elem1 < nelements; elem1++) {
     for (int elem2 = 0; elem2 < nelements; elem2++) {
 
-    double ztmp_r = 0.0;
-    double ztmp_i = 0.0;
+      double ztmp_r = 0.0;
+      double ztmp_i = 0.0;
 
-    int jju1 = elem1 * idxu_max + idxu_block[j1] + (j1 + 1) * mb1min;
-    int jju2 = elem2 * idxu_max + idxu_block[j2] + (j2 + 1) * mb2max;
-    int icgb = mb1min * (j2 + 1) + mb2max;
-    for (int ib = 0; ib < nb; ib++) {
+      int jju1 = idxu_block[j1] + (j1 + 1) * mb1min;
+      int jju2 = idxu_block[j2] + (j2 + 1) * mb2max;
+      int icgb = mb1min * (j2 + 1) + mb2max;
 
-      double suma1_r = 0.0;
-      double suma1_i = 0.0;
+      #pragma unroll
+      for (int ib = 0; ib < nb; ib++) {
 
-      int ma1 = ma1min;
-      int ma2 = ma2max;
-      int icga = ma1min * (j2 + 1) + ma2max;
+        int ma1 = ma1min;
+        int ma2 = ma2max;
+        int icga = ma1min*(j2+1) + ma2max;
 
-      for (int ia = 0; ia < na; ia++) {
-        suma1_r += cgblock[icga] * (ulisttot(jju1 + ma1, iatom).re *
-                                    ulisttot(jju2 + ma2, iatom).re -
-                                    ulisttot(jju1 + ma1, iatom).im *
-                                    ulisttot(jju2 + ma2, iatom).im);
-        suma1_i += cgblock[icga] * (ulisttot(jju1 + ma1, iatom).re *
-                                    ulisttot(jju2 + ma2, iatom).im +
-                                    ulisttot(jju1 + ma1, iatom).im *
-                                    ulisttot(jju2 + ma2, iatom).re);
-        ma1++;
-        ma2--;
-        icga += j2;
-      } // end loop over ia
+        #pragma unroll
+        for (int ia = 0; ia < na; ia++) {
+          const SNAcomplex utot1 = ulisttot_pack(iatom_mod,jju1+ma1,elem1,iatom_div);
+          const SNAcomplex utot2 = ulisttot_pack(iatom_mod,jju2+ma2,elem2,iatom_div);
+          const auto cgcoeff_a = cgblock[icga];
+          const auto cgcoeff_b = cgblock[icgb];
+          ztmp_r += cgcoeff_a * cgcoeff_b * (utot1.re * utot2.re - utot1.im * utot2.im);
+          ztmp_i += cgcoeff_a * cgcoeff_b * (utot1.re * utot2.im + utot1.im * utot2.re);
+          ma1++;
+          ma2--;
+          icga += j2;
+        } // end loop over ia
 
-      ztmp_r += cgblock[icgb] * suma1_r;
-      ztmp_i += cgblock[icgb] * suma1_i;
-      jju1 += j1 + 1;
-      jju2 -= j2 + 1;
-      icgb += j2;
-    } // end loop over ib
+        jju1 += j1 + 1;
+        jju2 -= j2 + 1;
+        icgb += j2;
+      } // end loop over ib
 
-    if (bnorm_flag) {
-      ztmp_i /= j + 1;
-      ztmp_r /= j + 1;
-    }
-
-    // apply to z(j1,j2,j,ma,mb) to unique element of y(j)
-    // find right y_list[jju] and beta(iatom,jjb) entries
-    // multiply and divide by j+1 factors
-    // account for multiplicity of 1, 2, or 3
-
-    // pick out right beta value
-    for (int elem3 = 0; elem3 < nelements; elem3++) {
-      const int jjuy = elem3 * idxu_max + jju;
-      if (j >= j1) {
-        const int jjb = idxb_block(j1, j2, j);
-        itriple = ((elem1 * nelements + elem2) * nelements + elem3) * idxb_max + jjb;
-        if (j1 == j) {
-          if (j2 == j) betaj = 3 * beta(itriple, iatom);
-          else betaj = 2 * beta(itriple, iatom);
-        } else betaj = beta(itriple, iatom);
-      } else if (j >= j2) {
-        const int jjb = idxb_block(j, j2, j1);
-        itriple = ((elem3 * nelements + elem2) * nelements + elem1) * idxb_max + jjb;
-        if (j2 == j) betaj = 2 * beta(itriple, iatom);
-        else betaj = beta(itriple, iatom);
-      } else {
-        const int jjb = idxb_block(j2, j, j1);
-        itriple = ((elem2 * nelements + elem3) * nelements + elem1) * idxb_max + jjb;
-        betaj = beta(itriple, iatom);
+      if (bnorm_flag) {
+        ztmp_r /= j + 1;
+        ztmp_i /= j + 1;
       }
 
-      if (!bnorm_flag && j1 > j)
-        betaj *= (j1 + 1) / (j + 1.0);
+      // apply to z(j1,j2,j,ma,mb) to unique element of y(j)
+      // find right y_list[jju] and beta(iatom,jjb) entries
+      // multiply and divide by j+1 factors
+      // account for multiplicity of 1, 2, or 3
 
-      Kokkos::atomic_add(&(ylist(jjuy, iatom).re), betaj * ztmp_r);
-      Kokkos::atomic_add(&(ylist(jjuy, iatom).im), betaj * ztmp_i);
-    }
-  }
+      // pick out right beta value
+      for (int elem3 = 0; elem3 < nelements; elem3++) {
+        if (j >= j1) {
+          const int jjb = idxb_block(j1, j2, j);
+          const auto itriple = ((elem1 * nelements + elem2) * nelements + elem3) * idxb_max + jjb;
+          if (j1 == j) {
+            if (j2 == j) betaj = 3 * beta_pack(iatom_mod, itriple, iatom_div);
+            else betaj = 2 * beta_pack(iatom_mod, itriple, iatom_div);
+          } else betaj = beta_pack(iatom_mod, itriple, iatom_div);
+        } else if (j >= j2) {
+          const int jjb = idxb_block(j, j2, j1);
+          const auto itriple = ((elem3 * nelements + elem2) * nelements + elem1) * idxb_max + jjb;
+          if (j2 == j) betaj = 2 * beta_pack(iatom_mod, itriple, iatom_div);
+          else betaj = beta_pack(iatom_mod, itriple, iatom_div);
+        } else {
+          const int jjb = idxb_block(j2, j, j1);
+          const auto itriple = ((elem2 * nelements + elem3) * nelements + elem1) * idxb_max + jjb;
+          betaj = beta_pack(iatom_mod, itriple, iatom_div);
+        }
+
+        if (!bnorm_flag && j1 > j)
+          betaj *= (j1 + 1) / (j + 1.0);
+
+
+        Kokkos::atomic_add(&(ylist_pack_re(iatom_mod, jju, elem3, iatom_div)), betaj*ztmp_r);
+        Kokkos::atomic_add(&(ylist_pack_im(iatom_mod, jju, elem3, iatom_div)), betaj*ztmp_i);
+      } // end loop over elem3
+    } // end loop over elem2
+  } // end loop over elem1
 }
 
 /* ----------------------------------------------------------------------
@@ -673,7 +734,7 @@ void SNAKokkos<DeviceType>::compute_fused_deidrj(const typename Kokkos::TeamPoli
   const int max_m_tile = (twojmax+1)*(twojmax/2+1);
   const int team_rank = team.team_rank();
   const int scratch_shift = team_rank * max_m_tile;
-  const int jpos = element(iatom, jnbor)*idxu_max;
+  const int jelem = element(iatom, jnbor);
 
   // double buffer for ulist
   SNAcomplex* ulist_buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
@@ -723,14 +784,14 @@ void SNAKokkos<DeviceType>::compute_fused_deidrj(const typename Kokkos::TeamPoli
 
   // Accumulate the full contribution to dedr on the fly
   const double du_prod = dsfac * u; // chain rule
-  const SNAcomplex y_local = ylist(jpos, iatom);
+  const SNAcomplex y_local = ylist(0, jelem, iatom);
 
   // Symmetry factor of 0.5 b/c 0 element is on diagonal for even j==0
   double dedr_full_sum = 0.5 * du_prod * y_local.re;
 
   // single has a warp barrier at the end
   Kokkos::single(Kokkos::PerThread(team), [=]() {
-    //dulist(0,iatom,jnbor,dir) = { dsfac * u, 0. }; // fold in chain rule here
+    
     ulist_buf1[0] = {1., 0.};
     dulist_buf1[0] = {0., 0.};
   });
@@ -755,10 +816,11 @@ void SNAKokkos<DeviceType>::compute_fused_deidrj(const typename Kokkos::TeamPoli
       [&] (const int m, double& sum_tmp) {
 
       // ma fast, mb slow
-      int ma = m % n_ma;
-      int mb = m / n_ma;
+      // Equivalent to `int ma = m % n_ma; int mb = m / n_ma;` IF everything's positive.
+      const int mb = m / n_ma;
+      const int ma = m - mb * n_ma;
 
-      const int jju_index = jpos+jju+m;
+      const int jju_index = jju+m;
 
       // Load y_local, apply the symmetry scaling factor
       // The "secret" of the shared memory optimization is it eliminates
@@ -766,7 +828,7 @@ void SNAKokkos<DeviceType>::compute_fused_deidrj(const typename Kokkos::TeamPoli
       // shared memory and otherwise always writing, making the kernel
       // ultimately compute bound. We take advantage of that by adding
       // some reads back in.
-      auto y_local = ylist(jju_index,iatom);
+      auto y_local = ylist(jju_index, jelem, iatom);
       if (j % 2 == 0 && 2*mb == j) {
         if (ma == mb) { y_local = 0.5*y_local; }
         else if (ma > mb) { y_local = { 0., 0. }; } // can probably avoid this outright
@@ -783,29 +845,35 @@ void SNAKokkos<DeviceType>::compute_fused_deidrj(const typename Kokkos::TeamPoli
       SNAcomplex du_accum = { 0., 0. };
 
       const double rootpq2 = -rootpqarray(ma, j - mb);
-      const SNAcomplex u_up2 = (ma > 0)?rootpq2*ulist_buf1[jjup_shared_idx-1]:SNAcomplex(0.,0.);
+      const SNAcomplex u_up2 = rootpq2*((ma > 0) ? ulist_buf1[jjup_shared_idx-1]:SNAcomplex(0.,0.));
+
+      // u_accum += conj(b) * u_up2
       caconjxpy(b, u_up2, u_accum);
 
       const double rootpq1 = rootpqarray(j - ma, j - mb);
-      const SNAcomplex u_up1 = (ma < j)?rootpq1*ulist_buf1[jjup_shared_idx]:SNAcomplex(0.,0.);
+      const SNAcomplex u_up1 = rootpq1*((ma < j) ? ulist_buf1[jjup_shared_idx]:SNAcomplex(0.,0.));
+
+      // u_accum += conj(a) * u_up1
       caconjxpy(a, u_up1, u_accum);
 
       // Next, spin up du_accum
-      const SNAcomplex du_up1 = (ma < j) ? rootpq1*dulist_buf1[jjup_shared_idx] : SNAcomplex(0.,0.);
+      const SNAcomplex du_up1 = rootpq1*((ma < j) ? dulist_buf1[jjup_shared_idx] : SNAcomplex(0.,0.));
+
+      // du_accum += conj(da) * u_up1 + conj(a) * du_up1
       caconjxpy(da, u_up1, du_accum);
       caconjxpy(a, du_up1, du_accum);
 
-      const SNAcomplex du_up2 = (ma > 0) ? rootpq2*dulist_buf1[jjup_shared_idx-1] : SNAcomplex(0.,0.);
+      const SNAcomplex du_up2 = rootpq2*((ma > 0) ? dulist_buf1[jjup_shared_idx-1] : SNAcomplex(0.,0.));
+
+      // du_accum += conj(db) * u_up2 + conj(b) * du_up2
       caconjxpy(db, u_up2, du_accum);
       caconjxpy(b, du_up2, du_accum);
 
-      // No need to save u_accum to global memory
       // Cache u_accum, du_accum to scratch memory.
       ulist_buf2[jju_shared_idx] = u_accum;
       dulist_buf2[jju_shared_idx] = du_accum;
 
       // Directly accumulate deidrj into sum_tmp
-      //dulist(jju_index,iatom,jnbor,dir) = ((dsfac * u)*u_accum) + (sfac*du_accum);
       const SNAcomplex du_prod = ((dsfac * u)*u_accum) + (sfac*du_accum);
       sum_tmp += du_prod.re * y_local.re + du_prod.im * y_local.im;
 
@@ -813,7 +881,7 @@ void SNAKokkos<DeviceType>::compute_fused_deidrj(const typename Kokkos::TeamPoli
       // u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb])
       if (j%2==1 && mb+1==n_mb) {
         int sign_factor = (((ma+mb)%2==0)?1:-1);
-        //const int jjup_flip = jju+(j+1-mb)*(j+1)-(ma+1); // no longer needed b/c we don't update dulist
+        
         const int jju_shared_flip = (j+1-mb)*(j+1)-(ma+1);
 
         if (sign_factor == 1) {
@@ -826,8 +894,6 @@ void SNAKokkos<DeviceType>::compute_fused_deidrj(const typename Kokkos::TeamPoli
 
         // We don't need the second half of the tile for the deidrj accumulation.
         // That's taken care of by the symmetry factor above.
-        //dulist(jjup_flip,iatom,jnbor,dir) = ((dsfac * u)*u_accum) + (sfac*du_accum);
-
         // We do need it for ortho polynomial generation, though
         ulist_buf2[jju_shared_flip] = u_accum;
         dulist_buf2[jju_shared_flip] = du_accum;
@@ -854,69 +920,150 @@ void SNAKokkos<DeviceType>::compute_fused_deidrj(const typename Kokkos::TeamPoli
   });
 }
 
-/* ----------------------------------------------------------------------
-   compute dEidRj, CPU path only.
-------------------------------------------------------------------------- */
 
 
+/* ----------------------------------------------------------------------
+ * CPU routines
+ * ----------------------------------------------------------------------*/
+
+/* ----------------------------------------------------------------------
+ *    compute Ui by summing over neighbors j
+ *    ------------------------------------------------------------------------- */
+
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType>::compute_deidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
+void SNAKokkos<DeviceType>::pre_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int& iatom, const int& ielem)
 {
-  t_scalar3<double> final_sum;
-  const int jelem = element(iatom, jnbor);
+  for (int jelem = 0; jelem < nelements; jelem++) {
+    for (int j = 0; j <= twojmax; j++) {
+      const int jju = idxu_block(j);
 
-  //for(int j = 0; j <= twojmax; j++) {
-  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,twojmax+1),
-      [&] (const int& j, t_scalar3<double>& sum_tmp) {
-    int jju = idxu_block[j];
-    int jjuy = idxu_block[j] + jelem*idxu_max;
+      // Only diagonal elements get initialized
+      // for (int m = 0; m < (j+1)*(j+1); m++)
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, (j+1)*(j+1)),
+        [&] (const int m) {
 
-    for(int mb = 0; 2*mb < j; mb++)
-      for(int ma = 0; ma <= j; ma++) {
-        sum_tmp.x += dulist(jju,iatom,jnbor,0).re * ylist(jjuy,iatom).re + dulist(jju,iatom,jnbor,0).im * ylist(jjuy,iatom).im;
-        sum_tmp.y += dulist(jju,iatom,jnbor,1).re * ylist(jjuy,iatom).re + dulist(jju,iatom,jnbor,1).im * ylist(jjuy,iatom).im;
-        sum_tmp.z += dulist(jju,iatom,jnbor,2).re * ylist(jjuy,iatom).re + dulist(jju,iatom,jnbor,2).im * ylist(jjuy,iatom).im;
-        jju++;
-        jjuy++;
-      } //end loop over ma mb
+        const int jjup = jju + m;
 
-    // For j even, handle middle column
+        // if m is on the "diagonal", initialize it with the self energy.
+        // Otherwise zero it out
+        SNAcomplex init = {0., 0.};
+        if (m % (j+2) == 0 && (!chem_flag || ielem == jelem || wselfall_flag)) { init = {wself, 0.0}; } //need to map iatom to element
 
-    if (j%2 == 0) {
-
-      int mb = j/2;
-      for(int ma = 0; ma < mb; ma++) {
-        sum_tmp.x += dulist(jju,iatom,jnbor,0).re * ylist(jjuy,iatom).re + dulist(jju,iatom,jnbor,0).im * ylist(jjuy,iatom).im;
-        sum_tmp.y += dulist(jju,iatom,jnbor,1).re * ylist(jjuy,iatom).re + dulist(jju,iatom,jnbor,1).im * ylist(jjuy,iatom).im;
-        sum_tmp.z += dulist(jju,iatom,jnbor,2).re * ylist(jjuy,iatom).re + dulist(jju,iatom,jnbor,2).im * ylist(jjuy,iatom).im;
-        jju++;
-        jjuy++;
-      }
-
-      //int ma = mb;
-      sum_tmp.x += (dulist(jju,iatom,jnbor,0).re * ylist(jjuy,iatom).re + dulist(jju,iatom,jnbor,0).im * ylist(jjuy,iatom).im)*0.5;
-      sum_tmp.y += (dulist(jju,iatom,jnbor,1).re * ylist(jjuy,iatom).re + dulist(jju,iatom,jnbor,1).im * ylist(jjuy,iatom).im)*0.5;
-      sum_tmp.z += (dulist(jju,iatom,jnbor,2).re * ylist(jjuy,iatom).re + dulist(jju,iatom,jnbor,2).im * ylist(jjuy,iatom).im)*0.5;
-    } // end if jeven
-
-  },final_sum); // end loop over j
-
-  Kokkos::single(Kokkos::PerThread(team), [&] () {
-    dedr(iatom,jnbor,0) = final_sum.x*2.0;
-    dedr(iatom,jnbor,1) = final_sum.y*2.0;
-    dedr(iatom,jnbor,2) = final_sum.z*2.0;
-  });
+        ulisttot(jjup, jelem, iatom) = init;
+      });
+    }
+  }
 
 }
 
+
 /* ----------------------------------------------------------------------
-   compute Bi by summing conj(Ui)*Zi
+   compute Ui by summing over bispectrum components. CPU only.
 ------------------------------------------------------------------------- */
 
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType>::compute_bi(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom)
+void SNAKokkos<DeviceType>::compute_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
+{
+  double rsq, r, x, y, z, z0, theta0;
+
+  // utot(j,ma,mb) = 0 for all j,ma,ma
+  // utot(j,ma,ma) = 1 for all j,ma
+  // for j in neighbors of i:
+  //   compute r0 = (x,y,z,z0)
+  //   utot(j,ma,mb) += u(r0;j,ma,mb) for all j,ma,mb
+
+  x = rij(iatom,jnbor,0);
+  y = rij(iatom,jnbor,1);
+  z = rij(iatom,jnbor,2);
+  rsq = x * x + y * y + z * z;
+  r = sqrt(rsq);
+
+  theta0 = (r - rmin0) * rfac0 * MY_PI / (rcutij(iatom,jnbor) - rmin0);
+  //    theta0 = (r - rmin0) * rscale0;
+  z0 = r / tan(theta0);
+
+  compute_uarray_cpu(team, iatom, jnbor, x, y, z, z0, r);
+  add_uarraytot(team, iatom, jnbor, r, wj(iatom,jnbor), rcutij(iatom,jnbor), element(iatom, jnbor));
+
+}
+/* ----------------------------------------------------------------------
+   compute Zi by summing over products of Ui, CPU version
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void SNAKokkos<DeviceType>::compute_zi_cpu(const int& iter)
+{
+  const int iatom = iter / idxz_max;
+  const int jjz = iter % idxz_max;
+
+  const int j1 = idxz(jjz, 0);
+  const int j2 = idxz(jjz, 1);
+  const int j = idxz(jjz, 2);
+  const int ma1min = idxz(jjz, 3);
+  const int ma2max = idxz(jjz, 4);
+  const int mb1min = idxz(jjz, 5);
+  const int mb2max = idxz(jjz, 6);
+  const int na = idxz(jjz, 7);
+  const int nb = idxz(jjz, 8);
+
+  const double *cgblock = cglist.data() + idxcg_block(j1,j2,j);
+
+  int idouble = 0;
+
+  for (int elem1 = 0; elem1 < nelements; elem1++) {
+    for (int elem2 = 0; elem2 < nelements; elem2++) {
+      zlist(jjz, idouble, iatom).re = 0.0;
+      zlist(jjz, idouble, iatom).im = 0.0;
+
+      int jju1 = idxu_block[j1] + (j1+1)*mb1min;
+      int jju2 = idxu_block[j2] + (j2+1)*mb2max;
+      int icgb = mb1min*(j2+1) + mb2max;
+      for(int ib = 0; ib < nb; ib++) {
+
+        double suma1_r = 0.0;
+        double suma1_i = 0.0;
+
+        int ma1 = ma1min;
+        int ma2 = ma2max;
+        int icga = ma1min * (j2 + 1) + ma2max;
+        for(int ia = 0; ia < na; ia++) {
+          suma1_r += cgblock[icga] * (ulisttot(jju1+ma1, elem1, iatom).re * ulisttot(jju2+ma2, elem2, iatom).re -
+                                      ulisttot(jju1+ma1, elem1, iatom).im * ulisttot(jju2+ma2, elem2, iatom).im);
+          suma1_i += cgblock[icga] * (ulisttot(jju1+ma1, elem1, iatom).re * ulisttot(jju2+ma2, elem2, iatom).im +
+                                      ulisttot(jju1+ma1, elem1, iatom).im * ulisttot(jju2+ma2, elem2, iatom).re);
+          ma1++;
+          ma2--;
+          icga += j2;
+        } // end loop over ia
+
+        zlist(jjz, idouble, iatom).re += cgblock[icgb] * suma1_r;
+        zlist(jjz, idouble, iatom).im += cgblock[icgb] * suma1_i;
+
+        jju1 += j1 + 1;
+        jju2 -= j2 + 1;
+        icgb += j2;
+      } // end loop over ib
+
+      if (bnorm_flag) {
+        zlist(jjz, idouble, iatom).re /= (j+1);
+        zlist(jjz, idouble, iatom).im /= (j+1);
+      }
+      idouble++;
+    } // end loop over elem2
+  } // end loop over elem1
+}
+
+
+/* ----------------------------------------------------------------------
+   compute Bi by summing conj(Ui)*Zi, CPU version
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void SNAKokkos<DeviceType>::compute_bi_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom)
 {
   // for j1 = 0,...,twojmax
   //   for j2 = 0,twojmax
@@ -929,15 +1076,14 @@ void SNAKokkos<DeviceType>::compute_bi(const typename Kokkos::TeamPolicy<DeviceT
 
   int itriple = 0;
   int idouble = 0;
-  int jalloy = 0;
-  for (int elem1 = 0; elem1 < nelements; elem1++)
+  for (int elem1 = 0; elem1 < nelements; elem1++) {
     for (int elem2 = 0; elem2 < nelements; elem2++) {
-      jalloy = idouble*idxz_max;
+      const auto jalloy = idouble;
       for (int elem3 = 0; elem3 < nelements; elem3++) {
-        Kokkos::parallel_for(Kokkos::TeamThreadRange(team, idxb_max),
-        [&](const int &jjb) {
+        Kokkos::parallel_for(Kokkos::TeamThreadRange(team,idxb_max),
+          [&] (const int& jjb) {
         //for(int jjb = 0; jjb < idxb_max; jjb++) {
-          const int jjballoy = itriple*idxb_max+jjb;
+          const auto jjballoy = itriple;
           const int j1 = idxb(jjb, 0);
           const int j2 = idxb(jjb, 1);
           const int j = idxb(jjb, 2);
@@ -946,76 +1092,191 @@ void SNAKokkos<DeviceType>::compute_bi(const typename Kokkos::TeamPolicy<DeviceT
           int jju = idxu_block[j];
           double sumzu = 0.0;
           double sumzu_temp = 0.0;
-          const int bound = (j + 2) / 2;
-          Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, (j + 1) * bound),
-          [&](const int mbma, double &sum) {
-            //for(int mb = 0; 2*mb < j; mb++)
-            //for(int ma = 0; ma <= j; ma++) {
-            const int ma = mbma % (j + 1);
-            const int mb = mbma / (j + 1);
-            const int jju_index = elem3 * idxu_max + jju + mb * (j + 1) + ma;
-            const int jjz_index = jalloy + jjz + mb * (j + 1) + ma;
-            if (2 * mb == j) return;
-            sum +=
-                ulisttot(jju_index, iatom).re *
-                zlist(jjz_index, iatom).re +
-                ulisttot(jju_index, iatom).im *
-                zlist(jjz_index, iatom).im;
-          }, sumzu_temp); // end loop over ma, mb
-          sumzu += sumzu_temp;
+          const int bound = (j+2)/2;
+          Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,(j+1)*bound),
+              [&] (const int mbma, double& sum) {
+              //for(int mb = 0; 2*mb < j; mb++)
+                //for(int ma = 0; ma <= j; ma++) {
+              const int ma = mbma % (j + 1);
+              const int mb = mbma / (j + 1);
+              const int jju_index = jju + mb * (j + 1) + ma;
+              const int jjz_index = jjz + mb * (j + 1) + ma;
+              if (2*mb == j) return;
+              sum +=
+                ulisttot(jju_index, elem3, iatom).re * zlist(jjz_index, jalloy, iatom).re +
+                ulisttot(jju_index, elem3, iatom).im * zlist(jjz_index, jalloy, iatom).im;
+            },sumzu_temp); // end loop over ma, mb
+            sumzu += sumzu_temp;
 
           // For j even, special treatment for middle column
 
-          if (j % 2 == 0) {
-            const int mb = j / 2;
+          if (j%2 == 0) {
+            const int mb = j/2;
             Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, mb),
-            [&](const int ma, double &sum) {
-              //for(int ma = 0; ma < mb; ma++) {
-              const int jju_index =
-                  elem3 * idxu_max + jju + (mb - 1) * (j + 1) + (j + 1) + ma;
-              const int jjz_index =
-                  jalloy + jjz + (mb - 1) * (j + 1) + (j + 1) + ma;
+                [&] (const int ma, double& sum) {
+            //for(int ma = 0; ma < mb; ma++) {
+              const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma;
+              const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma;
               sum +=
-                  ulisttot(jju_index, iatom).re *
-                  zlist(jjz_index, iatom).re +
-                  ulisttot(jju_index, iatom).im *
-                  zlist(jjz_index, iatom).im;
-            }, sumzu_temp); // end loop over ma
+                ulisttot(jju_index, elem3, iatom).re * zlist(jjz_index, jalloy, iatom).re +
+                ulisttot(jju_index, elem3, iatom).im * zlist(jjz_index, jalloy, iatom).im;
+            },sumzu_temp); // end loop over ma
             sumzu += sumzu_temp;
 
             const int ma = mb;
-            const int jju_index = elem3 * idxu_max + jju + (mb - 1) * (j + 1) + (j + 1) + ma;
-            const int jjz_index = jalloy + jjz + (mb - 1) * (j + 1) + (j + 1) + ma;
-            sumzu += 0.5 *
-                     (ulisttot(jju_index, iatom).re *
-                      zlist(jjz_index, iatom).re +
-                      ulisttot(jju_index, iatom).im *
-                      zlist(jjz_index, iatom).im);
+            const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma;
+            const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma;
+            sumzu += 0.5*
+              (ulisttot(jju_index, elem3, iatom).re * zlist(jjz_index, jalloy, iatom).re +
+               ulisttot(jju_index, elem3, iatom).im * zlist(jjz_index, jalloy, iatom).im);
           } // end if jeven
 
-          Kokkos::single(Kokkos::PerThread(team), [&]() {
+          Kokkos::single(Kokkos::PerThread(team), [&] () {
             sumzu *= 2.0;
 
             // apply bzero shift
 
             if (bzero_flag){
               if (!wselfall_flag) {
-                if (elem1 == elem2 && elem1 == elem3)
+                if (elem1 == elem2 && elem1 == elem3) {
                   sumzu -= bzero[j];
-              } else sumzu -= bzero[j];
+                }
+              } else { 
+                sumzu -= bzero[j];
+              }
             }
 
-            blist(jjballoy, iatom) = sumzu;
+            blist(jjb, jjballoy, iatom) = sumzu;
           });
         });
+          //} // end loop over j
+        //} // end loop over j1, j2
         itriple++;
       }
       idouble++;
-    }
-    //} // end loop over j
-  //} // end loop over j1, j2
+    } // end loop over elem2
+  } // end loop over elem1
+  
 }
 
+/* ----------------------------------------------------------------------
+   compute Yi from Ui without storing Zi, looping over zlist indices
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void SNAKokkos<DeviceType>::zero_yi_cpu(const int& idx, const int& iatom, const int& ielem)
+{
+  ylist(idx,ielem,iatom) = {0.0, 0.0};
+}
+
+/* ----------------------------------------------------------------------
+   compute Yi from Ui without storing Zi, looping over zlist indices,
+   CPU version
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void SNAKokkos<DeviceType>::compute_yi_cpu(int iter,
+ const Kokkos::View<F_FLOAT**, DeviceType> &beta)
+{
+  double betaj;
+  const int iatom = iter / idxz_max;
+  const int jjz = iter % idxz_max;
+
+  const int j1 = idxz(jjz, 0);
+  const int j2 = idxz(jjz, 1);
+  const int j = idxz(jjz, 2);
+  const int ma1min = idxz(jjz, 3);
+  const int ma2max = idxz(jjz, 4);
+  const int mb1min = idxz(jjz, 5);
+  const int mb2max = idxz(jjz, 6);
+  const int na = idxz(jjz, 7);
+  const int nb = idxz(jjz, 8);
+  const int jju = idxz(jjz, 9);
+
+  const double *cgblock = cglist.data() + idxcg_block(j1,j2,j);
+  //int mb = (2 * (mb1min+mb2max) - j1 - j2 + j) / 2;
+  //int ma = (2 * (ma1min+ma2max) - j1 - j2 + j) / 2;
+
+  for (int elem1 = 0; elem1 < nelements; elem1++) {
+    for (int elem2 = 0; elem2 < nelements; elem2++) {
+
+      double ztmp_r = 0.0;
+      double ztmp_i = 0.0;
+
+      int jju1 = idxu_block[j1] + (j1 + 1) * mb1min;
+      int jju2 = idxu_block[j2] + (j2 + 1) * mb2max;
+      int icgb = mb1min * (j2 +1 ) + mb2max;
+
+      for (int ib = 0; ib < nb; ib++) {
+
+        double suma1_r = 0.0;
+        double suma1_i = 0.0;
+
+        int ma1 = ma1min;
+        int ma2 = ma2max;
+        int icga = ma1min*(j2+1) + ma2max;
+
+        for (int ia = 0; ia < na; ia++) {
+          suma1_r += cgblock[icga] * (ulisttot(jju1+ma1, elem1, iatom).re * ulisttot(jju2+ma2, elem2, iatom).re -
+                                      ulisttot(jju1+ma1, elem1, iatom).im * ulisttot(jju2+ma2, elem2, iatom).im);
+          suma1_i += cgblock[icga] * (ulisttot(jju1+ma1, elem1, iatom).re * ulisttot(jju2+ma2, elem2, iatom).im +
+                                      ulisttot(jju1+ma1, elem1, iatom).im * ulisttot(jju2+ma2, elem2, iatom).re);
+          ma1++;
+          ma2--;
+          icga += j2;
+        } // end loop over ia
+
+        ztmp_r += cgblock[icgb] * suma1_r;
+        ztmp_i += cgblock[icgb] * suma1_i;
+        jju1 += j1 + 1;
+        jju2 -= j2 + 1;
+        icgb += j2;
+      } // end loop over ib
+
+      if (bnorm_flag) {
+        ztmp_i /= j + 1;
+        ztmp_r /= j + 1;
+      }
+
+      // apply to z(j1,j2,j,ma,mb) to unique element of y(j)
+      // find right y_list[jju] and beta(iatom,jjb) entries
+      // multiply and divide by j+1 factors
+      // account for multiplicity of 1, 2, or 3
+
+      // pick out right beta value
+      for (int elem3 = 0; elem3 < nelements; elem3++) {
+
+        if (j >= j1) {
+          const int jjb = idxb_block(j1, j2, j);
+          const auto itriple = ((elem1 * nelements + elem2) * nelements + elem3) * idxb_max + jjb;
+          if (j1 == j) {
+            if (j2 == j) betaj = 3 * beta(itriple, iatom);
+            else betaj = 2 * beta(itriple, iatom);
+          } else betaj = beta(itriple, iatom);
+        } else if (j >= j2) {
+          const int jjb = idxb_block(j, j2, j1);
+          const auto itriple = ((elem3 * nelements + elem2) * nelements + elem1) * idxb_max + jjb;
+          if (j2 == j) betaj = 2 * beta(itriple, iatom);
+          else betaj = beta(itriple, iatom);
+        } else {
+          const int jjb = idxb_block(j2, j, j1);
+          const auto itriple = ((elem2 * nelements + elem3) * nelements + elem1) * idxb_max + jjb;
+          betaj = beta(itriple, iatom);
+        }
+
+        if (!bnorm_flag && j1 > j)
+          betaj *= (j1 + 1) / (j + 1.0);
+
+        Kokkos::atomic_add(&(ylist(jju, elem3, iatom).re), betaj*ztmp_r);
+        Kokkos::atomic_add(&(ylist(jju, elem3, iatom).im), betaj*ztmp_i);
+      } // end loop over elem3
+    } // end loop over elem2
+  } // end loop over elem1
+}
+
+
 /* ----------------------------------------------------------------------
    calculate derivative of Ui w.r.t. atom j
 ------------------------------------------------------------------------- */
@@ -1042,6 +1303,61 @@ void SNAKokkos<DeviceType>::compute_duidrj_cpu(const typename Kokkos::TeamPolicy
   compute_duarray_cpu(team, iatom, jnbor, x, y, z, z0, r, dz0dr, wj(iatom,jnbor), rcutij(iatom,jnbor));
 }
 
+
+/* ----------------------------------------------------------------------
+   compute dEidRj, CPU path only.
+------------------------------------------------------------------------- */
+
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void SNAKokkos<DeviceType>::compute_deidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
+{
+  t_scalar3<double> final_sum;
+  const int jelem = element(iatom, jnbor);
+
+  //for(int j = 0; j <= twojmax; j++) {
+  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,twojmax+1),
+      [&] (const int& j, t_scalar3<double>& sum_tmp) {
+    int jju = idxu_block[j];
+
+    for(int mb = 0; 2*mb < j; mb++)
+      for(int ma = 0; ma <= j; ma++) {
+        sum_tmp.x += dulist(jju,iatom,jnbor,0).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,0).im * ylist(jju,jelem,iatom).im;
+        sum_tmp.y += dulist(jju,iatom,jnbor,1).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,1).im * ylist(jju,jelem,iatom).im;
+        sum_tmp.z += dulist(jju,iatom,jnbor,2).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,2).im * ylist(jju,jelem,iatom).im;
+        jju++;
+      } //end loop over ma mb
+
+    // For j even, handle middle column
+
+    if (j%2 == 0) {
+
+      int mb = j/2;
+      for(int ma = 0; ma < mb; ma++) {
+        sum_tmp.x += dulist(jju,iatom,jnbor,0).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,0).im * ylist(jju,jelem,iatom).im;
+        sum_tmp.y += dulist(jju,iatom,jnbor,1).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,1).im * ylist(jju,jelem,iatom).im;
+        sum_tmp.z += dulist(jju,iatom,jnbor,2).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,2).im * ylist(jju,jelem,iatom).im;
+        jju++;
+      }
+
+      //int ma = mb;
+      sum_tmp.x += (dulist(jju,iatom,jnbor,0).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,0).im * ylist(jju,jelem,iatom).im)*0.5;
+      sum_tmp.y += (dulist(jju,iatom,jnbor,1).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,1).im * ylist(jju,jelem,iatom).im)*0.5;
+      sum_tmp.z += (dulist(jju,iatom,jnbor,2).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,2).im * ylist(jju,jelem,iatom).im)*0.5;
+    } // end if jeven
+
+  },final_sum); // end loop over j
+
+  Kokkos::single(Kokkos::PerThread(team), [&] () {
+    dedr(iatom,jnbor,0) = final_sum.x*2.0;
+    dedr(iatom,jnbor,1) = final_sum.y*2.0;
+    dedr(iatom,jnbor,2) = final_sum.z*2.0;
+  });
+
+}
+
+
 /* ----------------------------------------------------------------------
    add Wigner U-functions for one neighbor to the total
 ------------------------------------------------------------------------- */
@@ -1052,10 +1368,11 @@ void SNAKokkos<DeviceType>::add_uarraytot(const typename Kokkos::TeamPolicy<Devi
                                           double r, double wj, double rcut, int jelem)
 {
   const double sfac = compute_sfac(r, rcut) * wj;
-  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,idxu_max),
+
+  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,ulisttot.extent(0)),
       [&] (const int& i) {
-    Kokkos::atomic_add(&(ulisttot(jelem*idxu_max+i,iatom).re), sfac * ulist(i,iatom,jnbor).re);
-    Kokkos::atomic_add(&(ulisttot(jelem*idxu_max+i,iatom).im), sfac * ulist(i,iatom,jnbor).im);
+    Kokkos::atomic_add(&(ulisttot(i,jelem,iatom).re), sfac * ulist(i,iatom,jnbor).re);
+    Kokkos::atomic_add(&(ulisttot(i,jelem,iatom).im), sfac * ulist(i,iatom,jnbor).im);
   });
 }
 
@@ -1162,7 +1479,7 @@ void SNAKokkos<DeviceType>::compute_duarray_cpu(const typename Kokkos::TeamPolic
                           double z0, double r, double dz0dr,
                           double wj, double rcut)
 {
-  double r0inv;
+double r0inv;
   double a_r, a_i, b_r, b_i;
   double da_r[3], da_i[3], db_r[3], db_i[3];
   double dz0[3], dr0inv[3], dr0invdr;
@@ -1710,6 +2027,7 @@ double SNAKokkos<DeviceType>::memory_usage()
 {
   int jdimpq = twojmax + 2;
   int jdim = twojmax + 1;
+  int natom_pad = ((natom + 32 - 1) / 32) * 32; // for AoSoA layouts
   double bytes;
 
   bytes = 0;
@@ -1717,21 +2035,38 @@ double SNAKokkos<DeviceType>::memory_usage()
   bytes += jdimpq*jdimpq * sizeof(double);               // pqarray
   bytes += idxcg_max * sizeof(double);                   // cglist
 
+
+
 #ifdef KOKKOS_ENABLE_CUDA
-  if (!std::is_same<DeviceType,Kokkos::Cuda>::value) {
+  if (std::is_same<DeviceType,Kokkos::Cuda>::value) {
+    
+    bytes += natom * idxu_max * nelements * sizeof(double);          // ulisttot_re
+    bytes += natom * idxu_max * nelements * sizeof(double);          // ulisttot_im
+    bytes += natom_pad * idxu_max * nelements * sizeof(double) * 2;  // ulisttot_pack
+
+    bytes += natom_pad * idxz_max * ndoubles * sizeof(double) * 2;   // zlist_pack
+    bytes += natom_pad * idxb_max * ntriples * sizeof(double);       // blist_pack
+
+    bytes += natom_pad * idxu_max * nelements * sizeof(double);      // ylist_pack_re
+    bytes += natom_pad * idxu_max * nelements * sizeof(double);      // ylist_pack_im
+    bytes += natom * idxu_max * nelements * sizeof(double) * 2;      // ylist
+  } else {
 #endif
-    bytes += natom * idxu_max * sizeof(double) * 2;        // ulist
-    bytes += natom * idxu_max * 3 * sizeof(double) * 2;    // dulist
+
+    bytes += natom * nmax * idxu_max * sizeof(double) * 2;           // ulist
+    bytes += natom * idxu_max * nelements * sizeof(double) * 2;      // ulisttot
+
+    bytes += natom * idxz_max * ndoubles * sizeof(double) * 2;       // zlist
+    bytes += natom * idxb_max * ntriples * sizeof(double);           // blist
+
+    bytes += natom * idxu_max * nelements * sizeof(double) * 2;      // ylist
+
+    bytes += natom * nmax * idxu_max * 3 * sizeof(double) * 2;       // dulist
 #ifdef KOKKOS_ENABLE_CUDA
   }
 #endif
-  bytes += natom * idxu_max * nelements * sizeof(double) * 2;        // ulisttot
-  if (!std::is_same<typename DeviceType::array_layout,Kokkos::LayoutRight>::value)
-    bytes += natom * idxu_max * nelements * sizeof(double) * 2;        // ulisttot_lr
 
-  bytes += natom * idxz_max * ndoubles * sizeof(double) * 2;        // zlist
-  bytes += natom * idxb_max * ntriples * sizeof(double);            // blist
-  bytes += natom * idxu_max * nelements * sizeof(double) * 2;        // ylist
+  bytes += natom * nmax * 3 * sizeof(double);            // dedr
 
   bytes += jdim * jdim * jdim * sizeof(int);             // idxcg_block
   bytes += jdim * sizeof(int);                           // idxu_block
@@ -1747,7 +2082,6 @@ double SNAKokkos<DeviceType>::memory_usage()
   bytes += natom * nmax * sizeof(int);                   // inside
   bytes += natom * nmax * sizeof(double);                // wj
   bytes += natom * nmax * sizeof(double);                // rcutij
-  bytes += natom * nmax * idxu_max * sizeof(double) * 2; // ulist_ij
 
   return bytes;
 }
diff --git a/src/KSPACE/msm.cpp b/src/KSPACE/msm.cpp
index abc8ca1150..e4e5846c9e 100644
--- a/src/KSPACE/msm.cpp
+++ b/src/KSPACE/msm.cpp
@@ -362,6 +362,7 @@ void MSM::setup()
 
   nmax_direct = 8*(nxhi_direct+1)*(nyhi_direct+1)*(nzhi_direct+1);
 
+  deallocate();
   if (peratom_allocate_flag) deallocate_peratom();
 
   // compute direct sum interaction weights
@@ -612,8 +613,6 @@ void MSM::compute(int eflag, int vflag)
 
 void MSM::allocate()
 {
-  deallocate();
-
   // interpolation coeffs
 
   order_allocated = order;
@@ -635,9 +634,9 @@ void MSM::allocate()
   // allocate memory for each grid level
 
   for (int n=0; n<levels; n++) {
-
     memory->create3d_offset(qgrid[n],nzlo_out[n],nzhi_out[n],
             nylo_out[n],nyhi_out[n],nxlo_out[n],nxhi_out[n],"msm:qgrid");
+
     memory->create3d_offset(egrid[n],nzlo_out[n],nzhi_out[n],
             nylo_out[n],nyhi_out[n],nxlo_out[n],nxhi_out[n],"msm:egrid");
 
@@ -660,23 +659,29 @@ void MSM::allocate()
 
 void MSM::deallocate()
 {
-  delete cg_all;
-  cg_all = nullptr;
-
   memory->destroy2d_offset(phi1d,-order_allocated);
   memory->destroy2d_offset(dphi1d,-order_allocated);
 
+  if (cg_all) delete cg_all;
+  cg_all = nullptr;
+
   for (int n=0; n<levels; n++) {
-    memory->destroy3d_offset(qgrid[n],nzlo_out[n],nylo_out[n],nxlo_out[n]);
-    memory->destroy3d_offset(egrid[n],nzlo_out[n],nylo_out[n],nxlo_out[n]);
+    if (qgrid[n])
+      memory->destroy3d_offset(qgrid[n],nzlo_out[n],nylo_out[n],nxlo_out[n]);
 
-    if (world_levels[n] != MPI_COMM_NULL)
-      MPI_Comm_free(&world_levels[n]);
-    world_levels[n] = MPI_COMM_NULL;
-    active_flag[n] = 0;
+    if (egrid[n])
+      memory->destroy3d_offset(egrid[n],nzlo_out[n],nylo_out[n],nxlo_out[n]);
 
-    delete cg[n];
-    cg[n] = nullptr;
+    if (world_levels)
+      if (world_levels[n] != MPI_COMM_NULL)
+          MPI_Comm_free(&world_levels[n]);
+
+    if (cg) {
+      if (cg[n]) {
+        delete cg[n];
+        cg[n] = nullptr;
+      }
+    }
   }
 }
 
@@ -765,7 +770,6 @@ void MSM::deallocate_peratom()
 
 void MSM::allocate_levels()
 {
-  deallocate_levels();
   ngrid = new int[levels];
 
   cg = new GridComm*[levels];
@@ -815,21 +819,21 @@ void MSM::allocate_levels()
   v5grid = new double***[levels];
 
   for (int n=0; n<levels; n++) {
-    cg[n] = nullptr;
+    cg[n] = NULL;
     world_levels[n] = MPI_COMM_NULL;
-    active_flag[n] = 0;
-    cg_peratom[n] = nullptr;
+    cg_peratom[n] = NULL;
 
-    qgrid[n] = nullptr;
-    egrid[n] = nullptr;
+    qgrid[n] = NULL;
+    egrid[n] = NULL;
 
-    v0grid[n] = nullptr;
-    v1grid[n] = nullptr;
-    v2grid[n] = nullptr;
-    v3grid[n] = nullptr;
-    v4grid[n] = nullptr;
-    v5grid[n] = nullptr;
+    v0grid[n] = NULL;
+    v1grid[n] = NULL;
+    v2grid[n] = NULL;
+    v3grid[n] = NULL;
+    v4grid[n] = NULL;
+    v5grid[n] = NULL;
   }
+
 }
 
 /* ----------------------------------------------------------------------
@@ -1105,6 +1109,7 @@ void MSM::set_grid_global()
 
   if (!domain->nonperiodic) levels -= 1;
 
+  deallocate_levels();
   allocate_levels();
 
   // find number of grid levels in each direction
diff --git a/src/atom_vec.cpp b/src/atom_vec.cpp
index 6a294d3ccf..5a86caa4fb 100644
--- a/src/atom_vec.cpp
+++ b/src/atom_vec.cpp
@@ -33,6 +33,8 @@ using namespace MathConst;
 #define DELTA 16384
 #define DELTA_BONUS 8192
 
+int AtomVec::num_atom_vecs = 0;
+
 /* ---------------------------------------------------------------------- */
 
 AtomVec::AtomVec(LAMMPS *lmp) : Pointers(lmp)
@@ -54,6 +56,8 @@ AtomVec::AtomVec(LAMMPS *lmp) : Pointers(lmp)
 
   threads = NULL;
 
+  ++num_atom_vecs;
+
   // peratom variables auto-included in corresponding child style fields string
   // these fields cannot be specified in the fields string
 
@@ -93,44 +97,48 @@ AtomVec::~AtomVec()
   int datatype,cols;
   void *pdata;
 
+  --num_atom_vecs;
+
   for (int i = 0; i < nargcopy; i++) delete [] argcopy[i];
   delete [] argcopy;
 
-  memory->destroy(atom->tag);
-  memory->destroy(atom->type);
-  memory->destroy(atom->mask);
-  memory->destroy(atom->image);
-  memory->destroy(atom->x);
-  memory->destroy(atom->v);
-  memory->destroy(atom->f);
+  if (num_atom_vecs == 0) {
+    memory->destroy(atom->tag);
+    memory->destroy(atom->type);
+    memory->destroy(atom->mask);
+    memory->destroy(atom->image);
+    memory->destroy(atom->x);
+    memory->destroy(atom->v);
+    memory->destroy(atom->f);
 
-  for (int i = 0; i < ngrow; i++) {
-    pdata = mgrow.pdata[i];
-    datatype = mgrow.datatype[i];
-    cols = mgrow.cols[i];
-    if (datatype == Atom::DOUBLE) {
-      if (cols == 0)
-        memory->destroy(*((double **) pdata));
-      else if (cols > 0)
-        memory->destroy(*((double ***) pdata));
-      else {
-        memory->destroy(*((double ***) pdata));
-      }
-    } else if (datatype == Atom::INT) {
-      if (cols == 0)
-        memory->destroy(*((int **) pdata));
-      else if (cols > 0)
-        memory->destroy(*((int ***) pdata));
-      else {
-        memory->destroy(*((int ***) pdata));
-      }
-    } else if (datatype == Atom::BIGINT) {
-      if (cols == 0)
-        memory->destroy(*((bigint **) pdata));
-      else if (cols > 0)
-        memory->destroy(*((bigint ***) pdata));
-      else {
-        memory->destroy(*((bigint ***) pdata));
+    for (int i = 0; i < ngrow; i++) {
+      pdata = mgrow.pdata[i];
+      datatype = mgrow.datatype[i];
+      cols = mgrow.cols[i];
+      if (datatype == Atom::DOUBLE) {
+        if (cols == 0)
+          memory->destroy(*((double **) pdata));
+        else if (cols > 0)
+          memory->destroy(*((double ***) pdata));
+        else {
+          memory->destroy(*((double ***) pdata));
+        }
+      } else if (datatype == Atom::INT) {
+        if (cols == 0)
+          memory->destroy(*((int **) pdata));
+        else if (cols > 0)
+          memory->destroy(*((int ***) pdata));
+        else {
+          memory->destroy(*((int ***) pdata));
+        }
+      } else if (datatype == Atom::BIGINT) {
+        if (cols == 0)
+          memory->destroy(*((bigint **) pdata));
+        else if (cols > 0)
+          memory->destroy(*((bigint ***) pdata));
+        else {
+          memory->destroy(*((bigint ***) pdata));
+        }
       }
     }
   }
diff --git a/src/atom_vec.h b/src/atom_vec.h
index 8ccf922c4b..625307face 100644
--- a/src/atom_vec.h
+++ b/src/atom_vec.h
@@ -209,6 +209,10 @@ class AtomVec : protected Pointers {
 
   bool *threads;
 
+  // counter for atom vec instances
+
+  static int num_atom_vecs;
+
   // local methods
 
   void grow_nmax();