diff --git a/src/KOKKOS/pair_snap_kokkos.h b/src/KOKKOS/pair_snap_kokkos.h
index 7d58f387d6..e7797097c2 100644
--- a/src/KOKKOS/pair_snap_kokkos.h
+++ b/src/KOKKOS/pair_snap_kokkos.h
@@ -204,10 +204,10 @@ class PairSNAPKokkos : public PairSNAP {
   void operator() (TagPairSNAPComputeNeighCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeNeighCPU>::member_type& team) const;
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (TagPairSNAPPreUiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPPreUiCPU>::member_type& team) const;
+  void operator() (TagPairSNAPPreUiCPU, const int& iatom) const;
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiCPU>::member_type& team) const;
+  void operator() (TagPairSNAPComputeUiCPU, const int& ii) const;
 
   KOKKOS_INLINE_FUNCTION
   void operator() (TagPairSNAPTransformUiCPU, const int j, const int iatom) const;
@@ -222,10 +222,10 @@ class PairSNAPKokkos : public PairSNAP {
   void operator() (TagPairSNAPComputeYiCPU,const int& ii) const;
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (TagPairSNAPComputeDuidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrjCPU>::member_type& team) const;
+  void operator() (TagPairSNAPComputeDuidrjCPU, const int& ii) const;
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (TagPairSNAPComputeDeidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrjCPU>::member_type& team) const;
+  void operator() (TagPairSNAPComputeDeidrjCPU, const int& ii) const;
 
   template<int NEIGHFLAG>
   KOKKOS_INLINE_FUNCTION
diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h
index b2252f85c7..d0647337a8 100644
--- a/src/KOKKOS/pair_snap_kokkos_impl.h
+++ b/src/KOKKOS/pair_snap_kokkos_impl.h
@@ -204,7 +204,7 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::compute(int eflag_in,
     const int inum_pad = inum_div * vector_length;
     MemKK::realloc_kokkos(d_beta,"PairSNAPKokkos:beta", inum_pad, ncoeff);
     snaKK.d_beta = d_beta;
-    MemKK::realloc_kokkos(d_ninside,"PairSNAPKokkos:ninside", inum);
+    MemKK::realloc_kokkos(d_ninside,"PairSNAPKokkos:ninside", inum_pad);
   }
 
   chunk_size = MIN(chunksize, inum); // "chunksize" variable is set by user
@@ -235,17 +235,14 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::compute(int eflag_in,
 
       //PreUi
       {
-        int team_size = team_size_default;
-        check_team_size_for<TagPairSNAPPreUiCPU>(chunk_size,team_size);
-        typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPPreUiCPU> policy_preui_cpu((chunk_size+team_size-1)/team_size,team_size,vector_length);
+        typename Kokkos::RangePolicy<DeviceType,TagPairSNAPPreUiCPU> policy_preui_cpu(0, chunk_size);
         Kokkos::parallel_for("PreUiCPU",policy_preui_cpu,*this);
       }
 
       // ComputeUi
       {
-        int team_size = team_size_default;
         // Fused calculation of ulist and accumulation into ulisttot using atomics
-        typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUiCPU> policy_ui_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
+        typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeUiCPU> policy_ui_cpu(0, chunk_size*max_neighs);
         Kokkos::parallel_for("ComputeUiCPU",policy_ui_cpu,*this);
       }
 
@@ -283,13 +280,10 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::compute(int eflag_in,
 
       //ComputeDuidrj and Deidrj
       {
-        int team_size = team_size_default;
-
-        typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeDuidrjCPU> policy_duidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
+        typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeDuidrjCPU> policy_duidrj_cpu(0,chunk_size*max_neighs);
         Kokkos::parallel_for("ComputeDuidrjCPU",policy_duidrj_cpu,*this);
 
-        typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeDeidrjCPU> policy_deidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
-
+        typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeDeidrjCPU> policy_deidrj_cpu(0,chunk_size*max_neighs);
         Kokkos::parallel_for("ComputeDeidrjCPU",policy_deidrj_cpu,*this);
       }
 
@@ -601,47 +595,6 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::coeff(int narg, char
   snaKK.init();
 }
 
-/* ----------------------------------------------------------------------
-   Begin routines that are common to both the CPU and GPU codepath.
-------------------------------------------------------------------------- */
-
-template<class DeviceType, typename real_type, int vector_length>
-KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPBeta, const int& iatom) const {
-
-  if (iatom >= chunk_size) return;
-
-  const int i = d_ilist[iatom + chunk_offset];
-  const int itype = type[i];
-  const int ielem = d_map[itype];
-
-  auto d_coeffi = Kokkos::subview(d_coeffelem, ielem, Kokkos::ALL);
-
-  for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
-    d_beta(iatom, icoeff) = d_coeffi[icoeff+1];
-  }
-
-  if (quadraticflag) {
-    const auto idxb_max = snaKK.idxb_max;
-    int k = ncoeff+1;
-    for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
-      const auto idxb = icoeff % idxb_max;
-      const auto idx_chem = icoeff / idxb_max;
-      real_type bveci = snaKK.blist(iatom, idx_chem, idxb);
-      d_beta(iatom, icoeff) += d_coeffi[k] * bveci;
-      k++;
-      for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
-        const auto jdxb = jcoeff % idxb_max;
-        const auto jdx_chem = jcoeff / idxb_max;
-        real_type bvecj = snaKK.blist(iatom, jdx_chem, jdxb);
-        d_beta(iatom, icoeff) += d_coeffi[k] * bvecj;
-        d_beta(iatom, jcoeff) += d_coeffi[k] * bveci;
-        k++;
-      }
-    }
-  }
-}
-
 /* ----------------------------------------------------------------------
    Begin routines that are unique to the GPU codepath. These take advantage
    of AoSoA data layouts and scratch memory for recursive polynomials
@@ -737,222 +690,6 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
   });
 }
 
-template<class DeviceType, typename real_type, int vector_length>
-KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeCayleyKlein,const int iatom_mod, const int jnbor, const int iatom_div) const {
-
-  const int iatom = iatom_mod + iatom_div * vector_length;
-  if (iatom >= chunk_size) return;
-
-  const int ninside = d_ninside(iatom);
-  if (jnbor >= ninside) return;
-
-  snaKK.compute_cayley_klein(iatom,jnbor);
-}
-
-template<class DeviceType, typename real_type, int vector_length>
-KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPPreUi, const int iatom_mod, const int j, const int iatom_div) const {
-  const int iatom = iatom_mod + iatom_div * vector_length;
-  if (iatom >= chunk_size) return;
-
-  int itype = type(iatom);
-  int ielem = d_map[itype];
-
-  snaKK.pre_ui(iatom, j, ielem);
-}
-
-template<class DeviceType, typename real_type, int vector_length>
-KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUiSmall>::member_type& team) const {
-
-  // extract flattened atom_div / neighbor number / bend location
-  int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
-
-  // extract neighbor index, iatom_div
-  int iatom_div = flattened_idx / (max_neighs * (twojmax + 1)); // removed "const" to work around GCC 7 bug
-  const int jj_jbend = flattened_idx - iatom_div * (max_neighs * (twojmax + 1));
-  const int jbend = jj_jbend / max_neighs;
-  int jj = jj_jbend - jbend * max_neighs; // removed "const" to work around GCC 7 bug
-
-  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
-    [&] (const int iatom_mod) {
-    const int ii = iatom_mod + vector_length * iatom_div;
-    if (ii >= chunk_size) return;
-
-    const int ninside = d_ninside(ii);
-    if (jj >= ninside) return;
-
-    snaKK.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div);
-  });
-
-}
-
-template<class DeviceType, typename real_type, int vector_length>
-KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUiLarge>::member_type& team) const {
-
-  // extract flattened atom_div / neighbor number / bend location
-  int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
-
-  // extract neighbor index, iatom_div
-  int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug
-  int jj = flattened_idx - iatom_div * max_neighs;
-
-  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
-    [&] (const int iatom_mod) {
-    const int ii = iatom_mod + vector_length * iatom_div;
-    if (ii >= chunk_size) return;
-
-    const int ninside = d_ninside(ii);
-    if (jj >= ninside) return;
-
-    snaKK.compute_ui_large(team,iatom_mod, jj, iatom_div);
-  });
-
-}
-
-
-template<class DeviceType, typename real_type, int vector_length>
-KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPTransformUi, const int iatom_mod, const int idxu, const int iatom_div) const {
-
-  const int iatom = iatom_mod + iatom_div * vector_length;
-  if (iatom >= chunk_size) return;
-  if (idxu > snaKK.idxu_max) return;
-
-  int elem_count = chemflag ? nelements : 1;
-
-  for (int ielem = 0; ielem < elem_count; ielem++) {
-
-    const FullHalfMapper mapper = snaKK.idxu_full_half[idxu];
-
-    auto utot_re = snaKK.ulisttot_re(iatom, ielem, mapper.idxu_half);
-    auto utot_im = snaKK.ulisttot_im(iatom, ielem, mapper.idxu_half);
-
-    if (mapper.flip_sign == 1) {
-      utot_im = -utot_im;
-    } else if (mapper.flip_sign == -1) {
-      utot_re = -utot_re;
-    }
-
-    snaKK.ulisttot(iatom, ielem, idxu) = { utot_re, utot_im };
-
-    if (mapper.flip_sign == 0) {
-      snaKK.ylist_re(iatom, ielem, mapper.idxu_half) = 0.;
-      snaKK.ylist_im(iatom, ielem, mapper.idxu_half) = 0.;
-    }
-  }
-}
-
-template<class DeviceType, typename real_type, int vector_length>
-KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeYi,const int iatom_mod, const int jjz, const int iatom_div) const {
-
-  const int iatom = iatom_mod + iatom_div * vector_length;
-  if (iatom >= chunk_size) return;
-
-  if (jjz >= snaKK.idxz_max) return;
-
-  snaKK.compute_yi(iatom, jjz);
-}
-
-template<class DeviceType, typename real_type, int vector_length>
-KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeYiWithZlist,const int iatom_mod, const int jjz, const int iatom_div) const {
-
-  const int iatom = iatom_mod + iatom_div * vector_length;
-  if (iatom >= chunk_size) return;
-
-  if (jjz >= snaKK.idxz_max) return;
-
-  snaKK.compute_yi_with_zlist(iatom, jjz);
-}
-
-template<class DeviceType, typename real_type, int vector_length>
-KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeZi,const int iatom_mod, const int jjz, const int iatom_div) const {
-
-  const int iatom = iatom_mod + iatom_div * vector_length;
-  if (iatom >= chunk_size) return;
-  if (jjz >= snaKK.idxz_max) return;
-
-  snaKK.compute_zi(iatom, jjz);
-}
-
-template<class DeviceType, typename real_type, int vector_length>
-KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeBi,const int iatom_mod, const int jjb, const int iatom_div) const {
-
-  const int iatom = iatom_mod + iatom_div * vector_length;
-  if (iatom >= chunk_size) return;
-
-  if (jjb >= snaKK.idxb_max) return;
-
-  snaKK.compute_bi(iatom,jjb);
-}
-
-template<class DeviceType, typename real_type, int vector_length>
-template<int dir>
-KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeFusedDeidrjSmall<dir>,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeFusedDeidrjSmall<dir> >::member_type& team) const {
-
-  // extract flattened atom_div / neighbor number / bend location
-  int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_fused_deidrj;
-
-  // extract neighbor index, iatom_div
-  int iatom_div = flattened_idx / (max_neighs * (twojmax + 1)); // removed "const" to work around GCC 7 bug
-  const int jj_jbend = flattened_idx - iatom_div * (max_neighs * (twojmax + 1));
-  const int jbend = jj_jbend / max_neighs;
-  int jj = jj_jbend - jbend * max_neighs; // removed "const" to work around GCC 7 bug
-
-  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
-    [&] (const int iatom_mod) {
-    const int ii = iatom_mod + vector_length * iatom_div;
-    if (ii >= chunk_size) return;
-
-    const int ninside = d_ninside(ii);
-    if (jj >= ninside) return;
-
-    snaKK.template compute_fused_deidrj_small<dir>(team, iatom_mod, jbend, jj, iatom_div);
-
-  });
-
-}
-
-template<class DeviceType, typename real_type, int vector_length>
-template<int dir>
-KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeFusedDeidrjLarge<dir>,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeFusedDeidrjLarge<dir> >::member_type& team) const {
-
-  // extract flattened atom_div / neighbor number / bend location
-  int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_fused_deidrj;
-
-  // extract neighbor index, iatom_div
-  int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug
-  int jj = flattened_idx - max_neighs * iatom_div;
-
-  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
-    [&] (const int iatom_mod) {
-    const int ii = iatom_mod + vector_length * iatom_div;
-    if (ii >= chunk_size) return;
-
-    const int ninside = d_ninside(ii);
-    if (jj >= ninside) return;
-
-    snaKK.template compute_fused_deidrj_large<dir>(team, iatom_mod, jj, iatom_div);
-
-  });
-}
-
-/* ----------------------------------------------------------------------
-   Begin routines that are unique to the CPU codepath. These do not take
-   advantage of AoSoA data layouts, but that could be a good point of
-   future optimization and unification with the above kernels. It's unlikely
-   that scratch memory optimizations will ever be useful for the CPU due to
-   different arithmetic intensity requirements for the CPU vs GPU.
-------------------------------------------------------------------------- */
-
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeNeighCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeNeighCPU>::member_type& team) const {
@@ -1028,36 +765,156 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
   });
 }
 
+/* ----------------------------------------------------------------------
+  Pre-compute the Cayley-Klein parameters for reuse in later routines.
+  GPU only.
+------------------------------------------------------------------------- */
 
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPPreUiCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPPreUiCPU>::member_type& team) const {
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeCayleyKlein,const int iatom_mod, const int jnbor, const int iatom_div) const {
 
-  // Extract the atom number
-  const int ii = team.team_rank() + team.team_size() * team.league_rank();
-  if (ii >= chunk_size) return;
-  int itype = type(ii);
-  int ielem = d_map[itype];
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
 
-  snaKK.pre_ui_cpu(team,ii,ielem);
+  const int ninside = d_ninside(iatom);
+  if (jnbor >= ninside) return;
+
+  snaKK.compute_cayley_klein(iatom, jnbor);
 }
 
-
+/* ----------------------------------------------------------------------
+  Initialize the "ulisttot" structure with non-zero on-diagonal terms
+  and zero terms elsewhere; both CPU and GPU.
+------------------------------------------------------------------------- */
 
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUiCPU>::member_type& team) const {
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPPreUi, const int iatom_mod, const int j, const int iatom_div) const {
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
 
-  // Extract the atom number
-  int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size()));
-  if (ii >= chunk_size) return;
+  int itype = type(iatom);
+  int ielem = d_map[itype];
 
-  // Extract the neighbor number
-  const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size());
-  const int ninside = d_ninside(ii);
-  if (jj >= ninside) return;
+  snaKK.pre_ui(iatom, j, ielem);
+}
 
-  snaKK.compute_ui_cpu(team,ii,jj);
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPPreUiCPU, const int& iatom) const {
+  const int itype = type(iatom);
+  const int ielem = d_map[itype];
+
+  snaKK.pre_ui_cpu(iatom, ielem);
+}
+
+/* ----------------------------------------------------------------------
+  Accumulate the spectral contributions from atom, neighbor pairs into
+  ulisttot_re and _im. These routines are GPU only and use scratch memory
+  staging.
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUiSmall>::member_type& team) const {
+
+  // extract flattened atom_div / neighbor number / bend location
+  int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
+
+  // extract neighbor index, iatom_div
+  int iatom_div = flattened_idx / (max_neighs * (twojmax + 1)); // removed "const" to work around GCC 7 bug
+  const int jj_jbend = flattened_idx - iatom_div * (max_neighs * (twojmax + 1));
+  const int jbend = jj_jbend / max_neighs;
+  int jj = jj_jbend - jbend * max_neighs; // removed "const" to work around GCC 7 bug
+
+  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
+    [&] (const int iatom_mod) {
+    const int ii = iatom_mod + vector_length * iatom_div;
+    if (ii >= chunk_size) return;
+
+    const int ninside = d_ninside(ii);
+    if (jj >= ninside) return;
+
+    snaKK.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div);
+  });
+
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUiLarge>::member_type& team) const {
+
+  // extract flattened atom_div / neighbor number / bend location
+  int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
+
+  // extract neighbor index, iatom_div
+  int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug
+  int jj = flattened_idx - iatom_div * max_neighs;
+
+  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
+    [&] (const int iatom_mod) {
+    const int ii = iatom_mod + vector_length * iatom_div;
+    if (ii >= chunk_size) return;
+
+    const int ninside = d_ninside(ii);
+    if (jj >= ninside) return;
+
+    snaKK.compute_ui_large(team,iatom_mod, jj, iatom_div);
+  });
+}
+
+/* ----------------------------------------------------------------------
+  Accumulate the spectral contributions from atom, neighbor pairs into
+  ulisttot_re and _im. This routine is CPU only and does not use staging.
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeUiCPU, const int& ii) const {
+  const int iatom = ii / max_neighs;
+  const int jnbor = ii % max_neighs;
+  const int ninside = d_ninside(iatom);
+  if (jnbor >= ninside) return;
+
+  snaKK.compute_ui_cpu(iatom, jnbor);
+}
+
+/* ----------------------------------------------------------------------
+  De-symmetrize ulisttot_re and _im and pack it into a unified ulisttot
+  structure. Zero-initialize ylist. CPU and GPU.
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPTransformUi, const int iatom_mod, const int idxu, const int iatom_div) const {
+
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+  if (idxu > snaKK.idxu_max) return;
+
+  int elem_count = chemflag ? nelements : 1;
+
+  for (int ielem = 0; ielem < elem_count; ielem++) {
+
+    const FullHalfMapper mapper = snaKK.idxu_full_half[idxu];
+
+    auto utot_re = snaKK.ulisttot_re(iatom, ielem, mapper.idxu_half);
+    auto utot_im = snaKK.ulisttot_im(iatom, ielem, mapper.idxu_half);
+
+    if (mapper.flip_sign == 1) {
+      utot_im = -utot_im;
+    } else if (mapper.flip_sign == -1) {
+      utot_re = -utot_re;
+    }
+
+    snaKK.ulisttot(iatom, ielem, idxu) = { utot_re, utot_im };
+
+    if (mapper.flip_sign == 0) {
+      snaKK.ylist_re(iatom, ielem, mapper.idxu_half) = 0.;
+      snaKK.ylist_im(iatom, ielem, mapper.idxu_half) = 0.;
+    }
+  }
 }
 
 template<class DeviceType, typename real_type, int vector_length>
@@ -1110,10 +967,21 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
   }
 }
 
+/* ----------------------------------------------------------------------
+  Compute all elements of the Z tensor and store them into the `zlist`
+  View. This is only used for energy timesteps or quadratic SNAP.
+  CPU and GPU.
+------------------------------------------------------------------------- */
+
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeYiCPU,const int& ii) const {
-  snaKK.compute_yi_cpu(ii);
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeZi,const int iatom_mod, const int jjz, const int iatom_div) const {
+
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+  if (jjz >= snaKK.idxz_max) return;
+
+  snaKK.compute_zi(iatom, jjz);
 }
 
 template<class DeviceType, typename real_type, int vector_length>
@@ -1122,6 +990,22 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
   snaKK.compute_zi_cpu(ii);
 }
 
+/* ----------------------------------------------------------------------
+  Compute the energy triple products and store in the "blist" View.
+  CPU and GPU.
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeBi,const int iatom_mod, const int jjb, const int iatom_div) const {
+
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+  if (jjb >= snaKK.idxb_max) return;
+
+  snaKK.compute_bi(iatom,jjb);
+}
+
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeBiCPU, const int& ii) const {
@@ -1130,42 +1014,187 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
   snaKK.compute_bi(iatom, jjb);
 }
 
-template<class DeviceType, typename real_type, int vector_length>
-KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeDuidrjCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeDuidrjCPU>::member_type& team) const {
 
-  // Extract the atom number
-  int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size()));
-  if (ii >= chunk_size) return;
-
-  // Extract the neighbor number
-  const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size());
-  const int ninside = d_ninside(ii);
-  if (jj >= ninside) return;
-
-  snaKK.compute_duidrj_cpu(team,ii,jj);
-}
+/* ----------------------------------------------------------------------
+  Assemble the "beta" coefficients that enter the computation of the
+  adjoint matrices Y. For quadratic SNAP, this includes accumulating
+  energy triple products into an "effective" beta that encodes the
+  quadratic terms with otherwise linear compute work.
+  CPU and GPU.
+------------------------------------------------------------------------- */
 
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeDeidrjCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeDeidrjCPU>::member_type& team) const {
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPBeta, const int& iatom) const {
 
-  // Extract the atom number
-  int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size()));
-  if (ii >= chunk_size) return;
+  if (iatom >= chunk_size) return;
 
-  // Extract the neighbor number
-  const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size());
-  const int ninside = d_ninside(ii);
-  if (jj >= ninside) return;
+  const int i = d_ilist[iatom + chunk_offset];
+  const int itype = type[i];
+  const int ielem = d_map[itype];
 
-  snaKK.compute_deidrj_cpu(team,ii,jj);
+  auto d_coeffi = Kokkos::subview(d_coeffelem, ielem, Kokkos::ALL);
+
+  for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
+    d_beta(iatom, icoeff) = d_coeffi[icoeff+1];
+  }
+
+  if (quadraticflag) {
+    const auto idxb_max = snaKK.idxb_max;
+    int k = ncoeff+1;
+    for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
+      const auto idxb = icoeff % idxb_max;
+      const auto idx_chem = icoeff / idxb_max;
+      real_type bveci = snaKK.blist(iatom, idx_chem, idxb);
+      d_beta(iatom, icoeff) += d_coeffi[k] * bveci;
+      k++;
+      for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
+        const auto jdxb = jcoeff % idxb_max;
+        const auto jdx_chem = jcoeff / idxb_max;
+        real_type bvecj = snaKK.blist(iatom, jdx_chem, jdxb);
+        d_beta(iatom, icoeff) += d_coeffi[k] * bvecj;
+        d_beta(iatom, jcoeff) += d_coeffi[k] * bveci;
+        k++;
+      }
+    }
+  }
 }
 
 /* ----------------------------------------------------------------------
-   Also used for both CPU and GPU codepaths. Could maybe benefit from a
-   separate GPU/CPU codepath, but this kernel takes so little time it's
-   likely not worth it.
+  Compute all elements of the Z tensor and accumultate them into the
+  adjoint matrices Y (ylist_re, _im) on non-energy timesteps. CPU and GPU.
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeYi,const int iatom_mod, const int jjz, const int iatom_div) const {
+
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+  if (jjz >= snaKK.idxz_max) return;
+
+  snaKK.compute_yi(iatom, jjz);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeYiCPU, const int& ii) const {
+  snaKK.compute_yi_cpu(ii);
+}
+
+/* ----------------------------------------------------------------------
+  Accumulate the pre-computed elements of the Z tensor into the adjoint
+  matrices Y (ylist_re, _im) on non-energy timesteps. CPU and GPU.
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeYiWithZlist,const int iatom_mod, const int jjz, const int iatom_div) const {
+
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+  if (jjz >= snaKK.idxz_max) return;
+
+  snaKK.compute_yi_with_zlist(iatom, jjz);
+}
+
+/* ----------------------------------------------------------------------
+  Assemble the force contributions for each atom, neighbor pair by
+  contracting the adjoint matrices Y with derivatives of the Wigner
+  matrices U. These routines are GPU only and use scratch memory
+  staging.
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+template<int dir>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeFusedDeidrjSmall<dir>,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeFusedDeidrjSmall<dir> >::member_type& team) const {
+
+  // extract flattened atom_div / neighbor number / bend location
+  int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_fused_deidrj;
+
+  // extract neighbor index, iatom_div
+  int iatom_div = flattened_idx / (max_neighs * (twojmax + 1)); // removed "const" to work around GCC 7 bug
+  const int jj_jbend = flattened_idx - iatom_div * (max_neighs * (twojmax + 1));
+  const int jbend = jj_jbend / max_neighs;
+  int jj = jj_jbend - jbend * max_neighs; // removed "const" to work around GCC 7 bug
+
+  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
+    [&] (const int iatom_mod) {
+    const int ii = iatom_mod + vector_length * iatom_div;
+    if (ii >= chunk_size) return;
+
+    const int ninside = d_ninside(ii);
+    if (jj >= ninside) return;
+
+    snaKK.template compute_fused_deidrj_small<dir>(team, iatom_mod, jbend, jj, iatom_div);
+
+  });
+
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+template<int dir>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeFusedDeidrjLarge<dir>,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeFusedDeidrjLarge<dir> >::member_type& team) const {
+
+  // extract flattened atom_div / neighbor number / bend location
+  int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_fused_deidrj;
+
+  // extract neighbor index, iatom_div
+  int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug
+  int jj = flattened_idx - max_neighs * iatom_div;
+
+  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
+    [&] (const int iatom_mod) {
+    const int ii = iatom_mod + vector_length * iatom_div;
+    if (ii >= chunk_size) return;
+
+    const int ninside = d_ninside(ii);
+    if (jj >= ninside) return;
+
+    snaKK.template compute_fused_deidrj_large<dir>(team, iatom_mod, jj, iatom_div);
+
+  });
+}
+
+/* ----------------------------------------------------------------------
+  Assemble the derivatives of the Winger matrices U into the View
+  "dulist". CPU only.
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeDuidrjCPU, const int& ii) const {
+  const int iatom = ii / max_neighs;
+  const int jnbor = ii % max_neighs;
+  const int ninside = d_ninside(iatom);
+  if (jnbor >= ninside) return;
+
+  snaKK.compute_duidrj_cpu(iatom, jnbor);
+}
+
+/* ----------------------------------------------------------------------
+  Assemble the force contributions for each atom, neighbor pair by
+  contracting the adjoint matrices Y with the pre-computed derivatives
+  of the Wigner matrices U. CPU only.
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeDeidrjCPU, const int& ii) const {
+  const int iatom = ii / max_neighs;
+  const int jnbor = ii % max_neighs;
+  const int ninside = d_ninside(iatom);
+  if (jnbor >= ninside) return;
+
+  snaKK.compute_deidrj_cpu(iatom, jnbor);
+}
+
+/* ----------------------------------------------------------------------
+   This routine formally accumulates the "chunked" force contributions
+   into the broader LAMMPS "f" force View. As appropriate it
+   also accumulates the total energy and the virial. CPU and GPU.
 ------------------------------------------------------------------------- */
 
 template<class DeviceType, typename real_type, int vector_length>
diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h
index fa187c26ea..4c9cace26a 100644
--- a/src/KOKKOS/sna_kokkos.h
+++ b/src/KOKKOS/sna_kokkos.h
@@ -237,9 +237,9 @@ class SNAKokkos {
 
   // functions for bispectrum coefficients, CPU only
   KOKKOS_INLINE_FUNCTION
-  void pre_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team,const int&,const int&) const; // ForceSNAP
+  void pre_ui_cpu(const int&, const int&) const; // ForceSNAP
   KOKKOS_INLINE_FUNCTION
-  void compute_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int) const; // ForceSNAP
+  void compute_ui_cpu(const int&, const int&) const; // ForceSNAP
   KOKKOS_INLINE_FUNCTION
   void compute_zi_cpu(const int&) const;    // ForceSNAP
   KOKKOS_INLINE_FUNCTION
@@ -247,9 +247,9 @@ class SNAKokkos {
 
   // functions for derivatives, CPU only
   KOKKOS_INLINE_FUNCTION
-  void compute_duidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int) const; //ForceSNAP
+  void compute_duidrj_cpu(const int&, const int&) const; //ForceSNAP
   KOKKOS_INLINE_FUNCTION
-  void compute_deidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int) const; // ForceSNAP
+  void compute_deidrj_cpu(const int&, const int&) const; // ForceSNAP
 
   KOKKOS_INLINE_FUNCTION
   real_type compute_sfac(real_type, real_type, real_type, real_type) const; // add_uarraytot, compute_duarray
@@ -358,11 +358,10 @@ class SNAKokkos {
   void init_rootpqarray();    // init()
 
   KOKKOS_INLINE_FUNCTION
-  void add_uarraytot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int, const real_type&, const real_type&, const real_type&, const real_type&, const real_type&, int) const; // compute_ui
+  void add_uarraytot(int, int, const real_type&, const real_type&, const real_type&, const real_type&, const real_type&, int) const; // compute_ui
 
   KOKKOS_INLINE_FUNCTION
-  void compute_uarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int,
-                      const real_type&, const real_type&, const real_type&,
+  void compute_uarray_cpu(int, int, const real_type&, const real_type&, const real_type&,
                       const real_type&, const real_type&) const; // compute_ui_cpu
 
 
@@ -372,8 +371,7 @@ class SNAKokkos {
   inline
   int compute_ncoeff();           // SNAKokkos()
   KOKKOS_INLINE_FUNCTION
-  void compute_duarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int,
-                       const real_type&, const real_type&, const real_type&, // compute_duidrj_cpu
+  void compute_duarray_cpu(int, int, const real_type&, const real_type&, const real_type&, // compute_duidrj_cpu
                            const real_type&, const real_type&, const real_type&, const real_type&, const real_type&,
                            const real_type&, const real_type&) const;
 
diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h
index 7988026dd2..6cb83b3910 100644
--- a/src/KOKKOS/sna_kokkos_impl.h
+++ b/src/KOKKOS/sna_kokkos_impl.h
@@ -1156,17 +1156,14 @@ typename SNAKokkos<DeviceType, real_type, vector_length>::real_type SNAKokkos<De
 
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType, real_type, vector_length>::pre_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int& iatom, const int& ielem) const
+void SNAKokkos<DeviceType, real_type, vector_length>::pre_ui_cpu(const int& iatom, const int& ielem) const
 {
   for (int jelem = 0; jelem < nelements; jelem++) {
     for (int j = 0; j <= twojmax; j++) {
       int jju = idxu_half_block(j); // removed "const" to work around GCC 7 bug
 
       // Only diagonal elements get initialized
-      // for (int m = 0; m < (j+1)*(j/2+1); m++)
-      Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, (j+1)*(j/2+1)),
-        [&] (const int m) {
-
+      for (int m = 0; m < (j+1)*(j/2+1); m++) {
         const int jjup = jju + m;
 
         // if m is on the "diagonal", initialize it with the self energy.
@@ -1176,7 +1173,7 @@ void SNAKokkos<DeviceType, real_type, vector_length>::pre_ui_cpu(const typename
 
         ulisttot_re(iatom, jelem, jjup) = init;
         ulisttot_im(iatom, jelem, jjup) = 0;
-      });
+      };
     }
   }
 
@@ -1191,7 +1188,7 @@ void SNAKokkos<DeviceType, real_type, vector_length>::pre_ui_cpu(const typename
 
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType, real_type, vector_length>::compute_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor) const
+void SNAKokkos<DeviceType, real_type, vector_length>::compute_ui_cpu(const int& iatom, const int& jnbor) const
 {
   real_type rsq, r, x, y, z, z0, theta0;
 
@@ -1211,8 +1208,8 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_ui_cpu(const typen
   //    theta0 = (r - rmin0) * rscale0;
   z0 = r / tan(theta0);
 
-  compute_uarray_cpu(team, iatom, jnbor, x, y, z, z0, r);
-  add_uarraytot(team, iatom, jnbor, r, wj(iatom,jnbor), rcutij(iatom,jnbor), sinnerij(iatom,jnbor), dinnerij(iatom,jnbor), element(iatom, jnbor));
+  compute_uarray_cpu(iatom, jnbor, x, y, z, z0, r);
+  add_uarraytot(iatom, jnbor, r, wj(iatom,jnbor), rcutij(iatom,jnbor), sinnerij(iatom,jnbor), dinnerij(iatom,jnbor), element(iatom, jnbor));
 
 }
 /* ----------------------------------------------------------------------
@@ -1328,7 +1325,7 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_yi_cpu(int iter) c
 
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType, real_type, vector_length>::compute_duidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor) const
+void SNAKokkos<DeviceType, real_type, vector_length>::compute_duidrj_cpu(const int& iatom, const int& jnbor) const
 {
   real_type rsq, r, x, y, z, z0, theta0, cs, sn;
   real_type dz0dr;
@@ -1345,7 +1342,7 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_duidrj_cpu(const t
   z0 = r * cs / sn;
   dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq;
 
-  compute_duarray_cpu(team, iatom, jnbor, x, y, z, z0, r, dz0dr, wj(iatom,jnbor), rcutij(iatom,jnbor), sinnerij(iatom,jnbor), dinnerij(iatom,jnbor));
+  compute_duarray_cpu(iatom, jnbor, x, y, z, z0, r, dz0dr, wj(iatom,jnbor), rcutij(iatom,jnbor), sinnerij(iatom,jnbor), dinnerij(iatom,jnbor));
 }
 
 
@@ -1360,54 +1357,42 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_duidrj_cpu(const t
 
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType, real_type, vector_length>::compute_deidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor) const
+void SNAKokkos<DeviceType, real_type, vector_length>::compute_deidrj_cpu(const int& iatom, const int& jnbor) const
 {
-  t_scalar3<real_type> final_sum;
+  real_type force_sum[3] = { 0, 0, 0 };
   const int jelem = element(iatom, jnbor);
 
-  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,twojmax+1),
-      [&] (const int& j, t_scalar3<real_type>& sum_tmp) {
+  for (int j = 0; j <= twojmax; j++) {
     int jju_half = idxu_half_block[j];
     int jju_cache = idxu_cache_block[j];
 
-    for (int mb = 0; 2*mb < j; mb++)
+    for (int mb = 0; 2 * mb < j; mb++) {
       for (int ma = 0; ma <= j; ma++) {
-        const complex y_val = { ylist_re(iatom, jelem, jju_half), ylist_im(iatom, jelem, jju_half) };
-        sum_tmp.x += dulist_cpu(iatom, jnbor, jju_cache, 0).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 0).im * y_val.im;
-        sum_tmp.y += dulist_cpu(iatom, jnbor, jju_cache, 1).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 1).im * y_val.im;
-        sum_tmp.z += dulist_cpu(iatom, jnbor, jju_cache, 2).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 2).im * y_val.im;
+        complex y = { ylist_re(iatom, jelem, jju_half), ylist_im(iatom, jelem, jju_half) };
+        for (int k = 0; k < 3; k++)
+          force_sum[k] += dulist_cpu(iatom, jnbor, jju_cache, k).re * y.re +
+                          dulist_cpu(iatom, jnbor, jju_cache, k).im * y.im;
         jju_half++; jju_cache++;
-      } //end loop over ma mb
+      }
+    } //end loop over ma mb
 
     // For j even, handle middle column
 
-    if (j%2 == 0) {
-
-      int mb = j/2;
-      for (int ma = 0; ma < mb; ma++) {
-        const complex y_val = { ylist_re(iatom, jelem, jju_half), ylist_im(iatom, jelem, jju_half) };
-        sum_tmp.x += dulist_cpu(iatom, jnbor, jju_cache, 0).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 0).im * y_val.im;
-        sum_tmp.y += dulist_cpu(iatom, jnbor, jju_cache, 1).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 1).im * y_val.im;
-        sum_tmp.z += dulist_cpu(iatom, jnbor, jju_cache, 2).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 2).im * y_val.im;
+    if (j % 2 == 0) {
+      //int mb = j / 2;
+      for (int ma = 0; ma <= j; ma++) {
+        complex y = { ylist_re(iatom, jelem, jju_half), ylist_im(iatom, jelem, jju_half) };
+        for (int k = 0; k < 3; k++)
+          force_sum[k] += static_cast<real_type>(0.5) * (dulist_cpu(iatom, jnbor, jju_cache, k).re * y.re +
+                          dulist_cpu(iatom, jnbor, jju_cache, k).im * y.im);
         jju_half++; jju_cache++;
       }
-
-      //int ma = mb;
-      // 0.5 is meant to avoid double-counting
-      const complex y_val = { 0.5 * ylist_re(iatom, jelem, jju_half), 0.5 * ylist_im(iatom, jelem, jju_half) };
-      sum_tmp.x += dulist_cpu(iatom, jnbor, jju_cache, 0).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 0).im * y_val.im;
-      sum_tmp.y += dulist_cpu(iatom, jnbor, jju_cache, 1).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 1).im * y_val.im;
-      sum_tmp.z += dulist_cpu(iatom, jnbor, jju_cache, 2).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 2).im * y_val.im;
     } // end if jeven
 
-  },final_sum); // end loop over j
-
-  Kokkos::single(Kokkos::PerThread(team), [&] () {
-    dedr(iatom,jnbor,0) = final_sum.x*2.0;
-    dedr(iatom,jnbor,1) = final_sum.y*2.0;
-    dedr(iatom,jnbor,2) = final_sum.z*2.0;
-  });
+  }
 
+  for (int k = 0; k < 3; k++)
+    dedr(iatom, jnbor, k) = 2 * force_sum[k];
 }
 
 
@@ -1421,15 +1406,13 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_deidrj_cpu(const t
 
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType, real_type, vector_length>::add_uarraytot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor,
+void SNAKokkos<DeviceType, real_type, vector_length>::add_uarraytot(int iatom, int jnbor,
     const real_type& r, const real_type& wj, const real_type& rcut,
     const real_type& sinner, const real_type& dinner, int jelem) const
 {
   const real_type sfac = compute_sfac(r, rcut, sinner, dinner) * wj;
 
-  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,twojmax+1),
-      [&] (const int& j) {
-
+  for (int j = 0; j <= twojmax; j++) {
     int jju_half = idxu_half_block[j]; // index into ulisttot
     int jju_cache = idxu_cache_block[j]; // index into ulist
 
@@ -1441,7 +1424,7 @@ void SNAKokkos<DeviceType, real_type, vector_length>::add_uarraytot(const typena
         count++;
       }
     }
-  });
+  }
 }
 
 /* ----------------------------------------------------------------------
@@ -1452,23 +1435,15 @@ void SNAKokkos<DeviceType, real_type, vector_length>::add_uarraytot(const typena
 
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType, real_type, vector_length>::compute_uarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor,
+void SNAKokkos<DeviceType, real_type, vector_length>::compute_uarray_cpu(int iatom, int jnbor,
                          const real_type& x, const real_type& y, const real_type& z, const real_type& z0, const real_type& r) const
 {
-  real_type r0inv;
-  real_type a_r, b_r, a_i, b_i;
-  real_type rootpq;
-
   // compute Cayley-Klein parameters for unit quaternion
-
-  r0inv = static_cast<real_type>(1.0) / sqrt(r * r + z0 * z0);
-  a_r = r0inv * z0;
-  a_i = -r0inv * z;
-  b_r = r0inv * y;
-  b_i = -r0inv * x;
+  real_type r0inv = static_cast<real_type>(1.0) / sqrt(r * r + z0 * z0);
+  complex a = { r0inv * z0, -r0inv * z };
+  complex b = { r0inv * y, -r0inv * x };
 
   // VMK Section 4.8.2
-
   ulist_cpu(iatom, jnbor, 0) = complex::one();
 
   for (int j = 1; j <= twojmax; j++) {
@@ -1476,61 +1451,65 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_uarray_cpu(const t
     int jjup = idxu_cache_block[j-1]; // removed "const" to work around GCC 7 bug
 
     // fill in left side of matrix layer from previous layer
+    for (int mb = 0; 2*mb <= j; mb++) {
+      int jju_index = jju + (j + 1) * mb;
+      int jjup_index = jjup + j * mb;
 
-    Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,(j+2)/2),
-        [&] (const int& mb) {
-    //for (int mb = 0; 2*mb <= j; mb++) {
-      const int jju_index = jju+mb+mb*j;
-      ulist_cpu(iatom, jnbor, jju_index) = { static_cast<real_type>(0), static_cast<real_type>(0) };
+      complex ui = complex::zero();
 
       for (int ma = 0; ma < j; ma++) {
-        const int jju_index = jju+mb+mb*j+ma;
-        const int jjup_index = jjup+mb*j+ma;
-        rootpq = rootpqarray(j - ma,j - mb);
-        ulist_cpu(iatom, jnbor, jju_index).re +=
-          rootpq *
-          (a_r * ulist_cpu(iatom, jnbor, jjup_index).re +
-           a_i * ulist_cpu(iatom, jnbor, jjup_index).im);
-        ulist_cpu(iatom, jnbor, jju_index).im +=
-          rootpq *
-          (a_r * ulist_cpu(iatom, jnbor, jjup_index).im -
-           a_i * ulist_cpu(iatom, jnbor, jjup_index).re);
+        complex ui_prev = ulist_cpu(iatom, jnbor, jjup_index);
 
-        rootpq = rootpqarray(ma + 1,j - mb);
-        ulist_cpu(iatom, jnbor, jju_index+1).re =
-          -rootpq *
-          (b_r * ulist_cpu(iatom, jnbor, jjup_index).re +
-           b_i * ulist_cpu(iatom, jnbor, jjup_index).im);
-        ulist_cpu(iatom, jnbor, jju_index+1).im =
-          -rootpq *
-          (b_r * ulist_cpu(iatom, jnbor, jjup_index).im -
-           b_i * ulist_cpu(iatom, jnbor, jjup_index).re);
+        real_type rootpq = rootpqarray(j - ma, j - mb);
+        ui.re += rootpq * (a.re * ui_prev.re + a.im * ui_prev.im);
+        ui.im += rootpq * (a.re * ui_prev.im - a.im * ui_prev.re);
+
+        ulist_cpu(iatom, jnbor, jju_index) = ui;
+
+        rootpq = rootpqarray(ma + 1, j - mb);
+        ui.re = -rootpq * (b.re * ui_prev.re + b.im * ui_prev.im);
+        ui.im = -rootpq * (b.re * ui_prev.im - b.im * ui_prev.re);
+
+        jju_index++;
+        jjup_index++;
       }
 
-      // copy left side to right side with inversion symmetry VMK 4.4(2)
-      // u[ma-j,mb-j] = (-1)^(ma-mb)*Conj([u[ma,mb))
+      ulist_cpu(iatom, jnbor, jju_index) = ui;
+    }
 
-      // Only need to add one symmetrized row for convenience
-      // Symmetry gets "unfolded" in accumulating ulisttot
-      if (j%2==1 && mb==(j/2)) {
-        const int mbpar = (mb)%2==0?1:-1;
-        int mapar = mbpar;
-        for (int ma = 0; ma <= j; ma++) {
-          const int jju_index = jju + mb*(j+1) + ma;
-          const int jjup_index = jju + (j+1-mb)*(j+1)-(ma+1);
-          if (mapar == 1) {
-            ulist_cpu(iatom, jnbor, jjup_index).re = ulist_cpu(iatom, jnbor, jju_index).re;
-            ulist_cpu(iatom, jnbor, jjup_index).im = -ulist_cpu(iatom, jnbor, jju_index).im;
-          } else {
-            ulist_cpu(iatom, jnbor, jjup_index).re = -ulist_cpu(iatom, jnbor, jju_index).re;
-            ulist_cpu(iatom, jnbor, jjup_index).im = ulist_cpu(iatom, jnbor, jju_index).im;
-          }
-          mapar = -mapar;
-        }
+    // If j is odd (half-integer in the mathematical convention), we need
+    // to add one more row for convenience (for now). This can either be done
+    // via symmetry (see the commented code below), or by the equations to fill
+    // from the left instead of the right
+    if (j % 2 == 1) {
+      int mb = j / 2;
+      // begin filling in the extra row
+      int jju_index = jju + (mb + 1) * (j + 1);
+      int jjup_index = jjup + mb * j;
+
+      complex ui = complex::zero();
+
+      for (int ma = 0; ma < j; ma++) {
+        complex ui_prev = ulist_cpu(iatom, jnbor, jjup_index);
+
+        real_type rootpq = rootpqarray(j - ma, mb + 1);
+        ui.re += rootpq * (b.re * ui_prev.re - b.im * ui_prev.im);
+        ui.im += rootpq * (b.re * ui_prev.im + b.im * ui_prev.re);
+
+        ulist_cpu(iatom, jnbor, jju_index) = ui;
+
+        rootpq = rootpqarray(ma + 1, mb + 1);
+        ui.re = rootpq * (a.re * ui_prev.re - a.im * ui_prev.im);
+        ui.im = rootpq * (a.re * ui_prev.im + a.im * ui_prev.re);
+
+        jju_index++;
+        jjup_index++;
       }
-    });
 
+      ulist_cpu(iatom, jnbor, jju_index) = ui;
+    }
   }
+
 }
 
 /* ----------------------------------------------------------------------
@@ -1541,30 +1520,25 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_uarray_cpu(const t
 
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType, real_type, vector_length>::compute_duarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor,
+void SNAKokkos<DeviceType, real_type, vector_length>::compute_duarray_cpu(int iatom, int jnbor,
                           const real_type& x, const real_type& y, const real_type& z,
                           const real_type& z0, const real_type& r, const real_type& dz0dr,
                           const real_type& wj, const real_type& rcut,
                           const real_type& sinner, const real_type& dinner) const
 {
-  real_type r0inv;
-  real_type a_r, a_i, b_r, b_i;
-  real_type u[3], da_r[3], da_i[3], db_r[3], db_i[3];
-  real_type dz0[3], dr0inv[3], dr0invdr;
-  real_type rootpq;
+  complex da[3], db[3];
+  real_type u[3], dz0[3], dr0inv[3];
 
   real_type rinv = 1.0 / r;
   u[0] = x * rinv;
   u[1] = y * rinv;
   u[2] = z * rinv;
 
-  r0inv = 1.0 / sqrt(r * r + z0 * z0);
-  a_r = z0 * r0inv;
-  a_i = -z * r0inv;
-  b_r = y * r0inv;
-  b_i = -x * r0inv;
+  real_type r0inv = 1.0 / sqrt(r * r + z0 * z0);
+  complex a = { z0 * r0inv, -z * r0inv };
+  complex b = { y * r0inv, -x * r0inv };
 
-  dr0invdr = -r0inv * r0inv * r0inv * (r + z0 * dz0dr);
+  real_type dr0invdr = -r0inv * r0inv * r0inv * (r + z0 * dz0dr);
 
   dr0inv[0] = dr0invdr * u[0];
   dr0inv[1] = dr0invdr * u[1];
@@ -1575,19 +1549,19 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_duarray_cpu(const
   dz0[2] = dz0dr * u[2];
 
   for (int k = 0; k < 3; k++) {
-    da_r[k] = dz0[k] * r0inv + z0 * dr0inv[k];
-    da_i[k] = -z * dr0inv[k];
+    da[k].re = dz0[k] * r0inv + z0 * dr0inv[k];
+    da[k].im = -z * dr0inv[k];
   }
 
-  da_i[2] += -r0inv;
+  da[2].im += -r0inv;
 
   for (int k = 0; k < 3; k++) {
-    db_r[k] = y * dr0inv[k];
-    db_i[k] = -x * dr0inv[k];
+    db[k].re = y * dr0inv[k];
+    db[k].im = -x * dr0inv[k];
   }
 
-  db_i[0] += -r0inv;
-  db_r[1] += r0inv;
+  db[0].im += -r0inv;
+  db[1].re += r0inv;
 
   for (int k = 0; k < 3; k++)
     dulist_cpu(iatom, jnbor, 0, k) = complex::zero();
@@ -1595,72 +1569,94 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_duarray_cpu(const
   for (int j = 1; j <= twojmax; j++) {
     int jju = idxu_cache_block[j];
     int jjup = idxu_cache_block[j-1];
-    Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,(j+2)/2),
-        [&] (const int& mb) {
-    //for (int mb = 0; 2*mb <= j; mb++) {
-      const int jju_index = jju+mb+mb*j;
-      for (int k = 0; k < 3; k++)
-        dulist_cpu(iatom, jnbor, jju_index, k) = complex::zero();
+
+    for (int mb = 0; 2*mb <= j; mb++) {
+      int jju_index = jju + mb * (j + 1);
+      int jjup_index = jjup + mb * j;
+
+      complex duidrj[3] = { complex::zero(), complex::zero(), complex::zero() };
 
       for (int ma = 0; ma < j; ma++) {
-        const int jju_index = jju+mb+mb*j+ma;
-        const int jjup_index = jjup+mb*j+ma;
-        rootpq = rootpqarray(j - ma,j - mb);
+        complex ui_prev = ulist_cpu(iatom, jnbor, jjup_index);
+        complex duidrj_prev[3] = { dulist_cpu(iatom, jnbor, jjup_index, 0),
+                                   dulist_cpu(iatom, jnbor, jjup_index, 1),
+                                   dulist_cpu(iatom, jnbor, jjup_index, 2) };
+
+        real_type rootpq = rootpqarray(j - ma,j - mb);
         for (int k = 0; k < 3; k++) {
-          dulist_cpu(iatom, jnbor, jju_index, k).re +=
-            rootpq * (da_r[k] * ulist_cpu(iatom, jnbor, jjup_index).re +
-                      da_i[k] * ulist_cpu(iatom, jnbor, jjup_index).im +
-                      a_r * dulist_cpu(iatom, jnbor, jjup_index, k).re +
-                      a_i * dulist_cpu(iatom, jnbor, jjup_index, k).im);
-          dulist_cpu(iatom, jnbor, jju_index, k).im +=
-            rootpq * (da_r[k] * ulist_cpu(iatom, jnbor, jjup_index).im -
-                      da_i[k] * ulist_cpu(iatom, jnbor, jjup_index).re +
-                      a_r * dulist_cpu(iatom, jnbor, jjup_index, k).im -
-                      a_i * dulist_cpu(iatom, jnbor, jjup_index, k).re);
+          duidrj[k].re += rootpq * (da[k].re * ui_prev.re + da[k].im * ui_prev.im +
+                                    a.re * duidrj_prev[k].re + a.im * duidrj_prev[k].im);
+
+          duidrj[k].im += rootpq * (da[k].re * ui_prev.im - da[k].im * ui_prev.re +
+                                    a.re * duidrj_prev[k].im - a.im * duidrj_prev[k].re);
+
+          dulist_cpu(iatom, jnbor, jju_index, k) = duidrj[k];
         }
 
         rootpq = rootpqarray(ma + 1,j - mb);
         for (int k = 0; k < 3; k++) {
-          dulist_cpu(iatom, jnbor, jju_index+1, k).re =
-            -rootpq * (db_r[k] * ulist_cpu(iatom, jnbor, jjup_index).re +
-                       db_i[k] * ulist_cpu(iatom, jnbor, jjup_index).im +
-                       b_r * dulist_cpu(iatom, jnbor, jjup_index, k).re +
-                       b_i * dulist_cpu(iatom, jnbor, jjup_index, k).im);
-          dulist_cpu(iatom, jnbor, jju_index+1, k).im =
-            -rootpq * (db_r[k] * ulist_cpu(iatom, jnbor, jjup_index).im -
-                       db_i[k] * ulist_cpu(iatom, jnbor, jjup_index).re +
-                       b_r * dulist_cpu(iatom, jnbor, jjup_index, k).im -
-                       b_i * dulist_cpu(iatom, jnbor, jjup_index, k).re);
+          duidrj[k].re = -rootpq * (db[k].re * ui_prev.re + db[k].im * ui_prev.im +
+                                    b.re * duidrj_prev[k].re + b.im * duidrj_prev[k].im);
+
+          duidrj[k].im = -rootpq * (db[k].re * ui_prev.im - db[k].im * ui_prev.re +
+                                    b.re * duidrj_prev[k].im - b.im * duidrj_prev[k].re);
         }
+
+        jju_index++;
+        jjup_index++;
       }
 
-      // Only need to add one symmetrized row for convenience
-      // Symmetry gets "unfolded" during the dedr accumulation
-
-      // copy left side to right side with inversion symmetry VMK 4.4(2)
-      // u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb])
-
-      if (j%2==1 && mb==(j/2)) {
-        const int mbpar = (mb)%2==0?1:-1;
-        int mapar = mbpar;
-        for (int ma = 0; ma <= j; ma++) {
-          const int jju_index = jju+mb*(j+1)+ma;
-          const int jjup_index = jju+(mb+2)*(j+1)-(ma+1);
-          if (mapar == 1) {
-            for (int k = 0; k < 3; k++) {
-              dulist_cpu(iatom, jnbor, jjup_index, k).re = dulist_cpu(iatom, jnbor, jju_index, k).re;
-              dulist_cpu(iatom, jnbor, jjup_index, k).im = -dulist_cpu(iatom, jnbor, jju_index, k).im;
-            }
-          } else {
-            for (int k = 0; k < 3; k++) {
-              dulist_cpu(iatom, jnbor, jjup_index, k).re = -dulist_cpu(iatom, jnbor, jju_index, k).re;
-              dulist_cpu(iatom, jnbor, jjup_index, k).im = dulist_cpu(iatom, jnbor, jju_index, k).im;
-            }
-          }
-          mapar = -mapar;
-        }
+      for (int k = 0; k < 3; k++) {
+        dulist_cpu(iatom, jnbor, jju_index, k) = duidrj[k];
       }
-    });
+    }
+
+    // Only need to add one symmetrized row for convenience
+    // Symmetry gets "unfolded" during the dedr accumulation
+
+    // copy left side to right side with inversion symmetry VMK 4.4(2)
+    // u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb])
+
+    if (j % 2 == 1) {
+      int mb = j / 2;
+      // begin filling in the extra row
+      int jju_index = jju + (mb + 1) * (j + 1);
+      int jjup_index = jjup + mb * j;
+
+      complex duidrj[3] = { complex::zero(), complex::zero(), complex::zero() };
+
+      for (int ma = 0; ma < j; ma++) {
+        complex ui_prev = ulist_cpu(iatom, jnbor, jjup_index);
+        complex duidrj_prev[3] = { dulist_cpu(iatom, jnbor, jjup_index, 0),
+                                   dulist_cpu(iatom, jnbor, jjup_index, 1),
+                                   dulist_cpu(iatom, jnbor, jjup_index, 2) };
+
+        real_type rootpq = rootpqarray(j - ma, mb + 1);
+        for (int k = 0; k < 3; k++) {
+          duidrj[k].re += rootpq * (db[k].re * ui_prev.re - db[k].im * ui_prev.im +
+                                    b.re * duidrj_prev[k].re - b.im * duidrj_prev[k].im);
+          duidrj[k].im += rootpq * (db[k].re * ui_prev.im + db[k].im * ui_prev.re +
+                                    b.re * duidrj_prev[k].im + b.im * duidrj_prev[k].re);
+
+          dulist_cpu(iatom, jnbor, jju_index, k) = duidrj[k];
+        }
+
+        rootpq = rootpqarray(ma + 1, mb + 1);
+        for (int k = 0; k < 3; k++) {
+          duidrj[k].re = rootpq * (da[k].re * ui_prev.re - da[k].im * ui_prev.im +
+                                   a.re * duidrj_prev[k].re - a.im * duidrj_prev[k].im);
+          duidrj[k].im = rootpq * (da[k].re * ui_prev.im + da[k].im * ui_prev.re +
+                                   a.re * duidrj_prev[k].im + a.im * duidrj_prev[k].re);
+        }
+
+        jju_index++;
+        jjup_index++;
+      }
+
+      for (int k = 0; k < 3; k++) {
+        dulist_cpu(iatom, jnbor, jju_index, k) = duidrj[k];
+      }
+    }
   }
 
   real_type sfac = compute_sfac(r, rcut, sinner, dinner);
@@ -1678,9 +1674,9 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_duarray_cpu(const
       for (int ma = 0; ma <= j; ma++) {
         for (int k = 0; k < 3; k++) {
           dulist_cpu(iatom, jnbor, jju, k).re = dsfac * ulist_cpu(iatom, jnbor, jju).re * u[k] +
-                                                sfac * dulist_cpu(iatom, jnbor, jju, k).re;
+                                    sfac * dulist_cpu(iatom, jnbor, jju, k).re;
           dulist_cpu(iatom, jnbor, jju, k).im = dsfac * ulist_cpu(iatom, jnbor, jju).im * u[k] +
-                                                sfac * dulist_cpu(iatom, jnbor, jju, k).im;
+                                    sfac * dulist_cpu(iatom, jnbor, jju, k).im;
         }
         jju++;
       }