Removed unnecessary team parallelism from CPU routines; rearranged pair_snap_kokkos_impl to make the subsequent CPU/GPU unifications easier to follow.

2024-11-19 12:44:08 -08:00
parent 976167e2e5
commit 277fba1907
4 changed files with 530 additions and 507 deletions
--- a/src/KOKKOS/pair_snap_kokkos.h
+++ b/src/KOKKOS/pair_snap_kokkos.h
@ -204,10 +204,10 @@ class PairSNAPKokkos : public PairSNAP {
  void operator() (TagPairSNAPComputeNeighCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeNeighCPU>::member_type& team) const;

  KOKKOS_INLINE_FUNCTION
-  void operator() (TagPairSNAPPreUiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPPreUiCPU>::member_type& team) const;
+  void operator() (TagPairSNAPPreUiCPU, const int& iatom) const;

  KOKKOS_INLINE_FUNCTION
-  void operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiCPU>::member_type& team) const;
+  void operator() (TagPairSNAPComputeUiCPU, const int& ii) const;

  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPTransformUiCPU, const int j, const int iatom) const;
@ -222,10 +222,10 @@ class PairSNAPKokkos : public PairSNAP {
  void operator() (TagPairSNAPComputeYiCPU,const int& ii) const;

  KOKKOS_INLINE_FUNCTION
-  void operator() (TagPairSNAPComputeDuidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrjCPU>::member_type& team) const;
+  void operator() (TagPairSNAPComputeDuidrjCPU, const int& ii) const;

  KOKKOS_INLINE_FUNCTION
-  void operator() (TagPairSNAPComputeDeidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrjCPU>::member_type& team) const;
+  void operator() (TagPairSNAPComputeDeidrjCPU, const int& ii) const;

  template<int NEIGHFLAG>
  KOKKOS_INLINE_FUNCTION
--- a/src/KOKKOS/pair_snap_kokkos_impl.h
+++ b/src/KOKKOS/pair_snap_kokkos_impl.h
@ -204,7 +204,7 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::compute(int eflag_in,
    const int inum_pad = inum_div * vector_length;
    MemKK::realloc_kokkos(d_beta,"PairSNAPKokkos:beta", inum_pad, ncoeff);
    snaKK.d_beta = d_beta;
-    MemKK::realloc_kokkos(d_ninside,"PairSNAPKokkos:ninside", inum);
+    MemKK::realloc_kokkos(d_ninside,"PairSNAPKokkos:ninside", inum_pad);
  }

  chunk_size = MIN(chunksize, inum); // "chunksize" variable is set by user
@ -235,17 +235,14 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::compute(int eflag_in,

      //PreUi
      {
-        int team_size = team_size_default;
-        check_team_size_for<TagPairSNAPPreUiCPU>(chunk_size,team_size);
-        typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPPreUiCPU> policy_preui_cpu((chunk_size+team_size-1)/team_size,team_size,vector_length);
+        typename Kokkos::RangePolicy<DeviceType,TagPairSNAPPreUiCPU> policy_preui_cpu(0, chunk_size);
        Kokkos::parallel_for("PreUiCPU",policy_preui_cpu,*this);
      }

      // ComputeUi
      {
-        int team_size = team_size_default;
        // Fused calculation of ulist and accumulation into ulisttot using atomics
-        typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUiCPU> policy_ui_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
+        typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeUiCPU> policy_ui_cpu(0, chunk_size*max_neighs);
        Kokkos::parallel_for("ComputeUiCPU",policy_ui_cpu,*this);
      }

@ -283,13 +280,10 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::compute(int eflag_in,

      //ComputeDuidrj and Deidrj
      {
-        int team_size = team_size_default;
-
-        typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeDuidrjCPU> policy_duidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
+        typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeDuidrjCPU> policy_duidrj_cpu(0,chunk_size*max_neighs);
        Kokkos::parallel_for("ComputeDuidrjCPU",policy_duidrj_cpu,*this);

-        typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeDeidrjCPU> policy_deidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
-
+        typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeDeidrjCPU> policy_deidrj_cpu(0,chunk_size*max_neighs);
        Kokkos::parallel_for("ComputeDeidrjCPU",policy_deidrj_cpu,*this);
      }

@ -601,47 +595,6 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::coeff(int narg, char
  snaKK.init();
 }

-/* ----------------------------------------------------------------------
-   Begin routines that are common to both the CPU and GPU codepath.
------------------------------------------------------------------------- */
-
-template<class DeviceType, typename real_type, int vector_length>
-KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPBeta, const int& iatom) const {
-
-  if (iatom >= chunk_size) return;
-
-  const int i = d_ilist[iatom + chunk_offset];
-  const int itype = type[i];
-  const int ielem = d_map[itype];
-
-  auto d_coeffi = Kokkos::subview(d_coeffelem, ielem, Kokkos::ALL);
-
-  for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
-    d_beta(iatom, icoeff) = d_coeffi[icoeff+1];
-  }
-
-  if (quadraticflag) {
-    const auto idxb_max = snaKK.idxb_max;
-    int k = ncoeff+1;
-    for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
-      const auto idxb = icoeff % idxb_max;
-      const auto idx_chem = icoeff / idxb_max;
-      real_type bveci = snaKK.blist(iatom, idx_chem, idxb);
-      d_beta(iatom, icoeff) += d_coeffi[k] * bveci;
-      k++;
-      for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
-        const auto jdxb = jcoeff % idxb_max;
-        const auto jdx_chem = jcoeff / idxb_max;
-        real_type bvecj = snaKK.blist(iatom, jdx_chem, jdxb);
-        d_beta(iatom, icoeff) += d_coeffi[k] * bvecj;
-        d_beta(iatom, jcoeff) += d_coeffi[k] * bveci;
-        k++;
-      }
-    }
-  }
-}
-
 /* ----------------------------------------------------------------------
   Begin routines that are unique to the GPU codepath. These take advantage
   of AoSoA data layouts and scratch memory for recursive polynomials
@ -737,222 +690,6 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
  });
 }

-template<class DeviceType, typename real_type, int vector_length>
-KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeCayleyKlein,const int iatom_mod, const int jnbor, const int iatom_div) const {
-
-  const int iatom = iatom_mod + iatom_div * vector_length;
-  if (iatom >= chunk_size) return;
-
-  const int ninside = d_ninside(iatom);
-  if (jnbor >= ninside) return;
-
-  snaKK.compute_cayley_klein(iatom,jnbor);
-}
-
-template<class DeviceType, typename real_type, int vector_length>
-KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPPreUi, const int iatom_mod, const int j, const int iatom_div) const {
-  const int iatom = iatom_mod + iatom_div * vector_length;
-  if (iatom >= chunk_size) return;
-
-  int itype = type(iatom);
-  int ielem = d_map[itype];
-
-  snaKK.pre_ui(iatom, j, ielem);
-}
-
-template<class DeviceType, typename real_type, int vector_length>
-KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUiSmall>::member_type& team) const {
-
-  // extract flattened atom_div / neighbor number / bend location
-  int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
-
-  // extract neighbor index, iatom_div
-  int iatom_div = flattened_idx / (max_neighs * (twojmax + 1)); // removed "const" to work around GCC 7 bug
-  const int jj_jbend = flattened_idx - iatom_div * (max_neighs * (twojmax + 1));
-  const int jbend = jj_jbend / max_neighs;
-  int jj = jj_jbend - jbend * max_neighs; // removed "const" to work around GCC 7 bug
-
-  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
-    [&] (const int iatom_mod) {
-    const int ii = iatom_mod + vector_length * iatom_div;
-    if (ii >= chunk_size) return;
-
-    const int ninside = d_ninside(ii);
-    if (jj >= ninside) return;
-
-    snaKK.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div);
-  });
-
-}
-
-template<class DeviceType, typename real_type, int vector_length>
-KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUiLarge>::member_type& team) const {
-
-  // extract flattened atom_div / neighbor number / bend location
-  int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
-
-  // extract neighbor index, iatom_div
-  int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug
-  int jj = flattened_idx - iatom_div * max_neighs;
-
-  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
-    [&] (const int iatom_mod) {
-    const int ii = iatom_mod + vector_length * iatom_div;
-    if (ii >= chunk_size) return;
-
-    const int ninside = d_ninside(ii);
-    if (jj >= ninside) return;
-
-    snaKK.compute_ui_large(team,iatom_mod, jj, iatom_div);
-  });
-
-}
-
-
-template<class DeviceType, typename real_type, int vector_length>
-KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPTransformUi, const int iatom_mod, const int idxu, const int iatom_div) const {
-
-  const int iatom = iatom_mod + iatom_div * vector_length;
-  if (iatom >= chunk_size) return;
-  if (idxu > snaKK.idxu_max) return;
-
-  int elem_count = chemflag ? nelements : 1;
-
-  for (int ielem = 0; ielem < elem_count; ielem++) {
-
-    const FullHalfMapper mapper = snaKK.idxu_full_half[idxu];
-
-    auto utot_re = snaKK.ulisttot_re(iatom, ielem, mapper.idxu_half);
-    auto utot_im = snaKK.ulisttot_im(iatom, ielem, mapper.idxu_half);
-
-    if (mapper.flip_sign == 1) {
-      utot_im = -utot_im;
-    } else if (mapper.flip_sign == -1) {
-      utot_re = -utot_re;
-    }
-
-    snaKK.ulisttot(iatom, ielem, idxu) = { utot_re, utot_im };
-
-    if (mapper.flip_sign == 0) {
-      snaKK.ylist_re(iatom, ielem, mapper.idxu_half) = 0.;
-      snaKK.ylist_im(iatom, ielem, mapper.idxu_half) = 0.;
-    }
-  }
-}
-
-template<class DeviceType, typename real_type, int vector_length>
-KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeYi,const int iatom_mod, const int jjz, const int iatom_div) const {
-
-  const int iatom = iatom_mod + iatom_div * vector_length;
-  if (iatom >= chunk_size) return;
-
-  if (jjz >= snaKK.idxz_max) return;
-
-  snaKK.compute_yi(iatom, jjz);
-}
-
-template<class DeviceType, typename real_type, int vector_length>
-KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeYiWithZlist,const int iatom_mod, const int jjz, const int iatom_div) const {
-
-  const int iatom = iatom_mod + iatom_div * vector_length;
-  if (iatom >= chunk_size) return;
-
-  if (jjz >= snaKK.idxz_max) return;
-
-  snaKK.compute_yi_with_zlist(iatom, jjz);
-}
-
-template<class DeviceType, typename real_type, int vector_length>
-KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeZi,const int iatom_mod, const int jjz, const int iatom_div) const {
-
-  const int iatom = iatom_mod + iatom_div * vector_length;
-  if (iatom >= chunk_size) return;
-  if (jjz >= snaKK.idxz_max) return;
-
-  snaKK.compute_zi(iatom, jjz);
-}
-
-template<class DeviceType, typename real_type, int vector_length>
-KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeBi,const int iatom_mod, const int jjb, const int iatom_div) const {
-
-  const int iatom = iatom_mod + iatom_div * vector_length;
-  if (iatom >= chunk_size) return;
-
-  if (jjb >= snaKK.idxb_max) return;
-
-  snaKK.compute_bi(iatom,jjb);
-}
-
-template<class DeviceType, typename real_type, int vector_length>
-template<int dir>
-KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeFusedDeidrjSmall<dir>,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeFusedDeidrjSmall<dir> >::member_type& team) const {
-
-  // extract flattened atom_div / neighbor number / bend location
-  int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_fused_deidrj;
-
-  // extract neighbor index, iatom_div
-  int iatom_div = flattened_idx / (max_neighs * (twojmax + 1)); // removed "const" to work around GCC 7 bug
-  const int jj_jbend = flattened_idx - iatom_div * (max_neighs * (twojmax + 1));
-  const int jbend = jj_jbend / max_neighs;
-  int jj = jj_jbend - jbend * max_neighs; // removed "const" to work around GCC 7 bug
-
-  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
-    [&] (const int iatom_mod) {
-    const int ii = iatom_mod + vector_length * iatom_div;
-    if (ii >= chunk_size) return;
-
-    const int ninside = d_ninside(ii);
-    if (jj >= ninside) return;
-
-    snaKK.template compute_fused_deidrj_small<dir>(team, iatom_mod, jbend, jj, iatom_div);
-
-  });
-
-}
-
-template<class DeviceType, typename real_type, int vector_length>
-template<int dir>
-KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeFusedDeidrjLarge<dir>,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeFusedDeidrjLarge<dir> >::member_type& team) const {
-
-  // extract flattened atom_div / neighbor number / bend location
-  int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_fused_deidrj;
-
-  // extract neighbor index, iatom_div
-  int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug
-  int jj = flattened_idx - max_neighs * iatom_div;
-
-  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
-    [&] (const int iatom_mod) {
-    const int ii = iatom_mod + vector_length * iatom_div;
-    if (ii >= chunk_size) return;
-
-    const int ninside = d_ninside(ii);
-    if (jj >= ninside) return;
-
-    snaKK.template compute_fused_deidrj_large<dir>(team, iatom_mod, jj, iatom_div);
-
-  });
-}
-
-/* ----------------------------------------------------------------------
-   Begin routines that are unique to the CPU codepath. These do not take
-   advantage of AoSoA data layouts, but that could be a good point of
-   future optimization and unification with the above kernels. It's unlikely
-   that scratch memory optimizations will ever be useful for the CPU due to
-   different arithmetic intensity requirements for the CPU vs GPU.
------------------------------------------------------------------------- */
-
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeNeighCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeNeighCPU>::member_type& team) const {
@ -1028,36 +765,156 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
  });
 }

+/* ----------------------------------------------------------------------
+  Pre-compute the Cayley-Klein parameters for reuse in later routines.
+  GPU only.
+------------------------------------------------------------------------- */

 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPPreUiCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPPreUiCPU>::member_type& team) const {
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeCayleyKlein,const int iatom_mod, const int jnbor, const int iatom_div) const {

-  // Extract the atom number
-  const int ii = team.team_rank() + team.team_size() * team.league_rank();
-  if (ii >= chunk_size) return;
-  int itype = type(ii);
-  int ielem = d_map[itype];
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;

-  snaKK.pre_ui_cpu(team,ii,ielem);
+  const int ninside = d_ninside(iatom);
+  if (jnbor >= ninside) return;
+
+  snaKK.compute_cayley_klein(iatom, jnbor);
 }

-
+/* ----------------------------------------------------------------------
+  Initialize the "ulisttot" structure with non-zero on-diagonal terms
+  and zero terms elsewhere; both CPU and GPU.
+------------------------------------------------------------------------- */

 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUiCPU>::member_type& team) const {
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPPreUi, const int iatom_mod, const int j, const int iatom_div) const {
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;

-  // Extract the atom number
-  int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size()));
-  if (ii >= chunk_size) return;
+  int itype = type(iatom);
+  int ielem = d_map[itype];

-  // Extract the neighbor number
-  const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size());
-  const int ninside = d_ninside(ii);
-  if (jj >= ninside) return;
+  snaKK.pre_ui(iatom, j, ielem);
+}

-  snaKK.compute_ui_cpu(team,ii,jj);
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPPreUiCPU, const int& iatom) const {
+  const int itype = type(iatom);
+  const int ielem = d_map[itype];
+
+  snaKK.pre_ui_cpu(iatom, ielem);
+}
+
+/* ----------------------------------------------------------------------
+  Accumulate the spectral contributions from atom, neighbor pairs into
+  ulisttot_re and _im. These routines are GPU only and use scratch memory
+  staging.
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUiSmall>::member_type& team) const {
+
+  // extract flattened atom_div / neighbor number / bend location
+  int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
+
+  // extract neighbor index, iatom_div
+  int iatom_div = flattened_idx / (max_neighs * (twojmax + 1)); // removed "const" to work around GCC 7 bug
+  const int jj_jbend = flattened_idx - iatom_div * (max_neighs * (twojmax + 1));
+  const int jbend = jj_jbend / max_neighs;
+  int jj = jj_jbend - jbend * max_neighs; // removed "const" to work around GCC 7 bug
+
+  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
+    [&] (const int iatom_mod) {
+    const int ii = iatom_mod + vector_length * iatom_div;
+    if (ii >= chunk_size) return;
+
+    const int ninside = d_ninside(ii);
+    if (jj >= ninside) return;
+
+    snaKK.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div);
+  });
+
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUiLarge>::member_type& team) const {
+
+  // extract flattened atom_div / neighbor number / bend location
+  int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
+
+  // extract neighbor index, iatom_div
+  int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug
+  int jj = flattened_idx - iatom_div * max_neighs;
+
+  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
+    [&] (const int iatom_mod) {
+    const int ii = iatom_mod + vector_length * iatom_div;
+    if (ii >= chunk_size) return;
+
+    const int ninside = d_ninside(ii);
+    if (jj >= ninside) return;
+
+    snaKK.compute_ui_large(team,iatom_mod, jj, iatom_div);
+  });
+}
+
+/* ----------------------------------------------------------------------
+  Accumulate the spectral contributions from atom, neighbor pairs into
+  ulisttot_re and _im. This routine is CPU only and does not use staging.
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeUiCPU, const int& ii) const {
+  const int iatom = ii / max_neighs;
+  const int jnbor = ii % max_neighs;
+  const int ninside = d_ninside(iatom);
+  if (jnbor >= ninside) return;
+
+  snaKK.compute_ui_cpu(iatom, jnbor);
+}
+
+/* ----------------------------------------------------------------------
+  De-symmetrize ulisttot_re and _im and pack it into a unified ulisttot
+  structure. Zero-initialize ylist. CPU and GPU.
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPTransformUi, const int iatom_mod, const int idxu, const int iatom_div) const {
+
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+  if (idxu > snaKK.idxu_max) return;
+
+  int elem_count = chemflag ? nelements : 1;
+
+  for (int ielem = 0; ielem < elem_count; ielem++) {
+
+    const FullHalfMapper mapper = snaKK.idxu_full_half[idxu];
+
+    auto utot_re = snaKK.ulisttot_re(iatom, ielem, mapper.idxu_half);
+    auto utot_im = snaKK.ulisttot_im(iatom, ielem, mapper.idxu_half);
+
+    if (mapper.flip_sign == 1) {
+      utot_im = -utot_im;
+    } else if (mapper.flip_sign == -1) {
+      utot_re = -utot_re;
+    }
+
+    snaKK.ulisttot(iatom, ielem, idxu) = { utot_re, utot_im };
+
+    if (mapper.flip_sign == 0) {
+      snaKK.ylist_re(iatom, ielem, mapper.idxu_half) = 0.;
+      snaKK.ylist_im(iatom, ielem, mapper.idxu_half) = 0.;
+    }
+  }
 }

 template<class DeviceType, typename real_type, int vector_length>
@ -1110,10 +967,21 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
  }
 }

+/* ----------------------------------------------------------------------
+  Compute all elements of the Z tensor and store them into the `zlist`
+  View. This is only used for energy timesteps or quadratic SNAP.
+  CPU and GPU.
+------------------------------------------------------------------------- */
+
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeYiCPU,const int& ii) const {
-  snaKK.compute_yi_cpu(ii);
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeZi,const int iatom_mod, const int jjz, const int iatom_div) const {
+
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+  if (jjz >= snaKK.idxz_max) return;
+
+  snaKK.compute_zi(iatom, jjz);
 }

 template<class DeviceType, typename real_type, int vector_length>
@ -1122,6 +990,22 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
  snaKK.compute_zi_cpu(ii);
 }

+/* ----------------------------------------------------------------------
+  Compute the energy triple products and store in the "blist" View.
+  CPU and GPU.
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeBi,const int iatom_mod, const int jjb, const int iatom_div) const {
+
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+  if (jjb >= snaKK.idxb_max) return;
+
+  snaKK.compute_bi(iatom,jjb);
+}
+
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeBiCPU, const int& ii) const {
@ -1130,42 +1014,187 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
  snaKK.compute_bi(iatom, jjb);
 }

-template<class DeviceType, typename real_type, int vector_length>
-KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeDuidrjCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeDuidrjCPU>::member_type& team) const {

-  // Extract the atom number
-  int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size()));
-  if (ii >= chunk_size) return;
-
-  // Extract the neighbor number
-  const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size());
-  const int ninside = d_ninside(ii);
-  if (jj >= ninside) return;
-
-  snaKK.compute_duidrj_cpu(team,ii,jj);
-}
+/* ----------------------------------------------------------------------
+  Assemble the "beta" coefficients that enter the computation of the
+  adjoint matrices Y. For quadratic SNAP, this includes accumulating
+  energy triple products into an "effective" beta that encodes the
+  quadratic terms with otherwise linear compute work.
+  CPU and GPU.
+------------------------------------------------------------------------- */

 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeDeidrjCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeDeidrjCPU>::member_type& team) const {
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPBeta, const int& iatom) const {

-  // Extract the atom number
-  int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size()));
-  if (ii >= chunk_size) return;
+  if (iatom >= chunk_size) return;

-  // Extract the neighbor number
-  const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size());
-  const int ninside = d_ninside(ii);
-  if (jj >= ninside) return;
+  const int i = d_ilist[iatom + chunk_offset];
+  const int itype = type[i];
+  const int ielem = d_map[itype];

-  snaKK.compute_deidrj_cpu(team,ii,jj);
+  auto d_coeffi = Kokkos::subview(d_coeffelem, ielem, Kokkos::ALL);
+
+  for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
+    d_beta(iatom, icoeff) = d_coeffi[icoeff+1];
+  }
+
+  if (quadraticflag) {
+    const auto idxb_max = snaKK.idxb_max;
+    int k = ncoeff+1;
+    for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
+      const auto idxb = icoeff % idxb_max;
+      const auto idx_chem = icoeff / idxb_max;
+      real_type bveci = snaKK.blist(iatom, idx_chem, idxb);
+      d_beta(iatom, icoeff) += d_coeffi[k] * bveci;
+      k++;
+      for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
+        const auto jdxb = jcoeff % idxb_max;
+        const auto jdx_chem = jcoeff / idxb_max;
+        real_type bvecj = snaKK.blist(iatom, jdx_chem, jdxb);
+        d_beta(iatom, icoeff) += d_coeffi[k] * bvecj;
+        d_beta(iatom, jcoeff) += d_coeffi[k] * bveci;
+        k++;
+      }
+    }
+  }
 }

 /* ----------------------------------------------------------------------
-   Also used for both CPU and GPU codepaths. Could maybe benefit from a
-   separate GPU/CPU codepath, but this kernel takes so little time it's
-   likely not worth it.
+  Compute all elements of the Z tensor and accumultate them into the
+  adjoint matrices Y (ylist_re, _im) on non-energy timesteps. CPU and GPU.
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeYi,const int iatom_mod, const int jjz, const int iatom_div) const {
+
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+  if (jjz >= snaKK.idxz_max) return;
+
+  snaKK.compute_yi(iatom, jjz);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeYiCPU, const int& ii) const {
+  snaKK.compute_yi_cpu(ii);
+}
+
+/* ----------------------------------------------------------------------
+  Accumulate the pre-computed elements of the Z tensor into the adjoint
+  matrices Y (ylist_re, _im) on non-energy timesteps. CPU and GPU.
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeYiWithZlist,const int iatom_mod, const int jjz, const int iatom_div) const {
+
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+  if (jjz >= snaKK.idxz_max) return;
+
+  snaKK.compute_yi_with_zlist(iatom, jjz);
+}
+
+/* ----------------------------------------------------------------------
+  Assemble the force contributions for each atom, neighbor pair by
+  contracting the adjoint matrices Y with derivatives of the Wigner
+  matrices U. These routines are GPU only and use scratch memory
+  staging.
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+template<int dir>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeFusedDeidrjSmall<dir>,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeFusedDeidrjSmall<dir> >::member_type& team) const {
+
+  // extract flattened atom_div / neighbor number / bend location
+  int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_fused_deidrj;
+
+  // extract neighbor index, iatom_div
+  int iatom_div = flattened_idx / (max_neighs * (twojmax + 1)); // removed "const" to work around GCC 7 bug
+  const int jj_jbend = flattened_idx - iatom_div * (max_neighs * (twojmax + 1));
+  const int jbend = jj_jbend / max_neighs;
+  int jj = jj_jbend - jbend * max_neighs; // removed "const" to work around GCC 7 bug
+
+  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
+    [&] (const int iatom_mod) {
+    const int ii = iatom_mod + vector_length * iatom_div;
+    if (ii >= chunk_size) return;
+
+    const int ninside = d_ninside(ii);
+    if (jj >= ninside) return;
+
+    snaKK.template compute_fused_deidrj_small<dir>(team, iatom_mod, jbend, jj, iatom_div);
+
+  });
+
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+template<int dir>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeFusedDeidrjLarge<dir>,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeFusedDeidrjLarge<dir> >::member_type& team) const {
+
+  // extract flattened atom_div / neighbor number / bend location
+  int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_fused_deidrj;
+
+  // extract neighbor index, iatom_div
+  int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug
+  int jj = flattened_idx - max_neighs * iatom_div;
+
+  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
+    [&] (const int iatom_mod) {
+    const int ii = iatom_mod + vector_length * iatom_div;
+    if (ii >= chunk_size) return;
+
+    const int ninside = d_ninside(ii);
+    if (jj >= ninside) return;
+
+    snaKK.template compute_fused_deidrj_large<dir>(team, iatom_mod, jj, iatom_div);
+
+  });
+}
+
+/* ----------------------------------------------------------------------
+  Assemble the derivatives of the Winger matrices U into the View
+  "dulist". CPU only.
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeDuidrjCPU, const int& ii) const {
+  const int iatom = ii / max_neighs;
+  const int jnbor = ii % max_neighs;
+  const int ninside = d_ninside(iatom);
+  if (jnbor >= ninside) return;
+
+  snaKK.compute_duidrj_cpu(iatom, jnbor);
+}
+
+/* ----------------------------------------------------------------------
+  Assemble the force contributions for each atom, neighbor pair by
+  contracting the adjoint matrices Y with the pre-computed derivatives
+  of the Wigner matrices U. CPU only.
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeDeidrjCPU, const int& ii) const {
+  const int iatom = ii / max_neighs;
+  const int jnbor = ii % max_neighs;
+  const int ninside = d_ninside(iatom);
+  if (jnbor >= ninside) return;
+
+  snaKK.compute_deidrj_cpu(iatom, jnbor);
+}
+
+/* ----------------------------------------------------------------------
+   This routine formally accumulates the "chunked" force contributions
+   into the broader LAMMPS "f" force View. As appropriate it
+   also accumulates the total energy and the virial. CPU and GPU.
 ------------------------------------------------------------------------- */

 template<class DeviceType, typename real_type, int vector_length>
--- a/src/KOKKOS/sna_kokkos.h
+++ b/src/KOKKOS/sna_kokkos.h
@ -237,9 +237,9 @@ class SNAKokkos {

  // functions for bispectrum coefficients, CPU only
  KOKKOS_INLINE_FUNCTION
-  void pre_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team,const int&,const int&) const; // ForceSNAP
+  void pre_ui_cpu(const int&, const int&) const; // ForceSNAP
  KOKKOS_INLINE_FUNCTION
-  void compute_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int) const; // ForceSNAP
+  void compute_ui_cpu(const int&, const int&) const; // ForceSNAP
  KOKKOS_INLINE_FUNCTION
  void compute_zi_cpu(const int&) const;    // ForceSNAP
  KOKKOS_INLINE_FUNCTION
@ -247,9 +247,9 @@ class SNAKokkos {

  // functions for derivatives, CPU only
  KOKKOS_INLINE_FUNCTION
-  void compute_duidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int) const; //ForceSNAP
+  void compute_duidrj_cpu(const int&, const int&) const; //ForceSNAP
  KOKKOS_INLINE_FUNCTION
-  void compute_deidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int) const; // ForceSNAP
+  void compute_deidrj_cpu(const int&, const int&) const; // ForceSNAP

  KOKKOS_INLINE_FUNCTION
  real_type compute_sfac(real_type, real_type, real_type, real_type) const; // add_uarraytot, compute_duarray
@ -358,11 +358,10 @@ class SNAKokkos {
  void init_rootpqarray();    // init()

  KOKKOS_INLINE_FUNCTION
-  void add_uarraytot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int, const real_type&, const real_type&, const real_type&, const real_type&, const real_type&, int) const; // compute_ui
+  void add_uarraytot(int, int, const real_type&, const real_type&, const real_type&, const real_type&, const real_type&, int) const; // compute_ui

  KOKKOS_INLINE_FUNCTION
-  void compute_uarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int,
-                      const real_type&, const real_type&, const real_type&,
+  void compute_uarray_cpu(int, int, const real_type&, const real_type&, const real_type&,
                      const real_type&, const real_type&) const; // compute_ui_cpu


@ -372,8 +371,7 @@ class SNAKokkos {
  inline
  int compute_ncoeff();           // SNAKokkos()
  KOKKOS_INLINE_FUNCTION
-  void compute_duarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int,
-                       const real_type&, const real_type&, const real_type&, // compute_duidrj_cpu
+  void compute_duarray_cpu(int, int, const real_type&, const real_type&, const real_type&, // compute_duidrj_cpu
                           const real_type&, const real_type&, const real_type&, const real_type&, const real_type&,
                           const real_type&, const real_type&) const;

--- a/src/KOKKOS/sna_kokkos_impl.h
+++ b/src/KOKKOS/sna_kokkos_impl.h
@ -1156,17 +1156,14 @@ typename SNAKokkos<DeviceType, real_type, vector_length>::real_type SNAKokkos<De

 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType, real_type, vector_length>::pre_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int& iatom, const int& ielem) const
+void SNAKokkos<DeviceType, real_type, vector_length>::pre_ui_cpu(const int& iatom, const int& ielem) const
 {
  for (int jelem = 0; jelem < nelements; jelem++) {
    for (int j = 0; j <= twojmax; j++) {
      int jju = idxu_half_block(j); // removed "const" to work around GCC 7 bug

      // Only diagonal elements get initialized
-      // for (int m = 0; m < (j+1)*(j/2+1); m++)
-      Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, (j+1)*(j/2+1)),
-        [&] (const int m) {
-
+      for (int m = 0; m < (j+1)*(j/2+1); m++) {
        const int jjup = jju + m;

        // if m is on the "diagonal", initialize it with the self energy.
@ -1176,7 +1173,7 @@ void SNAKokkos<DeviceType, real_type, vector_length>::pre_ui_cpu(const typename

        ulisttot_re(iatom, jelem, jjup) = init;
        ulisttot_im(iatom, jelem, jjup) = 0;
-      });
+      };
    }
  }

@ -1191,7 +1188,7 @@ void SNAKokkos<DeviceType, real_type, vector_length>::pre_ui_cpu(const typename

 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType, real_type, vector_length>::compute_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor) const
+void SNAKokkos<DeviceType, real_type, vector_length>::compute_ui_cpu(const int& iatom, const int& jnbor) const
 {
  real_type rsq, r, x, y, z, z0, theta0;

@ -1211,8 +1208,8 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_ui_cpu(const typen
  //    theta0 = (r - rmin0) * rscale0;
  z0 = r / tan(theta0);

-  compute_uarray_cpu(team, iatom, jnbor, x, y, z, z0, r);
-  add_uarraytot(team, iatom, jnbor, r, wj(iatom,jnbor), rcutij(iatom,jnbor), sinnerij(iatom,jnbor), dinnerij(iatom,jnbor), element(iatom, jnbor));
+  compute_uarray_cpu(iatom, jnbor, x, y, z, z0, r);
+  add_uarraytot(iatom, jnbor, r, wj(iatom,jnbor), rcutij(iatom,jnbor), sinnerij(iatom,jnbor), dinnerij(iatom,jnbor), element(iatom, jnbor));

 }
 /* ----------------------------------------------------------------------
@ -1328,7 +1325,7 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_yi_cpu(int iter) c

 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType, real_type, vector_length>::compute_duidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor) const
+void SNAKokkos<DeviceType, real_type, vector_length>::compute_duidrj_cpu(const int& iatom, const int& jnbor) const
 {
  real_type rsq, r, x, y, z, z0, theta0, cs, sn;
  real_type dz0dr;
@ -1345,7 +1342,7 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_duidrj_cpu(const t
  z0 = r * cs / sn;
  dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq;

-  compute_duarray_cpu(team, iatom, jnbor, x, y, z, z0, r, dz0dr, wj(iatom,jnbor), rcutij(iatom,jnbor), sinnerij(iatom,jnbor), dinnerij(iatom,jnbor));
+  compute_duarray_cpu(iatom, jnbor, x, y, z, z0, r, dz0dr, wj(iatom,jnbor), rcutij(iatom,jnbor), sinnerij(iatom,jnbor), dinnerij(iatom,jnbor));
 }


@ -1360,54 +1357,42 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_duidrj_cpu(const t

 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType, real_type, vector_length>::compute_deidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor) const
+void SNAKokkos<DeviceType, real_type, vector_length>::compute_deidrj_cpu(const int& iatom, const int& jnbor) const
 {
-  t_scalar3<real_type> final_sum;
+  real_type force_sum[3] = { 0, 0, 0 };
  const int jelem = element(iatom, jnbor);

-  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,twojmax+1),
-      [&] (const int& j, t_scalar3<real_type>& sum_tmp) {
+  for (int j = 0; j <= twojmax; j++) {
    int jju_half = idxu_half_block[j];
    int jju_cache = idxu_cache_block[j];

-    for (int mb = 0; 2*mb < j; mb++)
+    for (int mb = 0; 2 * mb < j; mb++) {
      for (int ma = 0; ma <= j; ma++) {
-        const complex y_val = { ylist_re(iatom, jelem, jju_half), ylist_im(iatom, jelem, jju_half) };
-        sum_tmp.x += dulist_cpu(iatom, jnbor, jju_cache, 0).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 0).im * y_val.im;
-        sum_tmp.y += dulist_cpu(iatom, jnbor, jju_cache, 1).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 1).im * y_val.im;
-        sum_tmp.z += dulist_cpu(iatom, jnbor, jju_cache, 2).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 2).im * y_val.im;
+        complex y = { ylist_re(iatom, jelem, jju_half), ylist_im(iatom, jelem, jju_half) };
+        for (int k = 0; k < 3; k++)
+          force_sum[k] += dulist_cpu(iatom, jnbor, jju_cache, k).re * y.re +
+                          dulist_cpu(iatom, jnbor, jju_cache, k).im * y.im;
        jju_half++; jju_cache++;
-      } //end loop over ma mb
+      }
+    } //end loop over ma mb

    // For j even, handle middle column

-    if (j%2 == 0) {
-
-      int mb = j/2;
-      for (int ma = 0; ma < mb; ma++) {
-        const complex y_val = { ylist_re(iatom, jelem, jju_half), ylist_im(iatom, jelem, jju_half) };
-        sum_tmp.x += dulist_cpu(iatom, jnbor, jju_cache, 0).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 0).im * y_val.im;
-        sum_tmp.y += dulist_cpu(iatom, jnbor, jju_cache, 1).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 1).im * y_val.im;
-        sum_tmp.z += dulist_cpu(iatom, jnbor, jju_cache, 2).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 2).im * y_val.im;
+    if (j % 2 == 0) {
+      //int mb = j / 2;
+      for (int ma = 0; ma <= j; ma++) {
+        complex y = { ylist_re(iatom, jelem, jju_half), ylist_im(iatom, jelem, jju_half) };
+        for (int k = 0; k < 3; k++)
+          force_sum[k] += static_cast<real_type>(0.5) * (dulist_cpu(iatom, jnbor, jju_cache, k).re * y.re +
+                          dulist_cpu(iatom, jnbor, jju_cache, k).im * y.im);
        jju_half++; jju_cache++;
      }
-
-      //int ma = mb;
-      // 0.5 is meant to avoid double-counting
-      const complex y_val = { 0.5 * ylist_re(iatom, jelem, jju_half), 0.5 * ylist_im(iatom, jelem, jju_half) };
-      sum_tmp.x += dulist_cpu(iatom, jnbor, jju_cache, 0).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 0).im * y_val.im;
-      sum_tmp.y += dulist_cpu(iatom, jnbor, jju_cache, 1).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 1).im * y_val.im;
-      sum_tmp.z += dulist_cpu(iatom, jnbor, jju_cache, 2).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 2).im * y_val.im;
    } // end if jeven

-  },final_sum); // end loop over j
-
-  Kokkos::single(Kokkos::PerThread(team), [&] () {
-    dedr(iatom,jnbor,0) = final_sum.x*2.0;
-    dedr(iatom,jnbor,1) = final_sum.y*2.0;
-    dedr(iatom,jnbor,2) = final_sum.z*2.0;
-  });
+  }

+  for (int k = 0; k < 3; k++)
+    dedr(iatom, jnbor, k) = 2 * force_sum[k];
 }


@ -1421,15 +1406,13 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_deidrj_cpu(const t

 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType, real_type, vector_length>::add_uarraytot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor,
+void SNAKokkos<DeviceType, real_type, vector_length>::add_uarraytot(int iatom, int jnbor,
    const real_type& r, const real_type& wj, const real_type& rcut,
    const real_type& sinner, const real_type& dinner, int jelem) const
 {
  const real_type sfac = compute_sfac(r, rcut, sinner, dinner) * wj;

-  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,twojmax+1),
-      [&] (const int& j) {
-
+  for (int j = 0; j <= twojmax; j++) {
    int jju_half = idxu_half_block[j]; // index into ulisttot
    int jju_cache = idxu_cache_block[j]; // index into ulist

@ -1441,7 +1424,7 @@ void SNAKokkos<DeviceType, real_type, vector_length>::add_uarraytot(const typena
        count++;
      }
    }
-  });
+  }
 }

 /* ----------------------------------------------------------------------
@ -1452,23 +1435,15 @@ void SNAKokkos<DeviceType, real_type, vector_length>::add_uarraytot(const typena

 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType, real_type, vector_length>::compute_uarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor,
+void SNAKokkos<DeviceType, real_type, vector_length>::compute_uarray_cpu(int iatom, int jnbor,
                         const real_type& x, const real_type& y, const real_type& z, const real_type& z0, const real_type& r) const
 {
-  real_type r0inv;
-  real_type a_r, b_r, a_i, b_i;
-  real_type rootpq;
-
  // compute Cayley-Klein parameters for unit quaternion
-
-  r0inv = static_cast<real_type>(1.0) / sqrt(r * r + z0 * z0);
-  a_r = r0inv * z0;
-  a_i = -r0inv * z;
-  b_r = r0inv * y;
-  b_i = -r0inv * x;
+  real_type r0inv = static_cast<real_type>(1.0) / sqrt(r * r + z0 * z0);
+  complex a = { r0inv * z0, -r0inv * z };
+  complex b = { r0inv * y, -r0inv * x };

  // VMK Section 4.8.2
-
  ulist_cpu(iatom, jnbor, 0) = complex::one();

  for (int j = 1; j <= twojmax; j++) {
@ -1476,61 +1451,65 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_uarray_cpu(const t
    int jjup = idxu_cache_block[j-1]; // removed "const" to work around GCC 7 bug

    // fill in left side of matrix layer from previous layer
+    for (int mb = 0; 2*mb <= j; mb++) {
+      int jju_index = jju + (j + 1) * mb;
+      int jjup_index = jjup + j * mb;

-    Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,(j+2)/2),
-        [&] (const int& mb) {
-    //for (int mb = 0; 2*mb <= j; mb++) {
-      const int jju_index = jju+mb+mb*j;
-      ulist_cpu(iatom, jnbor, jju_index) = { static_cast<real_type>(0), static_cast<real_type>(0) };
+      complex ui = complex::zero();

      for (int ma = 0; ma < j; ma++) {
-        const int jju_index = jju+mb+mb*j+ma;
-        const int jjup_index = jjup+mb*j+ma;
-        rootpq = rootpqarray(j - ma,j - mb);
-        ulist_cpu(iatom, jnbor, jju_index).re +=
-          rootpq *
-          (a_r * ulist_cpu(iatom, jnbor, jjup_index).re +
-           a_i * ulist_cpu(iatom, jnbor, jjup_index).im);
-        ulist_cpu(iatom, jnbor, jju_index).im +=
-          rootpq *
-          (a_r * ulist_cpu(iatom, jnbor, jjup_index).im -
-           a_i * ulist_cpu(iatom, jnbor, jjup_index).re);
+        complex ui_prev = ulist_cpu(iatom, jnbor, jjup_index);

-        rootpq = rootpqarray(ma + 1,j - mb);
-        ulist_cpu(iatom, jnbor, jju_index+1).re =
-          -rootpq *
-          (b_r * ulist_cpu(iatom, jnbor, jjup_index).re +
-           b_i * ulist_cpu(iatom, jnbor, jjup_index).im);
-        ulist_cpu(iatom, jnbor, jju_index+1).im =
-          -rootpq *
-          (b_r * ulist_cpu(iatom, jnbor, jjup_index).im -
-           b_i * ulist_cpu(iatom, jnbor, jjup_index).re);
+        real_type rootpq = rootpqarray(j - ma, j - mb);
+        ui.re += rootpq * (a.re * ui_prev.re + a.im * ui_prev.im);
+        ui.im += rootpq * (a.re * ui_prev.im - a.im * ui_prev.re);
+
+        ulist_cpu(iatom, jnbor, jju_index) = ui;
+
+        rootpq = rootpqarray(ma + 1, j - mb);
+        ui.re = -rootpq * (b.re * ui_prev.re + b.im * ui_prev.im);
+        ui.im = -rootpq * (b.re * ui_prev.im - b.im * ui_prev.re);
+
+        jju_index++;
+        jjup_index++;
      }

-      // copy left side to right side with inversion symmetry VMK 4.4(2)
-      // u[ma-j,mb-j] = (-1)^(ma-mb)*Conj([u[ma,mb))
+      ulist_cpu(iatom, jnbor, jju_index) = ui;
+    }

-      // Only need to add one symmetrized row for convenience
-      // Symmetry gets "unfolded" in accumulating ulisttot
-      if (j%2==1 && mb==(j/2)) {
-        const int mbpar = (mb)%2==0?1:-1;
-        int mapar = mbpar;
-        for (int ma = 0; ma <= j; ma++) {
-          const int jju_index = jju + mb*(j+1) + ma;
-          const int jjup_index = jju + (j+1-mb)*(j+1)-(ma+1);
-          if (mapar == 1) {
-            ulist_cpu(iatom, jnbor, jjup_index).re = ulist_cpu(iatom, jnbor, jju_index).re;
-            ulist_cpu(iatom, jnbor, jjup_index).im = -ulist_cpu(iatom, jnbor, jju_index).im;
-          } else {
-            ulist_cpu(iatom, jnbor, jjup_index).re = -ulist_cpu(iatom, jnbor, jju_index).re;
-            ulist_cpu(iatom, jnbor, jjup_index).im = ulist_cpu(iatom, jnbor, jju_index).im;
-          }
-          mapar = -mapar;
-        }
+    // If j is odd (half-integer in the mathematical convention), we need
+    // to add one more row for convenience (for now). This can either be done
+    // via symmetry (see the commented code below), or by the equations to fill
+    // from the left instead of the right
+    if (j % 2 == 1) {
+      int mb = j / 2;
+      // begin filling in the extra row
+      int jju_index = jju + (mb + 1) * (j + 1);
+      int jjup_index = jjup + mb * j;
+
+      complex ui = complex::zero();
+
+      for (int ma = 0; ma < j; ma++) {
+        complex ui_prev = ulist_cpu(iatom, jnbor, jjup_index);
+
+        real_type rootpq = rootpqarray(j - ma, mb + 1);
+        ui.re += rootpq * (b.re * ui_prev.re - b.im * ui_prev.im);
+        ui.im += rootpq * (b.re * ui_prev.im + b.im * ui_prev.re);
+
+        ulist_cpu(iatom, jnbor, jju_index) = ui;
+
+        rootpq = rootpqarray(ma + 1, mb + 1);
+        ui.re = rootpq * (a.re * ui_prev.re - a.im * ui_prev.im);
+        ui.im = rootpq * (a.re * ui_prev.im + a.im * ui_prev.re);
+
+        jju_index++;
+        jjup_index++;
      }
-    });

+      ulist_cpu(iatom, jnbor, jju_index) = ui;
+    }
  }
+
 }

 /* ----------------------------------------------------------------------
@ -1541,30 +1520,25 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_uarray_cpu(const t

 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType, real_type, vector_length>::compute_duarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor,
+void SNAKokkos<DeviceType, real_type, vector_length>::compute_duarray_cpu(int iatom, int jnbor,
                          const real_type& x, const real_type& y, const real_type& z,
                          const real_type& z0, const real_type& r, const real_type& dz0dr,
                          const real_type& wj, const real_type& rcut,
                          const real_type& sinner, const real_type& dinner) const
 {
-  real_type r0inv;
-  real_type a_r, a_i, b_r, b_i;
-  real_type u[3], da_r[3], da_i[3], db_r[3], db_i[3];
-  real_type dz0[3], dr0inv[3], dr0invdr;
-  real_type rootpq;
+  complex da[3], db[3];
+  real_type u[3], dz0[3], dr0inv[3];

  real_type rinv = 1.0 / r;
  u[0] = x * rinv;
  u[1] = y * rinv;
  u[2] = z * rinv;

-  r0inv = 1.0 / sqrt(r * r + z0 * z0);
-  a_r = z0 * r0inv;
-  a_i = -z * r0inv;
-  b_r = y * r0inv;
-  b_i = -x * r0inv;
+  real_type r0inv = 1.0 / sqrt(r * r + z0 * z0);
+  complex a = { z0 * r0inv, -z * r0inv };
+  complex b = { y * r0inv, -x * r0inv };

-  dr0invdr = -r0inv * r0inv * r0inv * (r + z0 * dz0dr);
+  real_type dr0invdr = -r0inv * r0inv * r0inv * (r + z0 * dz0dr);

  dr0inv[0] = dr0invdr * u[0];
  dr0inv[1] = dr0invdr * u[1];
@ -1575,19 +1549,19 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_duarray_cpu(const
  dz0[2] = dz0dr * u[2];

  for (int k = 0; k < 3; k++) {
-    da_r[k] = dz0[k] * r0inv + z0 * dr0inv[k];
-    da_i[k] = -z * dr0inv[k];
+    da[k].re = dz0[k] * r0inv + z0 * dr0inv[k];
+    da[k].im = -z * dr0inv[k];
  }

-  da_i[2] += -r0inv;
+  da[2].im += -r0inv;

  for (int k = 0; k < 3; k++) {
-    db_r[k] = y * dr0inv[k];
-    db_i[k] = -x * dr0inv[k];
+    db[k].re = y * dr0inv[k];
+    db[k].im = -x * dr0inv[k];
  }

-  db_i[0] += -r0inv;
-  db_r[1] += r0inv;
+  db[0].im += -r0inv;
+  db[1].re += r0inv;

  for (int k = 0; k < 3; k++)
    dulist_cpu(iatom, jnbor, 0, k) = complex::zero();
@ -1595,72 +1569,94 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_duarray_cpu(const
  for (int j = 1; j <= twojmax; j++) {
    int jju = idxu_cache_block[j];
    int jjup = idxu_cache_block[j-1];
-    Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,(j+2)/2),
-        [&] (const int& mb) {
-    //for (int mb = 0; 2*mb <= j; mb++) {
-      const int jju_index = jju+mb+mb*j;
-      for (int k = 0; k < 3; k++)
-        dulist_cpu(iatom, jnbor, jju_index, k) = complex::zero();
+
+    for (int mb = 0; 2*mb <= j; mb++) {
+      int jju_index = jju + mb * (j + 1);
+      int jjup_index = jjup + mb * j;
+
+      complex duidrj[3] = { complex::zero(), complex::zero(), complex::zero() };

      for (int ma = 0; ma < j; ma++) {
-        const int jju_index = jju+mb+mb*j+ma;
-        const int jjup_index = jjup+mb*j+ma;
-        rootpq = rootpqarray(j - ma,j - mb);
+        complex ui_prev = ulist_cpu(iatom, jnbor, jjup_index);
+        complex duidrj_prev[3] = { dulist_cpu(iatom, jnbor, jjup_index, 0),
+                                   dulist_cpu(iatom, jnbor, jjup_index, 1),
+                                   dulist_cpu(iatom, jnbor, jjup_index, 2) };
+
+        real_type rootpq = rootpqarray(j - ma,j - mb);
        for (int k = 0; k < 3; k++) {
-          dulist_cpu(iatom, jnbor, jju_index, k).re +=
-            rootpq * (da_r[k] * ulist_cpu(iatom, jnbor, jjup_index).re +
-                      da_i[k] * ulist_cpu(iatom, jnbor, jjup_index).im +
-                      a_r * dulist_cpu(iatom, jnbor, jjup_index, k).re +
-                      a_i * dulist_cpu(iatom, jnbor, jjup_index, k).im);
-          dulist_cpu(iatom, jnbor, jju_index, k).im +=
-            rootpq * (da_r[k] * ulist_cpu(iatom, jnbor, jjup_index).im -
-                      da_i[k] * ulist_cpu(iatom, jnbor, jjup_index).re +
-                      a_r * dulist_cpu(iatom, jnbor, jjup_index, k).im -
-                      a_i * dulist_cpu(iatom, jnbor, jjup_index, k).re);
+          duidrj[k].re += rootpq * (da[k].re * ui_prev.re + da[k].im * ui_prev.im +
+                                    a.re * duidrj_prev[k].re + a.im * duidrj_prev[k].im);
+
+          duidrj[k].im += rootpq * (da[k].re * ui_prev.im - da[k].im * ui_prev.re +
+                                    a.re * duidrj_prev[k].im - a.im * duidrj_prev[k].re);
+
+          dulist_cpu(iatom, jnbor, jju_index, k) = duidrj[k];
        }

        rootpq = rootpqarray(ma + 1,j - mb);
        for (int k = 0; k < 3; k++) {
-          dulist_cpu(iatom, jnbor, jju_index+1, k).re =
-            -rootpq * (db_r[k] * ulist_cpu(iatom, jnbor, jjup_index).re +
-                       db_i[k] * ulist_cpu(iatom, jnbor, jjup_index).im +
-                       b_r * dulist_cpu(iatom, jnbor, jjup_index, k).re +
-                       b_i * dulist_cpu(iatom, jnbor, jjup_index, k).im);
-          dulist_cpu(iatom, jnbor, jju_index+1, k).im =
-            -rootpq * (db_r[k] * ulist_cpu(iatom, jnbor, jjup_index).im -
-                       db_i[k] * ulist_cpu(iatom, jnbor, jjup_index).re +
-                       b_r * dulist_cpu(iatom, jnbor, jjup_index, k).im -
-                       b_i * dulist_cpu(iatom, jnbor, jjup_index, k).re);
+          duidrj[k].re = -rootpq * (db[k].re * ui_prev.re + db[k].im * ui_prev.im +
+                                    b.re * duidrj_prev[k].re + b.im * duidrj_prev[k].im);
+
+          duidrj[k].im = -rootpq * (db[k].re * ui_prev.im - db[k].im * ui_prev.re +
+                                    b.re * duidrj_prev[k].im - b.im * duidrj_prev[k].re);
        }
+
+        jju_index++;
+        jjup_index++;
      }

-      // Only need to add one symmetrized row for convenience
-      // Symmetry gets "unfolded" during the dedr accumulation
-
-      // copy left side to right side with inversion symmetry VMK 4.4(2)
-      // u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb])
-
-      if (j%2==1 && mb==(j/2)) {
-        const int mbpar = (mb)%2==0?1:-1;
-        int mapar = mbpar;
-        for (int ma = 0; ma <= j; ma++) {
-          const int jju_index = jju+mb*(j+1)+ma;
-          const int jjup_index = jju+(mb+2)*(j+1)-(ma+1);
-          if (mapar == 1) {
-            for (int k = 0; k < 3; k++) {
-              dulist_cpu(iatom, jnbor, jjup_index, k).re = dulist_cpu(iatom, jnbor, jju_index, k).re;
-              dulist_cpu(iatom, jnbor, jjup_index, k).im = -dulist_cpu(iatom, jnbor, jju_index, k).im;
-            }
-          } else {
-            for (int k = 0; k < 3; k++) {
-              dulist_cpu(iatom, jnbor, jjup_index, k).re = -dulist_cpu(iatom, jnbor, jju_index, k).re;
-              dulist_cpu(iatom, jnbor, jjup_index, k).im = dulist_cpu(iatom, jnbor, jju_index, k).im;
-            }
-          }
-          mapar = -mapar;
-        }
+      for (int k = 0; k < 3; k++) {
+        dulist_cpu(iatom, jnbor, jju_index, k) = duidrj[k];
      }
-    });
+    }
+
+    // Only need to add one symmetrized row for convenience
+    // Symmetry gets "unfolded" during the dedr accumulation
+
+    // copy left side to right side with inversion symmetry VMK 4.4(2)
+    // u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb])
+
+    if (j % 2 == 1) {
+      int mb = j / 2;
+      // begin filling in the extra row
+      int jju_index = jju + (mb + 1) * (j + 1);
+      int jjup_index = jjup + mb * j;
+
+      complex duidrj[3] = { complex::zero(), complex::zero(), complex::zero() };
+
+      for (int ma = 0; ma < j; ma++) {
+        complex ui_prev = ulist_cpu(iatom, jnbor, jjup_index);
+        complex duidrj_prev[3] = { dulist_cpu(iatom, jnbor, jjup_index, 0),
+                                   dulist_cpu(iatom, jnbor, jjup_index, 1),
+                                   dulist_cpu(iatom, jnbor, jjup_index, 2) };
+
+        real_type rootpq = rootpqarray(j - ma, mb + 1);
+        for (int k = 0; k < 3; k++) {
+          duidrj[k].re += rootpq * (db[k].re * ui_prev.re - db[k].im * ui_prev.im +
+                                    b.re * duidrj_prev[k].re - b.im * duidrj_prev[k].im);
+          duidrj[k].im += rootpq * (db[k].re * ui_prev.im + db[k].im * ui_prev.re +
+                                    b.re * duidrj_prev[k].im + b.im * duidrj_prev[k].re);
+
+          dulist_cpu(iatom, jnbor, jju_index, k) = duidrj[k];
+        }
+
+        rootpq = rootpqarray(ma + 1, mb + 1);
+        for (int k = 0; k < 3; k++) {
+          duidrj[k].re = rootpq * (da[k].re * ui_prev.re - da[k].im * ui_prev.im +
+                                   a.re * duidrj_prev[k].re - a.im * duidrj_prev[k].im);
+          duidrj[k].im = rootpq * (da[k].re * ui_prev.im + da[k].im * ui_prev.re +
+                                   a.re * duidrj_prev[k].im + a.im * duidrj_prev[k].re);
+        }
+
+        jju_index++;
+        jjup_index++;
+      }
+
+      for (int k = 0; k < 3; k++) {
+        dulist_cpu(iatom, jnbor, jju_index, k) = duidrj[k];
+      }
+    }
  }

  real_type sfac = compute_sfac(r, rcut, sinner, dinner);
@ -1678,9 +1674,9 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_duarray_cpu(const
      for (int ma = 0; ma <= j; ma++) {
        for (int k = 0; k < 3; k++) {
          dulist_cpu(iatom, jnbor, jju, k).re = dsfac * ulist_cpu(iatom, jnbor, jju).re * u[k] +
-                                                sfac * dulist_cpu(iatom, jnbor, jju, k).re;
+                                    sfac * dulist_cpu(iatom, jnbor, jju, k).re;
          dulist_cpu(iatom, jnbor, jju, k).im = dsfac * ulist_cpu(iatom, jnbor, jju).im * u[k] +
-                                                sfac * dulist_cpu(iatom, jnbor, jju, k).im;
+                                    sfac * dulist_cpu(iatom, jnbor, jju, k).im;
        }
        jju++;
      }