Added less-parallelism-higher-perf paths to recursive polynomial routines. Fixed qSNAP memory coalescing issue. Various modularity improvements.

2021-08-26 22:55:07 -04:00
parent a587f67423
commit a686ce33c6
4 changed files with 642 additions and 458 deletions
--- a/src/KOKKOS/pair_snap_kokkos.h
+++ b/src/KOKKOS/pair_snap_kokkos.h
@ -44,7 +44,8 @@ struct TagPairSNAPComputeForce{};
 struct TagPairSNAPComputeNeigh{};
 struct TagPairSNAPComputeCayleyKlein{};
 struct TagPairSNAPPreUi{};
-struct TagPairSNAPComputeUi{};
+struct TagPairSNAPComputeUiSmall{}; // more parallelism, more divergence
+struct TagPairSNAPComputeUiLarge{}; // less parallelism, no divergence
 struct TagPairSNAPTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist
 struct TagPairSNAPComputeZi{};
 struct TagPairSNAPBeta{};
@ -53,7 +54,9 @@ struct TagPairSNAPTransformBi{}; // re-order blist from AoSoA to AoS
 struct TagPairSNAPComputeYi{};
 struct TagPairSNAPComputeYiWithZlist{};
 template<int dir>
-struct TagPairSNAPComputeFusedDeidrj{};
+struct TagPairSNAPComputeFusedDeidrjSmall{}; // more parallelism, more divergence
+template<int dir>
+struct TagPairSNAPComputeFusedDeidrjLarge{}; // less parallelism, no divergence

 // CPU backend only
 struct TagPairSNAPComputeNeighCPU{};
@ -143,7 +146,10 @@ public:
  void operator() (TagPairSNAPPreUi,const int iatom_mod, const int j, const int iatom_div) const;

  KOKKOS_INLINE_FUNCTION
-  void operator() (TagPairSNAPComputeUi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUi>::member_type& team) const;
+  void operator() (TagPairSNAPComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiSmall>::member_type& team) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagPairSNAPComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiLarge>::member_type& team) const;

  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPTransformUi,const int iatom_mod, const int j, const int iatom_div) const;
@ -168,7 +174,11 @@ public:

  template<int dir>
  KOKKOS_INLINE_FUNCTION
-  void operator() (TagPairSNAPComputeFusedDeidrj<dir>,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeFusedDeidrj<dir> >::member_type& team) const;
+  void operator() (TagPairSNAPComputeFusedDeidrjSmall<dir>,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeFusedDeidrjSmall<dir> >::member_type& team) const;
+
+  template<int dir>
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagPairSNAPComputeFusedDeidrjLarge<dir>,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeFusedDeidrjLarge<dir> >::member_type& team) const;

  // CPU backend only
  KOKKOS_INLINE_FUNCTION
--- a/src/KOKKOS/pair_snap_kokkos_impl.h
+++ b/src/KOKKOS/pair_snap_kokkos_impl.h
@ -341,18 +341,32 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::compute(int eflag_in,
      // ComputeUi w/vector parallelism, shared memory, direct atomicAdd into ulisttot
      {
        // team_size_compute_ui is defined in `pair_snap_kokkos.h`
-
        // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer
        const int tile_size = vector_length * (twojmax + 1);
        const int scratch_size = scratch_size_helper<complex>(team_size_compute_ui * tile_size);

-        // total number of teams needed: (natoms / 32) * (max_neighs) * ("bend" locations)
-        const int n_teams = chunk_size_div * max_neighs * (twojmax + 1);
-        const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
+        if (chunk_size < 16384)
+        {
+          // Version with parallelism over j_bend

-        SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagPairSNAPComputeUi> policy_ui(n_teams_div, team_size_compute_ui, vector_length);
-        policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
-        Kokkos::parallel_for("ComputeUi",policy_ui,*this);
+          // total number of teams needed: (natoms / 32) * (max_neighs) * ("bend" locations)
+          const int n_teams = chunk_size_div * max_neighs * (twojmax + 1);
+          const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
+
+          SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagPairSNAPComputeUiSmall> policy_ui(n_teams_div, team_size_compute_ui, vector_length);
+          policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+          Kokkos::parallel_for("ComputeUiSmall",policy_ui,*this);
+        } else {
+          // Version w/out parallelism over j_bend
+
+          // total number of teams needed: (natoms / 32) * (max_neighs)
+          const int n_teams = chunk_size_div * max_neighs;
+          const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
+
+          SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagPairSNAPComputeUiLarge> policy_ui(n_teams_div, team_size_compute_ui, vector_length);
+          policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+          Kokkos::parallel_for("ComputeUiLarge",policy_ui,*this);
+        }
      }

      //TransformUi: un-"fold" ulisttot, zero ylist
@ -412,25 +426,51 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::compute(int eflag_in,
        const int tile_size = vector_length * (twojmax + 1);
        const int scratch_size = scratch_size_helper<complex>(2 * team_size_compute_fused_deidrj * tile_size);

-        // total number of teams needed: (natoms / 32) * (max_neighs) * ("bend" locations)
-        const int n_teams = chunk_size_div * max_neighs * (twojmax + 1);
-        const int n_teams_div = (n_teams + team_size_compute_fused_deidrj - 1) / team_size_compute_fused_deidrj;
+        if (chunk_size < 16384)
+        {
+          // Version with parallelism over j_bend

-        // x direction
-        SnapAoSoATeamPolicy<DeviceType, team_size_compute_fused_deidrj, TagPairSNAPComputeFusedDeidrj<0> > policy_fused_deidrj_x(n_teams_div,team_size_compute_fused_deidrj,vector_length);
-        policy_fused_deidrj_x = policy_fused_deidrj_x.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
-        Kokkos::parallel_for("ComputeFusedDeidrj<0>",policy_fused_deidrj_x,*this);
+          // total number of teams needed: (natoms / 32) * (max_neighs) * ("bend" locations)
+          const int n_teams = chunk_size_div * max_neighs * (twojmax + 1);
+          const int n_teams_div = (n_teams + team_size_compute_fused_deidrj - 1) / team_size_compute_fused_deidrj;

-        // y direction
-        SnapAoSoATeamPolicy<DeviceType, team_size_compute_fused_deidrj, TagPairSNAPComputeFusedDeidrj<1> > policy_fused_deidrj_y(n_teams_div,team_size_compute_fused_deidrj,vector_length);
-        policy_fused_deidrj_y = policy_fused_deidrj_y.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
-        Kokkos::parallel_for("ComputeFusedDeidrj<1>",policy_fused_deidrj_y,*this);
+          // x direction
+          SnapAoSoATeamPolicy<DeviceType, team_size_compute_fused_deidrj, TagPairSNAPComputeFusedDeidrjSmall<0> > policy_fused_deidrj_x(n_teams_div,team_size_compute_fused_deidrj,vector_length);
+          policy_fused_deidrj_x = policy_fused_deidrj_x.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+          Kokkos::parallel_for("ComputeFusedDeidrjSmall<0>",policy_fused_deidrj_x,*this);

-        // z direction
-        SnapAoSoATeamPolicy<DeviceType, team_size_compute_fused_deidrj, TagPairSNAPComputeFusedDeidrj<2> > policy_fused_deidrj_z(n_teams_div,team_size_compute_fused_deidrj,vector_length);
-        policy_fused_deidrj_z = policy_fused_deidrj_z.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
-        Kokkos::parallel_for("ComputeFusedDeidrj<2>",policy_fused_deidrj_z,*this);
+          // y direction
+          SnapAoSoATeamPolicy<DeviceType, team_size_compute_fused_deidrj, TagPairSNAPComputeFusedDeidrjSmall<1> > policy_fused_deidrj_y(n_teams_div,team_size_compute_fused_deidrj,vector_length);
+          policy_fused_deidrj_y = policy_fused_deidrj_y.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+          Kokkos::parallel_for("ComputeFusedDeidrjSmall<1>",policy_fused_deidrj_y,*this);

+          // z direction
+          SnapAoSoATeamPolicy<DeviceType, team_size_compute_fused_deidrj, TagPairSNAPComputeFusedDeidrjSmall<2> > policy_fused_deidrj_z(n_teams_div,team_size_compute_fused_deidrj,vector_length);
+          policy_fused_deidrj_z = policy_fused_deidrj_z.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+          Kokkos::parallel_for("ComputeFusedDeidrjSmall<2>",policy_fused_deidrj_z,*this);
+        } else {
+          // Version w/out parallelism over j_bend
+
+          // total number of teams needed: (natoms / 32) * (max_neighs)
+          const int n_teams = chunk_size_div * max_neighs;
+          const int n_teams_div = (n_teams + team_size_compute_fused_deidrj - 1) / team_size_compute_fused_deidrj;
+
+          // x direction
+          SnapAoSoATeamPolicy<DeviceType, team_size_compute_fused_deidrj, TagPairSNAPComputeFusedDeidrjLarge<0> > policy_fused_deidrj_x(n_teams_div,team_size_compute_fused_deidrj,vector_length);
+          policy_fused_deidrj_x = policy_fused_deidrj_x.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+          Kokkos::parallel_for("ComputeFusedDeidrjLarge<0>",policy_fused_deidrj_x,*this);
+
+          // y direction
+          SnapAoSoATeamPolicy<DeviceType, team_size_compute_fused_deidrj, TagPairSNAPComputeFusedDeidrjLarge<1> > policy_fused_deidrj_y(n_teams_div,team_size_compute_fused_deidrj,vector_length);
+          policy_fused_deidrj_y = policy_fused_deidrj_y.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+          Kokkos::parallel_for("ComputeFusedDeidrjLarge<1>",policy_fused_deidrj_y,*this);
+
+          // z direction
+          SnapAoSoATeamPolicy<DeviceType, team_size_compute_fused_deidrj, TagPairSNAPComputeFusedDeidrjLarge<2> > policy_fused_deidrj_z(n_teams_div,team_size_compute_fused_deidrj,vector_length);
+          policy_fused_deidrj_z = policy_fused_deidrj_z.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+          Kokkos::parallel_for("ComputeFusedDeidrjLarge<2>",policy_fused_deidrj_z,*this);
+
+        }
      }

 #endif // LMP_KOKKOS_GPU
@ -603,13 +643,13 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
    for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
      const auto idxb = icoeff % idxb_max;
      const auto idx_chem = icoeff / idxb_max;
-      auto bveci = my_sna.blist(idxb, idx_chem, ii);
+      real_type bveci = my_sna.blist(ii, idx_chem, idxb);
      d_beta_pack(iatom_mod,icoeff,iatom_div) += d_coeffi[k]*bveci;
      k++;
      for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
        const auto jdxb = jcoeff % idxb_max;
        const auto jdx_chem = jcoeff / idxb_max;
-        real_type bvecj = my_sna.blist(jdxb, jdx_chem, ii);
+        real_type bvecj = my_sna.blist(ii, jdx_chem, jdxb);
        d_beta_pack(iatom_mod,icoeff,iatom_div) += d_coeffi[k]*bvecj;
        d_beta_pack(iatom_mod,jcoeff,iatom_div) += d_coeffi[k]*bveci;
        k++;
@ -736,7 +776,7 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN

 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeUi,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUi>::member_type& team) const {
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUiSmall>::member_type& team) const {
  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;

  // extract flattened atom_div / neighbor number / bend location
@ -756,11 +796,37 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
    const int ninside = d_ninside(ii);
    if (jj >= ninside) return;

-    my_sna.compute_ui(team,iatom_mod, jbend, jj, iatom_div);
+    my_sna.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div);
  });

 }

+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUiLarge>::member_type& team) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+  // extract flattened atom_div / neighbor number / bend location
+  int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
+
+  // extract neighbor index, iatom_div
+  int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug
+  int jj = flattened_idx - iatom_div * max_neighs;
+
+  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
+    [&] (const int iatom_mod) {
+    const int ii = iatom_mod + vector_length * iatom_div;
+    if (ii >= chunk_size) return;
+
+    const int ninside = d_ninside(ii);
+    if (jj >= ninside) return;
+
+    my_sna.compute_ui_large(team,iatom_mod, jj, iatom_div);
+  });
+
+}
+
+
 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
 void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPTransformUi,const int iatom_mod, const int idxu, const int iatom_div) const {
@ -861,9 +927,9 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN

  for (int itriple = 0; itriple < ntriples; itriple++) {

-    const auto blocal = my_sna.blist_pack(iatom_mod, idxb, itriple, iatom_div);
+    const real_type blocal = my_sna.blist_pack(iatom_mod, idxb, itriple, iatom_div);

-    my_sna.blist(idxb, itriple, iatom) = blocal;
+    my_sna.blist(iatom, itriple, idxb) = blocal;
  }

 }
@ -871,7 +937,7 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
 template<class DeviceType, typename real_type, int vector_length>
 template<int dir>
 KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeFusedDeidrj<dir>,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeFusedDeidrj<dir> >::member_type& team) const {
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeFusedDeidrjSmall<dir>,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeFusedDeidrjSmall<dir> >::member_type& team) const {
  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;

  // extract flattened atom_div / neighbor number / bend location
@ -891,12 +957,38 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
    const int ninside = d_ninside(ii);
    if (jj >= ninside) return;

-    my_sna.template compute_fused_deidrj<dir>(team, iatom_mod, jbend, jj, iatom_div);
+    my_sna.template compute_fused_deidrj_small<dir>(team, iatom_mod, jbend, jj, iatom_div);

  });

 }

+template<class DeviceType, typename real_type, int vector_length>
+template<int dir>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeFusedDeidrjLarge<dir>,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeFusedDeidrjLarge<dir> >::member_type& team) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+  // extract flattened atom_div / neighbor number / bend location
+  int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_fused_deidrj;
+
+  // extract neighbor index, iatom_div
+  int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug
+  int jj = flattened_idx - max_neighs * iatom_div;
+
+  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
+    [&] (const int iatom_mod) {
+    const int ii = iatom_mod + vector_length * iatom_div;
+    if (ii >= chunk_size) return;
+
+    const int ninside = d_ninside(ii);
+    if (jj >= ninside) return;
+
+    my_sna.template compute_fused_deidrj_large<dir>(team, iatom_mod, jj, iatom_div);
+
+  });
+}
+
 /* ----------------------------------------------------------------------
   Begin routines that are unique to the CPU codepath. These do not take
   advantage of AoSoA data layouts, but that could be a good point of
@ -925,13 +1017,13 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
    for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
      const auto idxb = icoeff % idxb_max;
      const auto idx_chem = icoeff / idxb_max;
-      auto bveci = my_sna.blist(idxb,idx_chem,ii);
+      real_type bveci = my_sna.blist(ii,idx_chem,idxb);
      d_beta(icoeff,ii) += d_coeffi[k]*bveci;
      k++;
      for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
        const auto jdxb = jcoeff % idxb_max;
        const auto jdx_chem = jcoeff / idxb_max;
-        auto bvecj = my_sna.blist(jdxb,jdx_chem,ii);
+        real_type bvecj = my_sna.blist(ii,jdx_chem,jdxb);
        d_beta(icoeff,ii) += d_coeffi[k]*bvecj;
        d_beta(jcoeff,ii) += d_coeffi[k]*bveci;
        k++;
@ -1221,7 +1313,7 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
      for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
        const auto idxb = icoeff % idxb_max;
        const auto idx_chem = icoeff / idxb_max;
-        evdwl += d_coeffi[icoeff+1]*my_sna.blist(idxb,idx_chem,ii);
+        evdwl += d_coeffi[icoeff+1]*my_sna.blist(ii,idx_chem,idxb);
      }

      // quadratic contributions
@ -1230,12 +1322,12 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
        for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
          const auto idxb = icoeff % idxb_max;
          const auto idx_chem = icoeff / idxb_max;
-          auto bveci = my_sna.blist(idxb,idx_chem,ii);
+          real_type bveci = my_sna.blist(ii,idx_chem,idxb);
          evdwl += 0.5*d_coeffi[k++]*bveci*bveci;
          for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
            auto jdxb = jcoeff % idxb_max;
            auto jdx_chem = jcoeff / idxb_max;
-            auto bvecj = my_sna.blist(jdxb,jdx_chem,ii);
+            auto bvecj = my_sna.blist(ii,jdx_chem,jdxb);
            evdwl += d_coeffi[k++]*bveci*bvecj;
          }
        }
--- a/src/KOKKOS/sna_kokkos.h
+++ b/src/KOKKOS/sna_kokkos.h
@ -45,12 +45,12 @@ struct WignerWrapper {
  { ; }

  KOKKOS_INLINE_FUNCTION
-  complex get(const int& ma) {
+  complex get(const int& ma) const {
    return complex(buffer[offset + 2 * vector_length * ma], buffer[offset + vector_length + 2 * vector_length * ma]);
  }

  KOKKOS_INLINE_FUNCTION
-  void set(const int& ma, const complex& store) {
+  void set(const int& ma, const complex& store) const {
    buffer[offset + 2 * vector_length * ma] = store.re;
    buffer[offset + vector_length + 2 * vector_length * ma] = store.im;
  }
@ -122,8 +122,14 @@ inline
  void compute_cayley_klein(const int&, const int&, const int&);
  KOKKOS_INLINE_FUNCTION
  void pre_ui(const int&, const int&, const int&, const int&); // ForceSNAP
+
+  // version of the code with parallelism over j_bend
  KOKKOS_INLINE_FUNCTION
-  void compute_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int, const int, const int); // ForceSNAP
+  void compute_ui_small(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int, const int, const int); // ForceSNAP
+  // version of the code without parallelism over j_bend
+  KOKKOS_INLINE_FUNCTION
+  void compute_ui_large(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int, const int); // ForceSNAP
+
  KOKKOS_INLINE_FUNCTION
  void compute_zi(const int&, const int&, const int&);    // ForceSNAP
  KOKKOS_INLINE_FUNCTION
@ -135,6 +141,35 @@ inline
  KOKKOS_INLINE_FUNCTION
  void compute_bi(const int&, const int&, const int&);    // ForceSNAP

+  // functions for derivatives, GPU only
+  // version of the code with parallelism over j_bend
+  template<int dir>
+  KOKKOS_INLINE_FUNCTION
+  void compute_fused_deidrj_small(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int, const int, const int); //ForceSNAP
+  // version of the code without parallelism over j_bend
+  template<int dir>
+  KOKKOS_INLINE_FUNCTION
+  void compute_fused_deidrj_large(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int, const int); //ForceSNAP
+
+  // core "evaluation" functions that get plugged into "compute" functions
+  // plugged into compute_ui_small, compute_ui_large
+  KOKKOS_FORCEINLINE_FUNCTION
+  void evaluate_ui_jbend(const WignerWrapper<real_type, vector_length>&, const complex&, const complex&, const real_type&, const int&,
+                        const int&, const int&, const int&);
+  // plugged into compute_zi, compute_yi
+  KOKKOS_FORCEINLINE_FUNCTION
+  complex evaluate_zi(const int&, const int&, const int&, const int&, const int&, const int&, const int&, const int&, const int&,
+                        const int&, const int&, const int&, const int&, const real_type*);
+  // plugged into compute_yi, compute_yi_with_zlist
+  KOKKOS_FORCEINLINE_FUNCTION
+  real_type evaluate_beta_scaled(const int&, const int&, const int&, const int&, const int&, const int&, const int&, const int&,
+                        const Kokkos::View<real_type***, Kokkos::LayoutLeft, DeviceType> &);
+  // plugged into compute_fused_deidrj_small, compute_fused_deidrj_large
+  KOKKOS_FORCEINLINE_FUNCTION
+  real_type evaluate_duidrj_jbend(const WignerWrapper<real_type, vector_length>&, const complex&, const complex&, const real_type&, 
+                        const WignerWrapper<real_type, vector_length>&, const complex&, const complex&, const real_type&,
+                        const int&, const int&, const int&, const int&);
+
  // functions for bispectrum coefficients, CPU only
  KOKKOS_INLINE_FUNCTION
  void pre_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team,const int&,const int&); // ForceSNAP
@ -148,11 +183,6 @@ inline
    KOKKOS_INLINE_FUNCTION
  void compute_bi_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int);    // ForceSNAP

-  // functions for derivatives, GPU only
-  template<int dir>
-  KOKKOS_INLINE_FUNCTION
-  void compute_fused_deidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int, const int, const int); //ForceSNAP
-
  // functions for derivatives, CPU only
  KOKKOS_INLINE_FUNCTION
  void compute_duidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); //ForceSNAP
@ -168,23 +198,6 @@ inline
  KOKKOS_INLINE_FUNCTION
  void compute_s_dsfac(const real_type, const real_type, real_type&, real_type&); // compute_cayley_klein

-  static KOKKOS_FORCEINLINE_FUNCTION
-  void sincos_wrapper(double x, double* sin_, double *cos_) {
-#ifdef __SYCL_DEVICE_ONLY__
-    *sin_ = sycl::sincos(x, cos_);
-#else
-    sincos(x, sin_, cos_);
-#endif
-  }
-  static KOKKOS_FORCEINLINE_FUNCTION
-  void sincos_wrapper(float x, float* sin_, float *cos_) {
-#ifdef __SYCL_DEVICE_ONLY__
-    *sin_ = sycl::sincos(x, cos_);
-#else
-    sincosf(x, sin_, cos_);
-#endif
-  }
-
 #ifdef TIMING_INFO
  double* timers;
  timespec starttime, endtime;
@ -207,7 +220,7 @@ inline

  int twojmax, diagonalstyle;

-  t_sna_3d_ll blist;
+  t_sna_3d blist;
  t_sna_3c_ll ulisttot;
  t_sna_3c_ll ulisttot_full; // un-folded ulisttot, cpu only
  t_sna_3c_ll zlist;
--- a/src/KOKKOS/sna_kokkos_impl.h
+++ b/src/KOKKOS/sna_kokkos_impl.h