Unified zlist and blist CPU and GPU structures; greatly simplified/fused compute_bi

2024-11-19 12:14:32 -08:00
parent cf6714ea33
commit cb548143ee
4 changed files with 23 additions and 199 deletions
--- a/src/KOKKOS/pair_snap_kokkos.h
+++ b/src/KOKKOS/pair_snap_kokkos.h
@ -40,7 +40,6 @@ struct TagPairSNAPBeta{};
 template<int NEIGHFLAG, int EVFLAG>
 struct TagPairSNAPComputeForce{};

-
 // GPU backend only
 struct TagPairSNAPComputeNeigh{};
 struct TagPairSNAPComputeCayleyKlein{};
@ -50,7 +49,6 @@ struct TagPairSNAPComputeUiLarge{}; // less parallelism, no divergence
 struct TagPairSNAPTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist
 struct TagPairSNAPComputeZi{};
 struct TagPairSNAPComputeBi{};
-struct TagPairSNAPTransformBi{}; // re-order blist from AoSoA to AoS
 struct TagPairSNAPComputeYi{};
 struct TagPairSNAPComputeYiWithZlist{};
 template<int dir>
@ -187,9 +185,6 @@ class PairSNAPKokkos : public PairSNAP {
  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeBi,const int iatom_mod, const int idxb, const int iatom_div) const;

-  KOKKOS_INLINE_FUNCTION
-  void operator() (TagPairSNAPTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const;
-
  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeYi,const int iatom_mod, const int idxz, const int iatom_div) const;

@ -221,7 +216,7 @@ class PairSNAPKokkos : public PairSNAP {
  void operator() (TagPairSNAPComputeZiCPU,const int& ii) const;

  KOKKOS_INLINE_FUNCTION
-  void operator() (TagPairSNAPComputeBiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeBiCPU>::member_type& team) const;
+  void operator() (TagPairSNAPComputeBiCPU, const int& ii) const;

  KOKKOS_INLINE_FUNCTION
  void operator() (TagPairSNAPComputeYiCPU,const int& ii) const;
--- a/src/KOKKOS/pair_snap_kokkos_impl.h
+++ b/src/KOKKOS/pair_snap_kokkos_impl.h
@ -264,9 +264,8 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::compute(int eflag_in,
        Kokkos::parallel_for("ComputeZiCPU",policy_zi_cpu,*this);

        //ComputeBi
-        int team_size = team_size_default;
-        check_team_size_for<TagPairSNAPComputeBiCPU>(chunk_size,team_size);
-        typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeBiCPU> policy_bi_cpu(chunk_size,team_size,vector_length);
+        int idxb_max = snaKK.idxb_max;
+        typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeBiCPU> policy_bi_cpu(0, chunk_size * idxb_max);
        Kokkos::parallel_for("ComputeBiCPU",policy_bi_cpu,*this);
      }

@ -381,13 +380,6 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::compute(int eflag_in,
        Snap3DRangePolicy<DeviceType, tile_size_compute_bi, TagPairSNAPComputeBi>
            policy_compute_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_compute_bi,1});
        Kokkos::parallel_for("ComputeBi",policy_compute_bi,*this);
-
-        //Transform data layout of blist out of AoSoA
-        //We need this because `blist` gets used in ComputeForce which doesn't
-        //take advantage of AoSoA, which at best would only be beneficial on the margins
-        Snap3DRangePolicy<DeviceType, tile_size_transform_bi, TagPairSNAPTransformBi>
-            policy_transform_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_transform_bi,1});
-        Kokkos::parallel_for("TransformBi",policy_transform_bi,*this);
      }

      //Note zeroing `ylist` is fused into `TransformUi`.
@ -900,25 +892,6 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
  snaKK.compute_bi(iatom,jjb);
 }

-template<class DeviceType, typename real_type, int vector_length>
-KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const {
-
-  const int iatom = iatom_mod + iatom_div * vector_length;
-  if (iatom >= chunk_size) return;
-  if (idxb >= snaKK.idxb_max) return;
-
-  const int ntriples = snaKK.ntriples;
-
-  for (int itriple = 0; itriple < ntriples; itriple++) {
-
-    const real_type blocal = snaKK.blist_gpu(iatom, idxb, itriple);
-
-    snaKK.blist(iatom, itriple, idxb) = blocal;
-  }
-
-}
-
 template<class DeviceType, typename real_type, int vector_length>
 template<int dir>
 KOKKOS_INLINE_FUNCTION
@ -1151,9 +1124,10 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN

 template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeBiCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeBiCPU>::member_type& team) const {
-  int ii = team.league_rank();
-  snaKK.compute_bi_cpu(team,ii);
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeBiCPU, const int& ii) const {
+  const int iatom = ii / snaKK.idxb_max;
+  const int jjb = ii % snaKK.idxb_max;
+  snaKK.compute_bi(iatom, jjb);
 }

 template<class DeviceType, typename real_type, int vector_length>
--- a/src/KOKKOS/sna_kokkos.h
+++ b/src/KOKKOS/sna_kokkos.h
@ -244,8 +244,6 @@ class SNAKokkos {
  void compute_zi_cpu(const int&) const;    // ForceSNAP
  KOKKOS_INLINE_FUNCTION
  void compute_yi_cpu(int) const; // ForceSNAP
-    KOKKOS_INLINE_FUNCTION
-  void compute_bi_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int) const;    // ForceSNAP

  // functions for derivatives, CPU only
  KOKKOS_INLINE_FUNCTION
@ -294,12 +292,13 @@ class SNAKokkos {
  t_sna_3d ulisttot_im;
  t_sna_3c ulisttot; // un-folded ulisttot

+  t_sna_3c zlist;
+  t_sna_3d blist;
+
  t_sna_3d ylist_re;
  t_sna_3d ylist_im;

  // Structures for the CPU backend only
-  t_sna_3d blist;
-  t_sna_3c_ll zlist;

  t_sna_3c_ll ulist;

@ -313,9 +312,6 @@ class SNAKokkos {
  t_sna_3c db_gpu; // `db`
  t_sna_3d sfac_gpu; // sfac, dsfac_{x,y,z}

-  t_sna_3c zlist_gpu;
-  t_sna_3d blist_gpu;
-
  int idxcg_max, idxu_max, idxu_half_max, idxu_cache_max, idxz_max, idxb_max;

  // Chem snap counts
--- a/src/KOKKOS/sna_kokkos_impl.h
+++ b/src/KOKKOS/sna_kokkos_impl.h
@ -317,6 +317,9 @@ void SNAKokkos<DeviceType, real_type, vector_length>::grow_rij(int newnatom, int
  MemKK::realloc_kokkos(ulisttot_im,"sna:ulisttot_im", natom_pad, nelements, idxu_half_max);
  MemKK::realloc_kokkos(ulisttot,"sna:ulisttot", natom_pad, nelements, idxu_max);

+  MemKK::realloc_kokkos(zlist,"sna:zlist", natom_pad, ndoubles, idxz_max);
+  MemKK::realloc_kokkos(blist,"sna:blist", natom_pad, ntriples, idxb_max);
+
  MemKK::realloc_kokkos(ylist_re,"sna:ylist_re", natom_pad, nelements, idxu_half_max);
  MemKK::realloc_kokkos(ylist_im,"sna:ylist_im", natom_pad, nelements, idxu_half_max);

@ -327,10 +330,6 @@ void SNAKokkos<DeviceType, real_type, vector_length>::grow_rij(int newnatom, int
    MemKK::realloc_kokkos(db_gpu,"sna:db_gpu",natom_pad,nmax,3);
    MemKK::realloc_kokkos(sfac_gpu,"sna:sfac_gpu",natom_pad,nmax,4);
    MemKK::realloc_kokkos(ulist,"sna:ulist",1,1,1);
-    MemKK::realloc_kokkos(zlist,"sna:zlist",1,1,1);
-    MemKK::realloc_kokkos(zlist_gpu,"sna:zlist_gpu",natom_pad,idxz_max,ndoubles);
-    MemKK::realloc_kokkos(blist,"sna:blist",natom_pad,ntriples,idxb_max);
-    MemKK::realloc_kokkos(blist_gpu,"sna:blist_gpu",natom_pad,idxb_max,ntriples);
    MemKK::realloc_kokkos(dulist,"sna:dulist",1,1,1);
  } else {
    MemKK::realloc_kokkos(a_gpu,"sna:a_gpu",1,1);
@ -339,10 +338,6 @@ void SNAKokkos<DeviceType, real_type, vector_length>::grow_rij(int newnatom, int
    MemKK::realloc_kokkos(db_gpu,"sna:db_gpu",1,1,1);
    MemKK::realloc_kokkos(sfac_gpu,"sna:sfac_gpu",1,1,1);
    MemKK::realloc_kokkos(ulist,"sna:ulist",idxu_cache_max,natom_pad,nmax);
-    MemKK::realloc_kokkos(zlist,"sna:zlist",idxz_max,ndoubles,natom_pad);
-    MemKK::realloc_kokkos(zlist_gpu,"sna:zlist_gpu",1,1,1);
-    MemKK::realloc_kokkos(blist,"sna:blist",natom_pad,ntriples,idxb_max);
-    MemKK::realloc_kokkos(blist_gpu,"sna:blist_gpu",1,1,1);
    MemKK::realloc_kokkos(dulist,"sna:dulist",idxu_cache_max,natom_pad,nmax);
  }
 }
@ -663,7 +658,7 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_zi(const int& iato
  for (int elem1 = 0; elem1 < nelements; elem1++) {
    for (int elem2 = 0; elem2 < nelements; elem2++) {

-      zlist_gpu(iatom,jjz,idouble) = evaluate_zi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, iatom, elem1, elem2, cgblock);
+      zlist(iatom, idouble, jjz) = evaluate_zi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, iatom, elem1, elem2, cgblock);

      idouble++;
    }
@ -711,7 +706,7 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_bi(const int& iato
            const int jjz_index = jjz+mb*(j+1)+ma;
            if (2*mb == j) return; // I think we can remove this?
            const complex utot = ulisttot(iatom, elem3, jju_index);
-            const complex zloc = zlist_gpu(iatom, jjz_index, idouble);
+            const complex zloc = zlist(iatom, idouble, jjz_index);
            sumzu_temp += utot.re * zloc.re + utot.im * zloc.im;
          }
        }
@ -727,7 +722,7 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_bi(const int& iato
            const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma;

            const complex utot = ulisttot(iatom, elem3, jju_index);
-            const complex zloc = zlist_gpu(iatom, jjz_index, idouble);
+            const complex zloc = zlist(iatom, idouble, jjz_index);
            sumzu_temp += utot.re * zloc.re + utot.im * zloc.im;

          }
@ -738,7 +733,7 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_bi(const int& iato
          const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma;

          const complex utot = ulisttot(iatom, elem3, jju_index);
-          const complex zloc = zlist_gpu(iatom, jjz_index, idouble);
+          const complex zloc = zlist(iatom, idouble, jjz_index);
          sumzu += static_cast<real_type>(0.5) * (utot.re * zloc.re + utot.im * zloc.im);
        } // end if jeven

@ -752,7 +747,7 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_bi(const int& iato
            sumzu -= bzero[j];
          }
        }
-        blist_gpu(iatom, jjb, itriple) = sumzu;
+        blist(iatom, itriple, jjb) = sumzu;
            //} // end loop over j
          //} // end loop over j1, j2
        itriple++;
@ -818,7 +813,7 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_yi_with_zlist(cons
  int idouble = 0;
  for (int elem1 = 0; elem1 < nelements; elem1++) {
    for (int elem2 = 0; elem2 < nelements; elem2++) {
-      const complex ztmp = zlist_gpu(iatom,jjz,idouble);
+      const complex ztmp = zlist(iatom, idouble, jjz);
      // apply to z(j1,j2,j,ma,mb) to unique element of y(j)
      // find right y_list[jju] and beta(iatom,jjb) entries
      // multiply and divide by j+1 factors
@ -1240,144 +1235,12 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_zi_cpu(const int&

  for (int elem1 = 0; elem1 < nelements; elem1++) {
    for (int elem2 = 0; elem2 < nelements; elem2++) {
-      zlist(jjz, idouble, iatom).re = static_cast<real_type>(0.0);
-      zlist(jjz, idouble, iatom).im = static_cast<real_type>(0.0);
-
-      int jju1 = idxu_block[j1] + (j1+1)*mb1min;
-      int jju2 = idxu_block[j2] + (j2+1)*mb2max;
-      int icgb = mb1min*(j2+1) + mb2max;
-      for (int ib = 0; ib < nb; ib++) {
-
-        real_type suma1_r = static_cast<real_type>(0.0);
-        real_type suma1_i = static_cast<real_type>(0.0);
-
-        int ma1 = ma1min;
-        int ma2 = ma2max;
-        int icga = ma1min * (j2 + 1) + ma2max;
-        for (int ia = 0; ia < na; ia++) {
-          suma1_r += cgblock[icga] * (ulisttot(iatom, elem1, jju1+ma1).re * ulisttot(iatom, elem2, jju2+ma2).re - ulisttot(iatom, elem1, jju1+ma1).im * ulisttot(iatom, elem2, jju2+ma2).im);
-          suma1_i += cgblock[icga] * (ulisttot(iatom, elem1, jju1+ma1).re * ulisttot(iatom, elem2, jju2+ma2).im + ulisttot(iatom, elem1, jju1+ma1).im * ulisttot(iatom, elem2, jju2+ma2).re);
-          ma1++;
-          ma2--;
-          icga += j2;
-        } // end loop over ia
-
-        zlist(jjz, idouble, iatom).re += cgblock[icgb] * suma1_r;
-        zlist(jjz, idouble, iatom).im += cgblock[icgb] * suma1_i;
-
-        jju1 += j1 + 1;
-        jju2 -= j2 + 1;
-        icgb += j2;
-      } // end loop over ib
-
-      if (bnorm_flag) {
-        const real_type scale = static_cast<real_type>(1) / static_cast<real_type>(j + 1);
-        zlist(jjz, idouble, iatom).re *= scale;
-        zlist(jjz, idouble, iatom).im *= scale;
-      }
+      zlist(iatom, idouble, jjz) = evaluate_zi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, iatom, elem1, elem2, cgblock);
      idouble++;
    } // end loop over elem2
  } // end loop over elem1
 }

-
-/* ----------------------------------------------------------------------
-   compute Bi by summing conj(Ui)*Zi, CPU version
------------------------------------------------------------------------- */
-
-template<class DeviceType, typename real_type, int vector_length>
-KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType, real_type, vector_length>::compute_bi_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom) const
-{
-  // for j1 = 0,...,twojmax
-  //   for j2 = 0,twojmax
-  //     for j = |j1-j2|,Min(twojmax,j1+j2),2
-  //        b(j1,j2,j) = 0
-  //        for mb = 0,...,jmid
-  //          for ma = 0,...,j
-  //            b(j1,j2,j) +=
-  //              2*Conj(u(j,ma,mb))*z(j1,j2,j,ma,mb)
-
-  int itriple = 0;
-  int idouble = 0;
-  for (int elem1 = 0; elem1 < nelements; elem1++) {
-    for (int elem2 = 0; elem2 < nelements; elem2++) {
-      int jalloy = idouble; // must be non-const to work around gcc compiler bug
-      for (int elem3 = 0; elem3 < nelements; elem3++) {
-        Kokkos::parallel_for(Kokkos::TeamThreadRange(team,idxb_max),
-          [&] (const int& jjb) {
-          const int j1 = idxb(jjb, 0);
-          const int j2 = idxb(jjb, 1);
-          int j = idxb(jjb, 2); // removed "const" to work around GCC 7 bug
-
-          int jjz = idxz_block(j1, j2, j);
-          int jju = idxu_block[j];
-          real_type sumzu = static_cast<real_type>(0.0);
-          real_type sumzu_temp = static_cast<real_type>(0.0);
-          const int bound = (j+2)/2;
-          Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,(j+1)*bound),
-              [&] (const int mbma, real_type& sum) {
-              const int ma = mbma % (j + 1);
-              const int mb = mbma / (j + 1);
-              const int jju_index = jju + mb * (j + 1) + ma;
-              const int jjz_index = jjz + mb * (j + 1) + ma;
-              if (2*mb == j) return;
-              sum +=
-                ulisttot(iatom, elem3, jju_index).re * zlist(jjz_index, jalloy, iatom).re +
-                ulisttot(iatom, elem3, jju_index).im * zlist(jjz_index, jalloy, iatom).im;
-            },sumzu_temp); // end loop over ma, mb
-            sumzu += sumzu_temp;
-
-          // For j even, special treatment for middle column
-
-          if (j%2 == 0) {
-            int mb = j/2; // removed "const" to work around GCC 7 bug
-            Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, mb),
-                [&] (const int ma, real_type& sum) {
-              const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma;
-              const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma;
-              sum +=
-                ulisttot(iatom, elem3, jju_index).re * zlist(jjz_index, jalloy, iatom).re +
-                ulisttot(iatom, elem3, jju_index).im * zlist(jjz_index, jalloy, iatom).im;
-            },sumzu_temp); // end loop over ma
-            sumzu += sumzu_temp;
-
-            const int ma = mb;
-            const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma;
-            const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma;
-            sumzu += static_cast<real_type>(0.5)*
-              (ulisttot(iatom, elem3, jju_index).re * zlist(jjz_index, jalloy, iatom).re +
-               ulisttot(iatom, elem3, jju_index).im * zlist(jjz_index, jalloy, iatom).im);
-          } // end if jeven
-
-          Kokkos::single(Kokkos::PerThread(team), [&] () {
-            sumzu *= static_cast<real_type>(2.0);
-
-            // apply bzero shift
-
-            if (bzero_flag) {
-              if (!wselfall_flag) {
-                if (elem1 == elem2 && elem1 == elem3) {
-                  sumzu -= bzero[j];
-                }
-              } else {
-                sumzu -= bzero[j];
-              }
-            }
-
-            blist(iatom, itriple, jjb) = sumzu;
-          });
-        });
-          //} // end loop over j
-        //} // end loop over j1, j2
-        itriple++;
-      }
-      idouble++;
-    } // end loop over elem2
-  } // end loop over elem1
-
-}
-
 /* ----------------------------------------------------------------------
   compute Yi from Ui without storing Zi, looping over zlist indices,
   CPU version
@ -2295,6 +2158,9 @@ double SNAKokkos<DeviceType, real_type, vector_length>::memory_usage()
  bytes += MemKK::memory_usage(ulisttot_im);
  bytes += MemKK::memory_usage(ulisttot);

+  bytes += MemKK::memory_usage(zlist);
+  bytes += MemKK::memory_usage(blist);
+
  bytes += MemKK::memory_usage(ylist_re);
  bytes += MemKK::memory_usage(ylist_im);

@ -2304,15 +2170,8 @@ double SNAKokkos<DeviceType, real_type, vector_length>::memory_usage()
    bytes += MemKK::memory_usage(da_gpu);
    bytes += MemKK::memory_usage(db_gpu);
    bytes += MemKK::memory_usage(sfac_gpu);
-
-    bytes += MemKK::memory_usage(zlist_gpu);
-    bytes += MemKK::memory_usage(blist_gpu);
  } else {
    bytes += MemKK::memory_usage(ulist);
-
-    bytes += MemKK::memory_usage(zlist);
-    bytes += MemKK::memory_usage(blist);
-
    bytes += MemKK::memory_usage(dulist);
  }