From cb548143eea8c1369975c0f98ac10cf82947e926 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Tue, 19 Nov 2024 12:14:32 -0800 Subject: [PATCH] Unified zlist and blist CPU and GPU structures; greatly simplified/fused compute_bi --- src/KOKKOS/pair_snap_kokkos.h | 7 +- src/KOKKOS/pair_snap_kokkos_impl.h | 38 ++----- src/KOKKOS/sna_kokkos.h | 10 +- src/KOKKOS/sna_kokkos_impl.h | 167 +++-------------------------- 4 files changed, 23 insertions(+), 199 deletions(-) diff --git a/src/KOKKOS/pair_snap_kokkos.h b/src/KOKKOS/pair_snap_kokkos.h index c9607daa1c..7d58f387d6 100644 --- a/src/KOKKOS/pair_snap_kokkos.h +++ b/src/KOKKOS/pair_snap_kokkos.h @@ -40,7 +40,6 @@ struct TagPairSNAPBeta{}; template struct TagPairSNAPComputeForce{}; - // GPU backend only struct TagPairSNAPComputeNeigh{}; struct TagPairSNAPComputeCayleyKlein{}; @@ -50,7 +49,6 @@ struct TagPairSNAPComputeUiLarge{}; // less parallelism, no divergence struct TagPairSNAPTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist struct TagPairSNAPComputeZi{}; struct TagPairSNAPComputeBi{}; -struct TagPairSNAPTransformBi{}; // re-order blist from AoSoA to AoS struct TagPairSNAPComputeYi{}; struct TagPairSNAPComputeYiWithZlist{}; template @@ -187,9 +185,6 @@ class PairSNAPKokkos : public PairSNAP { KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeBi,const int iatom_mod, const int idxb, const int iatom_div) const; - KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const; - KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeYi,const int iatom_mod, const int idxz, const int iatom_div) const; @@ -221,7 +216,7 @@ class PairSNAPKokkos : public PairSNAP { void operator() (TagPairSNAPComputeZiCPU,const int& ii) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeBiCPU,const typename Kokkos::TeamPolicy::member_type& team) const; + void operator() (TagPairSNAPComputeBiCPU, const int& ii) const; KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeYiCPU,const int& ii) const; diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h index 2ca74375c4..b2252f85c7 100644 --- a/src/KOKKOS/pair_snap_kokkos_impl.h +++ b/src/KOKKOS/pair_snap_kokkos_impl.h @@ -264,9 +264,8 @@ void PairSNAPKokkos::compute(int eflag_in, Kokkos::parallel_for("ComputeZiCPU",policy_zi_cpu,*this); //ComputeBi - int team_size = team_size_default; - check_team_size_for(chunk_size,team_size); - typename Kokkos::TeamPolicy policy_bi_cpu(chunk_size,team_size,vector_length); + int idxb_max = snaKK.idxb_max; + typename Kokkos::RangePolicy policy_bi_cpu(0, chunk_size * idxb_max); Kokkos::parallel_for("ComputeBiCPU",policy_bi_cpu,*this); } @@ -381,13 +380,6 @@ void PairSNAPKokkos::compute(int eflag_in, Snap3DRangePolicy policy_compute_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_compute_bi,1}); Kokkos::parallel_for("ComputeBi",policy_compute_bi,*this); - - //Transform data layout of blist out of AoSoA - //We need this because `blist` gets used in ComputeForce which doesn't - //take advantage of AoSoA, which at best would only be beneficial on the margins - Snap3DRangePolicy - policy_transform_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_transform_bi,1}); - Kokkos::parallel_for("TransformBi",policy_transform_bi,*this); } //Note zeroing `ylist` is fused into `TransformUi`. @@ -900,25 +892,6 @@ void PairSNAPKokkos::operator() (TagPairSN snaKK.compute_bi(iatom,jjb); } -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const { - - const int iatom = iatom_mod + iatom_div * vector_length; - if (iatom >= chunk_size) return; - if (idxb >= snaKK.idxb_max) return; - - const int ntriples = snaKK.ntriples; - - for (int itriple = 0; itriple < ntriples; itriple++) { - - const real_type blocal = snaKK.blist_gpu(iatom, idxb, itriple); - - snaKK.blist(iatom, itriple, idxb) = blocal; - } - -} - template template KOKKOS_INLINE_FUNCTION @@ -1151,9 +1124,10 @@ void PairSNAPKokkos::operator() (TagPairSN template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeBiCPU,const typename Kokkos::TeamPolicy::member_type& team) const { - int ii = team.league_rank(); - snaKK.compute_bi_cpu(team,ii); +void PairSNAPKokkos::operator() (TagPairSNAPComputeBiCPU, const int& ii) const { + const int iatom = ii / snaKK.idxb_max; + const int jjb = ii % snaKK.idxb_max; + snaKK.compute_bi(iatom, jjb); } template diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h index 66d7e7254f..163c738651 100644 --- a/src/KOKKOS/sna_kokkos.h +++ b/src/KOKKOS/sna_kokkos.h @@ -244,8 +244,6 @@ class SNAKokkos { void compute_zi_cpu(const int&) const; // ForceSNAP KOKKOS_INLINE_FUNCTION void compute_yi_cpu(int) const; // ForceSNAP - KOKKOS_INLINE_FUNCTION - void compute_bi_cpu(const typename Kokkos::TeamPolicy::member_type& team, int) const; // ForceSNAP // functions for derivatives, CPU only KOKKOS_INLINE_FUNCTION @@ -294,12 +292,13 @@ class SNAKokkos { t_sna_3d ulisttot_im; t_sna_3c ulisttot; // un-folded ulisttot + t_sna_3c zlist; + t_sna_3d blist; + t_sna_3d ylist_re; t_sna_3d ylist_im; // Structures for the CPU backend only - t_sna_3d blist; - t_sna_3c_ll zlist; t_sna_3c_ll ulist; @@ -313,9 +312,6 @@ class SNAKokkos { t_sna_3c db_gpu; // `db` t_sna_3d sfac_gpu; // sfac, dsfac_{x,y,z} - t_sna_3c zlist_gpu; - t_sna_3d blist_gpu; - int idxcg_max, idxu_max, idxu_half_max, idxu_cache_max, idxz_max, idxb_max; // Chem snap counts diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index 1623bec0e1..0e2753215b 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -317,6 +317,9 @@ void SNAKokkos::grow_rij(int newnatom, int MemKK::realloc_kokkos(ulisttot_im,"sna:ulisttot_im", natom_pad, nelements, idxu_half_max); MemKK::realloc_kokkos(ulisttot,"sna:ulisttot", natom_pad, nelements, idxu_max); + MemKK::realloc_kokkos(zlist,"sna:zlist", natom_pad, ndoubles, idxz_max); + MemKK::realloc_kokkos(blist,"sna:blist", natom_pad, ntriples, idxb_max); + MemKK::realloc_kokkos(ylist_re,"sna:ylist_re", natom_pad, nelements, idxu_half_max); MemKK::realloc_kokkos(ylist_im,"sna:ylist_im", natom_pad, nelements, idxu_half_max); @@ -327,10 +330,6 @@ void SNAKokkos::grow_rij(int newnatom, int MemKK::realloc_kokkos(db_gpu,"sna:db_gpu",natom_pad,nmax,3); MemKK::realloc_kokkos(sfac_gpu,"sna:sfac_gpu",natom_pad,nmax,4); MemKK::realloc_kokkos(ulist,"sna:ulist",1,1,1); - MemKK::realloc_kokkos(zlist,"sna:zlist",1,1,1); - MemKK::realloc_kokkos(zlist_gpu,"sna:zlist_gpu",natom_pad,idxz_max,ndoubles); - MemKK::realloc_kokkos(blist,"sna:blist",natom_pad,ntriples,idxb_max); - MemKK::realloc_kokkos(blist_gpu,"sna:blist_gpu",natom_pad,idxb_max,ntriples); MemKK::realloc_kokkos(dulist,"sna:dulist",1,1,1); } else { MemKK::realloc_kokkos(a_gpu,"sna:a_gpu",1,1); @@ -339,10 +338,6 @@ void SNAKokkos::grow_rij(int newnatom, int MemKK::realloc_kokkos(db_gpu,"sna:db_gpu",1,1,1); MemKK::realloc_kokkos(sfac_gpu,"sna:sfac_gpu",1,1,1); MemKK::realloc_kokkos(ulist,"sna:ulist",idxu_cache_max,natom_pad,nmax); - MemKK::realloc_kokkos(zlist,"sna:zlist",idxz_max,ndoubles,natom_pad); - MemKK::realloc_kokkos(zlist_gpu,"sna:zlist_gpu",1,1,1); - MemKK::realloc_kokkos(blist,"sna:blist",natom_pad,ntriples,idxb_max); - MemKK::realloc_kokkos(blist_gpu,"sna:blist_gpu",1,1,1); MemKK::realloc_kokkos(dulist,"sna:dulist",idxu_cache_max,natom_pad,nmax); } } @@ -663,7 +658,7 @@ void SNAKokkos::compute_zi(const int& iato for (int elem1 = 0; elem1 < nelements; elem1++) { for (int elem2 = 0; elem2 < nelements; elem2++) { - zlist_gpu(iatom,jjz,idouble) = evaluate_zi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, iatom, elem1, elem2, cgblock); + zlist(iatom, idouble, jjz) = evaluate_zi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, iatom, elem1, elem2, cgblock); idouble++; } @@ -711,7 +706,7 @@ void SNAKokkos::compute_bi(const int& iato const int jjz_index = jjz+mb*(j+1)+ma; if (2*mb == j) return; // I think we can remove this? const complex utot = ulisttot(iatom, elem3, jju_index); - const complex zloc = zlist_gpu(iatom, jjz_index, idouble); + const complex zloc = zlist(iatom, idouble, jjz_index); sumzu_temp += utot.re * zloc.re + utot.im * zloc.im; } } @@ -727,7 +722,7 @@ void SNAKokkos::compute_bi(const int& iato const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma; const complex utot = ulisttot(iatom, elem3, jju_index); - const complex zloc = zlist_gpu(iatom, jjz_index, idouble); + const complex zloc = zlist(iatom, idouble, jjz_index); sumzu_temp += utot.re * zloc.re + utot.im * zloc.im; } @@ -738,7 +733,7 @@ void SNAKokkos::compute_bi(const int& iato const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma; const complex utot = ulisttot(iatom, elem3, jju_index); - const complex zloc = zlist_gpu(iatom, jjz_index, idouble); + const complex zloc = zlist(iatom, idouble, jjz_index); sumzu += static_cast(0.5) * (utot.re * zloc.re + utot.im * zloc.im); } // end if jeven @@ -752,7 +747,7 @@ void SNAKokkos::compute_bi(const int& iato sumzu -= bzero[j]; } } - blist_gpu(iatom, jjb, itriple) = sumzu; + blist(iatom, itriple, jjb) = sumzu; //} // end loop over j //} // end loop over j1, j2 itriple++; @@ -818,7 +813,7 @@ void SNAKokkos::compute_yi_with_zlist(cons int idouble = 0; for (int elem1 = 0; elem1 < nelements; elem1++) { for (int elem2 = 0; elem2 < nelements; elem2++) { - const complex ztmp = zlist_gpu(iatom,jjz,idouble); + const complex ztmp = zlist(iatom, idouble, jjz); // apply to z(j1,j2,j,ma,mb) to unique element of y(j) // find right y_list[jju] and beta(iatom,jjb) entries // multiply and divide by j+1 factors @@ -1240,144 +1235,12 @@ void SNAKokkos::compute_zi_cpu(const int& for (int elem1 = 0; elem1 < nelements; elem1++) { for (int elem2 = 0; elem2 < nelements; elem2++) { - zlist(jjz, idouble, iatom).re = static_cast(0.0); - zlist(jjz, idouble, iatom).im = static_cast(0.0); - - int jju1 = idxu_block[j1] + (j1+1)*mb1min; - int jju2 = idxu_block[j2] + (j2+1)*mb2max; - int icgb = mb1min*(j2+1) + mb2max; - for (int ib = 0; ib < nb; ib++) { - - real_type suma1_r = static_cast(0.0); - real_type suma1_i = static_cast(0.0); - - int ma1 = ma1min; - int ma2 = ma2max; - int icga = ma1min * (j2 + 1) + ma2max; - for (int ia = 0; ia < na; ia++) { - suma1_r += cgblock[icga] * (ulisttot(iatom, elem1, jju1+ma1).re * ulisttot(iatom, elem2, jju2+ma2).re - ulisttot(iatom, elem1, jju1+ma1).im * ulisttot(iatom, elem2, jju2+ma2).im); - suma1_i += cgblock[icga] * (ulisttot(iatom, elem1, jju1+ma1).re * ulisttot(iatom, elem2, jju2+ma2).im + ulisttot(iatom, elem1, jju1+ma1).im * ulisttot(iatom, elem2, jju2+ma2).re); - ma1++; - ma2--; - icga += j2; - } // end loop over ia - - zlist(jjz, idouble, iatom).re += cgblock[icgb] * suma1_r; - zlist(jjz, idouble, iatom).im += cgblock[icgb] * suma1_i; - - jju1 += j1 + 1; - jju2 -= j2 + 1; - icgb += j2; - } // end loop over ib - - if (bnorm_flag) { - const real_type scale = static_cast(1) / static_cast(j + 1); - zlist(jjz, idouble, iatom).re *= scale; - zlist(jjz, idouble, iatom).im *= scale; - } + zlist(iatom, idouble, jjz) = evaluate_zi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, iatom, elem1, elem2, cgblock); idouble++; } // end loop over elem2 } // end loop over elem1 } - -/* ---------------------------------------------------------------------- - compute Bi by summing conj(Ui)*Zi, CPU version -------------------------------------------------------------------------- */ - -template -KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_bi_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom) const -{ - // for j1 = 0,...,twojmax - // for j2 = 0,twojmax - // for j = |j1-j2|,Min(twojmax,j1+j2),2 - // b(j1,j2,j) = 0 - // for mb = 0,...,jmid - // for ma = 0,...,j - // b(j1,j2,j) += - // 2*Conj(u(j,ma,mb))*z(j1,j2,j,ma,mb) - - int itriple = 0; - int idouble = 0; - for (int elem1 = 0; elem1 < nelements; elem1++) { - for (int elem2 = 0; elem2 < nelements; elem2++) { - int jalloy = idouble; // must be non-const to work around gcc compiler bug - for (int elem3 = 0; elem3 < nelements; elem3++) { - Kokkos::parallel_for(Kokkos::TeamThreadRange(team,idxb_max), - [&] (const int& jjb) { - const int j1 = idxb(jjb, 0); - const int j2 = idxb(jjb, 1); - int j = idxb(jjb, 2); // removed "const" to work around GCC 7 bug - - int jjz = idxz_block(j1, j2, j); - int jju = idxu_block[j]; - real_type sumzu = static_cast(0.0); - real_type sumzu_temp = static_cast(0.0); - const int bound = (j+2)/2; - Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,(j+1)*bound), - [&] (const int mbma, real_type& sum) { - const int ma = mbma % (j + 1); - const int mb = mbma / (j + 1); - const int jju_index = jju + mb * (j + 1) + ma; - const int jjz_index = jjz + mb * (j + 1) + ma; - if (2*mb == j) return; - sum += - ulisttot(iatom, elem3, jju_index).re * zlist(jjz_index, jalloy, iatom).re + - ulisttot(iatom, elem3, jju_index).im * zlist(jjz_index, jalloy, iatom).im; - },sumzu_temp); // end loop over ma, mb - sumzu += sumzu_temp; - - // For j even, special treatment for middle column - - if (j%2 == 0) { - int mb = j/2; // removed "const" to work around GCC 7 bug - Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, mb), - [&] (const int ma, real_type& sum) { - const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma; - const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma; - sum += - ulisttot(iatom, elem3, jju_index).re * zlist(jjz_index, jalloy, iatom).re + - ulisttot(iatom, elem3, jju_index).im * zlist(jjz_index, jalloy, iatom).im; - },sumzu_temp); // end loop over ma - sumzu += sumzu_temp; - - const int ma = mb; - const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma; - const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma; - sumzu += static_cast(0.5)* - (ulisttot(iatom, elem3, jju_index).re * zlist(jjz_index, jalloy, iatom).re + - ulisttot(iatom, elem3, jju_index).im * zlist(jjz_index, jalloy, iatom).im); - } // end if jeven - - Kokkos::single(Kokkos::PerThread(team), [&] () { - sumzu *= static_cast(2.0); - - // apply bzero shift - - if (bzero_flag) { - if (!wselfall_flag) { - if (elem1 == elem2 && elem1 == elem3) { - sumzu -= bzero[j]; - } - } else { - sumzu -= bzero[j]; - } - } - - blist(iatom, itriple, jjb) = sumzu; - }); - }); - //} // end loop over j - //} // end loop over j1, j2 - itriple++; - } - idouble++; - } // end loop over elem2 - } // end loop over elem1 - -} - /* ---------------------------------------------------------------------- compute Yi from Ui without storing Zi, looping over zlist indices, CPU version @@ -2295,6 +2158,9 @@ double SNAKokkos::memory_usage() bytes += MemKK::memory_usage(ulisttot_im); bytes += MemKK::memory_usage(ulisttot); + bytes += MemKK::memory_usage(zlist); + bytes += MemKK::memory_usage(blist); + bytes += MemKK::memory_usage(ylist_re); bytes += MemKK::memory_usage(ylist_im); @@ -2304,15 +2170,8 @@ double SNAKokkos::memory_usage() bytes += MemKK::memory_usage(da_gpu); bytes += MemKK::memory_usage(db_gpu); bytes += MemKK::memory_usage(sfac_gpu); - - bytes += MemKK::memory_usage(zlist_gpu); - bytes += MemKK::memory_usage(blist_gpu); } else { bytes += MemKK::memory_usage(ulist); - - bytes += MemKK::memory_usage(zlist); - bytes += MemKK::memory_usage(blist); - bytes += MemKK::memory_usage(dulist); }