From a4d821197e663d94a0b2b1bf4833b05ebd82e70f Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Tue, 19 Nov 2024 08:56:36 -0800 Subject: [PATCH 01/30] Re-enable SNAP CPU codepaths to simplify verifying correctness --- src/KOKKOS/pair_snap_kokkos_impl.h | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h index 97c7d17ea9..86eeffff81 100644 --- a/src/KOKKOS/pair_snap_kokkos_impl.h +++ b/src/KOKKOS/pair_snap_kokkos_impl.h @@ -85,14 +85,14 @@ PairSNAPKokkos::~PairSNAPKokkos() template void PairSNAPKokkos::init_style() { - if (host_flag) { - if (lmp->kokkos->nthreads > 1) - error->all(FLERR,"Pair style snap/kk can currently only run on a single " - "CPU thread"); + //if (host_flag) { + // if (lmp->kokkos->nthreads > 1) + // error->all(FLERR,"Pair style snap/kk can currently only run on a single " + // "CPU thread"); - PairSNAP::init_style(); - return; - } + // PairSNAP::init_style(); + // return; + //} if (force->newton_pair == 0) error->all(FLERR,"Pair style SNAP requires newton pair on"); @@ -134,12 +134,12 @@ struct FindMaxNumNeighs { template void PairSNAPKokkos::compute(int eflag_in, int vflag_in) { - if (host_flag) { - atomKK->sync(Host,X_MASK|F_MASK|TYPE_MASK); - PairSNAP::compute(eflag_in,vflag_in); - atomKK->modified(Host,F_MASK); - return; - } + //if (host_flag) { + // atomKK->sync(Host,X_MASK|F_MASK|TYPE_MASK); + // PairSNAP::compute(eflag_in,vflag_in); + // atomKK->modified(Host,F_MASK); + // return; + //} eflag = eflag_in; vflag = vflag_in; From ce92b565b27568aacf0a353d1bec2f73feca4cd9 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Tue, 19 Nov 2024 10:00:56 -0800 Subject: [PATCH 02/30] Made SNAP host_flag constexpr, took advantage of if constexpr to reduce compile time --- src/KOKKOS/pair_snap_kokkos.h | 4 +++- src/KOKKOS/pair_snap_kokkos_impl.h | 20 +++++++------------- src/KOKKOS/sna_kokkos.h | 3 ++- src/KOKKOS/sna_kokkos_impl.h | 16 ++-------------- 4 files changed, 14 insertions(+), 29 deletions(-) diff --git a/src/KOKKOS/pair_snap_kokkos.h b/src/KOKKOS/pair_snap_kokkos.h index ca1884bfd2..bb599a46ea 100644 --- a/src/KOKKOS/pair_snap_kokkos.h +++ b/src/KOKKOS/pair_snap_kokkos.h @@ -80,6 +80,8 @@ class PairSNAPKokkos : public PairSNAP { typedef ArrayTypes AT; typedef EV_FLOAT value_type; + static constexpr LAMMPS_NS::ExecutionSpace execution_space = ExecutionSpaceFromDevice::space; + static constexpr int host_flag = (execution_space == LAMMPS_NS::Host); static constexpr int vector_length = vector_length_; using real_type = real_type_; using complex = SNAComplex; @@ -252,7 +254,7 @@ class PairSNAPKokkos : public PairSNAP { SNAKokkos snaKK; int inum,max_neighs,chunk_size,chunk_offset; - int host_flag,neighflag; + int neighflag; int eflag,vflag; diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h index 86eeffff81..f6cb290dc6 100644 --- a/src/KOKKOS/pair_snap_kokkos_impl.h +++ b/src/KOKKOS/pair_snap_kokkos_impl.h @@ -59,11 +59,8 @@ PairSNAPKokkos::PairSNAPKokkos(LAMMPS *lmp kokkosable = 1; atomKK = (AtomKokkos *) atom; - execution_space = ExecutionSpaceFromDevice::space; datamask_read = EMPTY_MASK; datamask_modify = EMPTY_MASK; - - host_flag = (execution_space == Host); } /* ---------------------------------------------------------------------- */ @@ -85,7 +82,7 @@ PairSNAPKokkos::~PairSNAPKokkos() template void PairSNAPKokkos::init_style() { - //if (host_flag) { + //if constexpr (host_flag) { // if (lmp->kokkos->nthreads > 1) // error->all(FLERR,"Pair style snap/kk can currently only run on a single " // "CPU thread"); @@ -134,7 +131,7 @@ struct FindMaxNumNeighs { template void PairSNAPKokkos::compute(int eflag_in, int vflag_in) { - //if (host_flag) { + //if constexpr (host_flag) { // atomKK->sync(Host,X_MASK|F_MASK|TYPE_MASK); // PairSNAP::compute(eflag_in,vflag_in); // atomKK->modified(Host,F_MASK); @@ -197,13 +194,13 @@ void PairSNAPKokkos::compute(int eflag_in, Kokkos::parallel_reduce("PairSNAPKokkos::find_max_neighs",inum, FindMaxNumNeighs(k_list), Kokkos::Max(max_neighs)); int team_size_default = 1; - if (!host_flag) + if constexpr (!host_flag) team_size_default = 32;//max_neighs; if (beta_max < inum) { beta_max = inum; MemKK::realloc_kokkos(d_beta,"PairSNAPKokkos:beta",ncoeff,inum); - if (!host_flag) + if constexpr (!host_flag) MemKK::realloc_kokkos(d_beta_pack,"PairSNAPKokkos:beta_pack",vector_length,ncoeff,(inum + vector_length - 1) / vector_length); MemKK::realloc_kokkos(d_ninside,"PairSNAPKokkos:ninside",inum); } @@ -222,7 +219,7 @@ void PairSNAPKokkos::compute(int eflag_in, if (chunk_size > inum - chunk_offset) chunk_size = inum - chunk_offset; - if (host_flag) + if constexpr (host_flag) { // Host codepath @@ -297,7 +294,7 @@ void PairSNAPKokkos::compute(int eflag_in, } else { // GPU -#ifdef LMP_KOKKOS_GPU + // The compiler will only hit this codepath if this is the GPU code // Pre-compute ceil(chunk_size / vector_length) for code cleanliness const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length; @@ -462,9 +459,6 @@ void PairSNAPKokkos::compute(int eflag_in, } } - -#endif // LMP_KOKKOS_GPU - } //ComputeForce @@ -1411,7 +1405,7 @@ double PairSNAPKokkos::memory_usage() { double bytes = Pair::memory_usage(); bytes += MemKK::memory_usage(d_beta); - if (!host_flag) + if constexpr (!host_flag) bytes += MemKK::memory_usage(d_beta_pack); bytes += MemKK::memory_usage(d_ninside); bytes += MemKK::memory_usage(d_map); diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h index 78a3dfa669..8c3ddfd9f7 100644 --- a/src/KOKKOS/sna_kokkos.h +++ b/src/KOKKOS/sna_kokkos.h @@ -134,6 +134,8 @@ class SNAKokkos { static constexpr int vector_length = vector_length_; using KKDeviceType = typename KKDevice::value; + static constexpr LAMMPS_NS::ExecutionSpace execution_space = ExecutionSpaceFromDevice::space; + static constexpr int host_flag = (execution_space == LAMMPS_NS::Host); typedef Kokkos::View t_sna_1i; typedef Kokkos::View t_sna_1d; @@ -182,7 +184,6 @@ class SNAKokkos { double memory_usage(); int ncoeff; - int host_flag; // functions for bispectrum coefficients, GPU only KOKKOS_INLINE_FUNCTION diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index d5690ea60a..8cee2d32e4 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -34,9 +34,6 @@ SNAKokkos::SNAKokkos(real_type rfac0_in, int twojmax_in, real_type rmin0_in, int switch_flag_in, int bzero_flag_in, int chem_flag_in, int bnorm_flag_in, int wselfall_flag_in, int nelements_in, int switch_inner_flag_in) { - LAMMPS_NS::ExecutionSpace execution_space = ExecutionSpaceFromDevice::space; - host_flag = (execution_space == LAMMPS_NS::Host); - wself = static_cast(1.0); rfac0 = rfac0_in; @@ -312,8 +309,7 @@ void SNAKokkos::grow_rij(int newnatom, int MemKK::realloc_kokkos(element,"sna:element",natom,nmax); MemKK::realloc_kokkos(dedr,"sna:dedr",natom,nmax,3); -#ifdef LMP_KOKKOS_GPU - if (!host_flag) { + if constexpr (!host_flag) { const int natom_div = (natom + vector_length - 1) / vector_length; MemKK::realloc_kokkos(a_pack,"sna:a_pack",vector_length,nmax,natom_div); @@ -336,7 +332,6 @@ void SNAKokkos::grow_rij(int newnatom, int MemKK::realloc_kokkos(ylist_pack_im,"sna:ylist_pack_im",vector_length,idxu_half_max,nelements,natom_div); MemKK::realloc_kokkos(dulist,"sna:dulist",1,1,1); } else { -#endif MemKK::realloc_kokkos(a_pack,"sna:a_pack",1,1,1); MemKK::realloc_kokkos(b_pack,"sna:b_pack",1,1,1); MemKK::realloc_kokkos(da_pack,"sna:da_pack",1,1,1,1); @@ -356,10 +351,7 @@ void SNAKokkos::grow_rij(int newnatom, int MemKK::realloc_kokkos(ylist_pack_re,"sna:ylist_pack_re",1,1,1,1); MemKK::realloc_kokkos(ylist_pack_im,"sna:ylist_pack_im",1,1,1,1); MemKK::realloc_kokkos(dulist,"sna:dulist",idxu_cache_max,natom,nmax); - -#ifdef LMP_KOKKOS_GPU } -#endif } /* ---------------------------------------------------------------------- @@ -2339,8 +2331,7 @@ double SNAKokkos::memory_usage() bytes += MemKK::memory_usage(rootpqarray); bytes += MemKK::memory_usage(cglist); -#ifdef LMP_KOKKOS_GPU - if (!host_flag) { + if constexpr (!host_flag) { bytes += MemKK::memory_usage(a_pack); bytes += MemKK::memory_usage(b_pack); @@ -2359,7 +2350,6 @@ double SNAKokkos::memory_usage() bytes += MemKK::memory_usage(ylist_pack_re); bytes += MemKK::memory_usage(ylist_pack_im); } else { -#endif bytes += MemKK::memory_usage(ulist); bytes += MemKK::memory_usage(ulisttot); @@ -2371,9 +2361,7 @@ double SNAKokkos::memory_usage() bytes += MemKK::memory_usage(ylist); bytes += MemKK::memory_usage(dulist); -#ifdef LMP_KOKKOS_GPU } -#endif bytes += MemKK::memory_usage(dedr); From 58d70366c275a6bce73f9530f238ffd59a4b8a91 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Tue, 19 Nov 2024 10:12:47 -0800 Subject: [PATCH 03/30] Added padding to non-AoSoA data structures to improve alignment --- src/KOKKOS/sna_kokkos.h | 2 +- src/KOKKOS/sna_kokkos_impl.h | 38 ++++++++++++++++++++---------------- 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h index 8c3ddfd9f7..2bf34674c4 100644 --- a/src/KOKKOS/sna_kokkos.h +++ b/src/KOKKOS/sna_kokkos.h @@ -284,7 +284,7 @@ class SNAKokkos { t_sna_2d dinnerij; t_sna_2i element; t_sna_3d dedr; - int natom, nmax; + int natom, natom_pad, nmax; void grow_rij(int, int); diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index 8cee2d32e4..39cec2dcea 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -56,6 +56,7 @@ SNAKokkos::SNAKokkos(real_type rfac0_in, nmax = 0; natom = 0; + natom_pad = 0; build_indexlist(); @@ -298,19 +299,21 @@ void SNAKokkos::grow_rij(int newnatom, int { if (newnatom <= natom && newnmax <= nmax) return; natom = newnatom; + // Create padded structures + const int natom_div = (natom + vector_length - 1) / vector_length; + natom_pad = natom_div * vector_length; nmax = newnmax; - MemKK::realloc_kokkos(rij,"sna:rij",natom,nmax,3); - MemKK::realloc_kokkos(wj,"sna:wj",natom,nmax); - MemKK::realloc_kokkos(rcutij,"sna:rcutij",natom,nmax); - MemKK::realloc_kokkos(sinnerij,"sna:sinnerij",natom,nmax); - MemKK::realloc_kokkos(dinnerij,"sna:dinnerij",natom,nmax); - MemKK::realloc_kokkos(inside,"sna:inside",natom,nmax); - MemKK::realloc_kokkos(element,"sna:element",natom,nmax); - MemKK::realloc_kokkos(dedr,"sna:dedr",natom,nmax,3); + MemKK::realloc_kokkos(rij,"sna:rij",natom_pad,nmax,3); + MemKK::realloc_kokkos(wj,"sna:wj",natom_pad,nmax); + MemKK::realloc_kokkos(rcutij,"sna:rcutij",natom_pad,nmax); + MemKK::realloc_kokkos(sinnerij,"sna:sinnerij",natom_pad,nmax); + MemKK::realloc_kokkos(dinnerij,"sna:dinnerij",natom_pad,nmax); + MemKK::realloc_kokkos(inside,"sna:inside",natom_pad,nmax); + MemKK::realloc_kokkos(element,"sna:element",natom_pad,nmax); + MemKK::realloc_kokkos(dedr,"sna:dedr",natom_pad,nmax,3); if constexpr (!host_flag) { - const int natom_div = (natom + vector_length - 1) / vector_length; MemKK::realloc_kokkos(a_pack,"sna:a_pack",vector_length,nmax,natom_div); MemKK::realloc_kokkos(b_pack,"sna:b_pack",vector_length,nmax,natom_div); @@ -325,7 +328,7 @@ void SNAKokkos::grow_rij(int newnatom, int MemKK::realloc_kokkos(ulist,"sna:ulist",1,1,1); MemKK::realloc_kokkos(zlist,"sna:zlist",1,1,1); MemKK::realloc_kokkos(zlist_pack,"sna:zlist_pack",vector_length,idxz_max,ndoubles,natom_div); - MemKK::realloc_kokkos(blist,"sna:blist",natom,ntriples,idxb_max); + MemKK::realloc_kokkos(blist,"sna:blist",natom_pad,ntriples,idxb_max); MemKK::realloc_kokkos(blist_pack,"sna:blist_pack",vector_length,idxb_max,ntriples,natom_div); MemKK::realloc_kokkos(ylist,"sna:ylist",1,1,1); MemKK::realloc_kokkos(ylist_pack_re,"sna:ylist_pack_re",vector_length,idxu_half_max,nelements,natom_div); @@ -337,20 +340,21 @@ void SNAKokkos::grow_rij(int newnatom, int MemKK::realloc_kokkos(da_pack,"sna:da_pack",1,1,1,1); MemKK::realloc_kokkos(db_pack,"sna:db_pack",1,1,1,1); MemKK::realloc_kokkos(sfac_pack,"sna:sfac_pack",1,1,1,1); - MemKK::realloc_kokkos(ulisttot,"sna:ulisttot",idxu_half_max,nelements,natom); - MemKK::realloc_kokkos(ulisttot_full,"sna:ulisttot_full",idxu_max,nelements,natom); + MemKK::realloc_kokkos(ulisttot,"sna:ulisttot",idxu_half_max,nelements,natom_pad); + MemKK::realloc_kokkos(ulisttot_full,"sna:ulisttot_full",idxu_max,nelements,natom_pad); MemKK::realloc_kokkos(ulisttot_re_pack,"sna:ulisttot_re",1,1,1,1); MemKK::realloc_kokkos(ulisttot_im_pack,"sna:ulisttot_im",1,1,1,1); MemKK::realloc_kokkos(ulisttot_pack,"sna:ulisttot_pack",1,1,1,1); - MemKK::realloc_kokkos(ulist,"sna:ulist",idxu_cache_max,natom,nmax); - MemKK::realloc_kokkos(zlist,"sna:zlist",idxz_max,ndoubles,natom); + MemKK::realloc_kokkos(ulist,"sna:ulist",idxu_cache_max,natom_pad,nmax); + MemKK::realloc_kokkos(zlist,"sna:zlist",idxz_max,ndoubles,natom_pad); MemKK::realloc_kokkos(zlist_pack,"sna:zlist_pack",1,1,1,1); - MemKK::realloc_kokkos(blist,"sna:blist",natom,ntriples,idxb_max); + MemKK::realloc_kokkos(blist,"sna:blist",natom_pad,ntriples,idxb_max); MemKK::realloc_kokkos(blist_pack,"sna:blist_pack",1,1,1,1); - MemKK::realloc_kokkos(ylist,"sna:ylist",idxu_half_max,nelements,natom); + MemKK::realloc_kokkos(ylist,"sna:ylist",idxu_half_max,nelements,natom_pad); MemKK::realloc_kokkos(ylist_pack_re,"sna:ylist_pack_re",1,1,1,1); MemKK::realloc_kokkos(ylist_pack_im,"sna:ylist_pack_im",1,1,1,1); - MemKK::realloc_kokkos(dulist,"sna:dulist",idxu_cache_max,natom,nmax); + MemKK::realloc_kokkos(dulist,"sna:dulist",idxu_cache_max,natom_pad,nmax); + } } From c9754e5fd3f444613bcb2424b457af08cf96cc82 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Tue, 19 Nov 2024 10:24:27 -0800 Subject: [PATCH 04/30] Added const to each of the snaKK routines; removed extra snaKK copies --- src/KOKKOS/pair_snap_kokkos_impl.h | 174 ++++++++++++----------------- src/KOKKOS/sna_kokkos.h | 54 ++++----- src/KOKKOS/sna_kokkos_impl.h | 54 ++++----- 3 files changed, 127 insertions(+), 155 deletions(-) diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h index f6cb290dc6..492c401074 100644 --- a/src/KOKKOS/pair_snap_kokkos_impl.h +++ b/src/KOKKOS/pair_snap_kokkos_impl.h @@ -624,7 +624,6 @@ void PairSNAPKokkos::operator() (TagPairSN const int i = d_ilist[ii + chunk_offset]; const int itype = type[i]; const int ielem = d_map[itype]; - SNAKokkos my_sna = snaKK; auto d_coeffi = Kokkos::subview(d_coeffelem, ielem, Kokkos::ALL); @@ -633,18 +632,18 @@ void PairSNAPKokkos::operator() (TagPairSN } if (quadraticflag) { - const auto idxb_max = my_sna.idxb_max; + const auto idxb_max = snaKK.idxb_max; int k = ncoeff+1; for (int icoeff = 0; icoeff < ncoeff; icoeff++) { const auto idxb = icoeff % idxb_max; const auto idx_chem = icoeff / idxb_max; - real_type bveci = my_sna.blist(ii, idx_chem, idxb); + real_type bveci = snaKK.blist(ii, idx_chem, idxb); d_beta_pack(iatom_mod,icoeff,iatom_div) += d_coeffi[k]*bveci; k++; for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) { const auto jdxb = jcoeff % idxb_max; const auto jdx_chem = jcoeff / idxb_max; - real_type bvecj = my_sna.blist(ii, jdx_chem, jdxb); + real_type bvecj = snaKK.blist(ii, jdx_chem, jdxb); d_beta_pack(iatom_mod,icoeff,iatom_div) += d_coeffi[k]*bvecj; d_beta_pack(iatom_mod,jcoeff,iatom_div) += d_coeffi[k]*bveci; k++; @@ -657,8 +656,6 @@ template KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPComputeNeigh,const typename Kokkos::TeamPolicy::member_type& team) const { - SNAKokkos my_sna = snaKK; - // extract atom number int ii = team.team_rank() + team.league_rank() * team.team_size(); if (ii >= chunk_size) return; @@ -725,20 +722,20 @@ void PairSNAPKokkos::operator() (TagPairSN const F_FLOAT dy = x(j,1) - ytmp; const F_FLOAT dz = x(j,2) - ztmp; const int jelem = d_map[jtype]; - my_sna.rij(ii,offset,0) = static_cast(dx); - my_sna.rij(ii,offset,1) = static_cast(dy); - my_sna.rij(ii,offset,2) = static_cast(dz); - my_sna.wj(ii,offset) = static_cast(d_wjelem[jelem]); - my_sna.rcutij(ii,offset) = static_cast((radi + d_radelem[jelem])*rcutfac); - my_sna.inside(ii,offset) = j; + snaKK.rij(ii,offset,0) = static_cast(dx); + snaKK.rij(ii,offset,1) = static_cast(dy); + snaKK.rij(ii,offset,2) = static_cast(dz); + snaKK.wj(ii,offset) = static_cast(d_wjelem[jelem]); + snaKK.rcutij(ii,offset) = static_cast((radi + d_radelem[jelem])*rcutfac); + snaKK.inside(ii,offset) = j; if (switchinnerflag) { - my_sna.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]); - my_sna.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]); + snaKK.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]); + snaKK.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]); } if (chemflag) - my_sna.element(ii,offset) = jelem; + snaKK.element(ii,offset) = jelem; else - my_sna.element(ii,offset) = 0; + snaKK.element(ii,offset) = 0; } offset++; } @@ -748,7 +745,6 @@ void PairSNAPKokkos::operator() (TagPairSN template KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPComputeCayleyKlein,const int iatom_mod, const int jnbor, const int iatom_div) const { - SNAKokkos my_sna = snaKK; const int ii = iatom_mod + iatom_div * vector_length; if (ii >= chunk_size) return; @@ -756,13 +752,12 @@ void PairSNAPKokkos::operator() (TagPairSN const int ninside = d_ninside(ii); if (jnbor >= ninside) return; - my_sna.compute_cayley_klein(iatom_mod,jnbor,iatom_div); + snaKK.compute_cayley_klein(iatom_mod,jnbor,iatom_div); } template KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPPreUi, const int iatom_mod, const int j, const int iatom_div) const { - SNAKokkos my_sna = snaKK; const int ii = iatom_mod + iatom_div * vector_length; if (ii >= chunk_size) return; @@ -770,13 +765,12 @@ void PairSNAPKokkos::operator() (TagPairSN int itype = type(ii); int ielem = d_map[itype]; - my_sna.pre_ui(iatom_mod, j, ielem, iatom_div); + snaKK.pre_ui(iatom_mod, j, ielem, iatom_div); } template KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPComputeUiSmall,const typename Kokkos::TeamPolicy::member_type& team) const { - SNAKokkos my_sna = snaKK; // extract flattened atom_div / neighbor number / bend location int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui; @@ -795,7 +789,7 @@ void PairSNAPKokkos::operator() (TagPairSN const int ninside = d_ninside(ii); if (jj >= ninside) return; - my_sna.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div); + snaKK.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div); }); } @@ -803,7 +797,6 @@ void PairSNAPKokkos::operator() (TagPairSN template KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPComputeUiLarge,const typename Kokkos::TeamPolicy::member_type& team) const { - SNAKokkos my_sna = snaKK; // extract flattened atom_div / neighbor number / bend location int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui; @@ -820,7 +813,7 @@ void PairSNAPKokkos::operator() (TagPairSN const int ninside = d_ninside(ii); if (jj >= ninside) return; - my_sna.compute_ui_large(team,iatom_mod, jj, iatom_div); + snaKK.compute_ui_large(team,iatom_mod, jj, iatom_div); }); } @@ -829,21 +822,20 @@ void PairSNAPKokkos::operator() (TagPairSN template KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPTransformUi,const int iatom_mod, const int idxu, const int iatom_div) const { - SNAKokkos my_sna = snaKK; const int iatom = iatom_mod + iatom_div * vector_length; if (iatom >= chunk_size) return; - if (idxu > my_sna.idxu_max) return; + if (idxu > snaKK.idxu_max) return; int elem_count = chemflag ? nelements : 1; for (int ielem = 0; ielem < elem_count; ielem++) { - const FullHalfMapper mapper = my_sna.idxu_full_half[idxu]; + const FullHalfMapper mapper = snaKK.idxu_full_half[idxu]; - auto utot_re = my_sna.ulisttot_re_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div); - auto utot_im = my_sna.ulisttot_im_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div); + auto utot_re = snaKK.ulisttot_re_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div); + auto utot_im = snaKK.ulisttot_im_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div); if (mapper.flip_sign == 1) { utot_im = -utot_im; @@ -851,11 +843,11 @@ void PairSNAPKokkos::operator() (TagPairSN utot_re = -utot_re; } - my_sna.ulisttot_pack(iatom_mod, idxu, ielem, iatom_div) = { utot_re, utot_im }; + snaKK.ulisttot_pack(iatom_mod, idxu, ielem, iatom_div) = { utot_re, utot_im }; if (mapper.flip_sign == 0) { - my_sna.ylist_pack_re(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.; - my_sna.ylist_pack_im(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.; + snaKK.ylist_pack_re(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.; + snaKK.ylist_pack_im(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.; } } } @@ -863,72 +855,67 @@ void PairSNAPKokkos::operator() (TagPairSN template KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPComputeYi,const int iatom_mod, const int jjz, const int iatom_div) const { - SNAKokkos my_sna = snaKK; const int iatom = iatom_mod + iatom_div * vector_length; if (iatom >= chunk_size) return; - if (jjz >= my_sna.idxz_max) return; + if (jjz >= snaKK.idxz_max) return; - my_sna.compute_yi(iatom_mod,jjz,iatom_div,d_beta_pack); + snaKK.compute_yi(iatom_mod,jjz,iatom_div,d_beta_pack); } template KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPComputeYiWithZlist,const int iatom_mod, const int jjz, const int iatom_div) const { - SNAKokkos my_sna = snaKK; const int iatom = iatom_mod + iatom_div * vector_length; if (iatom >= chunk_size) return; - if (jjz >= my_sna.idxz_max) return; + if (jjz >= snaKK.idxz_max) return; - my_sna.compute_yi_with_zlist(iatom_mod,jjz,iatom_div,d_beta_pack); + snaKK.compute_yi_with_zlist(iatom_mod,jjz,iatom_div,d_beta_pack); } template KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPComputeZi,const int iatom_mod, const int jjz, const int iatom_div) const { - SNAKokkos my_sna = snaKK; const int iatom = iatom_mod + iatom_div * vector_length; if (iatom >= chunk_size) return; - if (jjz >= my_sna.idxz_max) return; + if (jjz >= snaKK.idxz_max) return; - my_sna.compute_zi(iatom_mod,jjz,iatom_div); + snaKK.compute_zi(iatom_mod,jjz,iatom_div); } template KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPComputeBi,const int iatom_mod, const int jjb, const int iatom_div) const { - SNAKokkos my_sna = snaKK; const int iatom = iatom_mod + iatom_div * vector_length; if (iatom >= chunk_size) return; - if (jjb >= my_sna.idxb_max) return; + if (jjb >= snaKK.idxb_max) return; - my_sna.compute_bi(iatom_mod,jjb,iatom_div); + snaKK.compute_bi(iatom_mod,jjb,iatom_div); } template KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const { - SNAKokkos my_sna = snaKK; const int iatom = iatom_mod + iatom_div * vector_length; if (iatom >= chunk_size) return; - if (idxb >= my_sna.idxb_max) return; + if (idxb >= snaKK.idxb_max) return; - const int ntriples = my_sna.ntriples; + const int ntriples = snaKK.ntriples; for (int itriple = 0; itriple < ntriples; itriple++) { - const real_type blocal = my_sna.blist_pack(iatom_mod, idxb, itriple, iatom_div); + const real_type blocal = snaKK.blist_pack(iatom_mod, idxb, itriple, iatom_div); - my_sna.blist(iatom, itriple, idxb) = blocal; + snaKK.blist(iatom, itriple, idxb) = blocal; } } @@ -937,7 +924,6 @@ template template KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPComputeFusedDeidrjSmall,const typename Kokkos::TeamPolicy >::member_type& team) const { - SNAKokkos my_sna = snaKK; // extract flattened atom_div / neighbor number / bend location int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_fused_deidrj; @@ -956,7 +942,7 @@ void PairSNAPKokkos::operator() (TagPairSN const int ninside = d_ninside(ii); if (jj >= ninside) return; - my_sna.template compute_fused_deidrj_small(team, iatom_mod, jbend, jj, iatom_div); + snaKK.template compute_fused_deidrj_small(team, iatom_mod, jbend, jj, iatom_div); }); @@ -966,7 +952,6 @@ template template KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPComputeFusedDeidrjLarge,const typename Kokkos::TeamPolicy >::member_type& team) const { - SNAKokkos my_sna = snaKK; // extract flattened atom_div / neighbor number / bend location int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_fused_deidrj; @@ -983,7 +968,7 @@ void PairSNAPKokkos::operator() (TagPairSN const int ninside = d_ninside(ii); if (jj >= ninside) return; - my_sna.template compute_fused_deidrj_large(team, iatom_mod, jj, iatom_div); + snaKK.template compute_fused_deidrj_large(team, iatom_mod, jj, iatom_div); }); } @@ -1003,7 +988,6 @@ void PairSNAPKokkos::operator() (TagPairSN const int i = d_ilist[ii + chunk_offset]; const int itype = type[i]; const int ielem = d_map[itype]; - SNAKokkos my_sna = snaKK; auto d_coeffi = Kokkos::subview(d_coeffelem, ielem, Kokkos::ALL); @@ -1011,18 +995,18 @@ void PairSNAPKokkos::operator() (TagPairSN d_beta(icoeff,ii) = d_coeffi[icoeff+1]; if (quadraticflag) { - const auto idxb_max = my_sna.idxb_max; + const auto idxb_max = snaKK.idxb_max; int k = ncoeff+1; for (int icoeff = 0; icoeff < ncoeff; icoeff++) { const auto idxb = icoeff % idxb_max; const auto idx_chem = icoeff / idxb_max; - real_type bveci = my_sna.blist(ii,idx_chem,idxb); + real_type bveci = snaKK.blist(ii,idx_chem,idxb); d_beta(icoeff,ii) += d_coeffi[k]*bveci; k++; for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) { const auto jdxb = jcoeff % idxb_max; const auto jdx_chem = jcoeff / idxb_max; - real_type bvecj = my_sna.blist(ii,jdx_chem,jdxb); + real_type bvecj = snaKK.blist(ii,jdx_chem,jdxb); d_beta(icoeff,ii) += d_coeffi[k]*bvecj; d_beta(jcoeff,ii) += d_coeffi[k]*bveci; k++; @@ -1035,10 +1019,8 @@ template KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPComputeNeighCPU,const typename Kokkos::TeamPolicy::member_type& team) const { - int ii = team.league_rank(); const int i = d_ilist[ii + chunk_offset]; - SNAKokkos my_sna = snaKK; const double xtmp = x(i,0); const double ytmp = x(i,1); const double ztmp = x(i,2); @@ -1088,20 +1070,20 @@ void PairSNAPKokkos::operator() (TagPairSN if (rsq < rnd_cutsq(itype,jtype)) { if (final) { - my_sna.rij(ii,offset,0) = static_cast(dx); - my_sna.rij(ii,offset,1) = static_cast(dy); - my_sna.rij(ii,offset,2) = static_cast(dz); - my_sna.wj(ii,offset) = static_cast(d_wjelem[jelem]); - my_sna.rcutij(ii,offset) = static_cast((radi + d_radelem[jelem])*rcutfac); - my_sna.inside(ii,offset) = j; + snaKK.rij(ii,offset,0) = static_cast(dx); + snaKK.rij(ii,offset,1) = static_cast(dy); + snaKK.rij(ii,offset,2) = static_cast(dz); + snaKK.wj(ii,offset) = static_cast(d_wjelem[jelem]); + snaKK.rcutij(ii,offset) = static_cast((radi + d_radelem[jelem])*rcutfac); + snaKK.inside(ii,offset) = j; if (switchinnerflag) { - my_sna.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]); - my_sna.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]); + snaKK.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]); + snaKK.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]); } if (chemflag) - my_sna.element(ii,offset) = jelem; + snaKK.element(ii,offset) = jelem; else - my_sna.element(ii,offset) = 0; + snaKK.element(ii,offset) = 0; } offset++; } @@ -1112,7 +1094,6 @@ void PairSNAPKokkos::operator() (TagPairSN template KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPPreUiCPU,const typename Kokkos::TeamPolicy::member_type& team) const { - SNAKokkos my_sna = snaKK; // Extract the atom number const int ii = team.team_rank() + team.team_size() * team.league_rank(); @@ -1120,7 +1101,7 @@ void PairSNAPKokkos::operator() (TagPairSN int itype = type(ii); int ielem = d_map[itype]; - my_sna.pre_ui_cpu(team,ii,ielem); + snaKK.pre_ui_cpu(team,ii,ielem); } @@ -1128,7 +1109,6 @@ void PairSNAPKokkos::operator() (TagPairSN template KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy::member_type& team) const { - SNAKokkos my_sna = snaKK; // Extract the atom number int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size())); @@ -1139,13 +1119,12 @@ void PairSNAPKokkos::operator() (TagPairSN const int ninside = d_ninside(ii); if (jj >= ninside) return; - my_sna.compute_ui_cpu(team,ii,jj); + snaKK.compute_ui_cpu(team,ii,jj); } template KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPTransformUiCPU, const int j, const int iatom) const { - SNAKokkos my_sna = snaKK; if (iatom >= chunk_size) return; @@ -1156,8 +1135,8 @@ void PairSNAPKokkos::operator() (TagPairSN // De-symmetrize ulisttot for (int ielem = 0; ielem < elem_count; ielem++) { - const int jju_half = my_sna.idxu_half_block(j); - const int jju = my_sna.idxu_block(j); + const int jju_half = snaKK.idxu_half_block(j); + const int jju = snaKK.idxu_block(j); for (int mb = 0; 2*mb <= j; mb++) { for (int ma = 0; ma <= j; ma++) { @@ -1168,13 +1147,13 @@ void PairSNAPKokkos::operator() (TagPairSN const int idxu = jju + idxu_shift; // Load ulist - auto utot = my_sna.ulisttot(idxu_half, ielem, iatom); + auto utot = snaKK.ulisttot(idxu_half, ielem, iatom); // Store - my_sna.ulisttot_full(idxu, ielem, iatom) = utot; + snaKK.ulisttot_full(idxu, ielem, iatom) = utot; // Zero Yi - my_sna.ylist(idxu_half, ielem, iatom) = {0., 0.}; + snaKK.ylist(idxu_half, ielem, iatom) = {0., 0.}; // Symmetric term const int sign_factor = (((ma+mb)%2==0)?1:-1); @@ -1186,7 +1165,7 @@ void PairSNAPKokkos::operator() (TagPairSN utot.re = -utot.re; } - my_sna.ulisttot_full(idxu_flip, ielem, iatom) = utot; + snaKK.ulisttot_full(idxu_flip, ielem, iatom) = utot; } } } @@ -1195,29 +1174,25 @@ void PairSNAPKokkos::operator() (TagPairSN template KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPComputeYiCPU,const int& ii) const { - SNAKokkos my_sna = snaKK; - my_sna.compute_yi_cpu(ii,d_beta); + snaKK.compute_yi_cpu(ii,d_beta); } template KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPComputeZiCPU,const int& ii) const { - SNAKokkos my_sna = snaKK; - my_sna.compute_zi_cpu(ii); + snaKK.compute_zi_cpu(ii); } template KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPComputeBiCPU,const typename Kokkos::TeamPolicy::member_type& team) const { int ii = team.league_rank(); - SNAKokkos my_sna = snaKK; - my_sna.compute_bi_cpu(team,ii); + snaKK.compute_bi_cpu(team,ii); } template KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPComputeDuidrjCPU,const typename Kokkos::TeamPolicy::member_type& team) const { - SNAKokkos my_sna = snaKK; // Extract the atom number int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size())); @@ -1228,13 +1203,12 @@ void PairSNAPKokkos::operator() (TagPairSN const int ninside = d_ninside(ii); if (jj >= ninside) return; - my_sna.compute_duidrj_cpu(team,ii,jj); + snaKK.compute_duidrj_cpu(team,ii,jj); } template KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPComputeDeidrjCPU,const typename Kokkos::TeamPolicy::member_type& team) const { - SNAKokkos my_sna = snaKK; // Extract the atom number int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size())); @@ -1245,7 +1219,7 @@ void PairSNAPKokkos::operator() (TagPairSN const int ninside = d_ninside(ii); if (jj >= ninside) return; - my_sna.compute_deidrj_cpu(team,ii,jj); + snaKK.compute_deidrj_cpu(team,ii,jj); } /* ---------------------------------------------------------------------- @@ -1265,17 +1239,15 @@ void PairSNAPKokkos::operator() (TagPairSN const int i = d_ilist[ii + chunk_offset]; - SNAKokkos my_sna = snaKK; - const int ninside = d_ninside(ii); for (int jj = 0; jj < ninside; jj++) { - int j = my_sna.inside(ii,jj); + int j = snaKK.inside(ii,jj); F_FLOAT fij[3]; - fij[0] = my_sna.dedr(ii,jj,0); - fij[1] = my_sna.dedr(ii,jj,1); - fij[2] = my_sna.dedr(ii,jj,2); + fij[0] = snaKK.dedr(ii,jj,0); + fij[1] = snaKK.dedr(ii,jj,1); + fij[2] = snaKK.dedr(ii,jj,2); a_f(i,0) += fij[0]; a_f(i,1) += fij[1]; @@ -1288,8 +1260,8 @@ void PairSNAPKokkos::operator() (TagPairSN if (vflag_either) { v_tally_xyz(ev,i,j, fij[0],fij[1],fij[2], - -my_sna.rij(ii,jj,0),-my_sna.rij(ii,jj,1), - -my_sna.rij(ii,jj,2)); + -snaKK.rij(ii,jj,0),-snaKK.rij(ii,jj,1), + -snaKK.rij(ii,jj,2)); } } @@ -1316,7 +1288,7 @@ void PairSNAPKokkos::operator() (TagPairSN for (int icoeff = 0; icoeff < ncoeff; icoeff++) { const auto idxb = icoeff % idxb_max; const auto idx_chem = icoeff / idxb_max; - evdwl += d_coeffi[icoeff+1]*my_sna.blist(ii,idx_chem,idxb); + evdwl += d_coeffi[icoeff+1]*snaKK.blist(ii,idx_chem,idxb); } // quadratic contributions @@ -1325,12 +1297,12 @@ void PairSNAPKokkos::operator() (TagPairSN for (int icoeff = 0; icoeff < ncoeff; icoeff++) { const auto idxb = icoeff % idxb_max; const auto idx_chem = icoeff / idxb_max; - real_type bveci = my_sna.blist(ii,idx_chem,idxb); + real_type bveci = snaKK.blist(ii,idx_chem,idxb); evdwl += 0.5*d_coeffi[k++]*bveci*bveci; for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) { auto jdxb = jcoeff % idxb_max; auto jdx_chem = jcoeff / idxb_max; - auto bvecj = my_sna.blist(ii,jdx_chem,jdxb); + auto bvecj = snaKK.blist(ii,jdx_chem,jdxb); evdwl += d_coeffi[k++]*bveci*bvecj; } } diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h index 2bf34674c4..9456a219d4 100644 --- a/src/KOKKOS/sna_kokkos.h +++ b/src/KOKKOS/sna_kokkos.h @@ -187,84 +187,84 @@ class SNAKokkos { // functions for bispectrum coefficients, GPU only KOKKOS_INLINE_FUNCTION - void compute_cayley_klein(const int&, const int&, const int&); + void compute_cayley_klein(const int&, const int&, const int&) const; KOKKOS_INLINE_FUNCTION - void pre_ui(const int&, const int&, const int&, const int&); // ForceSNAP + void pre_ui(const int&, const int&, const int&, const int&) const; // ForceSNAP // version of the code with parallelism over j_bend KOKKOS_INLINE_FUNCTION - void compute_ui_small(const typename Kokkos::TeamPolicy::member_type& team, const int, const int, const int, const int); // ForceSNAP + void compute_ui_small(const typename Kokkos::TeamPolicy::member_type& team, const int, const int, const int, const int) const; // ForceSNAP // version of the code without parallelism over j_bend KOKKOS_INLINE_FUNCTION - void compute_ui_large(const typename Kokkos::TeamPolicy::member_type& team, const int, const int, const int); // ForceSNAP + void compute_ui_large(const typename Kokkos::TeamPolicy::member_type& team, const int, const int, const int) const; // ForceSNAP KOKKOS_INLINE_FUNCTION - void compute_zi(const int&, const int&, const int&); // ForceSNAP + void compute_zi(const int&, const int&, const int&) const; // ForceSNAP KOKKOS_INLINE_FUNCTION void compute_yi(int,int,int, - const Kokkos::View &beta_pack); // ForceSNAP + const Kokkos::View &beta_pack) const; // ForceSNAP KOKKOS_INLINE_FUNCTION void compute_yi_with_zlist(int,int,int, - const Kokkos::View &beta_pack); // ForceSNAP + const Kokkos::View &beta_pack) const; // ForceSNAP KOKKOS_INLINE_FUNCTION - void compute_bi(const int&, const int&, const int&); // ForceSNAP + void compute_bi(const int&, const int&, const int&) const; // ForceSNAP // functions for derivatives, GPU only // version of the code with parallelism over j_bend template KOKKOS_INLINE_FUNCTION - void compute_fused_deidrj_small(const typename Kokkos::TeamPolicy::member_type& team, const int, const int, const int, const int); //ForceSNAP + void compute_fused_deidrj_small(const typename Kokkos::TeamPolicy::member_type& team, const int, const int, const int, const int) const; //ForceSNAP // version of the code without parallelism over j_bend template KOKKOS_INLINE_FUNCTION - void compute_fused_deidrj_large(const typename Kokkos::TeamPolicy::member_type& team, const int, const int, const int); //ForceSNAP + void compute_fused_deidrj_large(const typename Kokkos::TeamPolicy::member_type& team, const int, const int, const int) const; //ForceSNAP // core "evaluation" functions that get plugged into "compute" functions // plugged into compute_ui_small, compute_ui_large KOKKOS_FORCEINLINE_FUNCTION void evaluate_ui_jbend(const WignerWrapper&, const complex&, const complex&, const real_type&, const int&, - const int&, const int&, const int&); + const int&, const int&, const int&) const; // plugged into compute_zi, compute_yi KOKKOS_FORCEINLINE_FUNCTION complex evaluate_zi(const int&, const int&, const int&, const int&, const int&, const int&, const int&, const int&, const int&, - const int&, const int&, const int&, const int&, const real_type*); + const int&, const int&, const int&, const int&, const real_type*) const; // plugged into compute_yi, compute_yi_with_zlist KOKKOS_FORCEINLINE_FUNCTION real_type evaluate_beta_scaled(const int&, const int&, const int&, const int&, const int&, const int&, const int&, const int&, - const Kokkos::View &); + const Kokkos::View &) const; // plugged into compute_fused_deidrj_small, compute_fused_deidrj_large KOKKOS_FORCEINLINE_FUNCTION real_type evaluate_duidrj_jbend(const WignerWrapper&, const complex&, const complex&, const real_type&, const WignerWrapper&, const complex&, const complex&, const real_type&, - const int&, const int&, const int&, const int&); + const int&, const int&, const int&, const int&) const; // functions for bispectrum coefficients, CPU only KOKKOS_INLINE_FUNCTION - void pre_ui_cpu(const typename Kokkos::TeamPolicy::member_type& team,const int&,const int&); // ForceSNAP + void pre_ui_cpu(const typename Kokkos::TeamPolicy::member_type& team,const int&,const int&) const; // ForceSNAP KOKKOS_INLINE_FUNCTION - void compute_ui_cpu(const typename Kokkos::TeamPolicy::member_type& team, int, int); // ForceSNAP + void compute_ui_cpu(const typename Kokkos::TeamPolicy::member_type& team, int, int) const; // ForceSNAP KOKKOS_INLINE_FUNCTION - void compute_zi_cpu(const int&); // ForceSNAP + void compute_zi_cpu(const int&) const; // ForceSNAP KOKKOS_INLINE_FUNCTION void compute_yi_cpu(int, - const Kokkos::View &beta); // ForceSNAP + const Kokkos::View &beta) const; // ForceSNAP KOKKOS_INLINE_FUNCTION - void compute_bi_cpu(const typename Kokkos::TeamPolicy::member_type& team, int); // ForceSNAP + void compute_bi_cpu(const typename Kokkos::TeamPolicy::member_type& team, int) const; // ForceSNAP // functions for derivatives, CPU only KOKKOS_INLINE_FUNCTION - void compute_duidrj_cpu(const typename Kokkos::TeamPolicy::member_type& team, int, int); //ForceSNAP + void compute_duidrj_cpu(const typename Kokkos::TeamPolicy::member_type& team, int, int) const; //ForceSNAP KOKKOS_INLINE_FUNCTION - void compute_deidrj_cpu(const typename Kokkos::TeamPolicy::member_type& team, int, int); // ForceSNAP + void compute_deidrj_cpu(const typename Kokkos::TeamPolicy::member_type& team, int, int) const; // ForceSNAP KOKKOS_INLINE_FUNCTION - real_type compute_sfac(real_type, real_type, real_type, real_type); // add_uarraytot, compute_duarray + real_type compute_sfac(real_type, real_type, real_type, real_type) const; // add_uarraytot, compute_duarray KOKKOS_INLINE_FUNCTION - real_type compute_dsfac(real_type, real_type, real_type, real_type); // compute_duarray + real_type compute_dsfac(real_type, real_type, real_type, real_type) const; // compute_duarray KOKKOS_INLINE_FUNCTION - void compute_s_dsfac(const real_type, const real_type, const real_type, const real_type, real_type&, real_type&); // compute_cayley_klein + void compute_s_dsfac(const real_type, const real_type, const real_type, const real_type, real_type&, real_type&) const; // compute_cayley_klein #ifdef TIMING_INFO double* timers; @@ -365,12 +365,12 @@ class SNAKokkos { void init_rootpqarray(); // init() KOKKOS_INLINE_FUNCTION - void add_uarraytot(const typename Kokkos::TeamPolicy::member_type& team, int, int, const real_type&, const real_type&, const real_type&, const real_type&, const real_type&, int); // compute_ui + void add_uarraytot(const typename Kokkos::TeamPolicy::member_type& team, int, int, const real_type&, const real_type&, const real_type&, const real_type&, const real_type&, int) const; // compute_ui KOKKOS_INLINE_FUNCTION void compute_uarray_cpu(const typename Kokkos::TeamPolicy::member_type& team, int, int, const real_type&, const real_type&, const real_type&, - const real_type&, const real_type&); // compute_ui_cpu + const real_type&, const real_type&) const; // compute_ui_cpu inline @@ -382,7 +382,7 @@ class SNAKokkos { void compute_duarray_cpu(const typename Kokkos::TeamPolicy::member_type& team, int, int, const real_type&, const real_type&, const real_type&, // compute_duidrj_cpu const real_type&, const real_type&, const real_type&, const real_type&, const real_type&, - const real_type&, const real_type&); + const real_type&, const real_type&) const; // Sets the style for the switching function // 0 = none diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index 39cec2dcea..7ebdcc5a0a 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -371,7 +371,7 @@ void SNAKokkos::grow_rij(int newnatom, int template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_cayley_klein(const int& iatom_mod, const int& jnbor, const int& iatom_div) +void SNAKokkos::compute_cayley_klein(const int& iatom_mod, const int& jnbor, const int& iatom_div) const { const int iatom = iatom_mod + vector_length * iatom_div; const real_type x = rij(iatom,jnbor,0); @@ -460,7 +460,7 @@ void SNAKokkos::compute_cayley_klein(const template KOKKOS_INLINE_FUNCTION -void SNAKokkos::pre_ui(const int& iatom_mod, const int& j, const int& ielem, const int& iatom_div) +void SNAKokkos::pre_ui(const int& iatom_mod, const int& j, const int& ielem, const int& iatom_div) const { for (int jelem = 0; jelem < nelements; jelem++) { @@ -494,7 +494,7 @@ void SNAKokkos::pre_ui(const int& iatom_mo template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_ui_small(const typename Kokkos::TeamPolicy::member_type& team, const int iatom_mod, const int j_bend, const int jnbor, const int iatom_div) +void SNAKokkos::compute_ui_small(const typename Kokkos::TeamPolicy::member_type& team, const int iatom_mod, const int j_bend, const int jnbor, const int iatom_div) const { // get shared memory offset @@ -525,7 +525,7 @@ void SNAKokkos::compute_ui_small(const typ // and some amount of load imbalance, at the expense of reducing parallelism template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_ui_large(const typename Kokkos::TeamPolicy::member_type& team, const int iatom_mod, const int jnbor, const int iatom_div) +void SNAKokkos::compute_ui_large(const typename Kokkos::TeamPolicy::member_type& team, const int iatom_mod, const int jnbor, const int iatom_div) const { // get shared memory offset // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer @@ -558,7 +558,7 @@ template KOKKOS_FORCEINLINE_FUNCTION void SNAKokkos::evaluate_ui_jbend(const WignerWrapper& ulist_wrapper, const complex& a, const complex& b, const real_type& sfac, const int& jelem, - const int& iatom_mod, const int& j_bend, const int& iatom_div) + const int& iatom_mod, const int& j_bend, const int& iatom_div) const { // utot(j,ma,mb) = 0 for all j,ma,ma @@ -664,7 +664,7 @@ void SNAKokkos::evaluate_ui_jbend(const Wi template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_zi(const int& iatom_mod, const int& jjz, const int& iatom_div) +void SNAKokkos::compute_zi(const int& iatom_mod, const int& jjz, const int& iatom_div) const { int j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, idxcg; @@ -692,7 +692,7 @@ void SNAKokkos::compute_zi(const int& iato template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_bi(const int& iatom_mod, const int& jjb, const int& iatom_div) +void SNAKokkos::compute_bi(const int& iatom_mod, const int& jjb, const int& iatom_div) const { // for j1 = 0,...,twojmax // for j2 = 0,twojmax @@ -786,7 +786,7 @@ void SNAKokkos::compute_bi(const int& iato template KOKKOS_INLINE_FUNCTION void SNAKokkos::compute_yi(int iatom_mod, int jjz, int iatom_div, - const Kokkos::View &beta_pack) + const Kokkos::View &beta_pack) const { int j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, jju_half, idxcg; @@ -827,7 +827,7 @@ void SNAKokkos::compute_yi(int iatom_mod, template KOKKOS_INLINE_FUNCTION void SNAKokkos::compute_yi_with_zlist(int iatom_mod, int jjz, int iatom_div, - const Kokkos::View &beta_pack) + const Kokkos::View &beta_pack) const { int j1, j2, j, jju_half; idxz(jjz).get_yi_with_zlist(j1, j2, j, jju_half); @@ -859,7 +859,7 @@ template KOKKOS_FORCEINLINE_FUNCTION typename SNAKokkos::complex SNAKokkos::evaluate_zi(const int& j1, const int& j2, const int& j, const int& ma1min, const int& ma2max, const int& mb1min, const int& mb2max, const int& na, const int& nb, - const int& iatom_mod, const int& elem1, const int& elem2, const int& iatom_div, const real_type* cgblock) { + const int& iatom_mod, const int& elem1, const int& elem2, const int& iatom_div, const real_type* cgblock) const { complex ztmp = complex::zero(); @@ -911,7 +911,7 @@ template KOKKOS_FORCEINLINE_FUNCTION typename SNAKokkos::real_type SNAKokkos::evaluate_beta_scaled(const int& j1, const int& j2, const int& j, const int& iatom_mod, const int& elem1, const int& elem2, const int& elem3, const int& iatom_div, - const Kokkos::View &beta_pack) { + const Kokkos::View &beta_pack) const { real_type betaj = 0; @@ -951,7 +951,7 @@ typename SNAKokkos::real_type SNAKokkos template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_fused_deidrj_small(const typename Kokkos::TeamPolicy::member_type& team, const int iatom_mod, const int j_bend, const int jnbor, const int iatom_div) +void SNAKokkos::compute_fused_deidrj_small(const typename Kokkos::TeamPolicy::member_type& team, const int iatom_mod, const int j_bend, const int jnbor, const int iatom_div) const { // get shared memory offset // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer @@ -988,7 +988,7 @@ void SNAKokkos::compute_fused_deidrj_small template template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_fused_deidrj_large(const typename Kokkos::TeamPolicy::member_type& team, const int iatom_mod, const int jnbor, const int iatom_div) +void SNAKokkos::compute_fused_deidrj_large(const typename Kokkos::TeamPolicy::member_type& team, const int iatom_mod, const int jnbor, const int iatom_div) const { // get shared memory offset // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer @@ -1032,7 +1032,7 @@ template KOKKOS_FORCEINLINE_FUNCTION typename SNAKokkos::real_type SNAKokkos::evaluate_duidrj_jbend(const WignerWrapper& ulist_wrapper, const complex& a, const complex& b, const real_type& sfac, const WignerWrapper& dulist_wrapper, const complex& da, const complex& db, const real_type& dsfacu, - const int& jelem, const int& iatom_mod, const int& j_bend, const int& iatom_div) { + const int& jelem, const int& iatom_mod, const int& j_bend, const int& iatom_div) const { real_type dedr_full_sum = static_cast(0); @@ -1178,7 +1178,7 @@ typename SNAKokkos::real_type SNAKokkos KOKKOS_INLINE_FUNCTION -void SNAKokkos::pre_ui_cpu(const typename Kokkos::TeamPolicy::member_type& team, const int& iatom, const int& ielem) +void SNAKokkos::pre_ui_cpu(const typename Kokkos::TeamPolicy::member_type& team, const int& iatom, const int& ielem) const { for (int jelem = 0; jelem < nelements; jelem++) { for (int j = 0; j <= twojmax; j++) { @@ -1212,7 +1212,7 @@ void SNAKokkos::pre_ui_cpu(const typename template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_ui_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor) +void SNAKokkos::compute_ui_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor) const { real_type rsq, r, x, y, z, z0, theta0; @@ -1242,7 +1242,7 @@ void SNAKokkos::compute_ui_cpu(const typen template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_zi_cpu(const int& iter) +void SNAKokkos::compute_zi_cpu(const int& iter) const { const int iatom = iter / idxz_max; const int jjz = iter % idxz_max; @@ -1305,7 +1305,7 @@ void SNAKokkos::compute_zi_cpu(const int& template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_bi_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom) +void SNAKokkos::compute_bi_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom) const { // for j1 = 0,...,twojmax // for j2 = 0,twojmax @@ -1404,7 +1404,7 @@ void SNAKokkos::compute_bi_cpu(const typen template KOKKOS_INLINE_FUNCTION void SNAKokkos::compute_yi_cpu(int iter, - const Kokkos::View &beta) + const Kokkos::View &beta) const { real_type betaj; const int iatom = iter / idxz_max; @@ -1504,7 +1504,7 @@ void SNAKokkos::compute_yi_cpu(int iter, template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_duidrj_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor) +void SNAKokkos::compute_duidrj_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor) const { real_type rsq, r, x, y, z, z0, theta0, cs, sn; real_type dz0dr; @@ -1536,7 +1536,7 @@ void SNAKokkos::compute_duidrj_cpu(const t template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_deidrj_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor) +void SNAKokkos::compute_deidrj_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor) const { t_scalar3 final_sum; const int jelem = element(iatom, jnbor); @@ -1604,7 +1604,7 @@ template KOKKOS_INLINE_FUNCTION void SNAKokkos::add_uarraytot(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor, const real_type& r, const real_type& wj, const real_type& rcut, - const real_type& sinner, const real_type& dinner, int jelem) + const real_type& sinner, const real_type& dinner, int jelem) const { const real_type sfac = compute_sfac(r, rcut, sinner, dinner) * wj; @@ -1634,7 +1634,7 @@ void SNAKokkos::add_uarraytot(const typena template KOKKOS_INLINE_FUNCTION void SNAKokkos::compute_uarray_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor, - const real_type& x, const real_type& y, const real_type& z, const real_type& z0, const real_type& r) + const real_type& x, const real_type& y, const real_type& z, const real_type& z0, const real_type& r) const { real_type r0inv; real_type a_r, b_r, a_i, b_i; @@ -1728,7 +1728,7 @@ void SNAKokkos::compute_duarray_cpu(const const real_type& x, const real_type& y, const real_type& z, const real_type& z0, const real_type& r, const real_type& dz0dr, const real_type& wj, const real_type& rcut, - const real_type& sinner, const real_type& dinner) + const real_type& sinner, const real_type& dinner) const { real_type r0inv; real_type a_r, a_i, b_r, b_i; @@ -2206,7 +2206,7 @@ int SNAKokkos::compute_ncoeff() template KOKKOS_INLINE_FUNCTION -real_type SNAKokkos::compute_sfac(real_type r, real_type rcut, real_type sinner, real_type dinner) +real_type SNAKokkos::compute_sfac(real_type r, real_type rcut, real_type sinner, real_type dinner) const { real_type sfac_outer; constexpr real_type one = static_cast(1.0); @@ -2239,7 +2239,7 @@ real_type SNAKokkos::compute_sfac(real_typ template KOKKOS_INLINE_FUNCTION -real_type SNAKokkos::compute_dsfac(real_type r, real_type rcut, real_type sinner, real_type dinner) +real_type SNAKokkos::compute_dsfac(real_type r, real_type rcut, real_type sinner, real_type dinner) const { real_type sfac_outer, dsfac_outer, sfac_inner, dsfac_inner; constexpr real_type one = static_cast(1.0); @@ -2287,7 +2287,7 @@ real_type SNAKokkos::compute_dsfac(real_ty template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_s_dsfac(const real_type r, const real_type rcut, const real_type sinner, const real_type dinner, real_type& sfac, real_type& dsfac) { +void SNAKokkos::compute_s_dsfac(const real_type r, const real_type rcut, const real_type sinner, const real_type dinner, real_type& sfac, real_type& dsfac) const { real_type sfac_outer, dsfac_outer, sfac_inner, dsfac_inner; constexpr real_type one = static_cast(1.0); From e95d1dec3ff8b530544587582cdfb93e08bcf786 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Tue, 19 Nov 2024 10:35:58 -0800 Subject: [PATCH 05/30] Removed AoSoA from a/b/da/db/sfac pack --- src/KOKKOS/sna_kokkos.h | 10 +-- src/KOKKOS/sna_kokkos_impl.h | 135 ++++++++++++++++++----------------- 2 files changed, 74 insertions(+), 71 deletions(-) diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h index 9456a219d4..17514efe7c 100644 --- a/src/KOKKOS/sna_kokkos.h +++ b/src/KOKKOS/sna_kokkos.h @@ -302,11 +302,11 @@ class SNAKokkos { t_sna_4c3_ll dulist; // Modified structures for GPU backend - t_sna_3c_ll a_pack; // Cayley-Klein `a` - t_sna_3c_ll b_pack; // `b` - t_sna_4c_ll da_pack; // `da` - t_sna_4c_ll db_pack; // `db` - t_sna_4d_ll sfac_pack; // sfac, dsfac_{x,y,z} + t_sna_2c a_gpu; // Cayley-Klein `a` + t_sna_2c b_gpu; // `b` + t_sna_3c da_gpu; // `da` + t_sna_3c db_gpu; // `db` + t_sna_3d sfac_gpu; // sfac, dsfac_{x,y,z} t_sna_4d_ll ulisttot_re_pack; // split real, t_sna_4d_ll ulisttot_im_pack; // imag, AoSoA, flattened diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index 7ebdcc5a0a..a6e7d46321 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -315,11 +315,11 @@ void SNAKokkos::grow_rij(int newnatom, int if constexpr (!host_flag) { - MemKK::realloc_kokkos(a_pack,"sna:a_pack",vector_length,nmax,natom_div); - MemKK::realloc_kokkos(b_pack,"sna:b_pack",vector_length,nmax,natom_div); - MemKK::realloc_kokkos(da_pack,"sna:da_pack",vector_length,nmax,natom_div,3); - MemKK::realloc_kokkos(db_pack,"sna:db_pack",vector_length,nmax,natom_div,3); - MemKK::realloc_kokkos(sfac_pack,"sna:sfac_pack",vector_length,nmax,natom_div,4); + MemKK::realloc_kokkos(a_gpu,"sna:a_gpu",natom_pad,nmax); + MemKK::realloc_kokkos(b_gpu,"sna:b_gpu",natom_pad,nmax); + MemKK::realloc_kokkos(da_gpu,"sna:da_gpu",natom_pad,nmax,3); + MemKK::realloc_kokkos(db_gpu,"sna:db_gpu",natom_pad,nmax,3); + MemKK::realloc_kokkos(sfac_gpu,"sna:sfac_gpu",natom_pad,nmax,4); MemKK::realloc_kokkos(ulisttot,"sna:ulisttot",1,1,1); // dummy allocation MemKK::realloc_kokkos(ulisttot_full,"sna:ulisttot",1,1,1); MemKK::realloc_kokkos(ulisttot_re_pack,"sna:ulisttot_re_pack",vector_length,idxu_half_max,nelements,natom_div); @@ -335,11 +335,11 @@ void SNAKokkos::grow_rij(int newnatom, int MemKK::realloc_kokkos(ylist_pack_im,"sna:ylist_pack_im",vector_length,idxu_half_max,nelements,natom_div); MemKK::realloc_kokkos(dulist,"sna:dulist",1,1,1); } else { - MemKK::realloc_kokkos(a_pack,"sna:a_pack",1,1,1); - MemKK::realloc_kokkos(b_pack,"sna:b_pack",1,1,1); - MemKK::realloc_kokkos(da_pack,"sna:da_pack",1,1,1,1); - MemKK::realloc_kokkos(db_pack,"sna:db_pack",1,1,1,1); - MemKK::realloc_kokkos(sfac_pack,"sna:sfac_pack",1,1,1,1); + MemKK::realloc_kokkos(a_gpu,"sna:a_gpu",1,1); + MemKK::realloc_kokkos(b_gpu,"sna:b_gpu",1,1); + MemKK::realloc_kokkos(da_gpu,"sna:da_gpu",1,1,1); + MemKK::realloc_kokkos(db_gpu,"sna:db_gpu",1,1,1); + MemKK::realloc_kokkos(sfac_gpu,"sna:sfac_gpu",1,1,1); MemKK::realloc_kokkos(ulisttot,"sna:ulisttot",idxu_half_max,nelements,natom_pad); MemKK::realloc_kokkos(ulisttot_full,"sna:ulisttot_full",idxu_max,nelements,natom_pad); MemKK::realloc_kokkos(ulisttot_re_pack,"sna:ulisttot_re",1,1,1,1); @@ -427,28 +427,28 @@ void SNAKokkos::compute_cayley_klein(const const real_type dsfacuy = dsfac * uy; const real_type dsfacuz = dsfac * uz; - a_pack(iatom_mod,jnbor,iatom_div) = a; - b_pack(iatom_mod,jnbor,iatom_div) = b; + a_gpu(iatom,jnbor) = a; + b_gpu(iatom,jnbor) = b; - da_pack(iatom_mod,jnbor,iatom_div,0) = dax; - db_pack(iatom_mod,jnbor,iatom_div,0) = dbx; + da_gpu(iatom,jnbor,0) = dax; + db_gpu(iatom,jnbor,0) = dbx; - da_pack(iatom_mod,jnbor,iatom_div,1) = day; - db_pack(iatom_mod,jnbor,iatom_div,1) = dby; + da_gpu(iatom,jnbor,1) = day; + db_gpu(iatom,jnbor,1) = dby; - da_pack(iatom_mod,jnbor,iatom_div,2) = daz; - db_pack(iatom_mod,jnbor,iatom_div,2) = dbz; + da_gpu(iatom,jnbor,2) = daz; + db_gpu(iatom,jnbor,2) = dbz; - sfac_pack(iatom_mod,jnbor,iatom_div,0) = sfac; - sfac_pack(iatom_mod,jnbor,iatom_div,1) = dsfacux; - sfac_pack(iatom_mod,jnbor,iatom_div,2) = dsfacuy; - sfac_pack(iatom_mod,jnbor,iatom_div,3) = dsfacuz; + sfac_gpu(iatom,jnbor,0) = sfac; + sfac_gpu(iatom,jnbor,1) = dsfacux; + sfac_gpu(iatom,jnbor,2) = dsfacuy; + sfac_gpu(iatom,jnbor,3) = dsfacuz; // we need to explicitly zero `dedr` somewhere before hitting // ComputeFusedDeidrj --- this is just a convenient place to do it. - dedr(iatom_mod + vector_length * iatom_div, jnbor, 0) = static_cast(0.); - dedr(iatom_mod + vector_length * iatom_div, jnbor, 1) = static_cast(0.); - dedr(iatom_mod + vector_length * iatom_div, jnbor, 2) = static_cast(0.); + dedr(iatom, jnbor, 0) = static_cast(0.); + dedr(iatom, jnbor, 1) = static_cast(0.); + dedr(iatom, jnbor, 2) = static_cast(0.); } @@ -496,7 +496,7 @@ template KOKKOS_INLINE_FUNCTION void SNAKokkos::compute_ui_small(const typename Kokkos::TeamPolicy::member_type& team, const int iatom_mod, const int j_bend, const int jnbor, const int iatom_div) const { - + const int iatom = iatom_mod + vector_length * iatom_div; // get shared memory offset // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer const int tile_size = vector_length * (twojmax + 1); @@ -508,11 +508,11 @@ void SNAKokkos::compute_ui_small(const typ const WignerWrapper ulist_wrapper((complex*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(complex), 0) + scratch_shift, iatom_mod); // load parameters - const complex a = a_pack(iatom_mod, jnbor, iatom_div); - const complex b = b_pack(iatom_mod, jnbor, iatom_div); - const real_type sfac = sfac_pack(iatom_mod, jnbor, iatom_div, 0); + const complex a = a_gpu(iatom, jnbor); + const complex b = b_gpu(iatom, jnbor); + const real_type sfac = sfac_gpu(iatom, jnbor, 0); - const int jelem = element(iatom_mod + vector_length * iatom_div, jnbor); + const int jelem = element(iatom, jnbor); // we need to "choose" when to bend // this for loop is here for context --- we expose additional @@ -527,6 +527,7 @@ template KOKKOS_INLINE_FUNCTION void SNAKokkos::compute_ui_large(const typename Kokkos::TeamPolicy::member_type& team, const int iatom_mod, const int jnbor, const int iatom_div) const { + const int iatom = iatom_mod + vector_length * iatom_div; // get shared memory offset // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer const int tile_size = vector_length * (twojmax + 1); @@ -538,11 +539,11 @@ void SNAKokkos::compute_ui_large(const typ const WignerWrapper ulist_wrapper((complex*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(complex), 0) + scratch_shift, iatom_mod); // load parameters - const complex a = a_pack(iatom_mod, jnbor, iatom_div); - const complex b = b_pack(iatom_mod, jnbor, iatom_div); - const real_type sfac = sfac_pack(iatom_mod, jnbor, iatom_div, 0); + const complex a = a_gpu(iatom, jnbor); + const complex b = b_gpu(iatom, jnbor); + const real_type sfac = sfac_gpu(iatom, jnbor, 0); - const int jelem = element(iatom_mod + vector_length * iatom_div, jnbor); + const int jelem = element(iatom, jnbor); // we need to "choose" when to bend #ifdef LMP_KK_DEVICE_COMPILE @@ -786,7 +787,7 @@ void SNAKokkos::compute_bi(const int& iato template KOKKOS_INLINE_FUNCTION void SNAKokkos::compute_yi(int iatom_mod, int jjz, int iatom_div, - const Kokkos::View &beta_pack) const + const Kokkos::View &beta_gpu) const { int j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, jju_half, idxcg; @@ -809,7 +810,7 @@ void SNAKokkos::compute_yi(int iatom_mod, // pick out right beta value for (int elem3 = 0; elem3 < nelements; elem3++) { - const real_type betaj = evaluate_beta_scaled(j1, j2, j, iatom_mod, elem1, elem2, elem3, iatom_div, beta_pack); + const real_type betaj = evaluate_beta_scaled(j1, j2, j, iatom_mod, elem1, elem2, elem3, iatom_div, beta_gpu); Kokkos::atomic_add(&(ylist_pack_re(iatom_mod, jju_half, elem3, iatom_div)), betaj * ztmp.re); Kokkos::atomic_add(&(ylist_pack_im(iatom_mod, jju_half, elem3, iatom_div)), betaj * ztmp.im); @@ -827,7 +828,7 @@ void SNAKokkos::compute_yi(int iatom_mod, template KOKKOS_INLINE_FUNCTION void SNAKokkos::compute_yi_with_zlist(int iatom_mod, int jjz, int iatom_div, - const Kokkos::View &beta_pack) const + const Kokkos::View &beta_gpu) const { int j1, j2, j, jju_half; idxz(jjz).get_yi_with_zlist(j1, j2, j, jju_half); @@ -843,7 +844,7 @@ void SNAKokkos::compute_yi_with_zlist(int // pick out right beta value for (int elem3 = 0; elem3 < nelements; elem3++) { - const real_type betaj = evaluate_beta_scaled(j1, j2, j, iatom_mod, elem1, elem2, elem3, iatom_div, beta_pack); + const real_type betaj = evaluate_beta_scaled(j1, j2, j, iatom_mod, elem1, elem2, elem3, iatom_div, beta_gpu); Kokkos::atomic_add(&(ylist_pack_re(iatom_mod, jju_half, elem3, iatom_div)), betaj * ztmp.re); Kokkos::atomic_add(&(ylist_pack_im(iatom_mod, jju_half, elem3, iatom_div)), betaj * ztmp.im); @@ -911,7 +912,7 @@ template KOKKOS_FORCEINLINE_FUNCTION typename SNAKokkos::real_type SNAKokkos::evaluate_beta_scaled(const int& j1, const int& j2, const int& j, const int& iatom_mod, const int& elem1, const int& elem2, const int& elem3, const int& iatom_div, - const Kokkos::View &beta_pack) const { + const Kokkos::View &beta_gpu) const { real_type betaj = 0; @@ -919,18 +920,18 @@ typename SNAKokkos::real_type SNAKokkos(3) * beta_pack(iatom_mod, itriple, iatom_div); - else betaj = static_cast(2) * beta_pack(iatom_mod, itriple, iatom_div); - } else betaj = beta_pack(iatom_mod, itriple, iatom_div); + if (j2 == j) betaj = static_cast(3) * beta_gpu(iatom_mod, itriple, iatom_div); + else betaj = static_cast(2) * beta_gpu(iatom_mod, itriple, iatom_div); + } else betaj = beta_gpu(iatom_mod, itriple, iatom_div); } else if (j >= j2) { const int jjb = idxb_block(j, j2, j1); const int itriple = ((elem3 * nelements + elem2) * nelements + elem1) * idxb_max + jjb; - if (j2 == j) betaj = static_cast(2) * beta_pack(iatom_mod, itriple, iatom_div); - else betaj = beta_pack(iatom_mod, itriple, iatom_div); + if (j2 == j) betaj = static_cast(2) * beta_gpu(iatom_mod, itriple, iatom_div); + else betaj = beta_gpu(iatom_mod, itriple, iatom_div); } else { const int jjb = idxb_block(j2, j, j1); const int itriple = ((elem2 * nelements + elem3) * nelements + elem1) * idxb_max + jjb; - betaj = beta_pack(iatom_mod, itriple, iatom_div); + betaj = beta_gpu(iatom_mod, itriple, iatom_div); } if (!bnorm_flag && j1 > j) { @@ -953,6 +954,7 @@ template KOKKOS_INLINE_FUNCTION void SNAKokkos::compute_fused_deidrj_small(const typename Kokkos::TeamPolicy::member_type& team, const int iatom_mod, const int j_bend, const int jnbor, const int iatom_div) const { + const int iatom = iatom_mod + vector_length * iatom_div; // get shared memory offset // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer const int tile_size = vector_length * (twojmax + 1); @@ -965,21 +967,21 @@ void SNAKokkos::compute_fused_deidrj_small WignerWrapper dulist_wrapper((complex*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(complex), 0) + scratch_shift, iatom_mod); // load parameters - const complex a = a_pack(iatom_mod, jnbor, iatom_div); - const complex b = b_pack(iatom_mod, jnbor, iatom_div); - const complex da = da_pack(iatom_mod, jnbor, iatom_div, dir); - const complex db = db_pack(iatom_mod, jnbor, iatom_div, dir); - const real_type sfac = sfac_pack(iatom_mod, jnbor, iatom_div, 0); - const real_type dsfacu = sfac_pack(iatom_mod, jnbor, iatom_div, dir + 1); // dsfac * u + const complex a = a_gpu(iatom, jnbor); + const complex b = b_gpu(iatom, jnbor); + const complex da = da_gpu(iatom, jnbor, dir); + const complex db = db_gpu(iatom, jnbor, dir); + const real_type sfac = sfac_gpu(iatom, jnbor, 0); + const real_type dsfacu = sfac_gpu(iatom, jnbor, dir + 1); // dsfac * u - const int jelem = element(iatom_mod + vector_length * iatom_div, jnbor); + const int jelem = element(iatom, jnbor); // compute the contribution to dedr_full_sum for one "bend" location const real_type dedr_full_sum = evaluate_duidrj_jbend(ulist_wrapper, a, b, sfac, dulist_wrapper, da, db, dsfacu, jelem, iatom_mod, j_bend, iatom_div); // dedr gets zeroed out at the start of each iteration in compute_cayley_klein - Kokkos::atomic_add(&(dedr(iatom_mod + vector_length * iatom_div, jnbor, dir)), static_cast(2.0) * dedr_full_sum); + Kokkos::atomic_add(&(dedr(iatom, jnbor, dir)), static_cast(2.0) * dedr_full_sum); } @@ -990,6 +992,7 @@ template KOKKOS_INLINE_FUNCTION void SNAKokkos::compute_fused_deidrj_large(const typename Kokkos::TeamPolicy::member_type& team, const int iatom_mod, const int jnbor, const int iatom_div) const { + const int iatom = iatom_mod + vector_length * iatom_div; // get shared memory offset // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer const int tile_size = vector_length * (twojmax + 1); @@ -1002,14 +1005,14 @@ void SNAKokkos::compute_fused_deidrj_large WignerWrapper dulist_wrapper((complex*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(complex), 0) + scratch_shift, iatom_mod); // load parameters - const complex a = a_pack(iatom_mod, jnbor, iatom_div); - const complex b = b_pack(iatom_mod, jnbor, iatom_div); - const complex da = da_pack(iatom_mod, jnbor, iatom_div, dir); - const complex db = db_pack(iatom_mod, jnbor, iatom_div, dir); - const real_type sfac = sfac_pack(iatom_mod, jnbor, iatom_div, 0); - const real_type dsfacu = sfac_pack(iatom_mod, jnbor, iatom_div, dir + 1); // dsfac * u + const complex a = a_gpu(iatom, jnbor); + const complex b = b_gpu(iatom, jnbor); + const complex da = da_gpu(iatom, jnbor, dir); + const complex db = db_gpu(iatom, jnbor, dir); + const real_type sfac = sfac_gpu(iatom, jnbor, 0); + const real_type dsfacu = sfac_gpu(iatom, jnbor, dir + 1); // dsfac * u - const int jelem = element(iatom_mod + vector_length * iatom_div, jnbor); + const int jelem = element(iatom, jnbor); // compute the contributions to dedr_full_sum for all "bend" locations real_type dedr_full_sum = static_cast(0); @@ -1022,7 +1025,7 @@ void SNAKokkos::compute_fused_deidrj_large } // there's one thread per atom, neighbor pair, so no need to make this atomic - dedr(iatom_mod + vector_length * iatom_div, jnbor, dir) = static_cast(2.0) * dedr_full_sum; + dedr(iatom, jnbor, dir) = static_cast(2.0) * dedr_full_sum; } @@ -2337,11 +2340,11 @@ double SNAKokkos::memory_usage() if constexpr (!host_flag) { - bytes += MemKK::memory_usage(a_pack); - bytes += MemKK::memory_usage(b_pack); - bytes += MemKK::memory_usage(da_pack); - bytes += MemKK::memory_usage(db_pack); - bytes += MemKK::memory_usage(sfac_pack); + bytes += MemKK::memory_usage(a_gpu); + bytes += MemKK::memory_usage(b_gpu); + bytes += MemKK::memory_usage(da_gpu); + bytes += MemKK::memory_usage(db_gpu); + bytes += MemKK::memory_usage(sfac_gpu); bytes += MemKK::memory_usage(ulisttot_re_pack); From 120c4600cff88142d862268cc5c5bc042ee9386e Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Tue, 19 Nov 2024 10:44:30 -0800 Subject: [PATCH 06/30] Removed AoSoA from ulisttot_[re_/im_/]_pack --- src/KOKKOS/pair_snap_kokkos_impl.h | 23 ++++++------ src/KOKKOS/sna_kokkos.h | 12 +++---- src/KOKKOS/sna_kokkos_impl.h | 56 ++++++++++++++---------------- 3 files changed, 44 insertions(+), 47 deletions(-) diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h index 492c401074..9e25e08f6c 100644 --- a/src/KOKKOS/pair_snap_kokkos_impl.h +++ b/src/KOKKOS/pair_snap_kokkos_impl.h @@ -746,26 +746,25 @@ template KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPComputeCayleyKlein,const int iatom_mod, const int jnbor, const int iatom_div) const { - const int ii = iatom_mod + iatom_div * vector_length; - if (ii >= chunk_size) return; + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; - const int ninside = d_ninside(ii); + const int ninside = d_ninside(iatom); if (jnbor >= ninside) return; - snaKK.compute_cayley_klein(iatom_mod,jnbor,iatom_div); + snaKK.compute_cayley_klein(iatom,jnbor); } template KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPPreUi, const int iatom_mod, const int j, const int iatom_div) const { + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; - const int ii = iatom_mod + iatom_div * vector_length; - if (ii >= chunk_size) return; - - int itype = type(ii); + int itype = type(iatom); int ielem = d_map[itype]; - snaKK.pre_ui(iatom_mod, j, ielem, iatom_div); + snaKK.pre_ui(iatom, j, ielem); } template @@ -834,8 +833,8 @@ void PairSNAPKokkos::operator() (TagPairSN const FullHalfMapper mapper = snaKK.idxu_full_half[idxu]; - auto utot_re = snaKK.ulisttot_re_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div); - auto utot_im = snaKK.ulisttot_im_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div); + auto utot_re = snaKK.ulisttot_re_gpu(iatom, mapper.idxu_half, ielem); + auto utot_im = snaKK.ulisttot_im_gpu(iatom, mapper.idxu_half, ielem); if (mapper.flip_sign == 1) { utot_im = -utot_im; @@ -843,7 +842,7 @@ void PairSNAPKokkos::operator() (TagPairSN utot_re = -utot_re; } - snaKK.ulisttot_pack(iatom_mod, idxu, ielem, iatom_div) = { utot_re, utot_im }; + snaKK.ulisttot_gpu(iatom, idxu, ielem) = { utot_re, utot_im }; if (mapper.flip_sign == 0) { snaKK.ylist_pack_re(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.; diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h index 17514efe7c..38b3f8f1b8 100644 --- a/src/KOKKOS/sna_kokkos.h +++ b/src/KOKKOS/sna_kokkos.h @@ -187,9 +187,9 @@ class SNAKokkos { // functions for bispectrum coefficients, GPU only KOKKOS_INLINE_FUNCTION - void compute_cayley_klein(const int&, const int&, const int&) const; + void compute_cayley_klein(const int&, const int&) const; KOKKOS_INLINE_FUNCTION - void pre_ui(const int&, const int&, const int&, const int&) const; // ForceSNAP + void pre_ui(const int&, const int&, const int&) const; // ForceSNAP // version of the code with parallelism over j_bend KOKKOS_INLINE_FUNCTION @@ -223,7 +223,7 @@ class SNAKokkos { // plugged into compute_ui_small, compute_ui_large KOKKOS_FORCEINLINE_FUNCTION void evaluate_ui_jbend(const WignerWrapper&, const complex&, const complex&, const real_type&, const int&, - const int&, const int&, const int&) const; + const int&, const int&) const; // plugged into compute_zi, compute_yi KOKKOS_FORCEINLINE_FUNCTION complex evaluate_zi(const int&, const int&, const int&, const int&, const int&, const int&, const int&, const int&, const int&, @@ -308,9 +308,9 @@ class SNAKokkos { t_sna_3c db_gpu; // `db` t_sna_3d sfac_gpu; // sfac, dsfac_{x,y,z} - t_sna_4d_ll ulisttot_re_pack; // split real, - t_sna_4d_ll ulisttot_im_pack; // imag, AoSoA, flattened - t_sna_4c_ll ulisttot_pack; // AoSoA layout + t_sna_3d_ll ulisttot_re_gpu; // split real, + t_sna_3d_ll ulisttot_im_gpu; // imag, AoSoA, flattened + t_sna_3c_ll ulisttot_gpu; // packed and de-symmetrized t_sna_4c_ll zlist_pack; // AoSoA layout t_sna_4d_ll blist_pack; t_sna_4d_ll ylist_pack_re; // split real, diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index a6e7d46321..e2f11222b3 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -322,9 +322,9 @@ void SNAKokkos::grow_rij(int newnatom, int MemKK::realloc_kokkos(sfac_gpu,"sna:sfac_gpu",natom_pad,nmax,4); MemKK::realloc_kokkos(ulisttot,"sna:ulisttot",1,1,1); // dummy allocation MemKK::realloc_kokkos(ulisttot_full,"sna:ulisttot",1,1,1); - MemKK::realloc_kokkos(ulisttot_re_pack,"sna:ulisttot_re_pack",vector_length,idxu_half_max,nelements,natom_div); - MemKK::realloc_kokkos(ulisttot_im_pack,"sna:ulisttot_im_pack",vector_length,idxu_half_max,nelements,natom_div); - MemKK::realloc_kokkos(ulisttot_pack,"sna:ulisttot_pack",vector_length,idxu_max,nelements,natom_div); + MemKK::realloc_kokkos(ulisttot_re_gpu,"sna:ulisttot_re_gpu",natom_pad,idxu_half_max,nelements); + MemKK::realloc_kokkos(ulisttot_im_gpu,"sna:ulisttot_im_gpu",natom_pad,idxu_half_max,nelements); + MemKK::realloc_kokkos(ulisttot_gpu,"sna:ulisttot_gpu",natom_pad,idxu_max,nelements); MemKK::realloc_kokkos(ulist,"sna:ulist",1,1,1); MemKK::realloc_kokkos(zlist,"sna:zlist",1,1,1); MemKK::realloc_kokkos(zlist_pack,"sna:zlist_pack",vector_length,idxz_max,ndoubles,natom_div); @@ -342,9 +342,9 @@ void SNAKokkos::grow_rij(int newnatom, int MemKK::realloc_kokkos(sfac_gpu,"sna:sfac_gpu",1,1,1); MemKK::realloc_kokkos(ulisttot,"sna:ulisttot",idxu_half_max,nelements,natom_pad); MemKK::realloc_kokkos(ulisttot_full,"sna:ulisttot_full",idxu_max,nelements,natom_pad); - MemKK::realloc_kokkos(ulisttot_re_pack,"sna:ulisttot_re",1,1,1,1); - MemKK::realloc_kokkos(ulisttot_im_pack,"sna:ulisttot_im",1,1,1,1); - MemKK::realloc_kokkos(ulisttot_pack,"sna:ulisttot_pack",1,1,1,1); + MemKK::realloc_kokkos(ulisttot_re_gpu,"sna:ulisttot_re_gpu",1,1,1); + MemKK::realloc_kokkos(ulisttot_im_gpu,"sna:ulisttot_im_gpu",1,1,1); + MemKK::realloc_kokkos(ulisttot_gpu,"sna:ulisttot_pack_gpu",1,1,1); MemKK::realloc_kokkos(ulist,"sna:ulist",idxu_cache_max,natom_pad,nmax); MemKK::realloc_kokkos(zlist,"sna:zlist",idxz_max,ndoubles,natom_pad); MemKK::realloc_kokkos(zlist_pack,"sna:zlist_pack",1,1,1,1); @@ -371,9 +371,8 @@ void SNAKokkos::grow_rij(int newnatom, int template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_cayley_klein(const int& iatom_mod, const int& jnbor, const int& iatom_div) const +void SNAKokkos::compute_cayley_klein(const int& iatom, const int& jnbor) const { - const int iatom = iatom_mod + vector_length * iatom_div; const real_type x = rij(iatom,jnbor,0); const real_type y = rij(iatom,jnbor,1); const real_type z = rij(iatom,jnbor,2); @@ -460,9 +459,8 @@ void SNAKokkos::compute_cayley_klein(const template KOKKOS_INLINE_FUNCTION -void SNAKokkos::pre_ui(const int& iatom_mod, const int& j, const int& ielem, const int& iatom_div) const +void SNAKokkos::pre_ui(const int& iatom, const int& j, const int& ielem) const { - for (int jelem = 0; jelem < nelements; jelem++) { int jju_half = idxu_half_block(j); @@ -475,8 +473,8 @@ void SNAKokkos::pre_ui(const int& iatom_mo real_type re_part = static_cast(0.); if (ma == mb && (!chem_flag || ielem == jelem || wselfall_flag)) { re_part = wself; } - ulisttot_re_pack(iatom_mod, jju_half, jelem, iatom_div) = re_part; - ulisttot_im_pack(iatom_mod, jju_half, jelem, iatom_div) = static_cast(0.); + ulisttot_re_gpu(iatom, jju_half, jelem) = re_part; + ulisttot_im_gpu(iatom, jju_half, jelem) = static_cast(0.); jju_half++; } @@ -518,7 +516,7 @@ void SNAKokkos::compute_ui_small(const typ // this for loop is here for context --- we expose additional // parallelism over this loop instead //for (int j_bend = 0; j_bend <= twojmax; j_bend++) { - evaluate_ui_jbend(ulist_wrapper, a, b, sfac, jelem, iatom_mod, j_bend, iatom_div); + evaluate_ui_jbend(ulist_wrapper, a, b, sfac, jelem, iatom, j_bend); } // Version of the code that loops over all `j_bend` values which reduces integer arithmetic @@ -550,7 +548,7 @@ void SNAKokkos::compute_ui_large(const typ #pragma unroll #endif for (int j_bend = 0; j_bend <= twojmax; j_bend++) { - evaluate_ui_jbend(ulist_wrapper, a, b, sfac, jelem, iatom_mod, j_bend, iatom_div); + evaluate_ui_jbend(ulist_wrapper, a, b, sfac, jelem, iatom, j_bend); } } @@ -559,9 +557,8 @@ template KOKKOS_FORCEINLINE_FUNCTION void SNAKokkos::evaluate_ui_jbend(const WignerWrapper& ulist_wrapper, const complex& a, const complex& b, const real_type& sfac, const int& jelem, - const int& iatom_mod, const int& j_bend, const int& iatom_div) const + const int& iatom, const int& j_bend) const { - // utot(j,ma,mb) = 0 for all j,ma,ma // utot(j,ma,ma) = 1 for all j,ma // for j in neighbors of i: @@ -621,8 +618,8 @@ void SNAKokkos::evaluate_ui_jbend(const Wi const complex ulist_prev = ulist_wrapper.get(ma); // atomic add the previous level here - Kokkos::atomic_add(&(ulisttot_re_pack(iatom_mod, jjup + ma, jelem, iatom_div)), ulist_prev.re * sfac); - Kokkos::atomic_add(&(ulisttot_im_pack(iatom_mod, jjup + ma, jelem, iatom_div)), ulist_prev.im * sfac); + Kokkos::atomic_add(&(ulisttot_re_gpu(iatom, jjup + ma, jelem)), ulist_prev.re * sfac); + Kokkos::atomic_add(&(ulisttot_im_gpu(iatom, jjup + ma, jelem)), ulist_prev.im * sfac); // ulist_accum += rootpq * b * ulist_prev; real_type rootpq = rootpqarray(j - ma, mb); @@ -651,8 +648,8 @@ void SNAKokkos::evaluate_ui_jbend(const Wi const complex ulist_prev = ulist_wrapper.get(ma); // atomic add the previous level here - Kokkos::atomic_add(&(ulisttot_re_pack(iatom_mod, jjup + ma, jelem, iatom_div)), ulist_prev.re * sfac); - Kokkos::atomic_add(&(ulisttot_im_pack(iatom_mod, jjup + ma, jelem, iatom_div)), ulist_prev.im * sfac); + Kokkos::atomic_add(&(ulisttot_re_gpu(iatom, jjup + ma, jelem)), ulist_prev.re * sfac); + Kokkos::atomic_add(&(ulisttot_im_gpu(iatom, jjup + ma, jelem)), ulist_prev.im * sfac); } } @@ -695,6 +692,7 @@ template KOKKOS_INLINE_FUNCTION void SNAKokkos::compute_bi(const int& iatom_mod, const int& jjb, const int& iatom_div) const { + const int iatom = iatom_mod + vector_length * iatom_div; // for j1 = 0,...,twojmax // for j2 = 0,twojmax // for j = |j1-j2|,Min(twojmax,j1+j2),2 @@ -725,7 +723,7 @@ void SNAKokkos::compute_bi(const int& iato const int jju_index = jju+mb*(j+1)+ma; const int jjz_index = jjz+mb*(j+1)+ma; if (2*mb == j) return; // I think we can remove this? - const complex utot = ulisttot_pack(iatom_mod, jju_index, elem3, iatom_div); + const complex utot = ulisttot_gpu(iatom, jju_index, elem3); const complex zloc = zlist_pack(iatom_mod, jjz_index, idouble, iatom_div); sumzu_temp += utot.re * zloc.re + utot.im * zloc.im; } @@ -741,7 +739,7 @@ void SNAKokkos::compute_bi(const int& iato const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma; const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma; - const complex utot = ulisttot_pack(iatom_mod, jju_index, elem3, iatom_div); + const complex utot = ulisttot_gpu(iatom, jju_index, elem3); const complex zloc = zlist_pack(iatom_mod, jjz_index, idouble, iatom_div); sumzu_temp += utot.re * zloc.re + utot.im * zloc.im; @@ -752,7 +750,7 @@ void SNAKokkos::compute_bi(const int& iato const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma; const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma; - const complex utot = ulisttot_pack(iatom_mod, jju_index, elem3, iatom_div); + const complex utot = ulisttot_gpu(iatom, jju_index, elem3); const complex zloc = zlist_pack(iatom_mod, jjz_index, idouble, iatom_div); sumzu += static_cast(0.5) * (utot.re * zloc.re + utot.im * zloc.im); } // end if jeven @@ -861,7 +859,7 @@ KOKKOS_FORCEINLINE_FUNCTION typename SNAKokkos::complex SNAKokkos::evaluate_zi(const int& j1, const int& j2, const int& j, const int& ma1min, const int& ma2max, const int& mb1min, const int& mb2max, const int& na, const int& nb, const int& iatom_mod, const int& elem1, const int& elem2, const int& iatom_div, const real_type* cgblock) const { - + const int iatom = iatom_mod + vector_length * iatom_div; complex ztmp = complex::zero(); int jju1 = idxu_block[j1] + (j1+1)*mb1min; @@ -881,8 +879,8 @@ typename SNAKokkos::complex SNAKokkos::memory_usage() bytes += MemKK::memory_usage(sfac_gpu); - bytes += MemKK::memory_usage(ulisttot_re_pack); - bytes += MemKK::memory_usage(ulisttot_im_pack); - bytes += MemKK::memory_usage(ulisttot_pack); + bytes += MemKK::memory_usage(ulisttot_re_gpu); + bytes += MemKK::memory_usage(ulisttot_im_gpu); + bytes += MemKK::memory_usage(ulisttot_gpu); bytes += MemKK::memory_usage(zlist_pack); bytes += MemKK::memory_usage(blist_pack); From 2fc33f3fd429cc7109a6cfac2aa5a6da40e934a8 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Tue, 19 Nov 2024 10:49:47 -0800 Subject: [PATCH 07/30] Removed AoSoA from [ylist_re/ylist_im/zlist/blist] --- src/KOKKOS/pair_snap_kokkos_impl.h | 13 +++--- src/KOKKOS/sna_kokkos.h | 22 +++++----- src/KOKKOS/sna_kokkos_impl.h | 68 +++++++++++++++--------------- 3 files changed, 49 insertions(+), 54 deletions(-) diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h index 9e25e08f6c..31a9463a4a 100644 --- a/src/KOKKOS/pair_snap_kokkos_impl.h +++ b/src/KOKKOS/pair_snap_kokkos_impl.h @@ -824,7 +824,6 @@ void PairSNAPKokkos::operator() (TagPairSN const int iatom = iatom_mod + iatom_div * vector_length; if (iatom >= chunk_size) return; - if (idxu > snaKK.idxu_max) return; int elem_count = chemflag ? nelements : 1; @@ -845,8 +844,8 @@ void PairSNAPKokkos::operator() (TagPairSN snaKK.ulisttot_gpu(iatom, idxu, ielem) = { utot_re, utot_im }; if (mapper.flip_sign == 0) { - snaKK.ylist_pack_re(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.; - snaKK.ylist_pack_im(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.; + snaKK.ylist_re_gpu(iatom, mapper.idxu_half, ielem) = 0.; + snaKK.ylist_im_gpu(iatom, mapper.idxu_half, ielem) = 0.; } } } @@ -881,10 +880,9 @@ void PairSNAPKokkos::operator() (TagPairSN const int iatom = iatom_mod + iatom_div * vector_length; if (iatom >= chunk_size) return; - if (jjz >= snaKK.idxz_max) return; - snaKK.compute_zi(iatom_mod,jjz,iatom_div); + snaKK.compute_zi(iatom, jjz); } template @@ -896,7 +894,7 @@ void PairSNAPKokkos::operator() (TagPairSN if (jjb >= snaKK.idxb_max) return; - snaKK.compute_bi(iatom_mod,jjb,iatom_div); + snaKK.compute_bi(iatom,jjb); } template @@ -905,14 +903,13 @@ void PairSNAPKokkos::operator() (TagPairSN const int iatom = iatom_mod + iatom_div * vector_length; if (iatom >= chunk_size) return; - if (idxb >= snaKK.idxb_max) return; const int ntriples = snaKK.ntriples; for (int itriple = 0; itriple < ntriples; itriple++) { - const real_type blocal = snaKK.blist_pack(iatom_mod, idxb, itriple, iatom_div); + const real_type blocal = snaKK.blist_gpu(iatom, idxb, itriple); snaKK.blist(iatom, itriple, idxb) = blocal; } diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h index 38b3f8f1b8..5cbcb2acf4 100644 --- a/src/KOKKOS/sna_kokkos.h +++ b/src/KOKKOS/sna_kokkos.h @@ -199,7 +199,7 @@ class SNAKokkos { void compute_ui_large(const typename Kokkos::TeamPolicy::member_type& team, const int, const int, const int) const; // ForceSNAP KOKKOS_INLINE_FUNCTION - void compute_zi(const int&, const int&, const int&) const; // ForceSNAP + void compute_zi(const int&, const int&) const; // ForceSNAP KOKKOS_INLINE_FUNCTION void compute_yi(int,int,int, const Kokkos::View &beta_pack) const; // ForceSNAP @@ -207,7 +207,7 @@ class SNAKokkos { void compute_yi_with_zlist(int,int,int, const Kokkos::View &beta_pack) const; // ForceSNAP KOKKOS_INLINE_FUNCTION - void compute_bi(const int&, const int&, const int&) const; // ForceSNAP + void compute_bi(const int&, const int&) const; // ForceSNAP // functions for derivatives, GPU only // version of the code with parallelism over j_bend @@ -227,7 +227,7 @@ class SNAKokkos { // plugged into compute_zi, compute_yi KOKKOS_FORCEINLINE_FUNCTION complex evaluate_zi(const int&, const int&, const int&, const int&, const int&, const int&, const int&, const int&, const int&, - const int&, const int&, const int&, const int&, const real_type*) const; + const int&, const int&, const int&, const real_type*) const; // plugged into compute_yi, compute_yi_with_zlist KOKKOS_FORCEINLINE_FUNCTION real_type evaluate_beta_scaled(const int&, const int&, const int&, const int&, const int&, const int&, const int&, const int&, @@ -236,7 +236,7 @@ class SNAKokkos { KOKKOS_FORCEINLINE_FUNCTION real_type evaluate_duidrj_jbend(const WignerWrapper&, const complex&, const complex&, const real_type&, const WignerWrapper&, const complex&, const complex&, const real_type&, - const int&, const int&, const int&, const int&) const; + const int&, const int&, const int&) const; // functions for bispectrum coefficients, CPU only KOKKOS_INLINE_FUNCTION @@ -308,13 +308,13 @@ class SNAKokkos { t_sna_3c db_gpu; // `db` t_sna_3d sfac_gpu; // sfac, dsfac_{x,y,z} - t_sna_3d_ll ulisttot_re_gpu; // split real, - t_sna_3d_ll ulisttot_im_gpu; // imag, AoSoA, flattened - t_sna_3c_ll ulisttot_gpu; // packed and de-symmetrized - t_sna_4c_ll zlist_pack; // AoSoA layout - t_sna_4d_ll blist_pack; - t_sna_4d_ll ylist_pack_re; // split real, - t_sna_4d_ll ylist_pack_im; // imag AoSoA layout + t_sna_3d ulisttot_re_gpu; // split real, + t_sna_3d ulisttot_im_gpu; // imag, AoSoA, flattened + t_sna_3c ulisttot_gpu; // packed and de-symmetrized + t_sna_3c zlist_gpu; + t_sna_3d blist_gpu; + t_sna_3d ylist_re_gpu; // split real, + t_sna_3d ylist_im_gpu; // imag int idxcg_max, idxu_max, idxu_half_max, idxu_cache_max, idxz_max, idxb_max; diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index e2f11222b3..eab9b99b6f 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -327,12 +327,12 @@ void SNAKokkos::grow_rij(int newnatom, int MemKK::realloc_kokkos(ulisttot_gpu,"sna:ulisttot_gpu",natom_pad,idxu_max,nelements); MemKK::realloc_kokkos(ulist,"sna:ulist",1,1,1); MemKK::realloc_kokkos(zlist,"sna:zlist",1,1,1); - MemKK::realloc_kokkos(zlist_pack,"sna:zlist_pack",vector_length,idxz_max,ndoubles,natom_div); + MemKK::realloc_kokkos(zlist_gpu,"sna:zlist_gpu",natom_pad,idxz_max,ndoubles); MemKK::realloc_kokkos(blist,"sna:blist",natom_pad,ntriples,idxb_max); - MemKK::realloc_kokkos(blist_pack,"sna:blist_pack",vector_length,idxb_max,ntriples,natom_div); + MemKK::realloc_kokkos(blist_gpu,"sna:blist_gpu",natom_pad,idxb_max,ntriples); MemKK::realloc_kokkos(ylist,"sna:ylist",1,1,1); - MemKK::realloc_kokkos(ylist_pack_re,"sna:ylist_pack_re",vector_length,idxu_half_max,nelements,natom_div); - MemKK::realloc_kokkos(ylist_pack_im,"sna:ylist_pack_im",vector_length,idxu_half_max,nelements,natom_div); + MemKK::realloc_kokkos(ylist_re_gpu,"sna:ylist_re_gpu",natom_pad,idxu_half_max,nelements); + MemKK::realloc_kokkos(ylist_im_gpu,"sna:ylist_im_gpu",natom_pad,idxu_half_max,nelements); MemKK::realloc_kokkos(dulist,"sna:dulist",1,1,1); } else { MemKK::realloc_kokkos(a_gpu,"sna:a_gpu",1,1); @@ -347,12 +347,12 @@ void SNAKokkos::grow_rij(int newnatom, int MemKK::realloc_kokkos(ulisttot_gpu,"sna:ulisttot_pack_gpu",1,1,1); MemKK::realloc_kokkos(ulist,"sna:ulist",idxu_cache_max,natom_pad,nmax); MemKK::realloc_kokkos(zlist,"sna:zlist",idxz_max,ndoubles,natom_pad); - MemKK::realloc_kokkos(zlist_pack,"sna:zlist_pack",1,1,1,1); + MemKK::realloc_kokkos(zlist_gpu,"sna:zlist_gpu",1,1,1); MemKK::realloc_kokkos(blist,"sna:blist",natom_pad,ntriples,idxb_max); - MemKK::realloc_kokkos(blist_pack,"sna:blist_pack",1,1,1,1); + MemKK::realloc_kokkos(blist_gpu,"sna:blist_gpu",1,1,1); MemKK::realloc_kokkos(ylist,"sna:ylist",idxu_half_max,nelements,natom_pad); - MemKK::realloc_kokkos(ylist_pack_re,"sna:ylist_pack_re",1,1,1,1); - MemKK::realloc_kokkos(ylist_pack_im,"sna:ylist_pack_im",1,1,1,1); + MemKK::realloc_kokkos(ylist_re_gpu,"sna:ylist_pack_re",1,1,1); + MemKK::realloc_kokkos(ylist_im_gpu,"sna:ylist_pack_im",1,1,1); MemKK::realloc_kokkos(dulist,"sna:dulist",idxu_cache_max,natom_pad,nmax); } @@ -662,9 +662,8 @@ void SNAKokkos::evaluate_ui_jbend(const Wi template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_zi(const int& iatom_mod, const int& jjz, const int& iatom_div) const +void SNAKokkos::compute_zi(const int& iatom, const int& jjz) const { - int j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, idxcg; idxz(jjz).get_zi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, idxcg); @@ -675,7 +674,7 @@ void SNAKokkos::compute_zi(const int& iato for (int elem1 = 0; elem1 < nelements; elem1++) { for (int elem2 = 0; elem2 < nelements; elem2++) { - zlist_pack(iatom_mod,jjz,idouble,iatom_div) = evaluate_zi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, iatom_mod, elem1, elem2, iatom_div, cgblock); + zlist_gpu(iatom,jjz,idouble) = evaluate_zi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, iatom, elem1, elem2, cgblock); idouble++; } @@ -690,9 +689,8 @@ void SNAKokkos::compute_zi(const int& iato template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_bi(const int& iatom_mod, const int& jjb, const int& iatom_div) const +void SNAKokkos::compute_bi(const int& iatom, const int& jjb) const { - const int iatom = iatom_mod + vector_length * iatom_div; // for j1 = 0,...,twojmax // for j2 = 0,twojmax // for j = |j1-j2|,Min(twojmax,j1+j2),2 @@ -724,7 +722,7 @@ void SNAKokkos::compute_bi(const int& iato const int jjz_index = jjz+mb*(j+1)+ma; if (2*mb == j) return; // I think we can remove this? const complex utot = ulisttot_gpu(iatom, jju_index, elem3); - const complex zloc = zlist_pack(iatom_mod, jjz_index, idouble, iatom_div); + const complex zloc = zlist_gpu(iatom, jjz_index, idouble); sumzu_temp += utot.re * zloc.re + utot.im * zloc.im; } } @@ -740,7 +738,7 @@ void SNAKokkos::compute_bi(const int& iato const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma; const complex utot = ulisttot_gpu(iatom, jju_index, elem3); - const complex zloc = zlist_pack(iatom_mod, jjz_index, idouble, iatom_div); + const complex zloc = zlist_gpu(iatom, jjz_index, idouble); sumzu_temp += utot.re * zloc.re + utot.im * zloc.im; } @@ -751,7 +749,7 @@ void SNAKokkos::compute_bi(const int& iato const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma; const complex utot = ulisttot_gpu(iatom, jju_index, elem3); - const complex zloc = zlist_pack(iatom_mod, jjz_index, idouble, iatom_div); + const complex zloc = zlist_gpu(iatom, jjz_index, idouble); sumzu += static_cast(0.5) * (utot.re * zloc.re + utot.im * zloc.im); } // end if jeven @@ -765,7 +763,7 @@ void SNAKokkos::compute_bi(const int& iato sumzu -= bzero[j]; } } - blist_pack(iatom_mod, jjb, itriple, iatom_div) = sumzu; + blist_gpu(iatom, jjb, itriple) = sumzu; //} // end loop over j //} // end loop over j1, j2 itriple++; @@ -787,7 +785,7 @@ KOKKOS_INLINE_FUNCTION void SNAKokkos::compute_yi(int iatom_mod, int jjz, int iatom_div, const Kokkos::View &beta_gpu) const { - + const int iatom = iatom_mod + vector_length * iatom_div; int j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, jju_half, idxcg; idxz(jjz).get_yi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, jju_half, idxcg); @@ -798,7 +796,7 @@ void SNAKokkos::compute_yi(int iatom_mod, for (int elem1 = 0; elem1 < nelements; elem1++) { for (int elem2 = 0; elem2 < nelements; elem2++) { - const complex ztmp = evaluate_zi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, iatom_mod, elem1, elem2, iatom_div, cgblock); + const complex ztmp = evaluate_zi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, iatom, elem1, elem2, cgblock); // apply to z(j1,j2,j,ma,mb) to unique element of y(j) // find right y_list[jju] and beta(iatom,jjb) entries @@ -810,8 +808,8 @@ void SNAKokkos::compute_yi(int iatom_mod, const real_type betaj = evaluate_beta_scaled(j1, j2, j, iatom_mod, elem1, elem2, elem3, iatom_div, beta_gpu); - Kokkos::atomic_add(&(ylist_pack_re(iatom_mod, jju_half, elem3, iatom_div)), betaj * ztmp.re); - Kokkos::atomic_add(&(ylist_pack_im(iatom_mod, jju_half, elem3, iatom_div)), betaj * ztmp.im); + Kokkos::atomic_add(&(ylist_re_gpu(iatom, jju_half, elem3)), betaj * ztmp.re); + Kokkos::atomic_add(&(ylist_im_gpu(iatom, jju_half, elem3)), betaj * ztmp.im); } // end loop over elem3 } // end loop over elem2 } // end loop over elem1 @@ -828,13 +826,14 @@ KOKKOS_INLINE_FUNCTION void SNAKokkos::compute_yi_with_zlist(int iatom_mod, int jjz, int iatom_div, const Kokkos::View &beta_gpu) const { + const int iatom = iatom_mod + vector_length * iatom_div; int j1, j2, j, jju_half; idxz(jjz).get_yi_with_zlist(j1, j2, j, jju_half); int idouble = 0; for (int elem1 = 0; elem1 < nelements; elem1++) { for (int elem2 = 0; elem2 < nelements; elem2++) { - const complex ztmp = zlist_pack(iatom_mod,jjz,idouble,iatom_div); + const complex ztmp = zlist_gpu(iatom,jjz,idouble); // apply to z(j1,j2,j,ma,mb) to unique element of y(j) // find right y_list[jju] and beta(iatom,jjb) entries // multiply and divide by j+1 factors @@ -844,8 +843,8 @@ void SNAKokkos::compute_yi_with_zlist(int const real_type betaj = evaluate_beta_scaled(j1, j2, j, iatom_mod, elem1, elem2, elem3, iatom_div, beta_gpu); - Kokkos::atomic_add(&(ylist_pack_re(iatom_mod, jju_half, elem3, iatom_div)), betaj * ztmp.re); - Kokkos::atomic_add(&(ylist_pack_im(iatom_mod, jju_half, elem3, iatom_div)), betaj * ztmp.im); + Kokkos::atomic_add(&(ylist_re_gpu(iatom, jju_half, elem3)), betaj * ztmp.re); + Kokkos::atomic_add(&(ylist_im_gpu(iatom, jju_half, elem3)), betaj * ztmp.im); } // end loop over elem3 idouble++; } // end loop over elem2 @@ -858,8 +857,7 @@ template KOKKOS_FORCEINLINE_FUNCTION typename SNAKokkos::complex SNAKokkos::evaluate_zi(const int& j1, const int& j2, const int& j, const int& ma1min, const int& ma2max, const int& mb1min, const int& mb2max, const int& na, const int& nb, - const int& iatom_mod, const int& elem1, const int& elem2, const int& iatom_div, const real_type* cgblock) const { - const int iatom = iatom_mod + vector_length * iatom_div; + const int& iatom, const int& elem1, const int& elem2, const real_type* cgblock) const { complex ztmp = complex::zero(); int jju1 = idxu_block[j1] + (j1+1)*mb1min; @@ -976,7 +974,7 @@ void SNAKokkos::compute_fused_deidrj_small // compute the contribution to dedr_full_sum for one "bend" location const real_type dedr_full_sum = evaluate_duidrj_jbend(ulist_wrapper, a, b, sfac, dulist_wrapper, da, db, dsfacu, - jelem, iatom_mod, j_bend, iatom_div); + jelem, iatom, j_bend); // dedr gets zeroed out at the start of each iteration in compute_cayley_klein Kokkos::atomic_add(&(dedr(iatom, jnbor, dir)), static_cast(2.0) * dedr_full_sum); @@ -1019,7 +1017,7 @@ void SNAKokkos::compute_fused_deidrj_large #endif for (int j_bend = 0; j_bend <= twojmax; j_bend++) { dedr_full_sum += evaluate_duidrj_jbend(ulist_wrapper, a, b, sfac, dulist_wrapper, da, db, dsfacu, - jelem, iatom_mod, j_bend, iatom_div); + jelem, iatom,j_bend); } // there's one thread per atom, neighbor pair, so no need to make this atomic @@ -1033,7 +1031,7 @@ template KOKKOS_FORCEINLINE_FUNCTION typename SNAKokkos::real_type SNAKokkos::evaluate_duidrj_jbend(const WignerWrapper& ulist_wrapper, const complex& a, const complex& b, const real_type& sfac, const WignerWrapper& dulist_wrapper, const complex& da, const complex& db, const real_type& dsfacu, - const int& jelem, const int& iatom_mod, const int& j_bend, const int& iatom_div) const { + const int& jelem, const int& iatom, const int& j_bend) const { real_type dedr_full_sum = static_cast(0); @@ -1103,7 +1101,7 @@ typename SNAKokkos::real_type SNAKokkos::real_type SNAKokkos(0.5)*y_local; } else if (ma > (mb-1)) { y_local.re = static_cast(0.); y_local.im = static_cast(0.); } // can probably avoid this outright @@ -2349,11 +2347,11 @@ double SNAKokkos::memory_usage() bytes += MemKK::memory_usage(ulisttot_im_gpu); bytes += MemKK::memory_usage(ulisttot_gpu); - bytes += MemKK::memory_usage(zlist_pack); - bytes += MemKK::memory_usage(blist_pack); + bytes += MemKK::memory_usage(zlist_gpu); + bytes += MemKK::memory_usage(blist_gpu); - bytes += MemKK::memory_usage(ylist_pack_re); - bytes += MemKK::memory_usage(ylist_pack_im); + bytes += MemKK::memory_usage(ylist_re_gpu); + bytes += MemKK::memory_usage(ylist_im_gpu); } else { bytes += MemKK::memory_usage(ulist); From 3c4a42ba728e0ecacd6c736947676adbe5dfdb52 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Tue, 19 Nov 2024 10:54:59 -0800 Subject: [PATCH 08/30] Removed d_beta_pack; fully unified SNAPBeta and SNAPBetaCPU routines --- src/KOKKOS/pair_snap_kokkos.h | 16 ++---- src/KOKKOS/pair_snap_kokkos_impl.h | 89 ++++++++++-------------------- src/KOKKOS/sna_kokkos.h | 15 +++-- src/KOKKOS/sna_kokkos_impl.h | 42 ++++++-------- 4 files changed, 59 insertions(+), 103 deletions(-) diff --git a/src/KOKKOS/pair_snap_kokkos.h b/src/KOKKOS/pair_snap_kokkos.h index bb599a46ea..c9607daa1c 100644 --- a/src/KOKKOS/pair_snap_kokkos.h +++ b/src/KOKKOS/pair_snap_kokkos.h @@ -36,6 +36,7 @@ PairStyle(snap/kk/host,PairSNAPKokkosDevice); namespace LAMMPS_NS { // Routines for both the CPU and GPU backend +struct TagPairSNAPBeta{}; template struct TagPairSNAPComputeForce{}; @@ -48,7 +49,6 @@ struct TagPairSNAPComputeUiSmall{}; // more parallelism, more divergence struct TagPairSNAPComputeUiLarge{}; // less parallelism, no divergence struct TagPairSNAPTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist struct TagPairSNAPComputeZi{}; -struct TagPairSNAPBeta{}; struct TagPairSNAPComputeBi{}; struct TagPairSNAPTransformBi{}; // re-order blist from AoSoA to AoS struct TagPairSNAPComputeYi{}; @@ -64,7 +64,6 @@ struct TagPairSNAPPreUiCPU{}; struct TagPairSNAPComputeUiCPU{}; struct TagPairSNAPTransformUiCPU{}; struct TagPairSNAPComputeZiCPU{}; -struct TagPairSNAPBetaCPU{}; struct TagPairSNAPComputeBiCPU{}; struct TagPairSNAPZeroYiCPU{}; struct TagPairSNAPComputeYiCPU{}; @@ -151,6 +150,10 @@ class PairSNAPKokkos : public PairSNAP { template void check_team_size_reduce(int, int&); + // CPU and GPU backend + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPBeta, const int& ii) const; + template KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeForce,const int& ii) const; @@ -159,9 +162,6 @@ class PairSNAPKokkos : public PairSNAP { KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeForce,const int& ii, EV_FLOAT&) const; - KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPBetaCPU,const int& ii) const; - // GPU backend only KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeNeigh,const typename Kokkos::TeamPolicy::member_type& team) const; @@ -184,9 +184,6 @@ class PairSNAPKokkos : public PairSNAP { KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeZi,const int iatom_mod, const int idxz, const int iatom_div) const; - KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPBeta, const int& ii) const; - KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeBi,const int iatom_mod, const int idxb, const int iatom_div) const; @@ -267,8 +264,7 @@ class PairSNAPKokkos : public PairSNAP { Kokkos::View d_dinnerelem; // element inner cutoff half-width Kokkos::View d_map; // mapping from atom types to elements Kokkos::View d_ninside; // ninside for all atoms in list - Kokkos::View d_beta; // betas for all atoms in list - Kokkos::View d_beta_pack; // betas for all atoms in list, GPU + typename SNAKokkos::t_sna_2d d_beta; // betas for all atoms in list typedef Kokkos::DualView tdual_fparams; tdual_fparams k_cutsq; diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h index 31a9463a4a..0d209490fd 100644 --- a/src/KOKKOS/pair_snap_kokkos_impl.h +++ b/src/KOKKOS/pair_snap_kokkos_impl.h @@ -199,16 +199,18 @@ void PairSNAPKokkos::compute(int eflag_in, if (beta_max < inum) { beta_max = inum; - MemKK::realloc_kokkos(d_beta,"PairSNAPKokkos:beta",ncoeff,inum); - if constexpr (!host_flag) - MemKK::realloc_kokkos(d_beta_pack,"PairSNAPKokkos:beta_pack",vector_length,ncoeff,(inum + vector_length - 1) / vector_length); - MemKK::realloc_kokkos(d_ninside,"PairSNAPKokkos:ninside",inum); + // padded allocation, similar to within grow_rij + const int inum_div = (inum + vector_length - 1) / vector_length; + const int inum_pad = inum_div * vector_length; + MemKK::realloc_kokkos(d_beta,"PairSNAPKokkos:beta", inum_pad, ncoeff); + snaKK.d_beta = d_beta; + MemKK::realloc_kokkos(d_ninside,"PairSNAPKokkos:ninside", inum); } - chunk_size = MIN(chunksize,inum); // "chunksize" variable is set by user + chunk_size = MIN(chunksize, inum); // "chunksize" variable is set by user chunk_offset = 0; - snaKK.grow_rij(chunk_size,max_neighs); + snaKK.grow_rij(chunk_size, max_neighs); EV_FLOAT ev; @@ -271,8 +273,8 @@ void PairSNAPKokkos::compute(int eflag_in, //ComputeYi { //Compute beta = dE_i/dB_i for all i in list - typename Kokkos::RangePolicy policy_beta(0,chunk_size); - Kokkos::parallel_for("ComputeBetaCPU",policy_beta,*this); + typename Kokkos::RangePolicy policy_beta(0,chunk_size); + Kokkos::parallel_for("ComputeBeta",policy_beta,*this); //ComputeYi int idxz_max = snaKK.idxz_max; @@ -608,27 +610,23 @@ void PairSNAPKokkos::coeff(int narg, char } /* ---------------------------------------------------------------------- - Begin routines that are unique to the GPU codepath. These take advantage - of AoSoA data layouts and scratch memory for recursive polynomials + Begin routines that are common to both the CPU and GPU codepath. ------------------------------------------------------------------------- */ template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPBeta,const int& ii) const { +void PairSNAPKokkos::operator() (TagPairSNAPBeta, const int& iatom) const { - if (ii >= chunk_size) return; + if (iatom >= chunk_size) return; - const int iatom_mod = ii % vector_length; - const int iatom_div = ii / vector_length; - - const int i = d_ilist[ii + chunk_offset]; + const int i = d_ilist[iatom + chunk_offset]; const int itype = type[i]; const int ielem = d_map[itype]; auto d_coeffi = Kokkos::subview(d_coeffelem, ielem, Kokkos::ALL); for (int icoeff = 0; icoeff < ncoeff; icoeff++) { - d_beta_pack(iatom_mod,icoeff,iatom_div) = d_coeffi[icoeff+1]; + d_beta(iatom, icoeff) = d_coeffi[icoeff+1]; } if (quadraticflag) { @@ -637,21 +635,26 @@ void PairSNAPKokkos::operator() (TagPairSN for (int icoeff = 0; icoeff < ncoeff; icoeff++) { const auto idxb = icoeff % idxb_max; const auto idx_chem = icoeff / idxb_max; - real_type bveci = snaKK.blist(ii, idx_chem, idxb); - d_beta_pack(iatom_mod,icoeff,iatom_div) += d_coeffi[k]*bveci; + real_type bveci = snaKK.blist(iatom, idx_chem, idxb); + d_beta(iatom, icoeff) += d_coeffi[k] * bveci; k++; for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) { const auto jdxb = jcoeff % idxb_max; const auto jdx_chem = jcoeff / idxb_max; - real_type bvecj = snaKK.blist(ii, jdx_chem, jdxb); - d_beta_pack(iatom_mod,icoeff,iatom_div) += d_coeffi[k]*bvecj; - d_beta_pack(iatom_mod,jcoeff,iatom_div) += d_coeffi[k]*bveci; + real_type bvecj = snaKK.blist(iatom, jdx_chem, jdxb); + d_beta(iatom, icoeff) += d_coeffi[k] * bvecj; + d_beta(iatom, jcoeff) += d_coeffi[k] * bveci; k++; } } } } +/* ---------------------------------------------------------------------- + Begin routines that are unique to the GPU codepath. These take advantage + of AoSoA data layouts and scratch memory for recursive polynomials +------------------------------------------------------------------------- */ + template KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPComputeNeigh,const typename Kokkos::TeamPolicy::member_type& team) const { @@ -859,7 +862,7 @@ void PairSNAPKokkos::operator() (TagPairSN if (jjz >= snaKK.idxz_max) return; - snaKK.compute_yi(iatom_mod,jjz,iatom_div,d_beta_pack); + snaKK.compute_yi(iatom, jjz); } template @@ -871,7 +874,7 @@ void PairSNAPKokkos::operator() (TagPairSN if (jjz >= snaKK.idxz_max) return; - snaKK.compute_yi_with_zlist(iatom_mod,jjz,iatom_div,d_beta_pack); + snaKK.compute_yi_with_zlist(iatom, jjz); } template @@ -977,40 +980,6 @@ void PairSNAPKokkos::operator() (TagPairSN different arithmetic intensity requirements for the CPU vs GPU. ------------------------------------------------------------------------- */ -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPBetaCPU,const int& ii) const { - - const int i = d_ilist[ii + chunk_offset]; - const int itype = type[i]; - const int ielem = d_map[itype]; - - auto d_coeffi = Kokkos::subview(d_coeffelem, ielem, Kokkos::ALL); - - for (int icoeff = 0; icoeff < ncoeff; icoeff++) - d_beta(icoeff,ii) = d_coeffi[icoeff+1]; - - if (quadraticflag) { - const auto idxb_max = snaKK.idxb_max; - int k = ncoeff+1; - for (int icoeff = 0; icoeff < ncoeff; icoeff++) { - const auto idxb = icoeff % idxb_max; - const auto idx_chem = icoeff / idxb_max; - real_type bveci = snaKK.blist(ii,idx_chem,idxb); - d_beta(icoeff,ii) += d_coeffi[k]*bveci; - k++; - for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) { - const auto jdxb = jcoeff % idxb_max; - const auto jdx_chem = jcoeff / idxb_max; - real_type bvecj = snaKK.blist(ii,jdx_chem,jdxb); - d_beta(icoeff,ii) += d_coeffi[k]*bvecj; - d_beta(jcoeff,ii) += d_coeffi[k]*bveci; - k++; - } - } - } -} - template KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPComputeNeighCPU,const typename Kokkos::TeamPolicy::member_type& team) const { @@ -1170,7 +1139,7 @@ void PairSNAPKokkos::operator() (TagPairSN template KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPComputeYiCPU,const int& ii) const { - snaKK.compute_yi_cpu(ii,d_beta); + snaKK.compute_yi_cpu(ii); } template @@ -1373,8 +1342,6 @@ double PairSNAPKokkos::memory_usage() { double bytes = Pair::memory_usage(); bytes += MemKK::memory_usage(d_beta); - if constexpr (!host_flag) - bytes += MemKK::memory_usage(d_beta_pack); bytes += MemKK::memory_usage(d_ninside); bytes += MemKK::memory_usage(d_map); bytes += MemKK::memory_usage(d_radelem); diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h index 5cbcb2acf4..edc9d885da 100644 --- a/src/KOKKOS/sna_kokkos.h +++ b/src/KOKKOS/sna_kokkos.h @@ -201,11 +201,9 @@ class SNAKokkos { KOKKOS_INLINE_FUNCTION void compute_zi(const int&, const int&) const; // ForceSNAP KOKKOS_INLINE_FUNCTION - void compute_yi(int,int,int, - const Kokkos::View &beta_pack) const; // ForceSNAP + void compute_yi(const int&, const int&) const; // ForceSNAP KOKKOS_INLINE_FUNCTION - void compute_yi_with_zlist(int,int,int, - const Kokkos::View &beta_pack) const; // ForceSNAP + void compute_yi_with_zlist(const int&, const int&) const; // ForceSNAP KOKKOS_INLINE_FUNCTION void compute_bi(const int&, const int&) const; // ForceSNAP @@ -230,8 +228,7 @@ class SNAKokkos { const int&, const int&, const int&, const real_type*) const; // plugged into compute_yi, compute_yi_with_zlist KOKKOS_FORCEINLINE_FUNCTION - real_type evaluate_beta_scaled(const int&, const int&, const int&, const int&, const int&, const int&, const int&, const int&, - const Kokkos::View &) const; + real_type evaluate_beta_scaled(const int&, const int&, const int&, const int&, const int&, const int&, const int&) const; // plugged into compute_fused_deidrj_small, compute_fused_deidrj_large KOKKOS_FORCEINLINE_FUNCTION real_type evaluate_duidrj_jbend(const WignerWrapper&, const complex&, const complex&, const real_type&, @@ -246,8 +243,7 @@ class SNAKokkos { KOKKOS_INLINE_FUNCTION void compute_zi_cpu(const int&) const; // ForceSNAP KOKKOS_INLINE_FUNCTION - void compute_yi_cpu(int, - const Kokkos::View &beta) const; // ForceSNAP + void compute_yi_cpu(int) const; // ForceSNAP KOKKOS_INLINE_FUNCTION void compute_bi_cpu(const typename Kokkos::TeamPolicy::member_type& team, int) const; // ForceSNAP @@ -290,6 +286,9 @@ class SNAKokkos { int twojmax, diagonalstyle; + // Beta for all atoms in list; aliases the object in PairSnapKokkos + t_sna_2d d_beta; + t_sna_3d blist; t_sna_3c_ll ulisttot; t_sna_3c_ll ulisttot_full; // un-folded ulisttot, cpu only diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index eab9b99b6f..bb69f90b13 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -782,10 +782,8 @@ void SNAKokkos::compute_bi(const int& iato template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_yi(int iatom_mod, int jjz, int iatom_div, - const Kokkos::View &beta_gpu) const +void SNAKokkos::compute_yi(const int& iatom, const int& jjz) const { - const int iatom = iatom_mod + vector_length * iatom_div; int j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, jju_half, idxcg; idxz(jjz).get_yi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, jju_half, idxcg); @@ -806,7 +804,7 @@ void SNAKokkos::compute_yi(int iatom_mod, // pick out right beta value for (int elem3 = 0; elem3 < nelements; elem3++) { - const real_type betaj = evaluate_beta_scaled(j1, j2, j, iatom_mod, elem1, elem2, elem3, iatom_div, beta_gpu); + const real_type betaj = evaluate_beta_scaled(j1, j2, j, iatom, elem1, elem2, elem3); Kokkos::atomic_add(&(ylist_re_gpu(iatom, jju_half, elem3)), betaj * ztmp.re); Kokkos::atomic_add(&(ylist_im_gpu(iatom, jju_half, elem3)), betaj * ztmp.im); @@ -823,10 +821,8 @@ void SNAKokkos::compute_yi(int iatom_mod, template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_yi_with_zlist(int iatom_mod, int jjz, int iatom_div, - const Kokkos::View &beta_gpu) const +void SNAKokkos::compute_yi_with_zlist(const int& iatom, const int& jjz) const { - const int iatom = iatom_mod + vector_length * iatom_div; int j1, j2, j, jju_half; idxz(jjz).get_yi_with_zlist(j1, j2, j, jju_half); @@ -841,7 +837,7 @@ void SNAKokkos::compute_yi_with_zlist(int // pick out right beta value for (int elem3 = 0; elem3 < nelements; elem3++) { - const real_type betaj = evaluate_beta_scaled(j1, j2, j, iatom_mod, elem1, elem2, elem3, iatom_div, beta_gpu); + const real_type betaj = evaluate_beta_scaled(j1, j2, j, iatom, elem1, elem2, elem3); Kokkos::atomic_add(&(ylist_re_gpu(iatom, jju_half, elem3)), betaj * ztmp.re); Kokkos::atomic_add(&(ylist_im_gpu(iatom, jju_half, elem3)), betaj * ztmp.im); @@ -907,8 +903,7 @@ typename SNAKokkos::complex SNAKokkos KOKKOS_FORCEINLINE_FUNCTION typename SNAKokkos::real_type SNAKokkos::evaluate_beta_scaled(const int& j1, const int& j2, const int& j, - const int& iatom_mod, const int& elem1, const int& elem2, const int& elem3, const int& iatom_div, - const Kokkos::View &beta_gpu) const { + const int& iatom, const int& elem1, const int& elem2, const int& elem3) const { real_type betaj = 0; @@ -916,18 +911,18 @@ typename SNAKokkos::real_type SNAKokkos(3) * beta_gpu(iatom_mod, itriple, iatom_div); - else betaj = static_cast(2) * beta_gpu(iatom_mod, itriple, iatom_div); - } else betaj = beta_gpu(iatom_mod, itriple, iatom_div); + if (j2 == j) betaj = static_cast(3) * d_beta(iatom, itriple); + else betaj = static_cast(2) * d_beta(iatom, itriple); + } else betaj = d_beta(iatom, itriple); } else if (j >= j2) { const int jjb = idxb_block(j, j2, j1); const int itriple = ((elem3 * nelements + elem2) * nelements + elem1) * idxb_max + jjb; - if (j2 == j) betaj = static_cast(2) * beta_gpu(iatom_mod, itriple, iatom_div); - else betaj = beta_gpu(iatom_mod, itriple, iatom_div); + if (j2 == j) betaj = static_cast(2) * d_beta(iatom, itriple); + else betaj = d_beta(iatom, itriple); } else { const int jjb = idxb_block(j2, j, j1); const int itriple = ((elem2 * nelements + elem3) * nelements + elem1) * idxb_max + jjb; - betaj = beta_gpu(iatom_mod, itriple, iatom_div); + betaj = d_beta(iatom, itriple); } if (!bnorm_flag && j1 > j) { @@ -1402,8 +1397,7 @@ void SNAKokkos::compute_bi_cpu(const typen template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_yi_cpu(int iter, - const Kokkos::View &beta) const +void SNAKokkos::compute_yi_cpu(int iter) const { real_type betaj; const int iatom = iter / idxz_max; @@ -1470,18 +1464,18 @@ void SNAKokkos::compute_yi_cpu(int iter, const int jjb = idxb_block(j1, j2, j); const int itriple = ((elem1 * nelements + elem2) * nelements + elem3) * idxb_max + jjb; if (j1 == j) { - if (j2 == j) betaj = 3 * beta(itriple, iatom); - else betaj = 2 * beta(itriple, iatom); - } else betaj = beta(itriple, iatom); + if (j2 == j) betaj = 3 * d_beta(iatom, itriple); + else betaj = 2 * d_beta(iatom, itriple); + } else betaj = d_beta(iatom, itriple); } else if (j >= j2) { const int jjb = idxb_block(j, j2, j1); const int itriple = ((elem3 * nelements + elem2) * nelements + elem1) * idxb_max + jjb; - if (j2 == j) betaj = 2 * beta(itriple, iatom); - else betaj = beta(itriple, iatom); + if (j2 == j) betaj = 2 * d_beta(iatom, itriple); + else betaj = d_beta(iatom, itriple); } else { const int jjb = idxb_block(j2, j, j1); const int itriple = ((elem2 * nelements + elem3) * nelements + elem1) * idxb_max + jjb; - betaj = beta(itriple, iatom); + betaj = d_beta(iatom, itriple); } if (!bnorm_flag && j1 > j) From abbcd86174b331423cd9012ed726cf06cf0070a9 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Tue, 19 Nov 2024 11:02:32 -0800 Subject: [PATCH 09/30] Unified ylist CPU and GPU structures --- src/KOKKOS/pair_snap_kokkos_impl.h | 7 ++- src/KOKKOS/sna_kokkos.h | 9 +-- src/KOKKOS/sna_kokkos_impl.h | 89 ++++++++++-------------------- 3 files changed, 37 insertions(+), 68 deletions(-) diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h index 0d209490fd..23e3f1d3a6 100644 --- a/src/KOKKOS/pair_snap_kokkos_impl.h +++ b/src/KOKKOS/pair_snap_kokkos_impl.h @@ -847,8 +847,8 @@ void PairSNAPKokkos::operator() (TagPairSN snaKK.ulisttot_gpu(iatom, idxu, ielem) = { utot_re, utot_im }; if (mapper.flip_sign == 0) { - snaKK.ylist_re_gpu(iatom, mapper.idxu_half, ielem) = 0.; - snaKK.ylist_im_gpu(iatom, mapper.idxu_half, ielem) = 0.; + snaKK.ylist_re(iatom, ielem, mapper.idxu_half) = 0.; + snaKK.ylist_im(iatom, ielem, mapper.idxu_half) = 0.; } } } @@ -1118,7 +1118,8 @@ void PairSNAPKokkos::operator() (TagPairSN snaKK.ulisttot_full(idxu, ielem, iatom) = utot; // Zero Yi - snaKK.ylist(idxu_half, ielem, iatom) = {0., 0.}; + snaKK.ylist_re(iatom, ielem, idxu_half) = 0; + snaKK.ylist_im(iatom, ielem, idxu_half) = 0; // Symmetric term const int sign_factor = (((ma+mb)%2==0)?1:-1); diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h index edc9d885da..185fbd295d 100644 --- a/src/KOKKOS/sna_kokkos.h +++ b/src/KOKKOS/sna_kokkos.h @@ -289,13 +289,17 @@ class SNAKokkos { // Beta for all atoms in list; aliases the object in PairSnapKokkos t_sna_2d d_beta; + // Structures for both the CPU, GPU backend + t_sna_3d ylist_re; + t_sna_3d ylist_im; + + // Structures for the CPU backend only t_sna_3d blist; t_sna_3c_ll ulisttot; t_sna_3c_ll ulisttot_full; // un-folded ulisttot, cpu only t_sna_3c_ll zlist; t_sna_3c_ll ulist; - t_sna_3c_ll ylist; // derivatives of data t_sna_4c3_ll dulist; @@ -312,8 +316,6 @@ class SNAKokkos { t_sna_3c ulisttot_gpu; // packed and de-symmetrized t_sna_3c zlist_gpu; t_sna_3d blist_gpu; - t_sna_3d ylist_re_gpu; // split real, - t_sna_3d ylist_im_gpu; // imag int idxcg_max, idxu_max, idxu_half_max, idxu_cache_max, idxz_max, idxb_max; @@ -409,4 +411,3 @@ class SNAKokkos { #include "sna_kokkos_impl.h" #endif - diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index bb69f90b13..beb50c2083 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -313,8 +313,10 @@ void SNAKokkos::grow_rij(int newnatom, int MemKK::realloc_kokkos(element,"sna:element",natom_pad,nmax); MemKK::realloc_kokkos(dedr,"sna:dedr",natom_pad,nmax,3); - if constexpr (!host_flag) { + MemKK::realloc_kokkos(ylist_re,"sna:ylist_re", natom_pad, nelements, idxu_half_max); + MemKK::realloc_kokkos(ylist_im,"sna:ylist_im", natom_pad, nelements, idxu_half_max); + if constexpr (!host_flag) { MemKK::realloc_kokkos(a_gpu,"sna:a_gpu",natom_pad,nmax); MemKK::realloc_kokkos(b_gpu,"sna:b_gpu",natom_pad,nmax); MemKK::realloc_kokkos(da_gpu,"sna:da_gpu",natom_pad,nmax,3); @@ -330,9 +332,6 @@ void SNAKokkos::grow_rij(int newnatom, int MemKK::realloc_kokkos(zlist_gpu,"sna:zlist_gpu",natom_pad,idxz_max,ndoubles); MemKK::realloc_kokkos(blist,"sna:blist",natom_pad,ntriples,idxb_max); MemKK::realloc_kokkos(blist_gpu,"sna:blist_gpu",natom_pad,idxb_max,ntriples); - MemKK::realloc_kokkos(ylist,"sna:ylist",1,1,1); - MemKK::realloc_kokkos(ylist_re_gpu,"sna:ylist_re_gpu",natom_pad,idxu_half_max,nelements); - MemKK::realloc_kokkos(ylist_im_gpu,"sna:ylist_im_gpu",natom_pad,idxu_half_max,nelements); MemKK::realloc_kokkos(dulist,"sna:dulist",1,1,1); } else { MemKK::realloc_kokkos(a_gpu,"sna:a_gpu",1,1); @@ -350,9 +349,6 @@ void SNAKokkos::grow_rij(int newnatom, int MemKK::realloc_kokkos(zlist_gpu,"sna:zlist_gpu",1,1,1); MemKK::realloc_kokkos(blist,"sna:blist",natom_pad,ntriples,idxb_max); MemKK::realloc_kokkos(blist_gpu,"sna:blist_gpu",1,1,1); - MemKK::realloc_kokkos(ylist,"sna:ylist",idxu_half_max,nelements,natom_pad); - MemKK::realloc_kokkos(ylist_re_gpu,"sna:ylist_pack_re",1,1,1); - MemKK::realloc_kokkos(ylist_im_gpu,"sna:ylist_pack_im",1,1,1); MemKK::realloc_kokkos(dulist,"sna:dulist",idxu_cache_max,natom_pad,nmax); } @@ -806,8 +802,8 @@ void SNAKokkos::compute_yi(const int& iato const real_type betaj = evaluate_beta_scaled(j1, j2, j, iatom, elem1, elem2, elem3); - Kokkos::atomic_add(&(ylist_re_gpu(iatom, jju_half, elem3)), betaj * ztmp.re); - Kokkos::atomic_add(&(ylist_im_gpu(iatom, jju_half, elem3)), betaj * ztmp.im); + Kokkos::atomic_add(&(ylist_re(iatom, elem3, jju_half)), betaj * ztmp.re); + Kokkos::atomic_add(&(ylist_im(iatom, elem3, jju_half)), betaj * ztmp.im); } // end loop over elem3 } // end loop over elem2 } // end loop over elem1 @@ -839,8 +835,8 @@ void SNAKokkos::compute_yi_with_zlist(cons const real_type betaj = evaluate_beta_scaled(j1, j2, j, iatom, elem1, elem2, elem3); - Kokkos::atomic_add(&(ylist_re_gpu(iatom, jju_half, elem3)), betaj * ztmp.re); - Kokkos::atomic_add(&(ylist_im_gpu(iatom, jju_half, elem3)), betaj * ztmp.im); + Kokkos::atomic_add(&(ylist_re(iatom, elem3, jju_half)), betaj * ztmp.re); + Kokkos::atomic_add(&(ylist_im(iatom, elem3, jju_half)), betaj * ztmp.im); } // end loop over elem3 idouble++; } // end loop over elem2 @@ -1096,7 +1092,7 @@ typename SNAKokkos::real_type SNAKokkos::real_type SNAKokkos(0.5)*y_local; } else if (ma > (mb-1)) { y_local.re = static_cast(0.); y_local.im = static_cast(0.); } // can probably avoid this outright @@ -1459,30 +1455,10 @@ void SNAKokkos::compute_yi_cpu(int iter) c // pick out right beta value for (int elem3 = 0; elem3 < nelements; elem3++) { + const real_type betaj = evaluate_beta_scaled(j1, j2, j, iatom, elem1, elem2, elem3); - if (j >= j1) { - const int jjb = idxb_block(j1, j2, j); - const int itriple = ((elem1 * nelements + elem2) * nelements + elem3) * idxb_max + jjb; - if (j1 == j) { - if (j2 == j) betaj = 3 * d_beta(iatom, itriple); - else betaj = 2 * d_beta(iatom, itriple); - } else betaj = d_beta(iatom, itriple); - } else if (j >= j2) { - const int jjb = idxb_block(j, j2, j1); - const int itriple = ((elem3 * nelements + elem2) * nelements + elem1) * idxb_max + jjb; - if (j2 == j) betaj = 2 * d_beta(iatom, itriple); - else betaj = d_beta(iatom, itriple); - } else { - const int jjb = idxb_block(j2, j, j1); - const int itriple = ((elem2 * nelements + elem3) * nelements + elem1) * idxb_max + jjb; - betaj = d_beta(iatom, itriple); - } - - if (!bnorm_flag && j1 > j) - betaj *= static_cast(j1 + 1) / static_cast(j + 1); - - Kokkos::atomic_add(&(ylist(jju_half, elem3, iatom).re), betaj*ztmp_r); - Kokkos::atomic_add(&(ylist(jju_half, elem3, iatom).im), betaj*ztmp_i); + Kokkos::atomic_add(&(ylist_re(iatom, elem3, jju_half)), betaj*ztmp_r); + Kokkos::atomic_add(&(ylist_im(iatom, elem3, jju_half)), betaj*ztmp_i); } // end loop over elem3 } // end loop over elem2 } // end loop over elem1 @@ -1541,12 +1517,10 @@ void SNAKokkos::compute_deidrj_cpu(const t for (int mb = 0; 2*mb < j; mb++) for (int ma = 0; ma <= j; ma++) { - sum_tmp.x += dulist(jju_cache,iatom,jnbor,0).re * ylist(jju_half,jelem,iatom).re + - dulist(jju_cache,iatom,jnbor,0).im * ylist(jju_half,jelem,iatom).im; - sum_tmp.y += dulist(jju_cache,iatom,jnbor,1).re * ylist(jju_half,jelem,iatom).re + - dulist(jju_cache,iatom,jnbor,1).im * ylist(jju_half,jelem,iatom).im; - sum_tmp.z += dulist(jju_cache,iatom,jnbor,2).re * ylist(jju_half,jelem,iatom).re + - dulist(jju_cache,iatom,jnbor,2).im * ylist(jju_half,jelem,iatom).im; + const complex y_val = { ylist_re(iatom, jelem, jju_half), ylist_im(iatom, jelem, jju_half) }; + sum_tmp.x += dulist(jju_cache,iatom,jnbor,0).re * y_val.re + dulist(jju_cache,iatom,jnbor,0).im * y_val.im; + sum_tmp.y += dulist(jju_cache,iatom,jnbor,1).re * y_val.re + dulist(jju_cache,iatom,jnbor,1).im * y_val.im; + sum_tmp.z += dulist(jju_cache,iatom,jnbor,2).re * y_val.re + dulist(jju_cache,iatom,jnbor,2).im * y_val.im; jju_half++; jju_cache++; } //end loop over ma mb @@ -1556,22 +1530,19 @@ void SNAKokkos::compute_deidrj_cpu(const t int mb = j/2; for (int ma = 0; ma < mb; ma++) { - sum_tmp.x += dulist(jju_cache,iatom,jnbor,0).re * ylist(jju_half,jelem,iatom).re + - dulist(jju_cache,iatom,jnbor,0).im * ylist(jju_half,jelem,iatom).im; - sum_tmp.y += dulist(jju_cache,iatom,jnbor,1).re * ylist(jju_half,jelem,iatom).re + - dulist(jju_cache,iatom,jnbor,1).im * ylist(jju_half,jelem,iatom).im; - sum_tmp.z += dulist(jju_cache,iatom,jnbor,2).re * ylist(jju_half,jelem,iatom).re + - dulist(jju_cache,iatom,jnbor,2).im * ylist(jju_half,jelem,iatom).im; + const complex y_val = { ylist_re(iatom, jelem, jju_half), ylist_im(iatom, jelem, jju_half) }; + sum_tmp.x += dulist(jju_cache,iatom,jnbor,0).re * y_val.re + dulist(jju_cache,iatom,jnbor,0).im * y_val.im; + sum_tmp.y += dulist(jju_cache,iatom,jnbor,1).re * y_val.re + dulist(jju_cache,iatom,jnbor,1).im * y_val.im; + sum_tmp.z += dulist(jju_cache,iatom,jnbor,2).re * y_val.re + dulist(jju_cache,iatom,jnbor,2).im * y_val.im; jju_half++; jju_cache++; } //int ma = mb; - sum_tmp.x += (dulist(jju_cache,iatom,jnbor,0).re * ylist(jju_half,jelem,iatom).re + - dulist(jju_cache,iatom,jnbor,0).im * ylist(jju_half,jelem,iatom).im)*0.5; - sum_tmp.y += (dulist(jju_cache,iatom,jnbor,1).re * ylist(jju_half,jelem,iatom).re + - dulist(jju_cache,iatom,jnbor,1).im * ylist(jju_half,jelem,iatom).im)*0.5; - sum_tmp.z += (dulist(jju_cache,iatom,jnbor,2).re * ylist(jju_half,jelem,iatom).re + - dulist(jju_cache,iatom,jnbor,2).im * ylist(jju_half,jelem,iatom).im)*0.5; + // 0.5 is meant to avoid double-counting + const complex y_val = { 0.5 * ylist_re(iatom, jelem, jju_half), 0.5 * ylist_im(iatom, jelem, jju_half) }; + sum_tmp.x += dulist(jju_cache,iatom,jnbor,0).re * y_val.re + dulist(jju_cache,iatom,jnbor,0).im * y_val.im; + sum_tmp.y += dulist(jju_cache,iatom,jnbor,1).re * y_val.re + dulist(jju_cache,iatom,jnbor,1).im * y_val.im; + sum_tmp.z += dulist(jju_cache,iatom,jnbor,2).re * y_val.re + dulist(jju_cache,iatom,jnbor,2).im * y_val.im; } // end if jeven },final_sum); // end loop over j @@ -2328,8 +2299,10 @@ double SNAKokkos::memory_usage() bytes += MemKK::memory_usage(rootpqarray); bytes += MemKK::memory_usage(cglist); - if constexpr (!host_flag) { + bytes += MemKK::memory_usage(ylist_re); + bytes += MemKK::memory_usage(ylist_im); + if constexpr (!host_flag) { bytes += MemKK::memory_usage(a_gpu); bytes += MemKK::memory_usage(b_gpu); bytes += MemKK::memory_usage(da_gpu); @@ -2343,11 +2316,7 @@ double SNAKokkos::memory_usage() bytes += MemKK::memory_usage(zlist_gpu); bytes += MemKK::memory_usage(blist_gpu); - - bytes += MemKK::memory_usage(ylist_re_gpu); - bytes += MemKK::memory_usage(ylist_im_gpu); } else { - bytes += MemKK::memory_usage(ulist); bytes += MemKK::memory_usage(ulisttot); bytes += MemKK::memory_usage(ulisttot_full); @@ -2355,8 +2324,6 @@ double SNAKokkos::memory_usage() bytes += MemKK::memory_usage(zlist); bytes += MemKK::memory_usage(blist); - bytes += MemKK::memory_usage(ylist); - bytes += MemKK::memory_usage(dulist); } From cf6714ea33815a6feb0166364fb524d43c5a7813 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Tue, 19 Nov 2024 11:12:20 -0800 Subject: [PATCH 10/30] Unified ulisttot CPU and GPU structures --- src/KOKKOS/pair_snap_kokkos_impl.h | 16 +++--- src/KOKKOS/sna_kokkos.h | 9 ++-- src/KOKKOS/sna_kokkos_impl.h | 85 +++++++++++++----------------- 3 files changed, 49 insertions(+), 61 deletions(-) diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h index 23e3f1d3a6..2ca74375c4 100644 --- a/src/KOKKOS/pair_snap_kokkos_impl.h +++ b/src/KOKKOS/pair_snap_kokkos_impl.h @@ -250,7 +250,7 @@ void PairSNAPKokkos::compute(int eflag_in, } { - // Expand ulisttot -> ulisttot_full + // Expand ulisttot_re,_im -> ulisttot // Zero out ylist typename Kokkos::MDRangePolicy, Kokkos::Rank<2, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformUiCPU> policy_transform_ui_cpu({0,0},{twojmax+1,chunk_size}); Kokkos::parallel_for("TransformUiCPU",policy_transform_ui_cpu,*this); @@ -823,7 +823,7 @@ void PairSNAPKokkos::operator() (TagPairSN template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPTransformUi,const int iatom_mod, const int idxu, const int iatom_div) const { +void PairSNAPKokkos::operator() (TagPairSNAPTransformUi, const int iatom_mod, const int idxu, const int iatom_div) const { const int iatom = iatom_mod + iatom_div * vector_length; if (iatom >= chunk_size) return; @@ -835,8 +835,8 @@ void PairSNAPKokkos::operator() (TagPairSN const FullHalfMapper mapper = snaKK.idxu_full_half[idxu]; - auto utot_re = snaKK.ulisttot_re_gpu(iatom, mapper.idxu_half, ielem); - auto utot_im = snaKK.ulisttot_im_gpu(iatom, mapper.idxu_half, ielem); + auto utot_re = snaKK.ulisttot_re(iatom, ielem, mapper.idxu_half); + auto utot_im = snaKK.ulisttot_im(iatom, ielem, mapper.idxu_half); if (mapper.flip_sign == 1) { utot_im = -utot_im; @@ -844,7 +844,7 @@ void PairSNAPKokkos::operator() (TagPairSN utot_re = -utot_re; } - snaKK.ulisttot_gpu(iatom, idxu, ielem) = { utot_re, utot_im }; + snaKK.ulisttot(iatom, ielem, idxu) = { utot_re, utot_im }; if (mapper.flip_sign == 0) { snaKK.ylist_re(iatom, ielem, mapper.idxu_half) = 0.; @@ -1112,10 +1112,10 @@ void PairSNAPKokkos::operator() (TagPairSN const int idxu = jju + idxu_shift; // Load ulist - auto utot = snaKK.ulisttot(idxu_half, ielem, iatom); + complex utot = { snaKK.ulisttot_re(iatom, ielem, idxu_half), snaKK.ulisttot_im(iatom, ielem, idxu_half) }; // Store - snaKK.ulisttot_full(idxu, ielem, iatom) = utot; + snaKK.ulisttot(iatom, ielem, idxu) = utot; // Zero Yi snaKK.ylist_re(iatom, ielem, idxu_half) = 0; @@ -1131,7 +1131,7 @@ void PairSNAPKokkos::operator() (TagPairSN utot.re = -utot.re; } - snaKK.ulisttot_full(idxu_flip, ielem, iatom) = utot; + snaKK.ulisttot(iatom, ielem, idxu_flip) = utot; } } } diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h index 185fbd295d..66d7e7254f 100644 --- a/src/KOKKOS/sna_kokkos.h +++ b/src/KOKKOS/sna_kokkos.h @@ -290,13 +290,15 @@ class SNAKokkos { t_sna_2d d_beta; // Structures for both the CPU, GPU backend + t_sna_3d ulisttot_re; + t_sna_3d ulisttot_im; + t_sna_3c ulisttot; // un-folded ulisttot + t_sna_3d ylist_re; t_sna_3d ylist_im; // Structures for the CPU backend only t_sna_3d blist; - t_sna_3c_ll ulisttot; - t_sna_3c_ll ulisttot_full; // un-folded ulisttot, cpu only t_sna_3c_ll zlist; t_sna_3c_ll ulist; @@ -311,9 +313,6 @@ class SNAKokkos { t_sna_3c db_gpu; // `db` t_sna_3d sfac_gpu; // sfac, dsfac_{x,y,z} - t_sna_3d ulisttot_re_gpu; // split real, - t_sna_3d ulisttot_im_gpu; // imag, AoSoA, flattened - t_sna_3c ulisttot_gpu; // packed and de-symmetrized t_sna_3c zlist_gpu; t_sna_3d blist_gpu; diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index beb50c2083..1623bec0e1 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -313,6 +313,10 @@ void SNAKokkos::grow_rij(int newnatom, int MemKK::realloc_kokkos(element,"sna:element",natom_pad,nmax); MemKK::realloc_kokkos(dedr,"sna:dedr",natom_pad,nmax,3); + MemKK::realloc_kokkos(ulisttot_re,"sna:ulisttot_re", natom_pad, nelements, idxu_half_max); + MemKK::realloc_kokkos(ulisttot_im,"sna:ulisttot_im", natom_pad, nelements, idxu_half_max); + MemKK::realloc_kokkos(ulisttot,"sna:ulisttot", natom_pad, nelements, idxu_max); + MemKK::realloc_kokkos(ylist_re,"sna:ylist_re", natom_pad, nelements, idxu_half_max); MemKK::realloc_kokkos(ylist_im,"sna:ylist_im", natom_pad, nelements, idxu_half_max); @@ -322,11 +326,6 @@ void SNAKokkos::grow_rij(int newnatom, int MemKK::realloc_kokkos(da_gpu,"sna:da_gpu",natom_pad,nmax,3); MemKK::realloc_kokkos(db_gpu,"sna:db_gpu",natom_pad,nmax,3); MemKK::realloc_kokkos(sfac_gpu,"sna:sfac_gpu",natom_pad,nmax,4); - MemKK::realloc_kokkos(ulisttot,"sna:ulisttot",1,1,1); // dummy allocation - MemKK::realloc_kokkos(ulisttot_full,"sna:ulisttot",1,1,1); - MemKK::realloc_kokkos(ulisttot_re_gpu,"sna:ulisttot_re_gpu",natom_pad,idxu_half_max,nelements); - MemKK::realloc_kokkos(ulisttot_im_gpu,"sna:ulisttot_im_gpu",natom_pad,idxu_half_max,nelements); - MemKK::realloc_kokkos(ulisttot_gpu,"sna:ulisttot_gpu",natom_pad,idxu_max,nelements); MemKK::realloc_kokkos(ulist,"sna:ulist",1,1,1); MemKK::realloc_kokkos(zlist,"sna:zlist",1,1,1); MemKK::realloc_kokkos(zlist_gpu,"sna:zlist_gpu",natom_pad,idxz_max,ndoubles); @@ -339,18 +338,12 @@ void SNAKokkos::grow_rij(int newnatom, int MemKK::realloc_kokkos(da_gpu,"sna:da_gpu",1,1,1); MemKK::realloc_kokkos(db_gpu,"sna:db_gpu",1,1,1); MemKK::realloc_kokkos(sfac_gpu,"sna:sfac_gpu",1,1,1); - MemKK::realloc_kokkos(ulisttot,"sna:ulisttot",idxu_half_max,nelements,natom_pad); - MemKK::realloc_kokkos(ulisttot_full,"sna:ulisttot_full",idxu_max,nelements,natom_pad); - MemKK::realloc_kokkos(ulisttot_re_gpu,"sna:ulisttot_re_gpu",1,1,1); - MemKK::realloc_kokkos(ulisttot_im_gpu,"sna:ulisttot_im_gpu",1,1,1); - MemKK::realloc_kokkos(ulisttot_gpu,"sna:ulisttot_pack_gpu",1,1,1); MemKK::realloc_kokkos(ulist,"sna:ulist",idxu_cache_max,natom_pad,nmax); MemKK::realloc_kokkos(zlist,"sna:zlist",idxz_max,ndoubles,natom_pad); MemKK::realloc_kokkos(zlist_gpu,"sna:zlist_gpu",1,1,1); MemKK::realloc_kokkos(blist,"sna:blist",natom_pad,ntriples,idxb_max); MemKK::realloc_kokkos(blist_gpu,"sna:blist_gpu",1,1,1); MemKK::realloc_kokkos(dulist,"sna:dulist",idxu_cache_max,natom_pad,nmax); - } } @@ -469,8 +462,8 @@ void SNAKokkos::pre_ui(const int& iatom, c real_type re_part = static_cast(0.); if (ma == mb && (!chem_flag || ielem == jelem || wselfall_flag)) { re_part = wself; } - ulisttot_re_gpu(iatom, jju_half, jelem) = re_part; - ulisttot_im_gpu(iatom, jju_half, jelem) = static_cast(0.); + ulisttot_re(iatom, jelem, jju_half) = re_part; + ulisttot_im(iatom, jelem, jju_half) = 0; jju_half++; } @@ -614,8 +607,8 @@ void SNAKokkos::evaluate_ui_jbend(const Wi const complex ulist_prev = ulist_wrapper.get(ma); // atomic add the previous level here - Kokkos::atomic_add(&(ulisttot_re_gpu(iatom, jjup + ma, jelem)), ulist_prev.re * sfac); - Kokkos::atomic_add(&(ulisttot_im_gpu(iatom, jjup + ma, jelem)), ulist_prev.im * sfac); + Kokkos::atomic_add(&(ulisttot_re(iatom, jelem, jjup + ma)), ulist_prev.re * sfac); + Kokkos::atomic_add(&(ulisttot_im(iatom, jelem, jjup + ma)), ulist_prev.im * sfac); // ulist_accum += rootpq * b * ulist_prev; real_type rootpq = rootpqarray(j - ma, mb); @@ -644,8 +637,8 @@ void SNAKokkos::evaluate_ui_jbend(const Wi const complex ulist_prev = ulist_wrapper.get(ma); // atomic add the previous level here - Kokkos::atomic_add(&(ulisttot_re_gpu(iatom, jjup + ma, jelem)), ulist_prev.re * sfac); - Kokkos::atomic_add(&(ulisttot_im_gpu(iatom, jjup + ma, jelem)), ulist_prev.im * sfac); + Kokkos::atomic_add(&(ulisttot_re(iatom, jelem, jjup + ma)), ulist_prev.re * sfac); + Kokkos::atomic_add(&(ulisttot_im(iatom, jelem, jjup + ma)), ulist_prev.im * sfac); } } @@ -717,7 +710,7 @@ void SNAKokkos::compute_bi(const int& iato const int jju_index = jju+mb*(j+1)+ma; const int jjz_index = jjz+mb*(j+1)+ma; if (2*mb == j) return; // I think we can remove this? - const complex utot = ulisttot_gpu(iatom, jju_index, elem3); + const complex utot = ulisttot(iatom, elem3, jju_index); const complex zloc = zlist_gpu(iatom, jjz_index, idouble); sumzu_temp += utot.re * zloc.re + utot.im * zloc.im; } @@ -733,7 +726,7 @@ void SNAKokkos::compute_bi(const int& iato const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma; const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma; - const complex utot = ulisttot_gpu(iatom, jju_index, elem3); + const complex utot = ulisttot(iatom, elem3, jju_index); const complex zloc = zlist_gpu(iatom, jjz_index, idouble); sumzu_temp += utot.re * zloc.re + utot.im * zloc.im; @@ -744,7 +737,7 @@ void SNAKokkos::compute_bi(const int& iato const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma; const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma; - const complex utot = ulisttot_gpu(iatom, jju_index, elem3); + const complex utot = ulisttot(iatom, elem3, jju_index); const complex zloc = zlist_gpu(iatom, jjz_index, idouble); sumzu += static_cast(0.5) * (utot.re * zloc.re + utot.im * zloc.im); } // end if jeven @@ -869,8 +862,8 @@ typename SNAKokkos::complex SNAKokkos::pre_ui_cpu(const typename // if m is on the "diagonal", initialize it with the self energy. // Otherwise zero it out - complex init(static_cast(0.),static_cast(0.)); - if (m % (j+2) == 0 && (!chem_flag || ielem == jelem || wselfall_flag)) { init.re = wself; } //need to map iatom to element + real_type init = 0; + if (m % (j+2) == 0 && (!chem_flag || ielem == jelem || wselfall_flag)) { init = wself; } //need to map iatom to element - ulisttot(jjup, jelem, iatom) = init; + ulisttot_re(iatom, jelem, jjup) = init; + ulisttot_im(iatom, jelem, jjup) = 0; }); } } @@ -1261,10 +1255,8 @@ void SNAKokkos::compute_zi_cpu(const int& int ma2 = ma2max; int icga = ma1min * (j2 + 1) + ma2max; for (int ia = 0; ia < na; ia++) { - suma1_r += cgblock[icga] * (ulisttot_full(jju1+ma1, elem1, iatom).re * ulisttot_full(jju2+ma2, elem2, iatom).re - - ulisttot_full(jju1+ma1, elem1, iatom).im * ulisttot_full(jju2+ma2, elem2, iatom).im); - suma1_i += cgblock[icga] * (ulisttot_full(jju1+ma1, elem1, iatom).re * ulisttot_full(jju2+ma2, elem2, iatom).im + - ulisttot_full(jju1+ma1, elem1, iatom).im * ulisttot_full(jju2+ma2, elem2, iatom).re); + suma1_r += cgblock[icga] * (ulisttot(iatom, elem1, jju1+ma1).re * ulisttot(iatom, elem2, jju2+ma2).re - ulisttot(iatom, elem1, jju1+ma1).im * ulisttot(iatom, elem2, jju2+ma2).im); + suma1_i += cgblock[icga] * (ulisttot(iatom, elem1, jju1+ma1).re * ulisttot(iatom, elem2, jju2+ma2).im + ulisttot(iatom, elem1, jju1+ma1).im * ulisttot(iatom, elem2, jju2+ma2).re); ma1++; ma2--; icga += j2; @@ -1331,8 +1323,8 @@ void SNAKokkos::compute_bi_cpu(const typen const int jjz_index = jjz + mb * (j + 1) + ma; if (2*mb == j) return; sum += - ulisttot_full(jju_index, elem3, iatom).re * zlist(jjz_index, jalloy, iatom).re + - ulisttot_full(jju_index, elem3, iatom).im * zlist(jjz_index, jalloy, iatom).im; + ulisttot(iatom, elem3, jju_index).re * zlist(jjz_index, jalloy, iatom).re + + ulisttot(iatom, elem3, jju_index).im * zlist(jjz_index, jalloy, iatom).im; },sumzu_temp); // end loop over ma, mb sumzu += sumzu_temp; @@ -1345,8 +1337,8 @@ void SNAKokkos::compute_bi_cpu(const typen const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma; const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma; sum += - ulisttot_full(jju_index, elem3, iatom).re * zlist(jjz_index, jalloy, iatom).re + - ulisttot_full(jju_index, elem3, iatom).im * zlist(jjz_index, jalloy, iatom).im; + ulisttot(iatom, elem3, jju_index).re * zlist(jjz_index, jalloy, iatom).re + + ulisttot(iatom, elem3, jju_index).im * zlist(jjz_index, jalloy, iatom).im; },sumzu_temp); // end loop over ma sumzu += sumzu_temp; @@ -1354,8 +1346,8 @@ void SNAKokkos::compute_bi_cpu(const typen const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma; const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma; sumzu += static_cast(0.5)* - (ulisttot_full(jju_index, elem3, iatom).re * zlist(jjz_index, jalloy, iatom).re + - ulisttot_full(jju_index, elem3, iatom).im * zlist(jjz_index, jalloy, iatom).im); + (ulisttot(iatom, elem3, jju_index).re * zlist(jjz_index, jalloy, iatom).re + + ulisttot(iatom, elem3, jju_index).im * zlist(jjz_index, jalloy, iatom).im); } // end if jeven Kokkos::single(Kokkos::PerThread(team), [&] () { @@ -1426,10 +1418,10 @@ void SNAKokkos::compute_yi_cpu(int iter) c int icga = ma1min*(j2+1) + ma2max; for (int ia = 0; ia < na; ia++) { - suma1_r += cgblock[icga] * (ulisttot_full(jju1+ma1, elem1, iatom).re * ulisttot_full(jju2+ma2, elem2, iatom).re - - ulisttot_full(jju1+ma1, elem1, iatom).im * ulisttot_full(jju2+ma2, elem2, iatom).im); - suma1_i += cgblock[icga] * (ulisttot_full(jju1+ma1, elem1, iatom).re * ulisttot_full(jju2+ma2, elem2, iatom).im + - ulisttot_full(jju1+ma1, elem1, iatom).im * ulisttot_full(jju2+ma2, elem2, iatom).re); + suma1_r += cgblock[icga] * (ulisttot(iatom, elem1, jju1+ma1).re * ulisttot(iatom, elem2, jju2+ma2).re - + ulisttot(iatom, elem1, jju1+ma1).im * ulisttot(iatom, elem2, jju2+ma2).im); + suma1_i += cgblock[icga] * (ulisttot(iatom, elem1, jju1+ma1).re * ulisttot(iatom, elem2, jju2+ma2).im + + ulisttot(iatom, elem1, jju1+ma1).im * ulisttot(iatom, elem2, jju2+ma2).re); ma1++; ma2--; icga += j2; @@ -1581,8 +1573,8 @@ void SNAKokkos::add_uarraytot(const typena int count = 0; for (int mb = 0; 2*mb <= j; mb++) { for (int ma = 0; ma <= j; ma++) { - Kokkos::atomic_add(&(ulisttot(jju_half+count, jelem, iatom).re), sfac * ulist(jju_cache+count, iatom, jnbor).re); - Kokkos::atomic_add(&(ulisttot(jju_half+count, jelem, iatom).im), sfac * ulist(jju_cache+count, iatom, jnbor).im); + Kokkos::atomic_add(&(ulisttot_re(iatom, jelem, jju_half+count)), sfac * ulist(jju_cache+count, iatom, jnbor).re); + Kokkos::atomic_add(&(ulisttot_im(iatom, jelem, jju_half+count)), sfac * ulist(jju_cache+count, iatom, jnbor).im); count++; } } @@ -2299,6 +2291,10 @@ double SNAKokkos::memory_usage() bytes += MemKK::memory_usage(rootpqarray); bytes += MemKK::memory_usage(cglist); + bytes += MemKK::memory_usage(ulisttot_re); + bytes += MemKK::memory_usage(ulisttot_im); + bytes += MemKK::memory_usage(ulisttot); + bytes += MemKK::memory_usage(ylist_re); bytes += MemKK::memory_usage(ylist_im); @@ -2309,17 +2305,10 @@ double SNAKokkos::memory_usage() bytes += MemKK::memory_usage(db_gpu); bytes += MemKK::memory_usage(sfac_gpu); - - bytes += MemKK::memory_usage(ulisttot_re_gpu); - bytes += MemKK::memory_usage(ulisttot_im_gpu); - bytes += MemKK::memory_usage(ulisttot_gpu); - bytes += MemKK::memory_usage(zlist_gpu); bytes += MemKK::memory_usage(blist_gpu); } else { bytes += MemKK::memory_usage(ulist); - bytes += MemKK::memory_usage(ulisttot); - bytes += MemKK::memory_usage(ulisttot_full); bytes += MemKK::memory_usage(zlist); bytes += MemKK::memory_usage(blist); From cb548143eea8c1369975c0f98ac10cf82947e926 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Tue, 19 Nov 2024 12:14:32 -0800 Subject: [PATCH 11/30] Unified zlist and blist CPU and GPU structures; greatly simplified/fused compute_bi --- src/KOKKOS/pair_snap_kokkos.h | 7 +- src/KOKKOS/pair_snap_kokkos_impl.h | 38 ++----- src/KOKKOS/sna_kokkos.h | 10 +- src/KOKKOS/sna_kokkos_impl.h | 167 +++-------------------------- 4 files changed, 23 insertions(+), 199 deletions(-) diff --git a/src/KOKKOS/pair_snap_kokkos.h b/src/KOKKOS/pair_snap_kokkos.h index c9607daa1c..7d58f387d6 100644 --- a/src/KOKKOS/pair_snap_kokkos.h +++ b/src/KOKKOS/pair_snap_kokkos.h @@ -40,7 +40,6 @@ struct TagPairSNAPBeta{}; template struct TagPairSNAPComputeForce{}; - // GPU backend only struct TagPairSNAPComputeNeigh{}; struct TagPairSNAPComputeCayleyKlein{}; @@ -50,7 +49,6 @@ struct TagPairSNAPComputeUiLarge{}; // less parallelism, no divergence struct TagPairSNAPTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist struct TagPairSNAPComputeZi{}; struct TagPairSNAPComputeBi{}; -struct TagPairSNAPTransformBi{}; // re-order blist from AoSoA to AoS struct TagPairSNAPComputeYi{}; struct TagPairSNAPComputeYiWithZlist{}; template @@ -187,9 +185,6 @@ class PairSNAPKokkos : public PairSNAP { KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeBi,const int iatom_mod, const int idxb, const int iatom_div) const; - KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const; - KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeYi,const int iatom_mod, const int idxz, const int iatom_div) const; @@ -221,7 +216,7 @@ class PairSNAPKokkos : public PairSNAP { void operator() (TagPairSNAPComputeZiCPU,const int& ii) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeBiCPU,const typename Kokkos::TeamPolicy::member_type& team) const; + void operator() (TagPairSNAPComputeBiCPU, const int& ii) const; KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeYiCPU,const int& ii) const; diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h index 2ca74375c4..b2252f85c7 100644 --- a/src/KOKKOS/pair_snap_kokkos_impl.h +++ b/src/KOKKOS/pair_snap_kokkos_impl.h @@ -264,9 +264,8 @@ void PairSNAPKokkos::compute(int eflag_in, Kokkos::parallel_for("ComputeZiCPU",policy_zi_cpu,*this); //ComputeBi - int team_size = team_size_default; - check_team_size_for(chunk_size,team_size); - typename Kokkos::TeamPolicy policy_bi_cpu(chunk_size,team_size,vector_length); + int idxb_max = snaKK.idxb_max; + typename Kokkos::RangePolicy policy_bi_cpu(0, chunk_size * idxb_max); Kokkos::parallel_for("ComputeBiCPU",policy_bi_cpu,*this); } @@ -381,13 +380,6 @@ void PairSNAPKokkos::compute(int eflag_in, Snap3DRangePolicy policy_compute_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_compute_bi,1}); Kokkos::parallel_for("ComputeBi",policy_compute_bi,*this); - - //Transform data layout of blist out of AoSoA - //We need this because `blist` gets used in ComputeForce which doesn't - //take advantage of AoSoA, which at best would only be beneficial on the margins - Snap3DRangePolicy - policy_transform_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_transform_bi,1}); - Kokkos::parallel_for("TransformBi",policy_transform_bi,*this); } //Note zeroing `ylist` is fused into `TransformUi`. @@ -900,25 +892,6 @@ void PairSNAPKokkos::operator() (TagPairSN snaKK.compute_bi(iatom,jjb); } -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const { - - const int iatom = iatom_mod + iatom_div * vector_length; - if (iatom >= chunk_size) return; - if (idxb >= snaKK.idxb_max) return; - - const int ntriples = snaKK.ntriples; - - for (int itriple = 0; itriple < ntriples; itriple++) { - - const real_type blocal = snaKK.blist_gpu(iatom, idxb, itriple); - - snaKK.blist(iatom, itriple, idxb) = blocal; - } - -} - template template KOKKOS_INLINE_FUNCTION @@ -1151,9 +1124,10 @@ void PairSNAPKokkos::operator() (TagPairSN template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeBiCPU,const typename Kokkos::TeamPolicy::member_type& team) const { - int ii = team.league_rank(); - snaKK.compute_bi_cpu(team,ii); +void PairSNAPKokkos::operator() (TagPairSNAPComputeBiCPU, const int& ii) const { + const int iatom = ii / snaKK.idxb_max; + const int jjb = ii % snaKK.idxb_max; + snaKK.compute_bi(iatom, jjb); } template diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h index 66d7e7254f..163c738651 100644 --- a/src/KOKKOS/sna_kokkos.h +++ b/src/KOKKOS/sna_kokkos.h @@ -244,8 +244,6 @@ class SNAKokkos { void compute_zi_cpu(const int&) const; // ForceSNAP KOKKOS_INLINE_FUNCTION void compute_yi_cpu(int) const; // ForceSNAP - KOKKOS_INLINE_FUNCTION - void compute_bi_cpu(const typename Kokkos::TeamPolicy::member_type& team, int) const; // ForceSNAP // functions for derivatives, CPU only KOKKOS_INLINE_FUNCTION @@ -294,12 +292,13 @@ class SNAKokkos { t_sna_3d ulisttot_im; t_sna_3c ulisttot; // un-folded ulisttot + t_sna_3c zlist; + t_sna_3d blist; + t_sna_3d ylist_re; t_sna_3d ylist_im; // Structures for the CPU backend only - t_sna_3d blist; - t_sna_3c_ll zlist; t_sna_3c_ll ulist; @@ -313,9 +312,6 @@ class SNAKokkos { t_sna_3c db_gpu; // `db` t_sna_3d sfac_gpu; // sfac, dsfac_{x,y,z} - t_sna_3c zlist_gpu; - t_sna_3d blist_gpu; - int idxcg_max, idxu_max, idxu_half_max, idxu_cache_max, idxz_max, idxb_max; // Chem snap counts diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index 1623bec0e1..0e2753215b 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -317,6 +317,9 @@ void SNAKokkos::grow_rij(int newnatom, int MemKK::realloc_kokkos(ulisttot_im,"sna:ulisttot_im", natom_pad, nelements, idxu_half_max); MemKK::realloc_kokkos(ulisttot,"sna:ulisttot", natom_pad, nelements, idxu_max); + MemKK::realloc_kokkos(zlist,"sna:zlist", natom_pad, ndoubles, idxz_max); + MemKK::realloc_kokkos(blist,"sna:blist", natom_pad, ntriples, idxb_max); + MemKK::realloc_kokkos(ylist_re,"sna:ylist_re", natom_pad, nelements, idxu_half_max); MemKK::realloc_kokkos(ylist_im,"sna:ylist_im", natom_pad, nelements, idxu_half_max); @@ -327,10 +330,6 @@ void SNAKokkos::grow_rij(int newnatom, int MemKK::realloc_kokkos(db_gpu,"sna:db_gpu",natom_pad,nmax,3); MemKK::realloc_kokkos(sfac_gpu,"sna:sfac_gpu",natom_pad,nmax,4); MemKK::realloc_kokkos(ulist,"sna:ulist",1,1,1); - MemKK::realloc_kokkos(zlist,"sna:zlist",1,1,1); - MemKK::realloc_kokkos(zlist_gpu,"sna:zlist_gpu",natom_pad,idxz_max,ndoubles); - MemKK::realloc_kokkos(blist,"sna:blist",natom_pad,ntriples,idxb_max); - MemKK::realloc_kokkos(blist_gpu,"sna:blist_gpu",natom_pad,idxb_max,ntriples); MemKK::realloc_kokkos(dulist,"sna:dulist",1,1,1); } else { MemKK::realloc_kokkos(a_gpu,"sna:a_gpu",1,1); @@ -339,10 +338,6 @@ void SNAKokkos::grow_rij(int newnatom, int MemKK::realloc_kokkos(db_gpu,"sna:db_gpu",1,1,1); MemKK::realloc_kokkos(sfac_gpu,"sna:sfac_gpu",1,1,1); MemKK::realloc_kokkos(ulist,"sna:ulist",idxu_cache_max,natom_pad,nmax); - MemKK::realloc_kokkos(zlist,"sna:zlist",idxz_max,ndoubles,natom_pad); - MemKK::realloc_kokkos(zlist_gpu,"sna:zlist_gpu",1,1,1); - MemKK::realloc_kokkos(blist,"sna:blist",natom_pad,ntriples,idxb_max); - MemKK::realloc_kokkos(blist_gpu,"sna:blist_gpu",1,1,1); MemKK::realloc_kokkos(dulist,"sna:dulist",idxu_cache_max,natom_pad,nmax); } } @@ -663,7 +658,7 @@ void SNAKokkos::compute_zi(const int& iato for (int elem1 = 0; elem1 < nelements; elem1++) { for (int elem2 = 0; elem2 < nelements; elem2++) { - zlist_gpu(iatom,jjz,idouble) = evaluate_zi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, iatom, elem1, elem2, cgblock); + zlist(iatom, idouble, jjz) = evaluate_zi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, iatom, elem1, elem2, cgblock); idouble++; } @@ -711,7 +706,7 @@ void SNAKokkos::compute_bi(const int& iato const int jjz_index = jjz+mb*(j+1)+ma; if (2*mb == j) return; // I think we can remove this? const complex utot = ulisttot(iatom, elem3, jju_index); - const complex zloc = zlist_gpu(iatom, jjz_index, idouble); + const complex zloc = zlist(iatom, idouble, jjz_index); sumzu_temp += utot.re * zloc.re + utot.im * zloc.im; } } @@ -727,7 +722,7 @@ void SNAKokkos::compute_bi(const int& iato const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma; const complex utot = ulisttot(iatom, elem3, jju_index); - const complex zloc = zlist_gpu(iatom, jjz_index, idouble); + const complex zloc = zlist(iatom, idouble, jjz_index); sumzu_temp += utot.re * zloc.re + utot.im * zloc.im; } @@ -738,7 +733,7 @@ void SNAKokkos::compute_bi(const int& iato const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma; const complex utot = ulisttot(iatom, elem3, jju_index); - const complex zloc = zlist_gpu(iatom, jjz_index, idouble); + const complex zloc = zlist(iatom, idouble, jjz_index); sumzu += static_cast(0.5) * (utot.re * zloc.re + utot.im * zloc.im); } // end if jeven @@ -752,7 +747,7 @@ void SNAKokkos::compute_bi(const int& iato sumzu -= bzero[j]; } } - blist_gpu(iatom, jjb, itriple) = sumzu; + blist(iatom, itriple, jjb) = sumzu; //} // end loop over j //} // end loop over j1, j2 itriple++; @@ -818,7 +813,7 @@ void SNAKokkos::compute_yi_with_zlist(cons int idouble = 0; for (int elem1 = 0; elem1 < nelements; elem1++) { for (int elem2 = 0; elem2 < nelements; elem2++) { - const complex ztmp = zlist_gpu(iatom,jjz,idouble); + const complex ztmp = zlist(iatom, idouble, jjz); // apply to z(j1,j2,j,ma,mb) to unique element of y(j) // find right y_list[jju] and beta(iatom,jjb) entries // multiply and divide by j+1 factors @@ -1240,144 +1235,12 @@ void SNAKokkos::compute_zi_cpu(const int& for (int elem1 = 0; elem1 < nelements; elem1++) { for (int elem2 = 0; elem2 < nelements; elem2++) { - zlist(jjz, idouble, iatom).re = static_cast(0.0); - zlist(jjz, idouble, iatom).im = static_cast(0.0); - - int jju1 = idxu_block[j1] + (j1+1)*mb1min; - int jju2 = idxu_block[j2] + (j2+1)*mb2max; - int icgb = mb1min*(j2+1) + mb2max; - for (int ib = 0; ib < nb; ib++) { - - real_type suma1_r = static_cast(0.0); - real_type suma1_i = static_cast(0.0); - - int ma1 = ma1min; - int ma2 = ma2max; - int icga = ma1min * (j2 + 1) + ma2max; - for (int ia = 0; ia < na; ia++) { - suma1_r += cgblock[icga] * (ulisttot(iatom, elem1, jju1+ma1).re * ulisttot(iatom, elem2, jju2+ma2).re - ulisttot(iatom, elem1, jju1+ma1).im * ulisttot(iatom, elem2, jju2+ma2).im); - suma1_i += cgblock[icga] * (ulisttot(iatom, elem1, jju1+ma1).re * ulisttot(iatom, elem2, jju2+ma2).im + ulisttot(iatom, elem1, jju1+ma1).im * ulisttot(iatom, elem2, jju2+ma2).re); - ma1++; - ma2--; - icga += j2; - } // end loop over ia - - zlist(jjz, idouble, iatom).re += cgblock[icgb] * suma1_r; - zlist(jjz, idouble, iatom).im += cgblock[icgb] * suma1_i; - - jju1 += j1 + 1; - jju2 -= j2 + 1; - icgb += j2; - } // end loop over ib - - if (bnorm_flag) { - const real_type scale = static_cast(1) / static_cast(j + 1); - zlist(jjz, idouble, iatom).re *= scale; - zlist(jjz, idouble, iatom).im *= scale; - } + zlist(iatom, idouble, jjz) = evaluate_zi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, iatom, elem1, elem2, cgblock); idouble++; } // end loop over elem2 } // end loop over elem1 } - -/* ---------------------------------------------------------------------- - compute Bi by summing conj(Ui)*Zi, CPU version -------------------------------------------------------------------------- */ - -template -KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_bi_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom) const -{ - // for j1 = 0,...,twojmax - // for j2 = 0,twojmax - // for j = |j1-j2|,Min(twojmax,j1+j2),2 - // b(j1,j2,j) = 0 - // for mb = 0,...,jmid - // for ma = 0,...,j - // b(j1,j2,j) += - // 2*Conj(u(j,ma,mb))*z(j1,j2,j,ma,mb) - - int itriple = 0; - int idouble = 0; - for (int elem1 = 0; elem1 < nelements; elem1++) { - for (int elem2 = 0; elem2 < nelements; elem2++) { - int jalloy = idouble; // must be non-const to work around gcc compiler bug - for (int elem3 = 0; elem3 < nelements; elem3++) { - Kokkos::parallel_for(Kokkos::TeamThreadRange(team,idxb_max), - [&] (const int& jjb) { - const int j1 = idxb(jjb, 0); - const int j2 = idxb(jjb, 1); - int j = idxb(jjb, 2); // removed "const" to work around GCC 7 bug - - int jjz = idxz_block(j1, j2, j); - int jju = idxu_block[j]; - real_type sumzu = static_cast(0.0); - real_type sumzu_temp = static_cast(0.0); - const int bound = (j+2)/2; - Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,(j+1)*bound), - [&] (const int mbma, real_type& sum) { - const int ma = mbma % (j + 1); - const int mb = mbma / (j + 1); - const int jju_index = jju + mb * (j + 1) + ma; - const int jjz_index = jjz + mb * (j + 1) + ma; - if (2*mb == j) return; - sum += - ulisttot(iatom, elem3, jju_index).re * zlist(jjz_index, jalloy, iatom).re + - ulisttot(iatom, elem3, jju_index).im * zlist(jjz_index, jalloy, iatom).im; - },sumzu_temp); // end loop over ma, mb - sumzu += sumzu_temp; - - // For j even, special treatment for middle column - - if (j%2 == 0) { - int mb = j/2; // removed "const" to work around GCC 7 bug - Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, mb), - [&] (const int ma, real_type& sum) { - const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma; - const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma; - sum += - ulisttot(iatom, elem3, jju_index).re * zlist(jjz_index, jalloy, iatom).re + - ulisttot(iatom, elem3, jju_index).im * zlist(jjz_index, jalloy, iatom).im; - },sumzu_temp); // end loop over ma - sumzu += sumzu_temp; - - const int ma = mb; - const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma; - const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma; - sumzu += static_cast(0.5)* - (ulisttot(iatom, elem3, jju_index).re * zlist(jjz_index, jalloy, iatom).re + - ulisttot(iatom, elem3, jju_index).im * zlist(jjz_index, jalloy, iatom).im); - } // end if jeven - - Kokkos::single(Kokkos::PerThread(team), [&] () { - sumzu *= static_cast(2.0); - - // apply bzero shift - - if (bzero_flag) { - if (!wselfall_flag) { - if (elem1 == elem2 && elem1 == elem3) { - sumzu -= bzero[j]; - } - } else { - sumzu -= bzero[j]; - } - } - - blist(iatom, itriple, jjb) = sumzu; - }); - }); - //} // end loop over j - //} // end loop over j1, j2 - itriple++; - } - idouble++; - } // end loop over elem2 - } // end loop over elem1 - -} - /* ---------------------------------------------------------------------- compute Yi from Ui without storing Zi, looping over zlist indices, CPU version @@ -2295,6 +2158,9 @@ double SNAKokkos::memory_usage() bytes += MemKK::memory_usage(ulisttot_im); bytes += MemKK::memory_usage(ulisttot); + bytes += MemKK::memory_usage(zlist); + bytes += MemKK::memory_usage(blist); + bytes += MemKK::memory_usage(ylist_re); bytes += MemKK::memory_usage(ylist_im); @@ -2304,15 +2170,8 @@ double SNAKokkos::memory_usage() bytes += MemKK::memory_usage(da_gpu); bytes += MemKK::memory_usage(db_gpu); bytes += MemKK::memory_usage(sfac_gpu); - - bytes += MemKK::memory_usage(zlist_gpu); - bytes += MemKK::memory_usage(blist_gpu); } else { bytes += MemKK::memory_usage(ulist); - - bytes += MemKK::memory_usage(zlist); - bytes += MemKK::memory_usage(blist); - bytes += MemKK::memory_usage(dulist); } From 976167e2e52c4b0ea246b0f5fe29c48468473f41 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Tue, 19 Nov 2024 12:30:58 -0800 Subject: [PATCH 12/30] Greatly tightened up CPU ulist, dulist computation --- src/KOKKOS/sna_kokkos.h | 9 +- src/KOKKOS/sna_kokkos_impl.h | 175 ++++++++++++++++------------------- 2 files changed, 82 insertions(+), 102 deletions(-) diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h index 163c738651..fa187c26ea 100644 --- a/src/KOKKOS/sna_kokkos.h +++ b/src/KOKKOS/sna_kokkos.h @@ -158,7 +158,7 @@ class SNAKokkos { typedef Kokkos::View t_sna_3c; typedef Kokkos::View t_sna_3c_ll; typedef Kokkos::View t_sna_4c; - typedef Kokkos::View t_sna_4c3_ll; + typedef Kokkos::View t_sna_4c3; typedef Kokkos::View t_sna_4c_ll; typedef Kokkos::View t_sna_3c3; typedef Kokkos::View t_sna_5c; @@ -299,11 +299,8 @@ class SNAKokkos { t_sna_3d ylist_im; // Structures for the CPU backend only - - t_sna_3c_ll ulist; - - // derivatives of data - t_sna_4c3_ll dulist; + t_sna_3c ulist_cpu; + t_sna_4c3 dulist_cpu; // Modified structures for GPU backend t_sna_2c a_gpu; // Cayley-Klein `a` diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index 0e2753215b..7988026dd2 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -329,16 +329,16 @@ void SNAKokkos::grow_rij(int newnatom, int MemKK::realloc_kokkos(da_gpu,"sna:da_gpu",natom_pad,nmax,3); MemKK::realloc_kokkos(db_gpu,"sna:db_gpu",natom_pad,nmax,3); MemKK::realloc_kokkos(sfac_gpu,"sna:sfac_gpu",natom_pad,nmax,4); - MemKK::realloc_kokkos(ulist,"sna:ulist",1,1,1); - MemKK::realloc_kokkos(dulist,"sna:dulist",1,1,1); + MemKK::realloc_kokkos(ulist_cpu,"sna:ulist_cpu",1,1,1); + MemKK::realloc_kokkos(dulist_cpu,"sna:dulist_cpu",1,1,1); } else { MemKK::realloc_kokkos(a_gpu,"sna:a_gpu",1,1); MemKK::realloc_kokkos(b_gpu,"sna:b_gpu",1,1); MemKK::realloc_kokkos(da_gpu,"sna:da_gpu",1,1,1); MemKK::realloc_kokkos(db_gpu,"sna:db_gpu",1,1,1); MemKK::realloc_kokkos(sfac_gpu,"sna:sfac_gpu",1,1,1); - MemKK::realloc_kokkos(ulist,"sna:ulist",idxu_cache_max,natom_pad,nmax); - MemKK::realloc_kokkos(dulist,"sna:dulist",idxu_cache_max,natom_pad,nmax); + MemKK::realloc_kokkos(ulist_cpu,"sna:ulist_cpu", natom_pad, nmax, idxu_cache_max); + MemKK::realloc_kokkos(dulist_cpu,"sna:dulist_cpu", natom_pad, nmax, idxu_cache_max); } } @@ -1373,9 +1373,9 @@ void SNAKokkos::compute_deidrj_cpu(const t for (int mb = 0; 2*mb < j; mb++) for (int ma = 0; ma <= j; ma++) { const complex y_val = { ylist_re(iatom, jelem, jju_half), ylist_im(iatom, jelem, jju_half) }; - sum_tmp.x += dulist(jju_cache,iatom,jnbor,0).re * y_val.re + dulist(jju_cache,iatom,jnbor,0).im * y_val.im; - sum_tmp.y += dulist(jju_cache,iatom,jnbor,1).re * y_val.re + dulist(jju_cache,iatom,jnbor,1).im * y_val.im; - sum_tmp.z += dulist(jju_cache,iatom,jnbor,2).re * y_val.re + dulist(jju_cache,iatom,jnbor,2).im * y_val.im; + sum_tmp.x += dulist_cpu(iatom, jnbor, jju_cache, 0).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 0).im * y_val.im; + sum_tmp.y += dulist_cpu(iatom, jnbor, jju_cache, 1).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 1).im * y_val.im; + sum_tmp.z += dulist_cpu(iatom, jnbor, jju_cache, 2).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 2).im * y_val.im; jju_half++; jju_cache++; } //end loop over ma mb @@ -1386,18 +1386,18 @@ void SNAKokkos::compute_deidrj_cpu(const t int mb = j/2; for (int ma = 0; ma < mb; ma++) { const complex y_val = { ylist_re(iatom, jelem, jju_half), ylist_im(iatom, jelem, jju_half) }; - sum_tmp.x += dulist(jju_cache,iatom,jnbor,0).re * y_val.re + dulist(jju_cache,iatom,jnbor,0).im * y_val.im; - sum_tmp.y += dulist(jju_cache,iatom,jnbor,1).re * y_val.re + dulist(jju_cache,iatom,jnbor,1).im * y_val.im; - sum_tmp.z += dulist(jju_cache,iatom,jnbor,2).re * y_val.re + dulist(jju_cache,iatom,jnbor,2).im * y_val.im; + sum_tmp.x += dulist_cpu(iatom, jnbor, jju_cache, 0).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 0).im * y_val.im; + sum_tmp.y += dulist_cpu(iatom, jnbor, jju_cache, 1).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 1).im * y_val.im; + sum_tmp.z += dulist_cpu(iatom, jnbor, jju_cache, 2).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 2).im * y_val.im; jju_half++; jju_cache++; } //int ma = mb; // 0.5 is meant to avoid double-counting const complex y_val = { 0.5 * ylist_re(iatom, jelem, jju_half), 0.5 * ylist_im(iatom, jelem, jju_half) }; - sum_tmp.x += dulist(jju_cache,iatom,jnbor,0).re * y_val.re + dulist(jju_cache,iatom,jnbor,0).im * y_val.im; - sum_tmp.y += dulist(jju_cache,iatom,jnbor,1).re * y_val.re + dulist(jju_cache,iatom,jnbor,1).im * y_val.im; - sum_tmp.z += dulist(jju_cache,iatom,jnbor,2).re * y_val.re + dulist(jju_cache,iatom,jnbor,2).im * y_val.im; + sum_tmp.x += dulist_cpu(iatom, jnbor, jju_cache, 0).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 0).im * y_val.im; + sum_tmp.y += dulist_cpu(iatom, jnbor, jju_cache, 1).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 1).im * y_val.im; + sum_tmp.z += dulist_cpu(iatom, jnbor, jju_cache, 2).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 2).im * y_val.im; } // end if jeven },final_sum); // end loop over j @@ -1436,8 +1436,8 @@ void SNAKokkos::add_uarraytot(const typena int count = 0; for (int mb = 0; 2*mb <= j; mb++) { for (int ma = 0; ma <= j; ma++) { - Kokkos::atomic_add(&(ulisttot_re(iatom, jelem, jju_half+count)), sfac * ulist(jju_cache+count, iatom, jnbor).re); - Kokkos::atomic_add(&(ulisttot_im(iatom, jelem, jju_half+count)), sfac * ulist(jju_cache+count, iatom, jnbor).im); + Kokkos::atomic_add(&(ulisttot_re(iatom, jelem, jju_half+count)), sfac * ulist_cpu(iatom, jnbor, jju_cache+count).re); + Kokkos::atomic_add(&(ulisttot_im(iatom, jelem, jju_half+count)), sfac * ulist_cpu(iatom, jnbor, jju_cache+count).im); count++; } } @@ -1469,8 +1469,7 @@ void SNAKokkos::compute_uarray_cpu(const t // VMK Section 4.8.2 - ulist(0,iatom,jnbor).re = 1.0; - ulist(0,iatom,jnbor).im = 0.0; + ulist_cpu(iatom, jnbor, 0) = complex::one(); for (int j = 1; j <= twojmax; j++) { int jju = idxu_cache_block[j]; // removed "const" to work around GCC 7 bug @@ -1482,31 +1481,30 @@ void SNAKokkos::compute_uarray_cpu(const t [&] (const int& mb) { //for (int mb = 0; 2*mb <= j; mb++) { const int jju_index = jju+mb+mb*j; - ulist(jju_index,iatom,jnbor).re = 0.0; - ulist(jju_index,iatom,jnbor).im = 0.0; + ulist_cpu(iatom, jnbor, jju_index) = { static_cast(0), static_cast(0) }; for (int ma = 0; ma < j; ma++) { const int jju_index = jju+mb+mb*j+ma; const int jjup_index = jjup+mb*j+ma; rootpq = rootpqarray(j - ma,j - mb); - ulist(jju_index,iatom,jnbor).re += + ulist_cpu(iatom, jnbor, jju_index).re += rootpq * - (a_r * ulist(jjup_index,iatom,jnbor).re + - a_i * ulist(jjup_index,iatom,jnbor).im); - ulist(jju_index,iatom,jnbor).im += + (a_r * ulist_cpu(iatom, jnbor, jjup_index).re + + a_i * ulist_cpu(iatom, jnbor, jjup_index).im); + ulist_cpu(iatom, jnbor, jju_index).im += rootpq * - (a_r * ulist(jjup_index,iatom,jnbor).im - - a_i * ulist(jjup_index,iatom,jnbor).re); + (a_r * ulist_cpu(iatom, jnbor, jjup_index).im - + a_i * ulist_cpu(iatom, jnbor, jjup_index).re); rootpq = rootpqarray(ma + 1,j - mb); - ulist(jju_index+1,iatom,jnbor).re = + ulist_cpu(iatom, jnbor, jju_index+1).re = -rootpq * - (b_r * ulist(jjup_index,iatom,jnbor).re + - b_i * ulist(jjup_index,iatom,jnbor).im); - ulist(jju_index+1,iatom,jnbor).im = + (b_r * ulist_cpu(iatom, jnbor, jjup_index).re + + b_i * ulist_cpu(iatom, jnbor, jjup_index).im); + ulist_cpu(iatom, jnbor, jju_index+1).im = -rootpq * - (b_r * ulist(jjup_index,iatom,jnbor).im - - b_i * ulist(jjup_index,iatom,jnbor).re); + (b_r * ulist_cpu(iatom, jnbor, jjup_index).im - + b_i * ulist_cpu(iatom, jnbor, jjup_index).re); } // copy left side to right side with inversion symmetry VMK 4.4(2) @@ -1521,11 +1519,11 @@ void SNAKokkos::compute_uarray_cpu(const t const int jju_index = jju + mb*(j+1) + ma; const int jjup_index = jju + (j+1-mb)*(j+1)-(ma+1); if (mapar == 1) { - ulist(jjup_index,iatom,jnbor).re = ulist(jju_index,iatom,jnbor).re; - ulist(jjup_index,iatom,jnbor).im = -ulist(jju_index,iatom,jnbor).im; + ulist_cpu(iatom, jnbor, jjup_index).re = ulist_cpu(iatom, jnbor, jju_index).re; + ulist_cpu(iatom, jnbor, jjup_index).im = -ulist_cpu(iatom, jnbor, jju_index).im; } else { - ulist(jjup_index,iatom,jnbor).re = -ulist(jju_index,iatom,jnbor).re; - ulist(jjup_index,iatom,jnbor).im = ulist(jju_index,iatom,jnbor).im; + ulist_cpu(iatom, jnbor, jjup_index).re = -ulist_cpu(iatom, jnbor, jju_index).re; + ulist_cpu(iatom, jnbor, jjup_index).im = ulist_cpu(iatom, jnbor, jju_index).im; } mapar = -mapar; } @@ -1551,14 +1549,14 @@ void SNAKokkos::compute_duarray_cpu(const { real_type r0inv; real_type a_r, a_i, b_r, b_i; - real_type da_r[3], da_i[3], db_r[3], db_i[3]; + real_type u[3], da_r[3], da_i[3], db_r[3], db_i[3]; real_type dz0[3], dr0inv[3], dr0invdr; real_type rootpq; real_type rinv = 1.0 / r; - real_type ux = x * rinv; - real_type uy = y * rinv; - real_type uz = z * rinv; + u[0] = x * rinv; + u[1] = y * rinv; + u[2] = z * rinv; r0inv = 1.0 / sqrt(r * r + z0 * z0); a_r = z0 * r0inv; @@ -1568,13 +1566,13 @@ void SNAKokkos::compute_duarray_cpu(const dr0invdr = -r0inv * r0inv * r0inv * (r + z0 * dz0dr); - dr0inv[0] = dr0invdr * ux; - dr0inv[1] = dr0invdr * uy; - dr0inv[2] = dr0invdr * uz; + dr0inv[0] = dr0invdr * u[0]; + dr0inv[1] = dr0invdr * u[1]; + dr0inv[2] = dr0invdr * u[2]; - dz0[0] = dz0dr * ux; - dz0[1] = dz0dr * uy; - dz0[2] = dz0dr * uz; + dz0[0] = dz0dr * u[0]; + dz0[1] = dz0dr * u[1]; + dz0[2] = dz0dr * u[2]; for (int k = 0; k < 3; k++) { da_r[k] = dz0[k] * r0inv + z0 * dr0inv[k]; @@ -1591,12 +1589,8 @@ void SNAKokkos::compute_duarray_cpu(const db_i[0] += -r0inv; db_r[1] += r0inv; - dulist(0,iatom,jnbor,0).re = 0.0; - dulist(0,iatom,jnbor,1).re = 0.0; - dulist(0,iatom,jnbor,2).re = 0.0; - dulist(0,iatom,jnbor,0).im = 0.0; - dulist(0,iatom,jnbor,1).im = 0.0; - dulist(0,iatom,jnbor,2).im = 0.0; + for (int k = 0; k < 3; k++) + dulist_cpu(iatom, jnbor, 0, k) = complex::zero(); for (int j = 1; j <= twojmax; j++) { int jju = idxu_cache_block[j]; @@ -1605,42 +1599,38 @@ void SNAKokkos::compute_duarray_cpu(const [&] (const int& mb) { //for (int mb = 0; 2*mb <= j; mb++) { const int jju_index = jju+mb+mb*j; - dulist(jju_index,iatom,jnbor,0).re = 0.0; - dulist(jju_index,iatom,jnbor,1).re = 0.0; - dulist(jju_index,iatom,jnbor,2).re = 0.0; - dulist(jju_index,iatom,jnbor,0).im = 0.0; - dulist(jju_index,iatom,jnbor,1).im = 0.0; - dulist(jju_index,iatom,jnbor,2).im = 0.0; + for (int k = 0; k < 3; k++) + dulist_cpu(iatom, jnbor, jju_index, k) = complex::zero(); for (int ma = 0; ma < j; ma++) { const int jju_index = jju+mb+mb*j+ma; const int jjup_index = jjup+mb*j+ma; rootpq = rootpqarray(j - ma,j - mb); for (int k = 0; k < 3; k++) { - dulist(jju_index,iatom,jnbor,k).re += - rootpq * (da_r[k] * ulist(jjup_index,iatom,jnbor).re + - da_i[k] * ulist(jjup_index,iatom,jnbor).im + - a_r * dulist(jjup_index,iatom,jnbor,k).re + - a_i * dulist(jjup_index,iatom,jnbor,k).im); - dulist(jju_index,iatom,jnbor,k).im += - rootpq * (da_r[k] * ulist(jjup_index,iatom,jnbor).im - - da_i[k] * ulist(jjup_index,iatom,jnbor).re + - a_r * dulist(jjup_index,iatom,jnbor,k).im - - a_i * dulist(jjup_index,iatom,jnbor,k).re); + dulist_cpu(iatom, jnbor, jju_index, k).re += + rootpq * (da_r[k] * ulist_cpu(iatom, jnbor, jjup_index).re + + da_i[k] * ulist_cpu(iatom, jnbor, jjup_index).im + + a_r * dulist_cpu(iatom, jnbor, jjup_index, k).re + + a_i * dulist_cpu(iatom, jnbor, jjup_index, k).im); + dulist_cpu(iatom, jnbor, jju_index, k).im += + rootpq * (da_r[k] * ulist_cpu(iatom, jnbor, jjup_index).im - + da_i[k] * ulist_cpu(iatom, jnbor, jjup_index).re + + a_r * dulist_cpu(iatom, jnbor, jjup_index, k).im - + a_i * dulist_cpu(iatom, jnbor, jjup_index, k).re); } rootpq = rootpqarray(ma + 1,j - mb); for (int k = 0; k < 3; k++) { - dulist(jju_index+1,iatom,jnbor,k).re = - -rootpq * (db_r[k] * ulist(jjup_index,iatom,jnbor).re + - db_i[k] * ulist(jjup_index,iatom,jnbor).im + - b_r * dulist(jjup_index,iatom,jnbor,k).re + - b_i * dulist(jjup_index,iatom,jnbor,k).im); - dulist(jju_index+1,iatom,jnbor,k).im = - -rootpq * (db_r[k] * ulist(jjup_index,iatom,jnbor).im - - db_i[k] * ulist(jjup_index,iatom,jnbor).re + - b_r * dulist(jjup_index,iatom,jnbor,k).im - - b_i * dulist(jjup_index,iatom,jnbor,k).re); + dulist_cpu(iatom, jnbor, jju_index+1, k).re = + -rootpq * (db_r[k] * ulist_cpu(iatom, jnbor, jjup_index).re + + db_i[k] * ulist_cpu(iatom, jnbor, jjup_index).im + + b_r * dulist_cpu(iatom, jnbor, jjup_index, k).re + + b_i * dulist_cpu(iatom, jnbor, jjup_index, k).im); + dulist_cpu(iatom, jnbor, jju_index+1, k).im = + -rootpq * (db_r[k] * ulist_cpu(iatom, jnbor, jjup_index).im - + db_i[k] * ulist_cpu(iatom, jnbor, jjup_index).re + + b_r * dulist_cpu(iatom, jnbor, jjup_index, k).im - + b_i * dulist_cpu(iatom, jnbor, jjup_index, k).re); } } @@ -1658,13 +1648,13 @@ void SNAKokkos::compute_duarray_cpu(const const int jjup_index = jju+(mb+2)*(j+1)-(ma+1); if (mapar == 1) { for (int k = 0; k < 3; k++) { - dulist(jjup_index,iatom,jnbor,k).re = dulist(jju_index,iatom,jnbor,k).re; - dulist(jjup_index,iatom,jnbor,k).im = -dulist(jju_index,iatom,jnbor,k).im; + dulist_cpu(iatom, jnbor, jjup_index, k).re = dulist_cpu(iatom, jnbor, jju_index, k).re; + dulist_cpu(iatom, jnbor, jjup_index, k).im = -dulist_cpu(iatom, jnbor, jju_index, k).im; } } else { for (int k = 0; k < 3; k++) { - dulist(jjup_index,iatom,jnbor,k).re = -dulist(jju_index,iatom,jnbor,k).re; - dulist(jjup_index,iatom,jnbor,k).im = dulist(jju_index,iatom,jnbor,k).im; + dulist_cpu(iatom, jnbor, jjup_index, k).re = -dulist_cpu(iatom, jnbor, jju_index, k).re; + dulist_cpu(iatom, jnbor, jjup_index, k).im = dulist_cpu(iatom, jnbor, jju_index, k).im; } } mapar = -mapar; @@ -1686,19 +1676,12 @@ void SNAKokkos::compute_duarray_cpu(const int jju = idxu_cache_block[j]; for (int mb = 0; 2*mb <= j; mb++) for (int ma = 0; ma <= j; ma++) { - dulist(jju,iatom,jnbor,0).re = dsfac * ulist(jju,iatom,jnbor).re * ux + - sfac * dulist(jju,iatom,jnbor,0).re; - dulist(jju,iatom,jnbor,0).im = dsfac * ulist(jju,iatom,jnbor).im * ux + - sfac * dulist(jju,iatom,jnbor,0).im; - dulist(jju,iatom,jnbor,1).re = dsfac * ulist(jju,iatom,jnbor).re * uy + - sfac * dulist(jju,iatom,jnbor,1).re; - dulist(jju,iatom,jnbor,1).im = dsfac * ulist(jju,iatom,jnbor).im * uy + - sfac * dulist(jju,iatom,jnbor,1).im; - dulist(jju,iatom,jnbor,2).re = dsfac * ulist(jju,iatom,jnbor).re * uz + - sfac * dulist(jju,iatom,jnbor,2).re; - dulist(jju,iatom,jnbor,2).im = dsfac * ulist(jju,iatom,jnbor).im * uz + - sfac * dulist(jju,iatom,jnbor,2).im; - + for (int k = 0; k < 3; k++) { + dulist_cpu(iatom, jnbor, jju, k).re = dsfac * ulist_cpu(iatom, jnbor, jju).re * u[k] + + sfac * dulist_cpu(iatom, jnbor, jju, k).re; + dulist_cpu(iatom, jnbor, jju, k).im = dsfac * ulist_cpu(iatom, jnbor, jju).im * u[k] + + sfac * dulist_cpu(iatom, jnbor, jju, k).im; + } jju++; } } @@ -2171,8 +2154,8 @@ double SNAKokkos::memory_usage() bytes += MemKK::memory_usage(db_gpu); bytes += MemKK::memory_usage(sfac_gpu); } else { - bytes += MemKK::memory_usage(ulist); - bytes += MemKK::memory_usage(dulist); + bytes += MemKK::memory_usage(ulist_cpu); + bytes += MemKK::memory_usage(dulist_cpu); } bytes += MemKK::memory_usage(dedr); From 277fba1907c63cc3a3d66460ec6bf9d17e2b565e Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Tue, 19 Nov 2024 12:44:08 -0800 Subject: [PATCH 13/30] Removed unnecessary team parallelism from CPU routines; rearranged pair_snap_kokkos_impl to make the subsequent CPU/GPU unifications easier to follow. --- src/KOKKOS/pair_snap_kokkos.h | 8 +- src/KOKKOS/pair_snap_kokkos_impl.h | 655 +++++++++++++++-------------- src/KOKKOS/sna_kokkos.h | 16 +- src/KOKKOS/sna_kokkos_impl.h | 358 ++++++++-------- 4 files changed, 530 insertions(+), 507 deletions(-) diff --git a/src/KOKKOS/pair_snap_kokkos.h b/src/KOKKOS/pair_snap_kokkos.h index 7d58f387d6..e7797097c2 100644 --- a/src/KOKKOS/pair_snap_kokkos.h +++ b/src/KOKKOS/pair_snap_kokkos.h @@ -204,10 +204,10 @@ class PairSNAPKokkos : public PairSNAP { void operator() (TagPairSNAPComputeNeighCPU,const typename Kokkos::TeamPolicy::member_type& team) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPPreUiCPU,const typename Kokkos::TeamPolicy::member_type& team) const; + void operator() (TagPairSNAPPreUiCPU, const int& iatom) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy::member_type& team) const; + void operator() (TagPairSNAPComputeUiCPU, const int& ii) const; KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPTransformUiCPU, const int j, const int iatom) const; @@ -222,10 +222,10 @@ class PairSNAPKokkos : public PairSNAP { void operator() (TagPairSNAPComputeYiCPU,const int& ii) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeDuidrjCPU,const typename Kokkos::TeamPolicy::member_type& team) const; + void operator() (TagPairSNAPComputeDuidrjCPU, const int& ii) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeDeidrjCPU,const typename Kokkos::TeamPolicy::member_type& team) const; + void operator() (TagPairSNAPComputeDeidrjCPU, const int& ii) const; template KOKKOS_INLINE_FUNCTION diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h index b2252f85c7..d0647337a8 100644 --- a/src/KOKKOS/pair_snap_kokkos_impl.h +++ b/src/KOKKOS/pair_snap_kokkos_impl.h @@ -204,7 +204,7 @@ void PairSNAPKokkos::compute(int eflag_in, const int inum_pad = inum_div * vector_length; MemKK::realloc_kokkos(d_beta,"PairSNAPKokkos:beta", inum_pad, ncoeff); snaKK.d_beta = d_beta; - MemKK::realloc_kokkos(d_ninside,"PairSNAPKokkos:ninside", inum); + MemKK::realloc_kokkos(d_ninside,"PairSNAPKokkos:ninside", inum_pad); } chunk_size = MIN(chunksize, inum); // "chunksize" variable is set by user @@ -235,17 +235,14 @@ void PairSNAPKokkos::compute(int eflag_in, //PreUi { - int team_size = team_size_default; - check_team_size_for(chunk_size,team_size); - typename Kokkos::TeamPolicy policy_preui_cpu((chunk_size+team_size-1)/team_size,team_size,vector_length); + typename Kokkos::RangePolicy policy_preui_cpu(0, chunk_size); Kokkos::parallel_for("PreUiCPU",policy_preui_cpu,*this); } // ComputeUi { - int team_size = team_size_default; // Fused calculation of ulist and accumulation into ulisttot using atomics - typename Kokkos::TeamPolicy policy_ui_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); + typename Kokkos::RangePolicy policy_ui_cpu(0, chunk_size*max_neighs); Kokkos::parallel_for("ComputeUiCPU",policy_ui_cpu,*this); } @@ -283,13 +280,10 @@ void PairSNAPKokkos::compute(int eflag_in, //ComputeDuidrj and Deidrj { - int team_size = team_size_default; - - typename Kokkos::TeamPolicy policy_duidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); + typename Kokkos::RangePolicy policy_duidrj_cpu(0,chunk_size*max_neighs); Kokkos::parallel_for("ComputeDuidrjCPU",policy_duidrj_cpu,*this); - typename Kokkos::TeamPolicy policy_deidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); - + typename Kokkos::RangePolicy policy_deidrj_cpu(0,chunk_size*max_neighs); Kokkos::parallel_for("ComputeDeidrjCPU",policy_deidrj_cpu,*this); } @@ -601,47 +595,6 @@ void PairSNAPKokkos::coeff(int narg, char snaKK.init(); } -/* ---------------------------------------------------------------------- - Begin routines that are common to both the CPU and GPU codepath. -------------------------------------------------------------------------- */ - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPBeta, const int& iatom) const { - - if (iatom >= chunk_size) return; - - const int i = d_ilist[iatom + chunk_offset]; - const int itype = type[i]; - const int ielem = d_map[itype]; - - auto d_coeffi = Kokkos::subview(d_coeffelem, ielem, Kokkos::ALL); - - for (int icoeff = 0; icoeff < ncoeff; icoeff++) { - d_beta(iatom, icoeff) = d_coeffi[icoeff+1]; - } - - if (quadraticflag) { - const auto idxb_max = snaKK.idxb_max; - int k = ncoeff+1; - for (int icoeff = 0; icoeff < ncoeff; icoeff++) { - const auto idxb = icoeff % idxb_max; - const auto idx_chem = icoeff / idxb_max; - real_type bveci = snaKK.blist(iatom, idx_chem, idxb); - d_beta(iatom, icoeff) += d_coeffi[k] * bveci; - k++; - for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) { - const auto jdxb = jcoeff % idxb_max; - const auto jdx_chem = jcoeff / idxb_max; - real_type bvecj = snaKK.blist(iatom, jdx_chem, jdxb); - d_beta(iatom, icoeff) += d_coeffi[k] * bvecj; - d_beta(iatom, jcoeff) += d_coeffi[k] * bveci; - k++; - } - } - } -} - /* ---------------------------------------------------------------------- Begin routines that are unique to the GPU codepath. These take advantage of AoSoA data layouts and scratch memory for recursive polynomials @@ -737,222 +690,6 @@ void PairSNAPKokkos::operator() (TagPairSN }); } -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeCayleyKlein,const int iatom_mod, const int jnbor, const int iatom_div) const { - - const int iatom = iatom_mod + iatom_div * vector_length; - if (iatom >= chunk_size) return; - - const int ninside = d_ninside(iatom); - if (jnbor >= ninside) return; - - snaKK.compute_cayley_klein(iatom,jnbor); -} - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPPreUi, const int iatom_mod, const int j, const int iatom_div) const { - const int iatom = iatom_mod + iatom_div * vector_length; - if (iatom >= chunk_size) return; - - int itype = type(iatom); - int ielem = d_map[itype]; - - snaKK.pre_ui(iatom, j, ielem); -} - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeUiSmall,const typename Kokkos::TeamPolicy::member_type& team) const { - - // extract flattened atom_div / neighbor number / bend location - int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui; - - // extract neighbor index, iatom_div - int iatom_div = flattened_idx / (max_neighs * (twojmax + 1)); // removed "const" to work around GCC 7 bug - const int jj_jbend = flattened_idx - iatom_div * (max_neighs * (twojmax + 1)); - const int jbend = jj_jbend / max_neighs; - int jj = jj_jbend - jbend * max_neighs; // removed "const" to work around GCC 7 bug - - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length), - [&] (const int iatom_mod) { - const int ii = iatom_mod + vector_length * iatom_div; - if (ii >= chunk_size) return; - - const int ninside = d_ninside(ii); - if (jj >= ninside) return; - - snaKK.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div); - }); - -} - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeUiLarge,const typename Kokkos::TeamPolicy::member_type& team) const { - - // extract flattened atom_div / neighbor number / bend location - int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui; - - // extract neighbor index, iatom_div - int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug - int jj = flattened_idx - iatom_div * max_neighs; - - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length), - [&] (const int iatom_mod) { - const int ii = iatom_mod + vector_length * iatom_div; - if (ii >= chunk_size) return; - - const int ninside = d_ninside(ii); - if (jj >= ninside) return; - - snaKK.compute_ui_large(team,iatom_mod, jj, iatom_div); - }); - -} - - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPTransformUi, const int iatom_mod, const int idxu, const int iatom_div) const { - - const int iatom = iatom_mod + iatom_div * vector_length; - if (iatom >= chunk_size) return; - if (idxu > snaKK.idxu_max) return; - - int elem_count = chemflag ? nelements : 1; - - for (int ielem = 0; ielem < elem_count; ielem++) { - - const FullHalfMapper mapper = snaKK.idxu_full_half[idxu]; - - auto utot_re = snaKK.ulisttot_re(iatom, ielem, mapper.idxu_half); - auto utot_im = snaKK.ulisttot_im(iatom, ielem, mapper.idxu_half); - - if (mapper.flip_sign == 1) { - utot_im = -utot_im; - } else if (mapper.flip_sign == -1) { - utot_re = -utot_re; - } - - snaKK.ulisttot(iatom, ielem, idxu) = { utot_re, utot_im }; - - if (mapper.flip_sign == 0) { - snaKK.ylist_re(iatom, ielem, mapper.idxu_half) = 0.; - snaKK.ylist_im(iatom, ielem, mapper.idxu_half) = 0.; - } - } -} - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeYi,const int iatom_mod, const int jjz, const int iatom_div) const { - - const int iatom = iatom_mod + iatom_div * vector_length; - if (iatom >= chunk_size) return; - - if (jjz >= snaKK.idxz_max) return; - - snaKK.compute_yi(iatom, jjz); -} - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeYiWithZlist,const int iatom_mod, const int jjz, const int iatom_div) const { - - const int iatom = iatom_mod + iatom_div * vector_length; - if (iatom >= chunk_size) return; - - if (jjz >= snaKK.idxz_max) return; - - snaKK.compute_yi_with_zlist(iatom, jjz); -} - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeZi,const int iatom_mod, const int jjz, const int iatom_div) const { - - const int iatom = iatom_mod + iatom_div * vector_length; - if (iatom >= chunk_size) return; - if (jjz >= snaKK.idxz_max) return; - - snaKK.compute_zi(iatom, jjz); -} - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeBi,const int iatom_mod, const int jjb, const int iatom_div) const { - - const int iatom = iatom_mod + iatom_div * vector_length; - if (iatom >= chunk_size) return; - - if (jjb >= snaKK.idxb_max) return; - - snaKK.compute_bi(iatom,jjb); -} - -template -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeFusedDeidrjSmall,const typename Kokkos::TeamPolicy >::member_type& team) const { - - // extract flattened atom_div / neighbor number / bend location - int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_fused_deidrj; - - // extract neighbor index, iatom_div - int iatom_div = flattened_idx / (max_neighs * (twojmax + 1)); // removed "const" to work around GCC 7 bug - const int jj_jbend = flattened_idx - iatom_div * (max_neighs * (twojmax + 1)); - const int jbend = jj_jbend / max_neighs; - int jj = jj_jbend - jbend * max_neighs; // removed "const" to work around GCC 7 bug - - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length), - [&] (const int iatom_mod) { - const int ii = iatom_mod + vector_length * iatom_div; - if (ii >= chunk_size) return; - - const int ninside = d_ninside(ii); - if (jj >= ninside) return; - - snaKK.template compute_fused_deidrj_small(team, iatom_mod, jbend, jj, iatom_div); - - }); - -} - -template -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeFusedDeidrjLarge,const typename Kokkos::TeamPolicy >::member_type& team) const { - - // extract flattened atom_div / neighbor number / bend location - int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_fused_deidrj; - - // extract neighbor index, iatom_div - int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug - int jj = flattened_idx - max_neighs * iatom_div; - - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length), - [&] (const int iatom_mod) { - const int ii = iatom_mod + vector_length * iatom_div; - if (ii >= chunk_size) return; - - const int ninside = d_ninside(ii); - if (jj >= ninside) return; - - snaKK.template compute_fused_deidrj_large(team, iatom_mod, jj, iatom_div); - - }); -} - -/* ---------------------------------------------------------------------- - Begin routines that are unique to the CPU codepath. These do not take - advantage of AoSoA data layouts, but that could be a good point of - future optimization and unification with the above kernels. It's unlikely - that scratch memory optimizations will ever be useful for the CPU due to - different arithmetic intensity requirements for the CPU vs GPU. -------------------------------------------------------------------------- */ - template KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPComputeNeighCPU,const typename Kokkos::TeamPolicy::member_type& team) const { @@ -1028,36 +765,156 @@ void PairSNAPKokkos::operator() (TagPairSN }); } +/* ---------------------------------------------------------------------- + Pre-compute the Cayley-Klein parameters for reuse in later routines. + GPU only. +------------------------------------------------------------------------- */ template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPPreUiCPU,const typename Kokkos::TeamPolicy::member_type& team) const { +void PairSNAPKokkos::operator() (TagPairSNAPComputeCayleyKlein,const int iatom_mod, const int jnbor, const int iatom_div) const { - // Extract the atom number - const int ii = team.team_rank() + team.team_size() * team.league_rank(); - if (ii >= chunk_size) return; - int itype = type(ii); - int ielem = d_map[itype]; + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; - snaKK.pre_ui_cpu(team,ii,ielem); + const int ninside = d_ninside(iatom); + if (jnbor >= ninside) return; + + snaKK.compute_cayley_klein(iatom, jnbor); } - +/* ---------------------------------------------------------------------- + Initialize the "ulisttot" structure with non-zero on-diagonal terms + and zero terms elsewhere; both CPU and GPU. +------------------------------------------------------------------------- */ template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy::member_type& team) const { +void PairSNAPKokkos::operator() (TagPairSNAPPreUi, const int iatom_mod, const int j, const int iatom_div) const { + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; - // Extract the atom number - int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size())); - if (ii >= chunk_size) return; + int itype = type(iatom); + int ielem = d_map[itype]; - // Extract the neighbor number - const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size()); - const int ninside = d_ninside(ii); - if (jj >= ninside) return; + snaKK.pre_ui(iatom, j, ielem); +} - snaKK.compute_ui_cpu(team,ii,jj); +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPPreUiCPU, const int& iatom) const { + const int itype = type(iatom); + const int ielem = d_map[itype]; + + snaKK.pre_ui_cpu(iatom, ielem); +} + +/* ---------------------------------------------------------------------- + Accumulate the spectral contributions from atom, neighbor pairs into + ulisttot_re and _im. These routines are GPU only and use scratch memory + staging. +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeUiSmall,const typename Kokkos::TeamPolicy::member_type& team) const { + + // extract flattened atom_div / neighbor number / bend location + int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui; + + // extract neighbor index, iatom_div + int iatom_div = flattened_idx / (max_neighs * (twojmax + 1)); // removed "const" to work around GCC 7 bug + const int jj_jbend = flattened_idx - iatom_div * (max_neighs * (twojmax + 1)); + const int jbend = jj_jbend / max_neighs; + int jj = jj_jbend - jbend * max_neighs; // removed "const" to work around GCC 7 bug + + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length), + [&] (const int iatom_mod) { + const int ii = iatom_mod + vector_length * iatom_div; + if (ii >= chunk_size) return; + + const int ninside = d_ninside(ii); + if (jj >= ninside) return; + + snaKK.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div); + }); + +} + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeUiLarge,const typename Kokkos::TeamPolicy::member_type& team) const { + + // extract flattened atom_div / neighbor number / bend location + int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui; + + // extract neighbor index, iatom_div + int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug + int jj = flattened_idx - iatom_div * max_neighs; + + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length), + [&] (const int iatom_mod) { + const int ii = iatom_mod + vector_length * iatom_div; + if (ii >= chunk_size) return; + + const int ninside = d_ninside(ii); + if (jj >= ninside) return; + + snaKK.compute_ui_large(team,iatom_mod, jj, iatom_div); + }); +} + +/* ---------------------------------------------------------------------- + Accumulate the spectral contributions from atom, neighbor pairs into + ulisttot_re and _im. This routine is CPU only and does not use staging. +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeUiCPU, const int& ii) const { + const int iatom = ii / max_neighs; + const int jnbor = ii % max_neighs; + const int ninside = d_ninside(iatom); + if (jnbor >= ninside) return; + + snaKK.compute_ui_cpu(iatom, jnbor); +} + +/* ---------------------------------------------------------------------- + De-symmetrize ulisttot_re and _im and pack it into a unified ulisttot + structure. Zero-initialize ylist. CPU and GPU. +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPTransformUi, const int iatom_mod, const int idxu, const int iatom_div) const { + + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; + if (idxu > snaKK.idxu_max) return; + + int elem_count = chemflag ? nelements : 1; + + for (int ielem = 0; ielem < elem_count; ielem++) { + + const FullHalfMapper mapper = snaKK.idxu_full_half[idxu]; + + auto utot_re = snaKK.ulisttot_re(iatom, ielem, mapper.idxu_half); + auto utot_im = snaKK.ulisttot_im(iatom, ielem, mapper.idxu_half); + + if (mapper.flip_sign == 1) { + utot_im = -utot_im; + } else if (mapper.flip_sign == -1) { + utot_re = -utot_re; + } + + snaKK.ulisttot(iatom, ielem, idxu) = { utot_re, utot_im }; + + if (mapper.flip_sign == 0) { + snaKK.ylist_re(iatom, ielem, mapper.idxu_half) = 0.; + snaKK.ylist_im(iatom, ielem, mapper.idxu_half) = 0.; + } + } } template @@ -1110,10 +967,21 @@ void PairSNAPKokkos::operator() (TagPairSN } } +/* ---------------------------------------------------------------------- + Compute all elements of the Z tensor and store them into the `zlist` + View. This is only used for energy timesteps or quadratic SNAP. + CPU and GPU. +------------------------------------------------------------------------- */ + template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeYiCPU,const int& ii) const { - snaKK.compute_yi_cpu(ii); +void PairSNAPKokkos::operator() (TagPairSNAPComputeZi,const int iatom_mod, const int jjz, const int iatom_div) const { + + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; + if (jjz >= snaKK.idxz_max) return; + + snaKK.compute_zi(iatom, jjz); } template @@ -1122,6 +990,22 @@ void PairSNAPKokkos::operator() (TagPairSN snaKK.compute_zi_cpu(ii); } +/* ---------------------------------------------------------------------- + Compute the energy triple products and store in the "blist" View. + CPU and GPU. +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeBi,const int iatom_mod, const int jjb, const int iatom_div) const { + + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; + if (jjb >= snaKK.idxb_max) return; + + snaKK.compute_bi(iatom,jjb); +} + template KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPComputeBiCPU, const int& ii) const { @@ -1130,42 +1014,187 @@ void PairSNAPKokkos::operator() (TagPairSN snaKK.compute_bi(iatom, jjb); } -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeDuidrjCPU,const typename Kokkos::TeamPolicy::member_type& team) const { - // Extract the atom number - int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size())); - if (ii >= chunk_size) return; - - // Extract the neighbor number - const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size()); - const int ninside = d_ninside(ii); - if (jj >= ninside) return; - - snaKK.compute_duidrj_cpu(team,ii,jj); -} +/* ---------------------------------------------------------------------- + Assemble the "beta" coefficients that enter the computation of the + adjoint matrices Y. For quadratic SNAP, this includes accumulating + energy triple products into an "effective" beta that encodes the + quadratic terms with otherwise linear compute work. + CPU and GPU. +------------------------------------------------------------------------- */ template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeDeidrjCPU,const typename Kokkos::TeamPolicy::member_type& team) const { +void PairSNAPKokkos::operator() (TagPairSNAPBeta, const int& iatom) const { - // Extract the atom number - int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size())); - if (ii >= chunk_size) return; + if (iatom >= chunk_size) return; - // Extract the neighbor number - const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size()); - const int ninside = d_ninside(ii); - if (jj >= ninside) return; + const int i = d_ilist[iatom + chunk_offset]; + const int itype = type[i]; + const int ielem = d_map[itype]; - snaKK.compute_deidrj_cpu(team,ii,jj); + auto d_coeffi = Kokkos::subview(d_coeffelem, ielem, Kokkos::ALL); + + for (int icoeff = 0; icoeff < ncoeff; icoeff++) { + d_beta(iatom, icoeff) = d_coeffi[icoeff+1]; + } + + if (quadraticflag) { + const auto idxb_max = snaKK.idxb_max; + int k = ncoeff+1; + for (int icoeff = 0; icoeff < ncoeff; icoeff++) { + const auto idxb = icoeff % idxb_max; + const auto idx_chem = icoeff / idxb_max; + real_type bveci = snaKK.blist(iatom, idx_chem, idxb); + d_beta(iatom, icoeff) += d_coeffi[k] * bveci; + k++; + for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) { + const auto jdxb = jcoeff % idxb_max; + const auto jdx_chem = jcoeff / idxb_max; + real_type bvecj = snaKK.blist(iatom, jdx_chem, jdxb); + d_beta(iatom, icoeff) += d_coeffi[k] * bvecj; + d_beta(iatom, jcoeff) += d_coeffi[k] * bveci; + k++; + } + } + } } /* ---------------------------------------------------------------------- - Also used for both CPU and GPU codepaths. Could maybe benefit from a - separate GPU/CPU codepath, but this kernel takes so little time it's - likely not worth it. + Compute all elements of the Z tensor and accumultate them into the + adjoint matrices Y (ylist_re, _im) on non-energy timesteps. CPU and GPU. +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeYi,const int iatom_mod, const int jjz, const int iatom_div) const { + + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; + if (jjz >= snaKK.idxz_max) return; + + snaKK.compute_yi(iatom, jjz); +} + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeYiCPU, const int& ii) const { + snaKK.compute_yi_cpu(ii); +} + +/* ---------------------------------------------------------------------- + Accumulate the pre-computed elements of the Z tensor into the adjoint + matrices Y (ylist_re, _im) on non-energy timesteps. CPU and GPU. +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeYiWithZlist,const int iatom_mod, const int jjz, const int iatom_div) const { + + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; + if (jjz >= snaKK.idxz_max) return; + + snaKK.compute_yi_with_zlist(iatom, jjz); +} + +/* ---------------------------------------------------------------------- + Assemble the force contributions for each atom, neighbor pair by + contracting the adjoint matrices Y with derivatives of the Wigner + matrices U. These routines are GPU only and use scratch memory + staging. +------------------------------------------------------------------------- */ + +template +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeFusedDeidrjSmall,const typename Kokkos::TeamPolicy >::member_type& team) const { + + // extract flattened atom_div / neighbor number / bend location + int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_fused_deidrj; + + // extract neighbor index, iatom_div + int iatom_div = flattened_idx / (max_neighs * (twojmax + 1)); // removed "const" to work around GCC 7 bug + const int jj_jbend = flattened_idx - iatom_div * (max_neighs * (twojmax + 1)); + const int jbend = jj_jbend / max_neighs; + int jj = jj_jbend - jbend * max_neighs; // removed "const" to work around GCC 7 bug + + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length), + [&] (const int iatom_mod) { + const int ii = iatom_mod + vector_length * iatom_div; + if (ii >= chunk_size) return; + + const int ninside = d_ninside(ii); + if (jj >= ninside) return; + + snaKK.template compute_fused_deidrj_small(team, iatom_mod, jbend, jj, iatom_div); + + }); + +} + +template +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeFusedDeidrjLarge,const typename Kokkos::TeamPolicy >::member_type& team) const { + + // extract flattened atom_div / neighbor number / bend location + int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_fused_deidrj; + + // extract neighbor index, iatom_div + int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug + int jj = flattened_idx - max_neighs * iatom_div; + + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length), + [&] (const int iatom_mod) { + const int ii = iatom_mod + vector_length * iatom_div; + if (ii >= chunk_size) return; + + const int ninside = d_ninside(ii); + if (jj >= ninside) return; + + snaKK.template compute_fused_deidrj_large(team, iatom_mod, jj, iatom_div); + + }); +} + +/* ---------------------------------------------------------------------- + Assemble the derivatives of the Winger matrices U into the View + "dulist". CPU only. +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeDuidrjCPU, const int& ii) const { + const int iatom = ii / max_neighs; + const int jnbor = ii % max_neighs; + const int ninside = d_ninside(iatom); + if (jnbor >= ninside) return; + + snaKK.compute_duidrj_cpu(iatom, jnbor); +} + +/* ---------------------------------------------------------------------- + Assemble the force contributions for each atom, neighbor pair by + contracting the adjoint matrices Y with the pre-computed derivatives + of the Wigner matrices U. CPU only. +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeDeidrjCPU, const int& ii) const { + const int iatom = ii / max_neighs; + const int jnbor = ii % max_neighs; + const int ninside = d_ninside(iatom); + if (jnbor >= ninside) return; + + snaKK.compute_deidrj_cpu(iatom, jnbor); +} + +/* ---------------------------------------------------------------------- + This routine formally accumulates the "chunked" force contributions + into the broader LAMMPS "f" force View. As appropriate it + also accumulates the total energy and the virial. CPU and GPU. ------------------------------------------------------------------------- */ template diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h index fa187c26ea..4c9cace26a 100644 --- a/src/KOKKOS/sna_kokkos.h +++ b/src/KOKKOS/sna_kokkos.h @@ -237,9 +237,9 @@ class SNAKokkos { // functions for bispectrum coefficients, CPU only KOKKOS_INLINE_FUNCTION - void pre_ui_cpu(const typename Kokkos::TeamPolicy::member_type& team,const int&,const int&) const; // ForceSNAP + void pre_ui_cpu(const int&, const int&) const; // ForceSNAP KOKKOS_INLINE_FUNCTION - void compute_ui_cpu(const typename Kokkos::TeamPolicy::member_type& team, int, int) const; // ForceSNAP + void compute_ui_cpu(const int&, const int&) const; // ForceSNAP KOKKOS_INLINE_FUNCTION void compute_zi_cpu(const int&) const; // ForceSNAP KOKKOS_INLINE_FUNCTION @@ -247,9 +247,9 @@ class SNAKokkos { // functions for derivatives, CPU only KOKKOS_INLINE_FUNCTION - void compute_duidrj_cpu(const typename Kokkos::TeamPolicy::member_type& team, int, int) const; //ForceSNAP + void compute_duidrj_cpu(const int&, const int&) const; //ForceSNAP KOKKOS_INLINE_FUNCTION - void compute_deidrj_cpu(const typename Kokkos::TeamPolicy::member_type& team, int, int) const; // ForceSNAP + void compute_deidrj_cpu(const int&, const int&) const; // ForceSNAP KOKKOS_INLINE_FUNCTION real_type compute_sfac(real_type, real_type, real_type, real_type) const; // add_uarraytot, compute_duarray @@ -358,11 +358,10 @@ class SNAKokkos { void init_rootpqarray(); // init() KOKKOS_INLINE_FUNCTION - void add_uarraytot(const typename Kokkos::TeamPolicy::member_type& team, int, int, const real_type&, const real_type&, const real_type&, const real_type&, const real_type&, int) const; // compute_ui + void add_uarraytot(int, int, const real_type&, const real_type&, const real_type&, const real_type&, const real_type&, int) const; // compute_ui KOKKOS_INLINE_FUNCTION - void compute_uarray_cpu(const typename Kokkos::TeamPolicy::member_type& team, int, int, - const real_type&, const real_type&, const real_type&, + void compute_uarray_cpu(int, int, const real_type&, const real_type&, const real_type&, const real_type&, const real_type&) const; // compute_ui_cpu @@ -372,8 +371,7 @@ class SNAKokkos { inline int compute_ncoeff(); // SNAKokkos() KOKKOS_INLINE_FUNCTION - void compute_duarray_cpu(const typename Kokkos::TeamPolicy::member_type& team, int, int, - const real_type&, const real_type&, const real_type&, // compute_duidrj_cpu + void compute_duarray_cpu(int, int, const real_type&, const real_type&, const real_type&, // compute_duidrj_cpu const real_type&, const real_type&, const real_type&, const real_type&, const real_type&, const real_type&, const real_type&) const; diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index 7988026dd2..6cb83b3910 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -1156,17 +1156,14 @@ typename SNAKokkos::real_type SNAKokkos KOKKOS_INLINE_FUNCTION -void SNAKokkos::pre_ui_cpu(const typename Kokkos::TeamPolicy::member_type& team, const int& iatom, const int& ielem) const +void SNAKokkos::pre_ui_cpu(const int& iatom, const int& ielem) const { for (int jelem = 0; jelem < nelements; jelem++) { for (int j = 0; j <= twojmax; j++) { int jju = idxu_half_block(j); // removed "const" to work around GCC 7 bug // Only diagonal elements get initialized - // for (int m = 0; m < (j+1)*(j/2+1); m++) - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, (j+1)*(j/2+1)), - [&] (const int m) { - + for (int m = 0; m < (j+1)*(j/2+1); m++) { const int jjup = jju + m; // if m is on the "diagonal", initialize it with the self energy. @@ -1176,7 +1173,7 @@ void SNAKokkos::pre_ui_cpu(const typename ulisttot_re(iatom, jelem, jjup) = init; ulisttot_im(iatom, jelem, jjup) = 0; - }); + }; } } @@ -1191,7 +1188,7 @@ void SNAKokkos::pre_ui_cpu(const typename template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_ui_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor) const +void SNAKokkos::compute_ui_cpu(const int& iatom, const int& jnbor) const { real_type rsq, r, x, y, z, z0, theta0; @@ -1211,8 +1208,8 @@ void SNAKokkos::compute_ui_cpu(const typen // theta0 = (r - rmin0) * rscale0; z0 = r / tan(theta0); - compute_uarray_cpu(team, iatom, jnbor, x, y, z, z0, r); - add_uarraytot(team, iatom, jnbor, r, wj(iatom,jnbor), rcutij(iatom,jnbor), sinnerij(iatom,jnbor), dinnerij(iatom,jnbor), element(iatom, jnbor)); + compute_uarray_cpu(iatom, jnbor, x, y, z, z0, r); + add_uarraytot(iatom, jnbor, r, wj(iatom,jnbor), rcutij(iatom,jnbor), sinnerij(iatom,jnbor), dinnerij(iatom,jnbor), element(iatom, jnbor)); } /* ---------------------------------------------------------------------- @@ -1328,7 +1325,7 @@ void SNAKokkos::compute_yi_cpu(int iter) c template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_duidrj_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor) const +void SNAKokkos::compute_duidrj_cpu(const int& iatom, const int& jnbor) const { real_type rsq, r, x, y, z, z0, theta0, cs, sn; real_type dz0dr; @@ -1345,7 +1342,7 @@ void SNAKokkos::compute_duidrj_cpu(const t z0 = r * cs / sn; dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq; - compute_duarray_cpu(team, iatom, jnbor, x, y, z, z0, r, dz0dr, wj(iatom,jnbor), rcutij(iatom,jnbor), sinnerij(iatom,jnbor), dinnerij(iatom,jnbor)); + compute_duarray_cpu(iatom, jnbor, x, y, z, z0, r, dz0dr, wj(iatom,jnbor), rcutij(iatom,jnbor), sinnerij(iatom,jnbor), dinnerij(iatom,jnbor)); } @@ -1360,54 +1357,42 @@ void SNAKokkos::compute_duidrj_cpu(const t template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_deidrj_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor) const +void SNAKokkos::compute_deidrj_cpu(const int& iatom, const int& jnbor) const { - t_scalar3 final_sum; + real_type force_sum[3] = { 0, 0, 0 }; const int jelem = element(iatom, jnbor); - Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,twojmax+1), - [&] (const int& j, t_scalar3& sum_tmp) { + for (int j = 0; j <= twojmax; j++) { int jju_half = idxu_half_block[j]; int jju_cache = idxu_cache_block[j]; - for (int mb = 0; 2*mb < j; mb++) + for (int mb = 0; 2 * mb < j; mb++) { for (int ma = 0; ma <= j; ma++) { - const complex y_val = { ylist_re(iatom, jelem, jju_half), ylist_im(iatom, jelem, jju_half) }; - sum_tmp.x += dulist_cpu(iatom, jnbor, jju_cache, 0).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 0).im * y_val.im; - sum_tmp.y += dulist_cpu(iatom, jnbor, jju_cache, 1).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 1).im * y_val.im; - sum_tmp.z += dulist_cpu(iatom, jnbor, jju_cache, 2).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 2).im * y_val.im; + complex y = { ylist_re(iatom, jelem, jju_half), ylist_im(iatom, jelem, jju_half) }; + for (int k = 0; k < 3; k++) + force_sum[k] += dulist_cpu(iatom, jnbor, jju_cache, k).re * y.re + + dulist_cpu(iatom, jnbor, jju_cache, k).im * y.im; jju_half++; jju_cache++; - } //end loop over ma mb + } + } //end loop over ma mb // For j even, handle middle column - if (j%2 == 0) { - - int mb = j/2; - for (int ma = 0; ma < mb; ma++) { - const complex y_val = { ylist_re(iatom, jelem, jju_half), ylist_im(iatom, jelem, jju_half) }; - sum_tmp.x += dulist_cpu(iatom, jnbor, jju_cache, 0).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 0).im * y_val.im; - sum_tmp.y += dulist_cpu(iatom, jnbor, jju_cache, 1).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 1).im * y_val.im; - sum_tmp.z += dulist_cpu(iatom, jnbor, jju_cache, 2).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 2).im * y_val.im; + if (j % 2 == 0) { + //int mb = j / 2; + for (int ma = 0; ma <= j; ma++) { + complex y = { ylist_re(iatom, jelem, jju_half), ylist_im(iatom, jelem, jju_half) }; + for (int k = 0; k < 3; k++) + force_sum[k] += static_cast(0.5) * (dulist_cpu(iatom, jnbor, jju_cache, k).re * y.re + + dulist_cpu(iatom, jnbor, jju_cache, k).im * y.im); jju_half++; jju_cache++; } - - //int ma = mb; - // 0.5 is meant to avoid double-counting - const complex y_val = { 0.5 * ylist_re(iatom, jelem, jju_half), 0.5 * ylist_im(iatom, jelem, jju_half) }; - sum_tmp.x += dulist_cpu(iatom, jnbor, jju_cache, 0).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 0).im * y_val.im; - sum_tmp.y += dulist_cpu(iatom, jnbor, jju_cache, 1).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 1).im * y_val.im; - sum_tmp.z += dulist_cpu(iatom, jnbor, jju_cache, 2).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 2).im * y_val.im; } // end if jeven - },final_sum); // end loop over j - - Kokkos::single(Kokkos::PerThread(team), [&] () { - dedr(iatom,jnbor,0) = final_sum.x*2.0; - dedr(iatom,jnbor,1) = final_sum.y*2.0; - dedr(iatom,jnbor,2) = final_sum.z*2.0; - }); + } + for (int k = 0; k < 3; k++) + dedr(iatom, jnbor, k) = 2 * force_sum[k]; } @@ -1421,15 +1406,13 @@ void SNAKokkos::compute_deidrj_cpu(const t template KOKKOS_INLINE_FUNCTION -void SNAKokkos::add_uarraytot(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor, +void SNAKokkos::add_uarraytot(int iatom, int jnbor, const real_type& r, const real_type& wj, const real_type& rcut, const real_type& sinner, const real_type& dinner, int jelem) const { const real_type sfac = compute_sfac(r, rcut, sinner, dinner) * wj; - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,twojmax+1), - [&] (const int& j) { - + for (int j = 0; j <= twojmax; j++) { int jju_half = idxu_half_block[j]; // index into ulisttot int jju_cache = idxu_cache_block[j]; // index into ulist @@ -1441,7 +1424,7 @@ void SNAKokkos::add_uarraytot(const typena count++; } } - }); + } } /* ---------------------------------------------------------------------- @@ -1452,23 +1435,15 @@ void SNAKokkos::add_uarraytot(const typena template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_uarray_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor, +void SNAKokkos::compute_uarray_cpu(int iatom, int jnbor, const real_type& x, const real_type& y, const real_type& z, const real_type& z0, const real_type& r) const { - real_type r0inv; - real_type a_r, b_r, a_i, b_i; - real_type rootpq; - // compute Cayley-Klein parameters for unit quaternion - - r0inv = static_cast(1.0) / sqrt(r * r + z0 * z0); - a_r = r0inv * z0; - a_i = -r0inv * z; - b_r = r0inv * y; - b_i = -r0inv * x; + real_type r0inv = static_cast(1.0) / sqrt(r * r + z0 * z0); + complex a = { r0inv * z0, -r0inv * z }; + complex b = { r0inv * y, -r0inv * x }; // VMK Section 4.8.2 - ulist_cpu(iatom, jnbor, 0) = complex::one(); for (int j = 1; j <= twojmax; j++) { @@ -1476,61 +1451,65 @@ void SNAKokkos::compute_uarray_cpu(const t int jjup = idxu_cache_block[j-1]; // removed "const" to work around GCC 7 bug // fill in left side of matrix layer from previous layer + for (int mb = 0; 2*mb <= j; mb++) { + int jju_index = jju + (j + 1) * mb; + int jjup_index = jjup + j * mb; - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,(j+2)/2), - [&] (const int& mb) { - //for (int mb = 0; 2*mb <= j; mb++) { - const int jju_index = jju+mb+mb*j; - ulist_cpu(iatom, jnbor, jju_index) = { static_cast(0), static_cast(0) }; + complex ui = complex::zero(); for (int ma = 0; ma < j; ma++) { - const int jju_index = jju+mb+mb*j+ma; - const int jjup_index = jjup+mb*j+ma; - rootpq = rootpqarray(j - ma,j - mb); - ulist_cpu(iatom, jnbor, jju_index).re += - rootpq * - (a_r * ulist_cpu(iatom, jnbor, jjup_index).re + - a_i * ulist_cpu(iatom, jnbor, jjup_index).im); - ulist_cpu(iatom, jnbor, jju_index).im += - rootpq * - (a_r * ulist_cpu(iatom, jnbor, jjup_index).im - - a_i * ulist_cpu(iatom, jnbor, jjup_index).re); + complex ui_prev = ulist_cpu(iatom, jnbor, jjup_index); - rootpq = rootpqarray(ma + 1,j - mb); - ulist_cpu(iatom, jnbor, jju_index+1).re = - -rootpq * - (b_r * ulist_cpu(iatom, jnbor, jjup_index).re + - b_i * ulist_cpu(iatom, jnbor, jjup_index).im); - ulist_cpu(iatom, jnbor, jju_index+1).im = - -rootpq * - (b_r * ulist_cpu(iatom, jnbor, jjup_index).im - - b_i * ulist_cpu(iatom, jnbor, jjup_index).re); + real_type rootpq = rootpqarray(j - ma, j - mb); + ui.re += rootpq * (a.re * ui_prev.re + a.im * ui_prev.im); + ui.im += rootpq * (a.re * ui_prev.im - a.im * ui_prev.re); + + ulist_cpu(iatom, jnbor, jju_index) = ui; + + rootpq = rootpqarray(ma + 1, j - mb); + ui.re = -rootpq * (b.re * ui_prev.re + b.im * ui_prev.im); + ui.im = -rootpq * (b.re * ui_prev.im - b.im * ui_prev.re); + + jju_index++; + jjup_index++; } - // copy left side to right side with inversion symmetry VMK 4.4(2) - // u[ma-j,mb-j] = (-1)^(ma-mb)*Conj([u[ma,mb)) + ulist_cpu(iatom, jnbor, jju_index) = ui; + } - // Only need to add one symmetrized row for convenience - // Symmetry gets "unfolded" in accumulating ulisttot - if (j%2==1 && mb==(j/2)) { - const int mbpar = (mb)%2==0?1:-1; - int mapar = mbpar; - for (int ma = 0; ma <= j; ma++) { - const int jju_index = jju + mb*(j+1) + ma; - const int jjup_index = jju + (j+1-mb)*(j+1)-(ma+1); - if (mapar == 1) { - ulist_cpu(iatom, jnbor, jjup_index).re = ulist_cpu(iatom, jnbor, jju_index).re; - ulist_cpu(iatom, jnbor, jjup_index).im = -ulist_cpu(iatom, jnbor, jju_index).im; - } else { - ulist_cpu(iatom, jnbor, jjup_index).re = -ulist_cpu(iatom, jnbor, jju_index).re; - ulist_cpu(iatom, jnbor, jjup_index).im = ulist_cpu(iatom, jnbor, jju_index).im; - } - mapar = -mapar; - } + // If j is odd (half-integer in the mathematical convention), we need + // to add one more row for convenience (for now). This can either be done + // via symmetry (see the commented code below), or by the equations to fill + // from the left instead of the right + if (j % 2 == 1) { + int mb = j / 2; + // begin filling in the extra row + int jju_index = jju + (mb + 1) * (j + 1); + int jjup_index = jjup + mb * j; + + complex ui = complex::zero(); + + for (int ma = 0; ma < j; ma++) { + complex ui_prev = ulist_cpu(iatom, jnbor, jjup_index); + + real_type rootpq = rootpqarray(j - ma, mb + 1); + ui.re += rootpq * (b.re * ui_prev.re - b.im * ui_prev.im); + ui.im += rootpq * (b.re * ui_prev.im + b.im * ui_prev.re); + + ulist_cpu(iatom, jnbor, jju_index) = ui; + + rootpq = rootpqarray(ma + 1, mb + 1); + ui.re = rootpq * (a.re * ui_prev.re - a.im * ui_prev.im); + ui.im = rootpq * (a.re * ui_prev.im + a.im * ui_prev.re); + + jju_index++; + jjup_index++; } - }); + ulist_cpu(iatom, jnbor, jju_index) = ui; + } } + } /* ---------------------------------------------------------------------- @@ -1541,30 +1520,25 @@ void SNAKokkos::compute_uarray_cpu(const t template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_duarray_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor, +void SNAKokkos::compute_duarray_cpu(int iatom, int jnbor, const real_type& x, const real_type& y, const real_type& z, const real_type& z0, const real_type& r, const real_type& dz0dr, const real_type& wj, const real_type& rcut, const real_type& sinner, const real_type& dinner) const { - real_type r0inv; - real_type a_r, a_i, b_r, b_i; - real_type u[3], da_r[3], da_i[3], db_r[3], db_i[3]; - real_type dz0[3], dr0inv[3], dr0invdr; - real_type rootpq; + complex da[3], db[3]; + real_type u[3], dz0[3], dr0inv[3]; real_type rinv = 1.0 / r; u[0] = x * rinv; u[1] = y * rinv; u[2] = z * rinv; - r0inv = 1.0 / sqrt(r * r + z0 * z0); - a_r = z0 * r0inv; - a_i = -z * r0inv; - b_r = y * r0inv; - b_i = -x * r0inv; + real_type r0inv = 1.0 / sqrt(r * r + z0 * z0); + complex a = { z0 * r0inv, -z * r0inv }; + complex b = { y * r0inv, -x * r0inv }; - dr0invdr = -r0inv * r0inv * r0inv * (r + z0 * dz0dr); + real_type dr0invdr = -r0inv * r0inv * r0inv * (r + z0 * dz0dr); dr0inv[0] = dr0invdr * u[0]; dr0inv[1] = dr0invdr * u[1]; @@ -1575,19 +1549,19 @@ void SNAKokkos::compute_duarray_cpu(const dz0[2] = dz0dr * u[2]; for (int k = 0; k < 3; k++) { - da_r[k] = dz0[k] * r0inv + z0 * dr0inv[k]; - da_i[k] = -z * dr0inv[k]; + da[k].re = dz0[k] * r0inv + z0 * dr0inv[k]; + da[k].im = -z * dr0inv[k]; } - da_i[2] += -r0inv; + da[2].im += -r0inv; for (int k = 0; k < 3; k++) { - db_r[k] = y * dr0inv[k]; - db_i[k] = -x * dr0inv[k]; + db[k].re = y * dr0inv[k]; + db[k].im = -x * dr0inv[k]; } - db_i[0] += -r0inv; - db_r[1] += r0inv; + db[0].im += -r0inv; + db[1].re += r0inv; for (int k = 0; k < 3; k++) dulist_cpu(iatom, jnbor, 0, k) = complex::zero(); @@ -1595,72 +1569,94 @@ void SNAKokkos::compute_duarray_cpu(const for (int j = 1; j <= twojmax; j++) { int jju = idxu_cache_block[j]; int jjup = idxu_cache_block[j-1]; - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,(j+2)/2), - [&] (const int& mb) { - //for (int mb = 0; 2*mb <= j; mb++) { - const int jju_index = jju+mb+mb*j; - for (int k = 0; k < 3; k++) - dulist_cpu(iatom, jnbor, jju_index, k) = complex::zero(); + + for (int mb = 0; 2*mb <= j; mb++) { + int jju_index = jju + mb * (j + 1); + int jjup_index = jjup + mb * j; + + complex duidrj[3] = { complex::zero(), complex::zero(), complex::zero() }; for (int ma = 0; ma < j; ma++) { - const int jju_index = jju+mb+mb*j+ma; - const int jjup_index = jjup+mb*j+ma; - rootpq = rootpqarray(j - ma,j - mb); + complex ui_prev = ulist_cpu(iatom, jnbor, jjup_index); + complex duidrj_prev[3] = { dulist_cpu(iatom, jnbor, jjup_index, 0), + dulist_cpu(iatom, jnbor, jjup_index, 1), + dulist_cpu(iatom, jnbor, jjup_index, 2) }; + + real_type rootpq = rootpqarray(j - ma,j - mb); for (int k = 0; k < 3; k++) { - dulist_cpu(iatom, jnbor, jju_index, k).re += - rootpq * (da_r[k] * ulist_cpu(iatom, jnbor, jjup_index).re + - da_i[k] * ulist_cpu(iatom, jnbor, jjup_index).im + - a_r * dulist_cpu(iatom, jnbor, jjup_index, k).re + - a_i * dulist_cpu(iatom, jnbor, jjup_index, k).im); - dulist_cpu(iatom, jnbor, jju_index, k).im += - rootpq * (da_r[k] * ulist_cpu(iatom, jnbor, jjup_index).im - - da_i[k] * ulist_cpu(iatom, jnbor, jjup_index).re + - a_r * dulist_cpu(iatom, jnbor, jjup_index, k).im - - a_i * dulist_cpu(iatom, jnbor, jjup_index, k).re); + duidrj[k].re += rootpq * (da[k].re * ui_prev.re + da[k].im * ui_prev.im + + a.re * duidrj_prev[k].re + a.im * duidrj_prev[k].im); + + duidrj[k].im += rootpq * (da[k].re * ui_prev.im - da[k].im * ui_prev.re + + a.re * duidrj_prev[k].im - a.im * duidrj_prev[k].re); + + dulist_cpu(iatom, jnbor, jju_index, k) = duidrj[k]; } rootpq = rootpqarray(ma + 1,j - mb); for (int k = 0; k < 3; k++) { - dulist_cpu(iatom, jnbor, jju_index+1, k).re = - -rootpq * (db_r[k] * ulist_cpu(iatom, jnbor, jjup_index).re + - db_i[k] * ulist_cpu(iatom, jnbor, jjup_index).im + - b_r * dulist_cpu(iatom, jnbor, jjup_index, k).re + - b_i * dulist_cpu(iatom, jnbor, jjup_index, k).im); - dulist_cpu(iatom, jnbor, jju_index+1, k).im = - -rootpq * (db_r[k] * ulist_cpu(iatom, jnbor, jjup_index).im - - db_i[k] * ulist_cpu(iatom, jnbor, jjup_index).re + - b_r * dulist_cpu(iatom, jnbor, jjup_index, k).im - - b_i * dulist_cpu(iatom, jnbor, jjup_index, k).re); + duidrj[k].re = -rootpq * (db[k].re * ui_prev.re + db[k].im * ui_prev.im + + b.re * duidrj_prev[k].re + b.im * duidrj_prev[k].im); + + duidrj[k].im = -rootpq * (db[k].re * ui_prev.im - db[k].im * ui_prev.re + + b.re * duidrj_prev[k].im - b.im * duidrj_prev[k].re); } + + jju_index++; + jjup_index++; } - // Only need to add one symmetrized row for convenience - // Symmetry gets "unfolded" during the dedr accumulation - - // copy left side to right side with inversion symmetry VMK 4.4(2) - // u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb]) - - if (j%2==1 && mb==(j/2)) { - const int mbpar = (mb)%2==0?1:-1; - int mapar = mbpar; - for (int ma = 0; ma <= j; ma++) { - const int jju_index = jju+mb*(j+1)+ma; - const int jjup_index = jju+(mb+2)*(j+1)-(ma+1); - if (mapar == 1) { - for (int k = 0; k < 3; k++) { - dulist_cpu(iatom, jnbor, jjup_index, k).re = dulist_cpu(iatom, jnbor, jju_index, k).re; - dulist_cpu(iatom, jnbor, jjup_index, k).im = -dulist_cpu(iatom, jnbor, jju_index, k).im; - } - } else { - for (int k = 0; k < 3; k++) { - dulist_cpu(iatom, jnbor, jjup_index, k).re = -dulist_cpu(iatom, jnbor, jju_index, k).re; - dulist_cpu(iatom, jnbor, jjup_index, k).im = dulist_cpu(iatom, jnbor, jju_index, k).im; - } - } - mapar = -mapar; - } + for (int k = 0; k < 3; k++) { + dulist_cpu(iatom, jnbor, jju_index, k) = duidrj[k]; } - }); + } + + // Only need to add one symmetrized row for convenience + // Symmetry gets "unfolded" during the dedr accumulation + + // copy left side to right side with inversion symmetry VMK 4.4(2) + // u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb]) + + if (j % 2 == 1) { + int mb = j / 2; + // begin filling in the extra row + int jju_index = jju + (mb + 1) * (j + 1); + int jjup_index = jjup + mb * j; + + complex duidrj[3] = { complex::zero(), complex::zero(), complex::zero() }; + + for (int ma = 0; ma < j; ma++) { + complex ui_prev = ulist_cpu(iatom, jnbor, jjup_index); + complex duidrj_prev[3] = { dulist_cpu(iatom, jnbor, jjup_index, 0), + dulist_cpu(iatom, jnbor, jjup_index, 1), + dulist_cpu(iatom, jnbor, jjup_index, 2) }; + + real_type rootpq = rootpqarray(j - ma, mb + 1); + for (int k = 0; k < 3; k++) { + duidrj[k].re += rootpq * (db[k].re * ui_prev.re - db[k].im * ui_prev.im + + b.re * duidrj_prev[k].re - b.im * duidrj_prev[k].im); + duidrj[k].im += rootpq * (db[k].re * ui_prev.im + db[k].im * ui_prev.re + + b.re * duidrj_prev[k].im + b.im * duidrj_prev[k].re); + + dulist_cpu(iatom, jnbor, jju_index, k) = duidrj[k]; + } + + rootpq = rootpqarray(ma + 1, mb + 1); + for (int k = 0; k < 3; k++) { + duidrj[k].re = rootpq * (da[k].re * ui_prev.re - da[k].im * ui_prev.im + + a.re * duidrj_prev[k].re - a.im * duidrj_prev[k].im); + duidrj[k].im = rootpq * (da[k].re * ui_prev.im + da[k].im * ui_prev.re + + a.re * duidrj_prev[k].im + a.im * duidrj_prev[k].re); + } + + jju_index++; + jjup_index++; + } + + for (int k = 0; k < 3; k++) { + dulist_cpu(iatom, jnbor, jju_index, k) = duidrj[k]; + } + } } real_type sfac = compute_sfac(r, rcut, sinner, dinner); @@ -1678,9 +1674,9 @@ void SNAKokkos::compute_duarray_cpu(const for (int ma = 0; ma <= j; ma++) { for (int k = 0; k < 3; k++) { dulist_cpu(iatom, jnbor, jju, k).re = dsfac * ulist_cpu(iatom, jnbor, jju).re * u[k] + - sfac * dulist_cpu(iatom, jnbor, jju, k).re; + sfac * dulist_cpu(iatom, jnbor, jju, k).re; dulist_cpu(iatom, jnbor, jju, k).im = dsfac * ulist_cpu(iatom, jnbor, jju).im * u[k] + - sfac * dulist_cpu(iatom, jnbor, jju, k).im; + sfac * dulist_cpu(iatom, jnbor, jju, k).im; } jju++; } From 457e4c094b001be1b2ec9837d6cd5af16648bec1 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Wed, 20 Nov 2024 09:41:57 -0800 Subject: [PATCH 14/30] No actual changes; this code just rearranges sna_kokkos_impl to make the subsequent CPU/GPU unifications easier to follow. --- src/KOKKOS/sna_kokkos_impl.h | 775 +++++++++++++++++------------------ 1 file changed, 374 insertions(+), 401 deletions(-) diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index 6cb83b3910..24af462cc5 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -464,7 +464,30 @@ void SNAKokkos::pre_ui(const int& iatom, c } } } +} +template +KOKKOS_INLINE_FUNCTION +void SNAKokkos::pre_ui_cpu(const int& iatom, const int& ielem) const +{ + for (int jelem = 0; jelem < nelements; jelem++) { + for (int j = 0; j <= twojmax; j++) { + int jju = idxu_half_block(j); // removed "const" to work around GCC 7 bug + + // Only diagonal elements get initialized + for (int m = 0; m < (j+1)*(j/2+1); m++) { + const int jjup = jju + m; + + // if m is on the "diagonal", initialize it with the self energy. + // Otherwise zero it out + real_type init = 0; + if (m % (j+2) == 0 && (!chem_flag || ielem == jelem || wselfall_flag)) { init = wself; } //need to map iatom to element + + ulisttot_re(iatom, jelem, jjup) = init; + ulisttot_im(iatom, jelem, jjup) = 0; + }; + } + } } /* ---------------------------------------------------------------------- @@ -635,13 +658,157 @@ void SNAKokkos::evaluate_ui_jbend(const Wi Kokkos::atomic_add(&(ulisttot_re(iatom, jelem, jjup + ma)), ulist_prev.re * sfac); Kokkos::atomic_add(&(ulisttot_im(iatom, jelem, jjup + ma)), ulist_prev.im * sfac); } - } /* ---------------------------------------------------------------------- - compute Zi by summing over products of Ui, - AoSoA data layout to take advantage of coalescing, avoiding warp - divergence. GPU version + compute Ui by summing over bispectrum components. CPU only. + See comments above compute_uarray_cpu and add_uarraytot for + data layout comments. +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void SNAKokkos::compute_ui_cpu(const int& iatom, const int& jnbor) const +{ + real_type rsq, r, x, y, z, z0, theta0; + + // utot(j,ma,mb) = 0 for all j,ma,ma + // utot(j,ma,ma) = 1 for all j,ma + // for j in neighbors of i: + // compute r0 = (x,y,z,z0) + // utot(j,ma,mb) += u(r0;j,ma,mb) for all j,ma,mb + + x = rij(iatom,jnbor,0); + y = rij(iatom,jnbor,1); + z = rij(iatom,jnbor,2); + rsq = x * x + y * y + z * z; + r = sqrt(rsq); + + theta0 = (r - rmin0) * rfac0 * MY_PI / (rcutij(iatom,jnbor) - rmin0); + // theta0 = (r - rmin0) * rscale0; + z0 = r / tan(theta0); + + compute_uarray_cpu(iatom, jnbor, x, y, z, z0, r); + add_uarraytot(iatom, jnbor, r, wj(iatom,jnbor), rcutij(iatom,jnbor), sinnerij(iatom,jnbor), dinnerij(iatom,jnbor), element(iatom, jnbor)); +} + +/* ---------------------------------------------------------------------- + compute Wigner U-functions for one neighbor. + `ulisttot` uses a "cached" data layout, matching the amount of + information stored between layers via scratch memory on the GPU path +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void SNAKokkos::compute_uarray_cpu(int iatom, int jnbor, + const real_type& x, const real_type& y, const real_type& z, const real_type& z0, const real_type& r) const +{ + // compute Cayley-Klein parameters for unit quaternion + real_type r0inv = static_cast(1.0) / sqrt(r * r + z0 * z0); + complex a = { r0inv * z0, -r0inv * z }; + complex b = { r0inv * y, -r0inv * x }; + + // VMK Section 4.8.2 + ulist_cpu(iatom, jnbor, 0) = complex::one(); + + for (int j = 1; j <= twojmax; j++) { + int jju = idxu_cache_block[j]; // removed "const" to work around GCC 7 bug + int jjup = idxu_cache_block[j-1]; // removed "const" to work around GCC 7 bug + + // fill in left side of matrix layer from previous layer + for (int mb = 0; 2*mb <= j; mb++) { + int jju_index = jju + (j + 1) * mb; + int jjup_index = jjup + j * mb; + + complex ui = complex::zero(); + + for (int ma = 0; ma < j; ma++) { + complex ui_prev = ulist_cpu(iatom, jnbor, jjup_index); + + real_type rootpq = rootpqarray(j - ma, j - mb); + ui.re += rootpq * (a.re * ui_prev.re + a.im * ui_prev.im); + ui.im += rootpq * (a.re * ui_prev.im - a.im * ui_prev.re); + + ulist_cpu(iatom, jnbor, jju_index) = ui; + + rootpq = rootpqarray(ma + 1, j - mb); + ui.re = -rootpq * (b.re * ui_prev.re + b.im * ui_prev.im); + ui.im = -rootpq * (b.re * ui_prev.im - b.im * ui_prev.re); + + jju_index++; + jjup_index++; + } + + ulist_cpu(iatom, jnbor, jju_index) = ui; + } + + // If j is odd (half-integer in the mathematical convention), we need + // to add one more row for convenience (for now). This can either be done + // via symmetry (see the commented code below), or by the equations to fill + // from the left instead of the right + if (j % 2 == 1) { + int mb = j / 2; + // begin filling in the extra row + int jju_index = jju + (mb + 1) * (j + 1); + int jjup_index = jjup + mb * j; + + complex ui = complex::zero(); + + for (int ma = 0; ma < j; ma++) { + complex ui_prev = ulist_cpu(iatom, jnbor, jjup_index); + + real_type rootpq = rootpqarray(j - ma, mb + 1); + ui.re += rootpq * (b.re * ui_prev.re - b.im * ui_prev.im); + ui.im += rootpq * (b.re * ui_prev.im + b.im * ui_prev.re); + + ulist_cpu(iatom, jnbor, jju_index) = ui; + + rootpq = rootpqarray(ma + 1, mb + 1); + ui.re = rootpq * (a.re * ui_prev.re - a.im * ui_prev.im); + ui.im = rootpq * (a.re * ui_prev.im + a.im * ui_prev.re); + + jju_index++; + jjup_index++; + } + + ulist_cpu(iatom, jnbor, jju_index) = ui; + } + } +} + +/* ---------------------------------------------------------------------- + add Wigner U-functions for one neighbor to the total + ulist is in a "cached" data layout, which is a compressed layout + which still keeps the recursive calculation simple. On the other hand + `ulisttot` uses a "half" data layout, which fully takes advantage + of the symmetry of the Wigner U matrices. +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void SNAKokkos::add_uarraytot(int iatom, int jnbor, + const real_type& r, const real_type& wj, const real_type& rcut, + const real_type& sinner, const real_type& dinner, int jelem) const +{ + const real_type sfac = compute_sfac(r, rcut, sinner, dinner) * wj; + + for (int j = 0; j <= twojmax; j++) { + int jju_half = idxu_half_block[j]; // index into ulisttot + int jju_cache = idxu_cache_block[j]; // index into ulist + + int count = 0; + for (int mb = 0; 2*mb <= j; mb++) { + for (int ma = 0; ma <= j; ma++) { + Kokkos::atomic_add(&(ulisttot_re(iatom, jelem, jju_half+count)), sfac * ulist_cpu(iatom, jnbor, jju_cache+count).re); + Kokkos::atomic_add(&(ulisttot_im(iatom, jelem, jju_half+count)), sfac * ulist_cpu(iatom, jnbor, jju_cache+count).im); + count++; + } + } + } +} + +/* ---------------------------------------------------------------------- + compute Zi by summing over products of Ui ------------------------------------------------------------------------- */ template @@ -665,6 +832,82 @@ void SNAKokkos::compute_zi(const int& iato } } +template +KOKKOS_INLINE_FUNCTION +void SNAKokkos::compute_zi_cpu(const int& iter) const +{ + const int iatom = iter / idxz_max; + const int jjz = iter % idxz_max; + + int j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, idxcg; + idxz(jjz).get_zi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, idxcg); + + const real_type *cgblock = cglist.data() + idxcg; + + int idouble = 0; + + for (int elem1 = 0; elem1 < nelements; elem1++) { + for (int elem2 = 0; elem2 < nelements; elem2++) { + zlist(iatom, idouble, jjz) = evaluate_zi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, iatom, elem1, elem2, cgblock); + idouble++; + } // end loop over elem2 + } // end loop over elem1 +} + +/* ---------------------------------------------------------------------- + Core "evaluation" kernel that computes a single zlist value. + This gets used in both `compute_zi` and `compute_yi` +------------------------------------------------------------------------- */ + +template +KOKKOS_FORCEINLINE_FUNCTION +typename SNAKokkos::complex SNAKokkos::evaluate_zi(const int& j1, const int& j2, const int& j, + const int& ma1min, const int& ma2max, const int& mb1min, const int& mb2max, const int& na, const int& nb, + const int& iatom, const int& elem1, const int& elem2, const real_type* cgblock) const { + complex ztmp = complex::zero(); + + int jju1 = idxu_block[j1] + (j1+1)*mb1min; + int jju2 = idxu_block[j2] + (j2+1)*mb2max; + int icgb = mb1min*(j2+1) + mb2max; + + #ifdef LMP_KK_DEVICE_COMPILE + #pragma unroll + #endif + for (int ib = 0; ib < nb; ib++) { + + int ma1 = ma1min; + int ma2 = ma2max; + int icga = ma1min*(j2+1) + ma2max; + + #ifdef LMP_KK_DEVICE_COMPILE + #pragma unroll + #endif + for (int ia = 0; ia < na; ia++) { + const complex utot1 = ulisttot(iatom, elem1, jju1+ma1); + const complex utot2 = ulisttot(iatom, elem2, jju2+ma2); + const real_type cgcoeff_a = cgblock[icga]; + const real_type cgcoeff_b = cgblock[icgb]; + ztmp.re += cgcoeff_a * cgcoeff_b * (utot1.re * utot2.re - utot1.im * utot2.im); + ztmp.im += cgcoeff_a * cgcoeff_b * (utot1.re * utot2.im + utot1.im * utot2.re); + ma1++; + ma2--; + icga += j2; + } // end loop over ia + + jju1 += j1 + 1; + jju2 -= j2 + 1; + icgb += j2; + } // end loop over ib + + if (bnorm_flag) { + const real_type scale = static_cast(1) / static_cast(j + 1); + ztmp.re *= scale; + ztmp.im *= scale; + } + + return ztmp; +} + /* ---------------------------------------------------------------------- compute Bi by summing conj(Ui)*Zi AoSoA data layout to take advantage of coalescing, avoiding warp @@ -759,9 +1002,7 @@ void SNAKokkos::compute_bi(const int& iato /* ---------------------------------------------------------------------- - compute Yi from Ui without storing Zi, looping over zlist indices. - AoSoA data layout to take advantage of coalescing, avoiding warp - divergence. GPU version. + Compute Yi from Ui without storing Zi, looping over zlist indices. ------------------------------------------------------------------------- */ template @@ -797,10 +1038,81 @@ void SNAKokkos::compute_yi(const int& iato } // end loop over elem1 } +template +KOKKOS_INLINE_FUNCTION +void SNAKokkos::compute_yi_cpu(int iter) const +{ + real_type betaj; + const int iatom = iter / idxz_max; + const int jjz = iter % idxz_max; + + int j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, jju_half, idxcg; + idxz(jjz).get_yi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, jju_half, idxcg); + + const real_type *cgblock = cglist.data() + idxcg; + //int mb = (2 * (mb1min+mb2max) - j1 - j2 + j) / 2; + //int ma = (2 * (ma1min+ma2max) - j1 - j2 + j) / 2; + + for (int elem1 = 0; elem1 < nelements; elem1++) { + for (int elem2 = 0; elem2 < nelements; elem2++) { + + real_type ztmp_r = 0.0; + real_type ztmp_i = 0.0; + + int jju1 = idxu_block[j1] + (j1 + 1) * mb1min; + int jju2 = idxu_block[j2] + (j2 + 1) * mb2max; + int icgb = mb1min * (j2 +1) + mb2max; + + for (int ib = 0; ib < nb; ib++) { + + real_type suma1_r = 0.0; + real_type suma1_i = 0.0; + + int ma1 = ma1min; + int ma2 = ma2max; + int icga = ma1min*(j2+1) + ma2max; + + for (int ia = 0; ia < na; ia++) { + suma1_r += cgblock[icga] * (ulisttot(iatom, elem1, jju1+ma1).re * ulisttot(iatom, elem2, jju2+ma2).re - + ulisttot(iatom, elem1, jju1+ma1).im * ulisttot(iatom, elem2, jju2+ma2).im); + suma1_i += cgblock[icga] * (ulisttot(iatom, elem1, jju1+ma1).re * ulisttot(iatom, elem2, jju2+ma2).im + + ulisttot(iatom, elem1, jju1+ma1).im * ulisttot(iatom, elem2, jju2+ma2).re); + ma1++; + ma2--; + icga += j2; + } // end loop over ia + + ztmp_r += cgblock[icgb] * suma1_r; + ztmp_i += cgblock[icgb] * suma1_i; + jju1 += j1 + 1; + jju2 -= j2 + 1; + icgb += j2; + } // end loop over ib + + if (bnorm_flag) { + const real_type scale = static_cast(1) / static_cast(j + 1); + ztmp_i *= scale; + ztmp_r *= scale; + } + + // apply to z(j1,j2,j,ma,mb) to unique element of y(j) + // find right y_list[jju] and beta(iatom,jjb) entries + // multiply and divide by j+1 factors + // account for multiplicity of 1, 2, or 3 + + // pick out right beta value + for (int elem3 = 0; elem3 < nelements; elem3++) { + const real_type betaj = evaluate_beta_scaled(j1, j2, j, iatom, elem1, elem2, elem3); + + Kokkos::atomic_add(&(ylist_re(iatom, elem3, jju_half)), betaj*ztmp_r); + Kokkos::atomic_add(&(ylist_im(iatom, elem3, jju_half)), betaj*ztmp_i); + } // end loop over elem3 + } // end loop over elem2 + } // end loop over elem1 +} + /* ---------------------------------------------------------------------- - compute Yi from Ui without storing Zi, looping over zlist indices. - AoSoA data layout to take advantage of coalescing, avoiding warp - divergence. GPU version. + compute Yi from Ui with the precomputed Zi. ------------------------------------------------------------------------- */ template @@ -831,59 +1143,11 @@ void SNAKokkos::compute_yi_with_zlist(cons } // end loop over elem1 } -// Core "evaluation" kernel that computes a single zlist value -// which gets used in both `compute_zi` and `compute_yi` -template -KOKKOS_FORCEINLINE_FUNCTION -typename SNAKokkos::complex SNAKokkos::evaluate_zi(const int& j1, const int& j2, const int& j, - const int& ma1min, const int& ma2max, const int& mb1min, const int& mb2max, const int& na, const int& nb, - const int& iatom, const int& elem1, const int& elem2, const real_type* cgblock) const { - complex ztmp = complex::zero(); +/* ---------------------------------------------------------------------- + Core "evaluation" kernel that extracts and rescales the appropriate + `beta` value which gets used in both `compute_yi` and `compute_yi_from_zlist` +------------------------------------------------------------------------- */ - int jju1 = idxu_block[j1] + (j1+1)*mb1min; - int jju2 = idxu_block[j2] + (j2+1)*mb2max; - int icgb = mb1min*(j2+1) + mb2max; - - #ifdef LMP_KK_DEVICE_COMPILE - #pragma unroll - #endif - for (int ib = 0; ib < nb; ib++) { - - int ma1 = ma1min; - int ma2 = ma2max; - int icga = ma1min*(j2+1) + ma2max; - - #ifdef LMP_KK_DEVICE_COMPILE - #pragma unroll - #endif - for (int ia = 0; ia < na; ia++) { - const complex utot1 = ulisttot(iatom, elem1, jju1+ma1); - const complex utot2 = ulisttot(iatom, elem2, jju2+ma2); - const real_type cgcoeff_a = cgblock[icga]; - const real_type cgcoeff_b = cgblock[icgb]; - ztmp.re += cgcoeff_a * cgcoeff_b * (utot1.re * utot2.re - utot1.im * utot2.im); - ztmp.im += cgcoeff_a * cgcoeff_b * (utot1.re * utot2.im + utot1.im * utot2.re); - ma1++; - ma2--; - icga += j2; - } // end loop over ia - - jju1 += j1 + 1; - jju2 -= j2 + 1; - icgb += j2; - } // end loop over ib - - if (bnorm_flag) { - const real_type scale = static_cast(1) / static_cast(j + 1); - ztmp.re *= scale; - ztmp.im *= scale; - } - - return ztmp; -} - -// Core "evaluation" kernel that extracts and rescales the appropriate `beta` value, -// which gets used in both `compute_yi` and `compute_yi_from_zlist template KOKKOS_FORCEINLINE_FUNCTION typename SNAKokkos::real_type SNAKokkos::evaluate_beta_scaled(const int& j1, const int& j2, const int& j, @@ -1145,178 +1409,6 @@ typename SNAKokkos::real_type SNAKokkos -KOKKOS_INLINE_FUNCTION -void SNAKokkos::pre_ui_cpu(const int& iatom, const int& ielem) const -{ - for (int jelem = 0; jelem < nelements; jelem++) { - for (int j = 0; j <= twojmax; j++) { - int jju = idxu_half_block(j); // removed "const" to work around GCC 7 bug - - // Only diagonal elements get initialized - for (int m = 0; m < (j+1)*(j/2+1); m++) { - const int jjup = jju + m; - - // if m is on the "diagonal", initialize it with the self energy. - // Otherwise zero it out - real_type init = 0; - if (m % (j+2) == 0 && (!chem_flag || ielem == jelem || wselfall_flag)) { init = wself; } //need to map iatom to element - - ulisttot_re(iatom, jelem, jjup) = init; - ulisttot_im(iatom, jelem, jjup) = 0; - }; - } - } - -} - - -/* ---------------------------------------------------------------------- - compute Ui by summing over bispectrum components. CPU only. - See comments above compute_uarray_cpu and add_uarraytot for - data layout comments. -------------------------------------------------------------------------- */ - -template -KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_ui_cpu(const int& iatom, const int& jnbor) const -{ - real_type rsq, r, x, y, z, z0, theta0; - - // utot(j,ma,mb) = 0 for all j,ma,ma - // utot(j,ma,ma) = 1 for all j,ma - // for j in neighbors of i: - // compute r0 = (x,y,z,z0) - // utot(j,ma,mb) += u(r0;j,ma,mb) for all j,ma,mb - - x = rij(iatom,jnbor,0); - y = rij(iatom,jnbor,1); - z = rij(iatom,jnbor,2); - rsq = x * x + y * y + z * z; - r = sqrt(rsq); - - theta0 = (r - rmin0) * rfac0 * MY_PI / (rcutij(iatom,jnbor) - rmin0); - // theta0 = (r - rmin0) * rscale0; - z0 = r / tan(theta0); - - compute_uarray_cpu(iatom, jnbor, x, y, z, z0, r); - add_uarraytot(iatom, jnbor, r, wj(iatom,jnbor), rcutij(iatom,jnbor), sinnerij(iatom,jnbor), dinnerij(iatom,jnbor), element(iatom, jnbor)); - -} -/* ---------------------------------------------------------------------- - compute Zi by summing over products of Ui, CPU version -------------------------------------------------------------------------- */ - -template -KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_zi_cpu(const int& iter) const -{ - const int iatom = iter / idxz_max; - const int jjz = iter % idxz_max; - - int j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, idxcg; - idxz(jjz).get_zi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, idxcg); - - const real_type *cgblock = cglist.data() + idxcg; - - int idouble = 0; - - for (int elem1 = 0; elem1 < nelements; elem1++) { - for (int elem2 = 0; elem2 < nelements; elem2++) { - zlist(iatom, idouble, jjz) = evaluate_zi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, iatom, elem1, elem2, cgblock); - idouble++; - } // end loop over elem2 - } // end loop over elem1 -} - -/* ---------------------------------------------------------------------- - compute Yi from Ui without storing Zi, looping over zlist indices, - CPU version -------------------------------------------------------------------------- */ - -template -KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_yi_cpu(int iter) const -{ - real_type betaj; - const int iatom = iter / idxz_max; - const int jjz = iter % idxz_max; - - int j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, jju_half, idxcg; - idxz(jjz).get_yi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, jju_half, idxcg); - - const real_type *cgblock = cglist.data() + idxcg; - //int mb = (2 * (mb1min+mb2max) - j1 - j2 + j) / 2; - //int ma = (2 * (ma1min+ma2max) - j1 - j2 + j) / 2; - - for (int elem1 = 0; elem1 < nelements; elem1++) { - for (int elem2 = 0; elem2 < nelements; elem2++) { - - real_type ztmp_r = 0.0; - real_type ztmp_i = 0.0; - - int jju1 = idxu_block[j1] + (j1 + 1) * mb1min; - int jju2 = idxu_block[j2] + (j2 + 1) * mb2max; - int icgb = mb1min * (j2 +1) + mb2max; - - for (int ib = 0; ib < nb; ib++) { - - real_type suma1_r = 0.0; - real_type suma1_i = 0.0; - - int ma1 = ma1min; - int ma2 = ma2max; - int icga = ma1min*(j2+1) + ma2max; - - for (int ia = 0; ia < na; ia++) { - suma1_r += cgblock[icga] * (ulisttot(iatom, elem1, jju1+ma1).re * ulisttot(iatom, elem2, jju2+ma2).re - - ulisttot(iatom, elem1, jju1+ma1).im * ulisttot(iatom, elem2, jju2+ma2).im); - suma1_i += cgblock[icga] * (ulisttot(iatom, elem1, jju1+ma1).re * ulisttot(iatom, elem2, jju2+ma2).im + - ulisttot(iatom, elem1, jju1+ma1).im * ulisttot(iatom, elem2, jju2+ma2).re); - ma1++; - ma2--; - icga += j2; - } // end loop over ia - - ztmp_r += cgblock[icgb] * suma1_r; - ztmp_i += cgblock[icgb] * suma1_i; - jju1 += j1 + 1; - jju2 -= j2 + 1; - icgb += j2; - } // end loop over ib - - if (bnorm_flag) { - const real_type scale = static_cast(1) / static_cast(j + 1); - ztmp_i *= scale; - ztmp_r *= scale; - } - - // apply to z(j1,j2,j,ma,mb) to unique element of y(j) - // find right y_list[jju] and beta(iatom,jjb) entries - // multiply and divide by j+1 factors - // account for multiplicity of 1, 2, or 3 - - // pick out right beta value - for (int elem3 = 0; elem3 < nelements; elem3++) { - const real_type betaj = evaluate_beta_scaled(j1, j2, j, iatom, elem1, elem2, elem3); - - Kokkos::atomic_add(&(ylist_re(iatom, elem3, jju_half)), betaj*ztmp_r); - Kokkos::atomic_add(&(ylist_im(iatom, elem3, jju_half)), betaj*ztmp_i); - } // end loop over elem3 - } // end loop over elem2 - } // end loop over elem1 -} - - /* ---------------------------------------------------------------------- calculate derivative of Ui w.r.t. atom j see comments above compute_duarray_cpu for comments on the @@ -1345,173 +1437,6 @@ void SNAKokkos::compute_duidrj_cpu(const i compute_duarray_cpu(iatom, jnbor, x, y, z, z0, r, dz0dr, wj(iatom,jnbor), rcutij(iatom,jnbor), sinnerij(iatom,jnbor), dinnerij(iatom,jnbor)); } - -/* ---------------------------------------------------------------------- - compute dEidRj, CPU path only. - dulist takes advantage of a `cached` data layout, similar to the - shared memory layout for the GPU routines, which is efficient for - compressing the calculation in compute_duarray_cpu. That said, - dulist only uses the "half" data layout part of that structure. -------------------------------------------------------------------------- */ - - -template -KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_deidrj_cpu(const int& iatom, const int& jnbor) const -{ - real_type force_sum[3] = { 0, 0, 0 }; - const int jelem = element(iatom, jnbor); - - for (int j = 0; j <= twojmax; j++) { - int jju_half = idxu_half_block[j]; - int jju_cache = idxu_cache_block[j]; - - for (int mb = 0; 2 * mb < j; mb++) { - for (int ma = 0; ma <= j; ma++) { - complex y = { ylist_re(iatom, jelem, jju_half), ylist_im(iatom, jelem, jju_half) }; - for (int k = 0; k < 3; k++) - force_sum[k] += dulist_cpu(iatom, jnbor, jju_cache, k).re * y.re + - dulist_cpu(iatom, jnbor, jju_cache, k).im * y.im; - jju_half++; jju_cache++; - } - } //end loop over ma mb - - // For j even, handle middle column - - if (j % 2 == 0) { - //int mb = j / 2; - for (int ma = 0; ma <= j; ma++) { - complex y = { ylist_re(iatom, jelem, jju_half), ylist_im(iatom, jelem, jju_half) }; - for (int k = 0; k < 3; k++) - force_sum[k] += static_cast(0.5) * (dulist_cpu(iatom, jnbor, jju_cache, k).re * y.re + - dulist_cpu(iatom, jnbor, jju_cache, k).im * y.im); - jju_half++; jju_cache++; - } - } // end if jeven - - } - - for (int k = 0; k < 3; k++) - dedr(iatom, jnbor, k) = 2 * force_sum[k]; -} - - -/* ---------------------------------------------------------------------- - add Wigner U-functions for one neighbor to the total - ulist is in a "cached" data layout, which is a compressed layout - which still keeps the recursive calculation simple. On the other hand - `ulisttot` uses a "half" data layout, which fully takes advantage - of the symmetry of the Wigner U matrices. -------------------------------------------------------------------------- */ - -template -KOKKOS_INLINE_FUNCTION -void SNAKokkos::add_uarraytot(int iatom, int jnbor, - const real_type& r, const real_type& wj, const real_type& rcut, - const real_type& sinner, const real_type& dinner, int jelem) const -{ - const real_type sfac = compute_sfac(r, rcut, sinner, dinner) * wj; - - for (int j = 0; j <= twojmax; j++) { - int jju_half = idxu_half_block[j]; // index into ulisttot - int jju_cache = idxu_cache_block[j]; // index into ulist - - int count = 0; - for (int mb = 0; 2*mb <= j; mb++) { - for (int ma = 0; ma <= j; ma++) { - Kokkos::atomic_add(&(ulisttot_re(iatom, jelem, jju_half+count)), sfac * ulist_cpu(iatom, jnbor, jju_cache+count).re); - Kokkos::atomic_add(&(ulisttot_im(iatom, jelem, jju_half+count)), sfac * ulist_cpu(iatom, jnbor, jju_cache+count).im); - count++; - } - } - } -} - -/* ---------------------------------------------------------------------- - compute Wigner U-functions for one neighbor. - `ulisttot` uses a "cached" data layout, matching the amount of - information stored between layers via scratch memory on the GPU path -------------------------------------------------------------------------- */ - -template -KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_uarray_cpu(int iatom, int jnbor, - const real_type& x, const real_type& y, const real_type& z, const real_type& z0, const real_type& r) const -{ - // compute Cayley-Klein parameters for unit quaternion - real_type r0inv = static_cast(1.0) / sqrt(r * r + z0 * z0); - complex a = { r0inv * z0, -r0inv * z }; - complex b = { r0inv * y, -r0inv * x }; - - // VMK Section 4.8.2 - ulist_cpu(iatom, jnbor, 0) = complex::one(); - - for (int j = 1; j <= twojmax; j++) { - int jju = idxu_cache_block[j]; // removed "const" to work around GCC 7 bug - int jjup = idxu_cache_block[j-1]; // removed "const" to work around GCC 7 bug - - // fill in left side of matrix layer from previous layer - for (int mb = 0; 2*mb <= j; mb++) { - int jju_index = jju + (j + 1) * mb; - int jjup_index = jjup + j * mb; - - complex ui = complex::zero(); - - for (int ma = 0; ma < j; ma++) { - complex ui_prev = ulist_cpu(iatom, jnbor, jjup_index); - - real_type rootpq = rootpqarray(j - ma, j - mb); - ui.re += rootpq * (a.re * ui_prev.re + a.im * ui_prev.im); - ui.im += rootpq * (a.re * ui_prev.im - a.im * ui_prev.re); - - ulist_cpu(iatom, jnbor, jju_index) = ui; - - rootpq = rootpqarray(ma + 1, j - mb); - ui.re = -rootpq * (b.re * ui_prev.re + b.im * ui_prev.im); - ui.im = -rootpq * (b.re * ui_prev.im - b.im * ui_prev.re); - - jju_index++; - jjup_index++; - } - - ulist_cpu(iatom, jnbor, jju_index) = ui; - } - - // If j is odd (half-integer in the mathematical convention), we need - // to add one more row for convenience (for now). This can either be done - // via symmetry (see the commented code below), or by the equations to fill - // from the left instead of the right - if (j % 2 == 1) { - int mb = j / 2; - // begin filling in the extra row - int jju_index = jju + (mb + 1) * (j + 1); - int jjup_index = jjup + mb * j; - - complex ui = complex::zero(); - - for (int ma = 0; ma < j; ma++) { - complex ui_prev = ulist_cpu(iatom, jnbor, jjup_index); - - real_type rootpq = rootpqarray(j - ma, mb + 1); - ui.re += rootpq * (b.re * ui_prev.re - b.im * ui_prev.im); - ui.im += rootpq * (b.re * ui_prev.im + b.im * ui_prev.re); - - ulist_cpu(iatom, jnbor, jju_index) = ui; - - rootpq = rootpqarray(ma + 1, mb + 1); - ui.re = rootpq * (a.re * ui_prev.re - a.im * ui_prev.im); - ui.im = rootpq * (a.re * ui_prev.im + a.im * ui_prev.re); - - jju_index++; - jjup_index++; - } - - ulist_cpu(iatom, jnbor, jju_index) = ui; - } - } - -} - /* ---------------------------------------------------------------------- compute derivatives of Wigner U-functions for one neighbor see comments in compute_uarray_cpu() @@ -1683,6 +1608,54 @@ void SNAKokkos::compute_duarray_cpu(int ia } } +/* ---------------------------------------------------------------------- + compute dEidRj, CPU path only. + dulist takes advantage of a `cached` data layout, similar to the + shared memory layout for the GPU routines, which is efficient for + compressing the calculation in compute_duarray_cpu. That said, + dulist only uses the "half" data layout part of that structure. +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void SNAKokkos::compute_deidrj_cpu(const int& iatom, const int& jnbor) const +{ + real_type force_sum[3] = { 0, 0, 0 }; + const int jelem = element(iatom, jnbor); + + for (int j = 0; j <= twojmax; j++) { + int jju_half = idxu_half_block[j]; + int jju_cache = idxu_cache_block[j]; + + for (int mb = 0; 2 * mb < j; mb++) { + for (int ma = 0; ma <= j; ma++) { + complex y = { ylist_re(iatom, jelem, jju_half), ylist_im(iatom, jelem, jju_half) }; + for (int k = 0; k < 3; k++) + force_sum[k] += dulist_cpu(iatom, jnbor, jju_cache, k).re * y.re + + dulist_cpu(iatom, jnbor, jju_cache, k).im * y.im; + jju_half++; jju_cache++; + } + } //end loop over ma mb + + // For j even, handle middle column + + if (j % 2 == 0) { + //int mb = j / 2; + for (int ma = 0; ma <= j; ma++) { + complex y = { ylist_re(iatom, jelem, jju_half), ylist_im(iatom, jelem, jju_half) }; + for (int k = 0; k < 3; k++) + force_sum[k] += static_cast(0.5) * (dulist_cpu(iatom, jnbor, jju_cache, k).re * y.re + + dulist_cpu(iatom, jnbor, jju_cache, k).im * y.im); + jju_half++; jju_cache++; + } + } // end if jeven + + } + + for (int k = 0; k < 3; k++) + dedr(iatom, jnbor, k) = 2 * force_sum[k]; +} + /* ---------------------------------------------------------------------- factorial n, wrapper for precomputed table ------------------------------------------------------------------------- */ From 99c4d116b8bd9e916fea1ad3ce32e9783547f0ad Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Wed, 20 Nov 2024 09:48:20 -0800 Subject: [PATCH 15/30] Flatten compute_ui_cpu and compute_duidrj_cpu into single functions --- src/KOKKOS/sna_kokkos.h | 12 ---- src/KOKKOS/sna_kokkos_impl.h | 111 +++++++++++------------------------ 2 files changed, 35 insertions(+), 88 deletions(-) diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h index 4c9cace26a..46bacbf3d5 100644 --- a/src/KOKKOS/sna_kokkos.h +++ b/src/KOKKOS/sna_kokkos.h @@ -357,23 +357,11 @@ class SNAKokkos { inline void init_rootpqarray(); // init() - KOKKOS_INLINE_FUNCTION - void add_uarraytot(int, int, const real_type&, const real_type&, const real_type&, const real_type&, const real_type&, int) const; // compute_ui - - KOKKOS_INLINE_FUNCTION - void compute_uarray_cpu(int, int, const real_type&, const real_type&, const real_type&, - const real_type&, const real_type&) const; // compute_ui_cpu - - inline double deltacg(int, int, int); // init_clebsch_gordan inline int compute_ncoeff(); // SNAKokkos() - KOKKOS_INLINE_FUNCTION - void compute_duarray_cpu(int, int, const real_type&, const real_type&, const real_type&, // compute_duidrj_cpu - const real_type&, const real_type&, const real_type&, const real_type&, const real_type&, - const real_type&, const real_type&) const; // Sets the style for the switching function // 0 = none diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index 24af462cc5..fd5bec9650 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -662,47 +662,36 @@ void SNAKokkos::evaluate_ui_jbend(const Wi /* ---------------------------------------------------------------------- compute Ui by summing over bispectrum components. CPU only. - See comments above compute_uarray_cpu and add_uarraytot for - data layout comments. + This first computes Wigner U-functions for one neighbor. + `ulisttot` uses a "cached" data layout, matching the amount of + information stored between layers via scratch memory on the GPU path. + Next, it adds Wigner U-functions for each neighbor to ulisttot which is + in a "half" data layout, which is a compressed layout + which still keeps the recursive calculation simple. ------------------------------------------------------------------------- */ template KOKKOS_INLINE_FUNCTION void SNAKokkos::compute_ui_cpu(const int& iatom, const int& jnbor) const { - real_type rsq, r, x, y, z, z0, theta0; - // utot(j,ma,mb) = 0 for all j,ma,ma // utot(j,ma,ma) = 1 for all j,ma // for j in neighbors of i: // compute r0 = (x,y,z,z0) // utot(j,ma,mb) += u(r0;j,ma,mb) for all j,ma,mb - x = rij(iatom,jnbor,0); - y = rij(iatom,jnbor,1); - z = rij(iatom,jnbor,2); - rsq = x * x + y * y + z * z; - r = sqrt(rsq); + const real_type x = rij(iatom,jnbor,0); + const real_type y = rij(iatom,jnbor,1); + const real_type z = rij(iatom,jnbor,2); + const real_type rsq = x * x + y * y + z * z; + const real_type r = sqrt(rsq); - theta0 = (r - rmin0) * rfac0 * MY_PI / (rcutij(iatom,jnbor) - rmin0); + const real_type theta0 = (r - rmin0) * rfac0 * MY_PI / (rcutij(iatom,jnbor) - rmin0); // theta0 = (r - rmin0) * rscale0; - z0 = r / tan(theta0); + const real_type z0 = r / tan(theta0); - compute_uarray_cpu(iatom, jnbor, x, y, z, z0, r); - add_uarraytot(iatom, jnbor, r, wj(iatom,jnbor), rcutij(iatom,jnbor), sinnerij(iatom,jnbor), dinnerij(iatom,jnbor), element(iatom, jnbor)); -} + // begin what was "compute_uarray_cpu" -/* ---------------------------------------------------------------------- - compute Wigner U-functions for one neighbor. - `ulisttot` uses a "cached" data layout, matching the amount of - information stored between layers via scratch memory on the GPU path -------------------------------------------------------------------------- */ - -template -KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_uarray_cpu(int iatom, int jnbor, - const real_type& x, const real_type& y, const real_type& z, const real_type& z0, const real_type& r) const -{ // compute Cayley-Klein parameters for unit quaternion real_type r0inv = static_cast(1.0) / sqrt(r * r + z0 * z0); complex a = { r0inv * z0, -r0inv * z }; @@ -774,23 +763,11 @@ void SNAKokkos::compute_uarray_cpu(int iat ulist_cpu(iatom, jnbor, jju_index) = ui; } } -} -/* ---------------------------------------------------------------------- - add Wigner U-functions for one neighbor to the total - ulist is in a "cached" data layout, which is a compressed layout - which still keeps the recursive calculation simple. On the other hand - `ulisttot` uses a "half" data layout, which fully takes advantage - of the symmetry of the Wigner U matrices. -------------------------------------------------------------------------- */ + // begin what was add_uarraytot -template -KOKKOS_INLINE_FUNCTION -void SNAKokkos::add_uarraytot(int iatom, int jnbor, - const real_type& r, const real_type& wj, const real_type& rcut, - const real_type& sinner, const real_type& dinner, int jelem) const -{ - const real_type sfac = compute_sfac(r, rcut, sinner, dinner) * wj; + const real_type sfac = compute_sfac(r, rcutij(iatom,jnbor), sinnerij(iatom,jnbor), dinnerij(iatom,jnbor)) * wj(iatom,jnbor); + const auto jelem = element(iatom, jnbor); for (int j = 0; j <= twojmax; j++) { int jju_half = idxu_half_block[j]; // index into ulisttot @@ -1418,42 +1395,24 @@ typename SNAKokkos::real_type SNAKokkos KOKKOS_INLINE_FUNCTION void SNAKokkos::compute_duidrj_cpu(const int& iatom, const int& jnbor) const -{ - real_type rsq, r, x, y, z, z0, theta0, cs, sn; - real_type dz0dr; - - x = rij(iatom,jnbor,0); - y = rij(iatom,jnbor,1); - z = rij(iatom,jnbor,2); - rsq = x * x + y * y + z * z; - r = sqrt(rsq); - real_type rscale0 = rfac0 * static_cast(MY_PI) / (rcutij(iatom,jnbor) - rmin0); - theta0 = (r - rmin0) * rscale0; - sn = sin(theta0); - cs = cos(theta0); - z0 = r * cs / sn; - dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq; - - compute_duarray_cpu(iatom, jnbor, x, y, z, z0, r, dz0dr, wj(iatom,jnbor), rcutij(iatom,jnbor), sinnerij(iatom,jnbor), dinnerij(iatom,jnbor)); -} - -/* ---------------------------------------------------------------------- - compute derivatives of Wigner U-functions for one neighbor - see comments in compute_uarray_cpu() - Uses same cached data layout of ulist -------------------------------------------------------------------------- */ - -template -KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_duarray_cpu(int iatom, int jnbor, - const real_type& x, const real_type& y, const real_type& z, - const real_type& z0, const real_type& r, const real_type& dz0dr, - const real_type& wj, const real_type& rcut, - const real_type& sinner, const real_type& dinner) const { complex da[3], db[3]; real_type u[3], dz0[3], dr0inv[3]; + const real_type x = rij(iatom,jnbor,0); + const real_type y = rij(iatom,jnbor,1); + const real_type z = rij(iatom,jnbor,2); + const real_type rsq = x * x + y * y + z * z; + const real_type r = sqrt(rsq); + const real_type rscale0 = rfac0 * static_cast(MY_PI) / (rcutij(iatom,jnbor) - rmin0); + const real_type theta0 = (r - rmin0) * rscale0; + const real_type sn = sin(theta0); + const real_type cs = cos(theta0); + const real_type z0 = r * cs / sn; + const real_type dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq; + + // begin what was compute_duarray_cpu + real_type rinv = 1.0 / r; u[0] = x * rinv; u[1] = y * rinv; @@ -1584,11 +1543,11 @@ void SNAKokkos::compute_duarray_cpu(int ia } } - real_type sfac = compute_sfac(r, rcut, sinner, dinner); - real_type dsfac = compute_dsfac(r, rcut, sinner, dinner); + real_type sfac, dsfac; + compute_s_dsfac(r, rcutij(iatom,jnbor), sinnerij(iatom,jnbor), dinnerij(iatom,jnbor), sfac, dsfac); - sfac *= wj; - dsfac *= wj; + sfac *= wj(iatom,jnbor); + dsfac *= wj(iatom,jnbor); // Even though we fill out a full "cached" data layout above, // we only need the "half" data for the accumulation into dedr. From 8a65f442372dfc1ae028446518c106aa9319cc05 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Wed, 20 Nov 2024 09:58:50 -0800 Subject: [PATCH 16/30] Unify the CPU and GPU TransformUi routines --- src/KOKKOS/pair_snap_kokkos.h | 3 +- src/KOKKOS/pair_snap_kokkos_impl.h | 81 ++++-------------------------- src/KOKKOS/sna_kokkos.h | 4 ++ src/KOKKOS/sna_kokkos_impl.h | 33 ++++++++++++ 4 files changed, 47 insertions(+), 74 deletions(-) diff --git a/src/KOKKOS/pair_snap_kokkos.h b/src/KOKKOS/pair_snap_kokkos.h index e7797097c2..1257a3ad23 100644 --- a/src/KOKKOS/pair_snap_kokkos.h +++ b/src/KOKKOS/pair_snap_kokkos.h @@ -60,7 +60,6 @@ struct TagPairSNAPComputeFusedDeidrjLarge{}; // less parallelism, no divergence struct TagPairSNAPComputeNeighCPU{}; struct TagPairSNAPPreUiCPU{}; struct TagPairSNAPComputeUiCPU{}; -struct TagPairSNAPTransformUiCPU{}; struct TagPairSNAPComputeZiCPU{}; struct TagPairSNAPComputeBiCPU{}; struct TagPairSNAPZeroYiCPU{}; @@ -210,7 +209,7 @@ class PairSNAPKokkos : public PairSNAP { void operator() (TagPairSNAPComputeUiCPU, const int& ii) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPTransformUiCPU, const int j, const int iatom) const; + void operator() (TagPairSNAPTransformUi, const int& ii) const; KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeZiCPU,const int& ii) const; diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h index d0647337a8..a740a8f75a 100644 --- a/src/KOKKOS/pair_snap_kokkos_impl.h +++ b/src/KOKKOS/pair_snap_kokkos_impl.h @@ -249,8 +249,9 @@ void PairSNAPKokkos::compute(int eflag_in, { // Expand ulisttot_re,_im -> ulisttot // Zero out ylist - typename Kokkos::MDRangePolicy, Kokkos::Rank<2, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformUiCPU> policy_transform_ui_cpu({0,0},{twojmax+1,chunk_size}); - Kokkos::parallel_for("TransformUiCPU",policy_transform_ui_cpu,*this); + int idxu_max = snaKK.idxu_max; + typename Kokkos::RangePolicy policy_transform_ui_cpu(0, chunk_size * idxu_max); + Kokkos::parallel_for("TransformUi",policy_transform_ui_cpu,*this); } //Compute bispectrum @@ -888,83 +889,19 @@ void PairSNAPKokkos::operator() (TagPairSN template KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPTransformUi, const int iatom_mod, const int idxu, const int iatom_div) const { - const int iatom = iatom_mod + iatom_div * vector_length; if (iatom >= chunk_size) return; if (idxu > snaKK.idxu_max) return; - - int elem_count = chemflag ? nelements : 1; - - for (int ielem = 0; ielem < elem_count; ielem++) { - - const FullHalfMapper mapper = snaKK.idxu_full_half[idxu]; - - auto utot_re = snaKK.ulisttot_re(iatom, ielem, mapper.idxu_half); - auto utot_im = snaKK.ulisttot_im(iatom, ielem, mapper.idxu_half); - - if (mapper.flip_sign == 1) { - utot_im = -utot_im; - } else if (mapper.flip_sign == -1) { - utot_re = -utot_re; - } - - snaKK.ulisttot(iatom, ielem, idxu) = { utot_re, utot_im }; - - if (mapper.flip_sign == 0) { - snaKK.ylist_re(iatom, ielem, mapper.idxu_half) = 0.; - snaKK.ylist_im(iatom, ielem, mapper.idxu_half) = 0.; - } - } + snaKK.transform_ui(iatom, idxu); } template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPTransformUiCPU, const int j, const int iatom) const { - +void PairSNAPKokkos::operator() (TagPairSNAPTransformUi, const int& ii) const { + const int iatom = ii / snaKK.idxu_max; + const int idxu = ii % snaKK.idxu_max; if (iatom >= chunk_size) return; - - if (j > twojmax) return; - - int elem_count = chemflag ? nelements : 1; - - // De-symmetrize ulisttot - for (int ielem = 0; ielem < elem_count; ielem++) { - - const int jju_half = snaKK.idxu_half_block(j); - const int jju = snaKK.idxu_block(j); - - for (int mb = 0; 2*mb <= j; mb++) { - for (int ma = 0; ma <= j; ma++) { - // Extract top half - - const int idxu_shift = mb * (j + 1) + ma; - const int idxu_half = jju_half + idxu_shift; - const int idxu = jju + idxu_shift; - - // Load ulist - complex utot = { snaKK.ulisttot_re(iatom, ielem, idxu_half), snaKK.ulisttot_im(iatom, ielem, idxu_half) }; - - // Store - snaKK.ulisttot(iatom, ielem, idxu) = utot; - - // Zero Yi - snaKK.ylist_re(iatom, ielem, idxu_half) = 0; - snaKK.ylist_im(iatom, ielem, idxu_half) = 0; - - // Symmetric term - const int sign_factor = (((ma+mb)%2==0)?1:-1); - const int idxu_flip = jju + (j + 1 - mb) * (j + 1) - (ma + 1); - - if (sign_factor == 1) { - utot.im = -utot.im; - } else { - utot.re = -utot.re; - } - - snaKK.ulisttot(iatom, ielem, idxu_flip) = utot; - } - } - } + snaKK.transform_ui(iatom, idxu); } /* ---------------------------------------------------------------------- @@ -986,7 +923,7 @@ void PairSNAPKokkos::operator() (TagPairSN template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeZiCPU,const int& ii) const { +void PairSNAPKokkos::operator() (TagPairSNAPComputeZiCPU, const int& ii) const { snaKK.compute_zi_cpu(ii); } diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h index 46bacbf3d5..c3ec81cc76 100644 --- a/src/KOKKOS/sna_kokkos.h +++ b/src/KOKKOS/sna_kokkos.h @@ -198,6 +198,10 @@ class SNAKokkos { KOKKOS_INLINE_FUNCTION void compute_ui_large(const typename Kokkos::TeamPolicy::member_type& team, const int, const int, const int) const; // ForceSNAP + // desymmetrize ulisttot + KOKKOS_INLINE_FUNCTION + void transform_ui(const int&, const int&) const; + KOKKOS_INLINE_FUNCTION void compute_zi(const int&, const int&) const; // ForceSNAP KOKKOS_INLINE_FUNCTION diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index fd5bec9650..4ae4f61d9e 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -784,6 +784,39 @@ void SNAKokkos::compute_ui_cpu(const int& } } +/* ---------------------------------------------------------------------- + De-symmetrize ulisttot_re and _im and pack it into a unified ulisttot + structure, fused in with zeroing ylist +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void SNAKokkos::transform_ui(const int& iatom, const int& idxu) const +{ + int elem_count = chem_flag ? nelements : 1; + + for (int ielem = 0; ielem < elem_count; ielem++) { + + const FullHalfMapper mapper = idxu_full_half[idxu]; + + auto utot_re = ulisttot_re(iatom, ielem, mapper.idxu_half); + auto utot_im = ulisttot_im(iatom, ielem, mapper.idxu_half); + + if (mapper.flip_sign == 1) { + utot_im = -utot_im; + } else if (mapper.flip_sign == -1) { + utot_re = -utot_re; + } + + ulisttot(iatom, ielem, idxu) = { utot_re, utot_im }; + + if (mapper.flip_sign == 0) { + ylist_re(iatom, ielem, mapper.idxu_half) = 0.; + ylist_im(iatom, ielem, mapper.idxu_half) = 0.; + } + } +} + /* ---------------------------------------------------------------------- compute Zi by summing over products of Ui ------------------------------------------------------------------------- */ From 98b67b8ea008978293738a48175c274c0cfbe9ea Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Wed, 20 Nov 2024 10:13:11 -0800 Subject: [PATCH 17/30] Unify the CPU and GPU ComputeYi and ComputeZi routines; extend ComputeYiWithZlist to the CPU --- src/KOKKOS/pair_snap_kokkos.h | 18 +++--- src/KOKKOS/pair_snap_kokkos_impl.h | 52 ++++++++++------ src/KOKKOS/sna_kokkos.h | 4 -- src/KOKKOS/sna_kokkos_impl.h | 97 ------------------------------ 4 files changed, 42 insertions(+), 129 deletions(-) diff --git a/src/KOKKOS/pair_snap_kokkos.h b/src/KOKKOS/pair_snap_kokkos.h index 1257a3ad23..35fc551d66 100644 --- a/src/KOKKOS/pair_snap_kokkos.h +++ b/src/KOKKOS/pair_snap_kokkos.h @@ -36,7 +36,12 @@ PairStyle(snap/kk/host,PairSNAPKokkosDevice); namespace LAMMPS_NS { // Routines for both the CPU and GPU backend +struct TagPairSNAPTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist +struct TagPairSNAPComputeZi{}; +struct TagPairSNAPComputeYi{}; +struct TagPairSNAPComputeYiWithZlist{}; struct TagPairSNAPBeta{}; + template struct TagPairSNAPComputeForce{}; @@ -46,11 +51,7 @@ struct TagPairSNAPComputeCayleyKlein{}; struct TagPairSNAPPreUi{}; struct TagPairSNAPComputeUiSmall{}; // more parallelism, more divergence struct TagPairSNAPComputeUiLarge{}; // less parallelism, no divergence -struct TagPairSNAPTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist -struct TagPairSNAPComputeZi{}; struct TagPairSNAPComputeBi{}; -struct TagPairSNAPComputeYi{}; -struct TagPairSNAPComputeYiWithZlist{}; template struct TagPairSNAPComputeFusedDeidrjSmall{}; // more parallelism, more divergence template @@ -60,9 +61,7 @@ struct TagPairSNAPComputeFusedDeidrjLarge{}; // less parallelism, no divergence struct TagPairSNAPComputeNeighCPU{}; struct TagPairSNAPPreUiCPU{}; struct TagPairSNAPComputeUiCPU{}; -struct TagPairSNAPComputeZiCPU{}; struct TagPairSNAPComputeBiCPU{}; -struct TagPairSNAPZeroYiCPU{}; struct TagPairSNAPComputeYiCPU{}; struct TagPairSNAPComputeDuidrjCPU{}; struct TagPairSNAPComputeDeidrjCPU{}; @@ -212,13 +211,16 @@ class PairSNAPKokkos : public PairSNAP { void operator() (TagPairSNAPTransformUi, const int& ii) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeZiCPU,const int& ii) const; + void operator() (TagPairSNAPComputeZi, const int& ii) const; KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeBiCPU, const int& ii) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeYiCPU,const int& ii) const; + void operator() (TagPairSNAPComputeYi, const int& ii) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeYiWithZlist, const int& ii) const; KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeDuidrjCPU, const int& ii) const; diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h index a740a8f75a..20f8ec09d6 100644 --- a/src/KOKKOS/pair_snap_kokkos_impl.h +++ b/src/KOKKOS/pair_snap_kokkos_impl.h @@ -251,32 +251,36 @@ void PairSNAPKokkos::compute(int eflag_in, // Zero out ylist int idxu_max = snaKK.idxu_max; typename Kokkos::RangePolicy policy_transform_ui_cpu(0, chunk_size * idxu_max); - Kokkos::parallel_for("TransformUi",policy_transform_ui_cpu,*this); + Kokkos::parallel_for("TransformUiCPU",policy_transform_ui_cpu,*this); } //Compute bispectrum if (quadraticflag || eflag) { //ComputeZi int idxz_max = snaKK.idxz_max; - typename Kokkos::RangePolicy policy_zi_cpu(0,chunk_size*idxz_max); - Kokkos::parallel_for("ComputeZiCPU",policy_zi_cpu,*this); + typename Kokkos::RangePolicy policy_zi_cpu(0, chunk_size * idxz_max); + Kokkos::parallel_for("ComputeZiCPU", policy_zi_cpu, *this); //ComputeBi int idxb_max = snaKK.idxb_max; typename Kokkos::RangePolicy policy_bi_cpu(0, chunk_size * idxb_max); Kokkos::parallel_for("ComputeBiCPU",policy_bi_cpu,*this); + + //Compute beta = dE_i/dB_i for all i in list + typename Kokkos::RangePolicy policy_beta(0,chunk_size); + Kokkos::parallel_for("ComputeBeta",policy_beta,*this); } //ComputeYi { - //Compute beta = dE_i/dB_i for all i in list - typename Kokkos::RangePolicy policy_beta(0,chunk_size); - Kokkos::parallel_for("ComputeBeta",policy_beta,*this); - - //ComputeYi int idxz_max = snaKK.idxz_max; - typename Kokkos::RangePolicy policy_yi_cpu(0,chunk_size*idxz_max); - Kokkos::parallel_for("ComputeYiCPU",policy_yi_cpu,*this); + if (quadraticflag || eflag) { + typename Kokkos::RangePolicy policy_yi_cpu(0, chunk_size * idxz_max); + Kokkos::parallel_for("ComputeYiWithZlistCPU", policy_yi_cpu,*this); + } else { + typename Kokkos::RangePolicy policy_yi_cpu(0, chunk_size * idxz_max); + Kokkos::parallel_for("ComputeYiCPU", policy_yi_cpu,*this); + } } // host flag //ComputeDuidrj and Deidrj @@ -912,19 +916,19 @@ void PairSNAPKokkos::operator() (TagPairSN template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeZi,const int iatom_mod, const int jjz, const int iatom_div) const { - +void PairSNAPKokkos::operator() (TagPairSNAPComputeZi, const int iatom_mod, const int jjz, const int iatom_div) const { const int iatom = iatom_mod + iatom_div * vector_length; if (iatom >= chunk_size) return; if (jjz >= snaKK.idxz_max) return; - snaKK.compute_zi(iatom, jjz); } template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeZiCPU, const int& ii) const { - snaKK.compute_zi_cpu(ii); +void PairSNAPKokkos::operator() (TagPairSNAPComputeZi, const int& ii) const { + const int iatom = ii / snaKK.idxz_max; + const int jjz = ii % snaKK.idxz_max; + snaKK.compute_zi(iatom, jjz); } /* ---------------------------------------------------------------------- @@ -1004,19 +1008,19 @@ void PairSNAPKokkos::operator() (TagPairSN template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeYi,const int iatom_mod, const int jjz, const int iatom_div) const { - +void PairSNAPKokkos::operator() (TagPairSNAPComputeYi, const int iatom_mod, const int jjz, const int iatom_div) const { const int iatom = iatom_mod + iatom_div * vector_length; if (iatom >= chunk_size) return; if (jjz >= snaKK.idxz_max) return; - snaKK.compute_yi(iatom, jjz); } template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeYiCPU, const int& ii) const { - snaKK.compute_yi_cpu(ii); +void PairSNAPKokkos::operator() (TagPairSNAPComputeYi, const int& ii) const { + const int iatom = ii / snaKK.idxz_max; + const int jjz = ii % snaKK.idxz_max; + snaKK.compute_yi(iatom, jjz); } /* ---------------------------------------------------------------------- @@ -1035,6 +1039,14 @@ void PairSNAPKokkos::operator() (TagPairSN snaKK.compute_yi_with_zlist(iatom, jjz); } +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeYiWithZlist, const int& ii) const { + const int iatom = ii / snaKK.idxz_max; + const int jjz = ii % snaKK.idxz_max; + snaKK.compute_yi_with_zlist(iatom, jjz); +} + /* ---------------------------------------------------------------------- Assemble the force contributions for each atom, neighbor pair by contracting the adjoint matrices Y with derivatives of the Wigner diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h index c3ec81cc76..6e6cfd37d3 100644 --- a/src/KOKKOS/sna_kokkos.h +++ b/src/KOKKOS/sna_kokkos.h @@ -244,10 +244,6 @@ class SNAKokkos { void pre_ui_cpu(const int&, const int&) const; // ForceSNAP KOKKOS_INLINE_FUNCTION void compute_ui_cpu(const int&, const int&) const; // ForceSNAP - KOKKOS_INLINE_FUNCTION - void compute_zi_cpu(const int&) const; // ForceSNAP - KOKKOS_INLINE_FUNCTION - void compute_yi_cpu(int) const; // ForceSNAP // functions for derivatives, CPU only KOKKOS_INLINE_FUNCTION diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index 4ae4f61d9e..546439e575 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -828,30 +828,6 @@ void SNAKokkos::compute_zi(const int& iato int j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, idxcg; idxz(jjz).get_zi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, idxcg); - const real_type* cgblock = cglist.data() + idxcg; - - int idouble = 0; - - for (int elem1 = 0; elem1 < nelements; elem1++) { - for (int elem2 = 0; elem2 < nelements; elem2++) { - - zlist(iatom, idouble, jjz) = evaluate_zi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, iatom, elem1, elem2, cgblock); - - idouble++; - } - } -} - -template -KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_zi_cpu(const int& iter) const -{ - const int iatom = iter / idxz_max; - const int jjz = iter % idxz_max; - - int j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, idxcg; - idxz(jjz).get_zi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, idxcg); - const real_type *cgblock = cglist.data() + idxcg; int idouble = 0; @@ -1048,79 +1024,6 @@ void SNAKokkos::compute_yi(const int& iato } // end loop over elem1 } -template -KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_yi_cpu(int iter) const -{ - real_type betaj; - const int iatom = iter / idxz_max; - const int jjz = iter % idxz_max; - - int j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, jju_half, idxcg; - idxz(jjz).get_yi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, jju_half, idxcg); - - const real_type *cgblock = cglist.data() + idxcg; - //int mb = (2 * (mb1min+mb2max) - j1 - j2 + j) / 2; - //int ma = (2 * (ma1min+ma2max) - j1 - j2 + j) / 2; - - for (int elem1 = 0; elem1 < nelements; elem1++) { - for (int elem2 = 0; elem2 < nelements; elem2++) { - - real_type ztmp_r = 0.0; - real_type ztmp_i = 0.0; - - int jju1 = idxu_block[j1] + (j1 + 1) * mb1min; - int jju2 = idxu_block[j2] + (j2 + 1) * mb2max; - int icgb = mb1min * (j2 +1) + mb2max; - - for (int ib = 0; ib < nb; ib++) { - - real_type suma1_r = 0.0; - real_type suma1_i = 0.0; - - int ma1 = ma1min; - int ma2 = ma2max; - int icga = ma1min*(j2+1) + ma2max; - - for (int ia = 0; ia < na; ia++) { - suma1_r += cgblock[icga] * (ulisttot(iatom, elem1, jju1+ma1).re * ulisttot(iatom, elem2, jju2+ma2).re - - ulisttot(iatom, elem1, jju1+ma1).im * ulisttot(iatom, elem2, jju2+ma2).im); - suma1_i += cgblock[icga] * (ulisttot(iatom, elem1, jju1+ma1).re * ulisttot(iatom, elem2, jju2+ma2).im + - ulisttot(iatom, elem1, jju1+ma1).im * ulisttot(iatom, elem2, jju2+ma2).re); - ma1++; - ma2--; - icga += j2; - } // end loop over ia - - ztmp_r += cgblock[icgb] * suma1_r; - ztmp_i += cgblock[icgb] * suma1_i; - jju1 += j1 + 1; - jju2 -= j2 + 1; - icgb += j2; - } // end loop over ib - - if (bnorm_flag) { - const real_type scale = static_cast(1) / static_cast(j + 1); - ztmp_i *= scale; - ztmp_r *= scale; - } - - // apply to z(j1,j2,j,ma,mb) to unique element of y(j) - // find right y_list[jju] and beta(iatom,jjb) entries - // multiply and divide by j+1 factors - // account for multiplicity of 1, 2, or 3 - - // pick out right beta value - for (int elem3 = 0; elem3 < nelements; elem3++) { - const real_type betaj = evaluate_beta_scaled(j1, j2, j, iatom, elem1, elem2, elem3); - - Kokkos::atomic_add(&(ylist_re(iatom, elem3, jju_half)), betaj*ztmp_r); - Kokkos::atomic_add(&(ylist_im(iatom, elem3, jju_half)), betaj*ztmp_i); - } // end loop over elem3 - } // end loop over elem2 - } // end loop over elem1 -} - /* ---------------------------------------------------------------------- compute Yi from Ui with the precomputed Zi. ------------------------------------------------------------------------- */ From 6e54d9326b16f89f834603ed63161bdd021f17c1 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Wed, 20 Nov 2024 10:20:42 -0800 Subject: [PATCH 18/30] Unify the CPU and GPU PreUi routines --- src/KOKKOS/pair_snap_kokkos.h | 14 +++++--------- src/KOKKOS/pair_snap_kokkos_impl.h | 15 +++++++++------ src/KOKKOS/sna_kokkos.h | 2 -- src/KOKKOS/sna_kokkos_impl.h | 24 ------------------------ 4 files changed, 14 insertions(+), 41 deletions(-) diff --git a/src/KOKKOS/pair_snap_kokkos.h b/src/KOKKOS/pair_snap_kokkos.h index 35fc551d66..9175c033fc 100644 --- a/src/KOKKOS/pair_snap_kokkos.h +++ b/src/KOKKOS/pair_snap_kokkos.h @@ -36,22 +36,21 @@ PairStyle(snap/kk/host,PairSNAPKokkosDevice); namespace LAMMPS_NS { // Routines for both the CPU and GPU backend +struct TagPairSNAPPreUi{}; struct TagPairSNAPTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist struct TagPairSNAPComputeZi{}; +struct TagPairSNAPComputeBi{}; +struct TagPairSNAPBeta{}; struct TagPairSNAPComputeYi{}; struct TagPairSNAPComputeYiWithZlist{}; -struct TagPairSNAPBeta{}; - template struct TagPairSNAPComputeForce{}; // GPU backend only struct TagPairSNAPComputeNeigh{}; struct TagPairSNAPComputeCayleyKlein{}; -struct TagPairSNAPPreUi{}; struct TagPairSNAPComputeUiSmall{}; // more parallelism, more divergence struct TagPairSNAPComputeUiLarge{}; // less parallelism, no divergence -struct TagPairSNAPComputeBi{}; template struct TagPairSNAPComputeFusedDeidrjSmall{}; // more parallelism, more divergence template @@ -59,10 +58,7 @@ struct TagPairSNAPComputeFusedDeidrjLarge{}; // less parallelism, no divergence // CPU backend only struct TagPairSNAPComputeNeighCPU{}; -struct TagPairSNAPPreUiCPU{}; struct TagPairSNAPComputeUiCPU{}; -struct TagPairSNAPComputeBiCPU{}; -struct TagPairSNAPComputeYiCPU{}; struct TagPairSNAPComputeDuidrjCPU{}; struct TagPairSNAPComputeDeidrjCPU{}; @@ -202,7 +198,7 @@ class PairSNAPKokkos : public PairSNAP { void operator() (TagPairSNAPComputeNeighCPU,const typename Kokkos::TeamPolicy::member_type& team) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPPreUiCPU, const int& iatom) const; + void operator() (TagPairSNAPPreUi, const int& ii) const; KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeUiCPU, const int& ii) const; @@ -214,7 +210,7 @@ class PairSNAPKokkos : public PairSNAP { void operator() (TagPairSNAPComputeZi, const int& ii) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeBiCPU, const int& ii) const; + void operator() (TagPairSNAPComputeBi, const int& ii) const; KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeYi, const int& ii) const; diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h index 20f8ec09d6..8349f8b398 100644 --- a/src/KOKKOS/pair_snap_kokkos_impl.h +++ b/src/KOKKOS/pair_snap_kokkos_impl.h @@ -235,7 +235,7 @@ void PairSNAPKokkos::compute(int eflag_in, //PreUi { - typename Kokkos::RangePolicy policy_preui_cpu(0, chunk_size); + typename Kokkos::RangePolicy policy_preui_cpu(0, chunk_size * (twojmax + 1)); Kokkos::parallel_for("PreUiCPU",policy_preui_cpu,*this); } @@ -263,7 +263,7 @@ void PairSNAPKokkos::compute(int eflag_in, //ComputeBi int idxb_max = snaKK.idxb_max; - typename Kokkos::RangePolicy policy_bi_cpu(0, chunk_size * idxb_max); + typename Kokkos::RangePolicy policy_bi_cpu(0, chunk_size * idxb_max); Kokkos::parallel_for("ComputeBiCPU",policy_bi_cpu,*this); //Compute beta = dE_i/dB_i for all i in list @@ -807,11 +807,14 @@ void PairSNAPKokkos::operator() (TagPairSN template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPPreUiCPU, const int& iatom) const { +void PairSNAPKokkos::operator() (TagPairSNAPPreUi, const int& ii) const { + const int iatom = ii / (twojmax+1); + const int j = ii % (twojmax+1); + const int itype = type(iatom); const int ielem = d_map[itype]; - snaKK.pre_ui_cpu(iatom, ielem); + snaKK.pre_ui(iatom, j, ielem); } /* ---------------------------------------------------------------------- @@ -944,12 +947,12 @@ void PairSNAPKokkos::operator() (TagPairSN if (iatom >= chunk_size) return; if (jjb >= snaKK.idxb_max) return; - snaKK.compute_bi(iatom,jjb); + snaKK.compute_bi(iatom, jjb); } template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeBiCPU, const int& ii) const { +void PairSNAPKokkos::operator() (TagPairSNAPComputeBi, const int& ii) const { const int iatom = ii / snaKK.idxb_max; const int jjb = ii % snaKK.idxb_max; snaKK.compute_bi(iatom, jjb); diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h index 6e6cfd37d3..92f413ed17 100644 --- a/src/KOKKOS/sna_kokkos.h +++ b/src/KOKKOS/sna_kokkos.h @@ -241,8 +241,6 @@ class SNAKokkos { // functions for bispectrum coefficients, CPU only KOKKOS_INLINE_FUNCTION - void pre_ui_cpu(const int&, const int&) const; // ForceSNAP - KOKKOS_INLINE_FUNCTION void compute_ui_cpu(const int&, const int&) const; // ForceSNAP // functions for derivatives, CPU only diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index 546439e575..4cdd37d1f5 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -466,30 +466,6 @@ void SNAKokkos::pre_ui(const int& iatom, c } } -template -KOKKOS_INLINE_FUNCTION -void SNAKokkos::pre_ui_cpu(const int& iatom, const int& ielem) const -{ - for (int jelem = 0; jelem < nelements; jelem++) { - for (int j = 0; j <= twojmax; j++) { - int jju = idxu_half_block(j); // removed "const" to work around GCC 7 bug - - // Only diagonal elements get initialized - for (int m = 0; m < (j+1)*(j/2+1); m++) { - const int jjup = jju + m; - - // if m is on the "diagonal", initialize it with the self energy. - // Otherwise zero it out - real_type init = 0; - if (m % (j+2) == 0 && (!chem_flag || ielem == jelem || wselfall_flag)) { init = wself; } //need to map iatom to element - - ulisttot_re(iatom, jelem, jjup) = init; - ulisttot_im(iatom, jelem, jjup) = 0; - }; - } - } -} - /* ---------------------------------------------------------------------- compute Ui by computing Wigner U-functions for one neighbor and accumulating to the total. GPU only. From 5dcadf57484b3011ba61680dbf7f717bb3bfaabf Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Wed, 20 Nov 2024 10:29:02 -0800 Subject: [PATCH 19/30] Created a simplifying abstraction to separately call host or device kernels where the code is identical --- src/KOKKOS/pair_snap_kokkos.h | 11 + src/KOKKOS/pair_snap_kokkos_impl.h | 354 ++++++++++++----------------- 2 files changed, 162 insertions(+), 203 deletions(-) diff --git a/src/KOKKOS/pair_snap_kokkos.h b/src/KOKKOS/pair_snap_kokkos.h index 9175c033fc..ece1384a49 100644 --- a/src/KOKKOS/pair_snap_kokkos.h +++ b/src/KOKKOS/pair_snap_kokkos.h @@ -127,6 +127,17 @@ class PairSNAPKokkos : public PairSNAP { template using SnapAoSoATeamPolicy = typename Kokkos::TeamPolicy, TagPairSNAP>; + // Helper routine that returns a CPU or a GPU policy as appropriate + template + auto snap_get_policy(const int& chunk_size_div, const int& second_loop) { + if constexpr (host_flag) + return typename Kokkos::RangePolicy(0, chunk_size_div * vector_length * second_loop); + else + return Snap3DRangePolicy({0, 0, 0}, + {vector_length, second_loop, chunk_size_div}, + {vector_length, num_teams, 1}); + } + PairSNAPKokkos(class LAMMPS *); ~PairSNAPKokkos() override; diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h index 8349f8b398..c365d03c90 100644 --- a/src/KOKKOS/pair_snap_kokkos_impl.h +++ b/src/KOKKOS/pair_snap_kokkos_impl.h @@ -221,236 +221,173 @@ void PairSNAPKokkos::compute(int eflag_in, if (chunk_size > inum - chunk_offset) chunk_size = inum - chunk_offset; - if constexpr (host_flag) + // pre-compute ceil(chunk_size / vector_length) and the padded chunk size for convenience + const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length; + const int chunk_size_pad = chunk_size_div * vector_length; + + // ComputeNeigh + if constexpr (host_flag) { + int team_size = team_size_default; + check_team_size_for(chunk_size,team_size); + typename Kokkos::TeamPolicy policy_neigh(chunk_size,team_size,vector_length); + Kokkos::parallel_for("ComputeNeighCPU",policy_neigh,*this); + } else { + // team_size_compute_neigh is defined in `pair_snap_kokkos.h` + int scratch_size = scratch_size_helper(team_size_compute_neigh * max_neighs); + + SnapAoSoATeamPolicy policy_neigh(chunk_size,team_size_compute_neigh,vector_length); + policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this); + } + + // ComputeCayleyKlein; this is only called on the GPU + if constexpr (!host_flag) { + // tile_size_compute_ck is defined in `pair_snap_kokkos.h` + Snap3DRangePolicy + policy_compute_ck({0,0,0},{vector_length,max_neighs,chunk_size_div},{vector_length,tile_size_compute_ck,1}); + Kokkos::parallel_for("ComputeCayleyKlein",policy_compute_ck,*this); + } + + // PreUi; same CPU and GPU codepath { - // Host codepath - - //ComputeNeigh - { - int team_size = team_size_default; - check_team_size_for(chunk_size,team_size); - typename Kokkos::TeamPolicy policy_neigh(chunk_size,team_size,vector_length); - Kokkos::parallel_for("ComputeNeighCPU",policy_neigh,*this); - } - - //PreUi - { - typename Kokkos::RangePolicy policy_preui_cpu(0, chunk_size * (twojmax + 1)); - Kokkos::parallel_for("PreUiCPU",policy_preui_cpu,*this); - } - - // ComputeUi - { - // Fused calculation of ulist and accumulation into ulisttot using atomics - typename Kokkos::RangePolicy policy_ui_cpu(0, chunk_size*max_neighs); - Kokkos::parallel_for("ComputeUiCPU",policy_ui_cpu,*this); - } - - { - // Expand ulisttot_re,_im -> ulisttot - // Zero out ylist - int idxu_max = snaKK.idxu_max; - typename Kokkos::RangePolicy policy_transform_ui_cpu(0, chunk_size * idxu_max); - Kokkos::parallel_for("TransformUiCPU",policy_transform_ui_cpu,*this); - } - - //Compute bispectrum - if (quadraticflag || eflag) { - //ComputeZi - int idxz_max = snaKK.idxz_max; - typename Kokkos::RangePolicy policy_zi_cpu(0, chunk_size * idxz_max); - Kokkos::parallel_for("ComputeZiCPU", policy_zi_cpu, *this); - - //ComputeBi - int idxb_max = snaKK.idxb_max; - typename Kokkos::RangePolicy policy_bi_cpu(0, chunk_size * idxb_max); - Kokkos::parallel_for("ComputeBiCPU",policy_bi_cpu,*this); - - //Compute beta = dE_i/dB_i for all i in list - typename Kokkos::RangePolicy policy_beta(0,chunk_size); - Kokkos::parallel_for("ComputeBeta",policy_beta,*this); - } - - //ComputeYi - { - int idxz_max = snaKK.idxz_max; - if (quadraticflag || eflag) { - typename Kokkos::RangePolicy policy_yi_cpu(0, chunk_size * idxz_max); - Kokkos::parallel_for("ComputeYiWithZlistCPU", policy_yi_cpu,*this); - } else { - typename Kokkos::RangePolicy policy_yi_cpu(0, chunk_size * idxz_max); - Kokkos::parallel_for("ComputeYiCPU", policy_yi_cpu,*this); - } - } // host flag - - //ComputeDuidrj and Deidrj - { - typename Kokkos::RangePolicy policy_duidrj_cpu(0,chunk_size*max_neighs); - Kokkos::parallel_for("ComputeDuidrjCPU",policy_duidrj_cpu,*this); - - typename Kokkos::RangePolicy policy_deidrj_cpu(0,chunk_size*max_neighs); - Kokkos::parallel_for("ComputeDeidrjCPU",policy_deidrj_cpu,*this); - } - - } else { // GPU - - // The compiler will only hit this codepath if this is the GPU code - - // Pre-compute ceil(chunk_size / vector_length) for code cleanliness - const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length; - - //ComputeNeigh - { - // team_size_compute_neigh is defined in `pair_snap_kokkos.h` - int scratch_size = scratch_size_helper(team_size_compute_neigh * max_neighs); - - SnapAoSoATeamPolicy policy_neigh(chunk_size,team_size_compute_neigh,vector_length); - policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); - Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this); - } - - //ComputeCayleyKlein - { - // tile_size_compute_ck is defined in `pair_snap_kokkos.h` - Snap3DRangePolicy - policy_compute_ck({0,0,0},{vector_length,max_neighs,chunk_size_div},{vector_length,tile_size_compute_ck,1}); - Kokkos::parallel_for("ComputeCayleyKlein",policy_compute_ck,*this); - } - - //PreUi - { - // tile_size_pre_ui is defined in `pair_snap_kokkos.h` - Snap3DRangePolicy - policy_preui({0,0,0},{vector_length,twojmax+1,chunk_size_div},{vector_length,tile_size_pre_ui,1}); - Kokkos::parallel_for("PreUi",policy_preui,*this); - } + auto policy_pre_ui = snap_get_policy(chunk_size_div, twojmax + 1); + //typename Kokkos::RangePolicy policy_preui_cpu(0, chunk_size * (twojmax + 1)); + Kokkos::parallel_for("PreUi",policy_pre_ui,*this); + } + // ComputeUi; separate CPU, GPU codepaths + if constexpr (host_flag) { + // Fused calculation of ulist and accumulation into ulisttot using atomics + typename Kokkos::RangePolicy policy_ui_cpu(0, chunk_size*max_neighs); + Kokkos::parallel_for("ComputeUiCPU",policy_ui_cpu,*this); + } else { // ComputeUi w/vector parallelism, shared memory, direct atomicAdd into ulisttot + + // team_size_compute_ui is defined in `pair_snap_kokkos.h` + // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer + const int tile_size = vector_length * (twojmax + 1); + const int scratch_size = scratch_size_helper(team_size_compute_ui * tile_size); + + if (chunk_size < parallel_thresh) { - // team_size_compute_ui is defined in `pair_snap_kokkos.h` - // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer - const int tile_size = vector_length * (twojmax + 1); - const int scratch_size = scratch_size_helper(team_size_compute_ui * tile_size); + // Version with parallelism over j_bend - if (chunk_size < parallel_thresh) - { - // Version with parallelism over j_bend + // total number of teams needed: (natoms / 32) * (max_neighs) * ("bend" locations) + const int n_teams = chunk_size_div * max_neighs * (twojmax + 1); + const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui; - // total number of teams needed: (natoms / 32) * (max_neighs) * ("bend" locations) - const int n_teams = chunk_size_div * max_neighs * (twojmax + 1); - const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui; + SnapAoSoATeamPolicy policy_ui(n_teams_div, team_size_compute_ui, vector_length); + policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + Kokkos::parallel_for("ComputeUiSmall",policy_ui,*this); + } else { + // Version w/out parallelism over j_bend - SnapAoSoATeamPolicy policy_ui(n_teams_div, team_size_compute_ui, vector_length); - policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); - Kokkos::parallel_for("ComputeUiSmall",policy_ui,*this); - } else { - // Version w/out parallelism over j_bend + // total number of teams needed: (natoms / 32) * (max_neighs) + const int n_teams = chunk_size_div * max_neighs; + const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui; - // total number of teams needed: (natoms / 32) * (max_neighs) - const int n_teams = chunk_size_div * max_neighs; - const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui; - - SnapAoSoATeamPolicy policy_ui(n_teams_div, team_size_compute_ui, vector_length); - policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); - Kokkos::parallel_for("ComputeUiLarge",policy_ui,*this); - } + SnapAoSoATeamPolicy policy_ui(n_teams_div, team_size_compute_ui, vector_length); + policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + Kokkos::parallel_for("ComputeUiLarge",policy_ui,*this); } + } - //TransformUi: un-"fold" ulisttot, zero ylist - { - // team_size_transform_ui is defined in `pair_snap_kokkos.h` - Snap3DRangePolicy - policy_transform_ui({0,0,0},{vector_length,snaKK.idxu_max,chunk_size_div},{vector_length,tile_size_transform_ui,1}); - Kokkos::parallel_for("TransformUi",policy_transform_ui,*this); - } + { + // Expand ulisttot_re,_im -> ulisttot + // Zero out ylist + auto policy_transform_ui = snap_get_policy(chunk_size_div, snaKK.idxu_max); + //int idxu_max = snaKK.idxu_max; + //typename Kokkos::RangePolicy policy_transform_ui_cpu(0, chunk_size * idxu_max); + Kokkos::parallel_for("TransformUi",policy_transform_ui,*this); + } - //Compute bispectrum in AoSoA data layout, transform Bi - if (quadraticflag || eflag) { - // team_size_[compute_zi, compute_bi, transform_bi] are defined in `pair_snap_kokkos.h` + //Compute bispectrum + if (quadraticflag || eflag) { + // team_size_[compute_zi, compute_bi, transform_bi] are defined in `pair_snap_kokkos.h` - //ComputeZi - const int idxz_max = snaKK.idxz_max; - Snap3DRangePolicy - policy_compute_zi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_zi,1}); - Kokkos::parallel_for("ComputeZi",policy_compute_zi,*this); + //ComputeZi + auto policy_compute_zi = snap_get_policy(chunk_size_div, snaKK.idxz_max); + Kokkos::parallel_for("ComputeZi",policy_compute_zi,*this); - //ComputeBi - const int idxb_max = snaKK.idxb_max; - Snap3DRangePolicy - policy_compute_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_compute_bi,1}); - Kokkos::parallel_for("ComputeBi",policy_compute_bi,*this); - } + //ComputeBi + auto policy_compute_bi = snap_get_policy(chunk_size_div, snaKK.idxb_max); + Kokkos::parallel_for("ComputeBi",policy_compute_bi,*this); + } + + { + //Compute beta = dE_i/dB_i for all i in list + typename Kokkos::RangePolicy policy_beta(0, chunk_size_pad); + Kokkos::parallel_for("ComputeBeta", policy_beta, *this); //Note zeroing `ylist` is fused into `TransformUi`. - { - //Compute beta = dE_i/dB_i for all i in list - typename Kokkos::RangePolicy policy_beta(0,chunk_size); - Kokkos::parallel_for("ComputeBeta",policy_beta,*this); - const int idxz_max = snaKK.idxz_max; - if (quadraticflag || eflag) { - Snap3DRangePolicy - policy_compute_yi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_yi,1}); - Kokkos::parallel_for("ComputeYiWithZlist",policy_compute_yi,*this); - } else { - Snap3DRangePolicy - policy_compute_yi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_yi,1}); - Kokkos::parallel_for("ComputeYi",policy_compute_yi,*this); - } + if (quadraticflag || eflag) { + auto policy_compute_yi = snap_get_policy(chunk_size_div, snaKK.idxz_max); + Kokkos::parallel_for("ComputeYiWithZlist",policy_compute_yi,*this); + } else { + auto policy_compute_yi = snap_get_policy(chunk_size_div, snaKK.idxz_max); + Kokkos::parallel_for("ComputeYi",policy_compute_yi,*this); } + } + if constexpr (host_flag) { + //ComputeDuidrj and Deidrj + typename Kokkos::RangePolicy policy_duidrj_cpu(0, chunk_size*max_neighs); + Kokkos::parallel_for("ComputeDuidrjCPU",policy_duidrj_cpu,*this); + + typename Kokkos::RangePolicy policy_deidrj_cpu(0, chunk_size*max_neighs); + Kokkos::parallel_for("ComputeDeidrjCPU",policy_deidrj_cpu,*this); + } else { // GPU // Fused ComputeDuidrj, ComputeDeidrj + // team_size_compute_fused_deidrj is defined in `pair_snap_kokkos.h` + + // scratch size: 32 atoms * (twojmax+1) cached values * 2 for u, du, no double buffer + const int tile_size = vector_length * (twojmax + 1); + const int scratch_size = scratch_size_helper(2 * team_size_compute_fused_deidrj * tile_size); + + if (chunk_size < parallel_thresh) { - // team_size_compute_fused_deidrj is defined in `pair_snap_kokkos.h` + // Version with parallelism over j_bend - // scratch size: 32 atoms * (twojmax+1) cached values * 2 for u, du, no double buffer - const int tile_size = vector_length * (twojmax + 1); - const int scratch_size = scratch_size_helper(2 * team_size_compute_fused_deidrj * tile_size); + // total number of teams needed: (natoms / 32) * (max_neighs) * ("bend" locations) + const int n_teams = chunk_size_div * max_neighs * (twojmax + 1); + const int n_teams_div = (n_teams + team_size_compute_fused_deidrj - 1) / team_size_compute_fused_deidrj; - if (chunk_size < parallel_thresh) - { - // Version with parallelism over j_bend + // x direction + SnapAoSoATeamPolicy > policy_fused_deidrj_x(n_teams_div,team_size_compute_fused_deidrj,vector_length); + policy_fused_deidrj_x = policy_fused_deidrj_x.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + Kokkos::parallel_for("ComputeFusedDeidrjSmall<0>",policy_fused_deidrj_x,*this); - // total number of teams needed: (natoms / 32) * (max_neighs) * ("bend" locations) - const int n_teams = chunk_size_div * max_neighs * (twojmax + 1); - const int n_teams_div = (n_teams + team_size_compute_fused_deidrj - 1) / team_size_compute_fused_deidrj; + // y direction + SnapAoSoATeamPolicy > policy_fused_deidrj_y(n_teams_div,team_size_compute_fused_deidrj,vector_length); + policy_fused_deidrj_y = policy_fused_deidrj_y.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + Kokkos::parallel_for("ComputeFusedDeidrjSmall<1>",policy_fused_deidrj_y,*this); - // x direction - SnapAoSoATeamPolicy > policy_fused_deidrj_x(n_teams_div,team_size_compute_fused_deidrj,vector_length); - policy_fused_deidrj_x = policy_fused_deidrj_x.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); - Kokkos::parallel_for("ComputeFusedDeidrjSmall<0>",policy_fused_deidrj_x,*this); + // z direction + SnapAoSoATeamPolicy > policy_fused_deidrj_z(n_teams_div,team_size_compute_fused_deidrj,vector_length); + policy_fused_deidrj_z = policy_fused_deidrj_z.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + Kokkos::parallel_for("ComputeFusedDeidrjSmall<2>",policy_fused_deidrj_z,*this); + } else { + // Version w/out parallelism over j_bend - // y direction - SnapAoSoATeamPolicy > policy_fused_deidrj_y(n_teams_div,team_size_compute_fused_deidrj,vector_length); - policy_fused_deidrj_y = policy_fused_deidrj_y.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); - Kokkos::parallel_for("ComputeFusedDeidrjSmall<1>",policy_fused_deidrj_y,*this); + // total number of teams needed: (natoms / 32) * (max_neighs) + const int n_teams = chunk_size_div * max_neighs; + const int n_teams_div = (n_teams + team_size_compute_fused_deidrj - 1) / team_size_compute_fused_deidrj; - // z direction - SnapAoSoATeamPolicy > policy_fused_deidrj_z(n_teams_div,team_size_compute_fused_deidrj,vector_length); - policy_fused_deidrj_z = policy_fused_deidrj_z.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); - Kokkos::parallel_for("ComputeFusedDeidrjSmall<2>",policy_fused_deidrj_z,*this); - } else { - // Version w/out parallelism over j_bend + // x direction + SnapAoSoATeamPolicy > policy_fused_deidrj_x(n_teams_div,team_size_compute_fused_deidrj,vector_length); + policy_fused_deidrj_x = policy_fused_deidrj_x.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + Kokkos::parallel_for("ComputeFusedDeidrjLarge<0>",policy_fused_deidrj_x,*this); - // total number of teams needed: (natoms / 32) * (max_neighs) - const int n_teams = chunk_size_div * max_neighs; - const int n_teams_div = (n_teams + team_size_compute_fused_deidrj - 1) / team_size_compute_fused_deidrj; + // y direction + SnapAoSoATeamPolicy > policy_fused_deidrj_y(n_teams_div,team_size_compute_fused_deidrj,vector_length); + policy_fused_deidrj_y = policy_fused_deidrj_y.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + Kokkos::parallel_for("ComputeFusedDeidrjLarge<1>",policy_fused_deidrj_y,*this); - // x direction - SnapAoSoATeamPolicy > policy_fused_deidrj_x(n_teams_div,team_size_compute_fused_deidrj,vector_length); - policy_fused_deidrj_x = policy_fused_deidrj_x.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); - Kokkos::parallel_for("ComputeFusedDeidrjLarge<0>",policy_fused_deidrj_x,*this); + // z direction + SnapAoSoATeamPolicy > policy_fused_deidrj_z(n_teams_div,team_size_compute_fused_deidrj,vector_length); + policy_fused_deidrj_z = policy_fused_deidrj_z.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + Kokkos::parallel_for("ComputeFusedDeidrjLarge<2>",policy_fused_deidrj_z,*this); - // y direction - SnapAoSoATeamPolicy > policy_fused_deidrj_y(n_teams_div,team_size_compute_fused_deidrj,vector_length); - policy_fused_deidrj_y = policy_fused_deidrj_y.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); - Kokkos::parallel_for("ComputeFusedDeidrjLarge<1>",policy_fused_deidrj_y,*this); - - // z direction - SnapAoSoATeamPolicy > policy_fused_deidrj_z(n_teams_div,team_size_compute_fused_deidrj,vector_length); - policy_fused_deidrj_z = policy_fused_deidrj_z.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); - Kokkos::parallel_for("ComputeFusedDeidrjLarge<2>",policy_fused_deidrj_z,*this); - - } } } @@ -700,6 +637,8 @@ KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPComputeNeighCPU,const typename Kokkos::TeamPolicy::member_type& team) const { int ii = team.league_rank(); + if (ii >= chunk_size) return; + const int i = d_ilist[ii + chunk_offset]; const double xtmp = x(i,0); const double ytmp = x(i,1); @@ -809,8 +748,9 @@ template KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPPreUi, const int& ii) const { const int iatom = ii / (twojmax+1); - const int j = ii % (twojmax+1); + if (iatom >= chunk_size) return; + const int j = ii % (twojmax+1); const int itype = type(iatom); const int ielem = d_map[itype]; @@ -882,6 +822,8 @@ KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPComputeUiCPU, const int& ii) const { const int iatom = ii / max_neighs; const int jnbor = ii % max_neighs; + + if (iatom >= chunk_size) return; const int ninside = d_ninside(iatom); if (jnbor >= ninside) return; @@ -931,6 +873,7 @@ KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPComputeZi, const int& ii) const { const int iatom = ii / snaKK.idxz_max; const int jjz = ii % snaKK.idxz_max; + if (iatom >= chunk_size) return; snaKK.compute_zi(iatom, jjz); } @@ -955,6 +898,7 @@ KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPComputeBi, const int& ii) const { const int iatom = ii / snaKK.idxb_max; const int jjb = ii % snaKK.idxb_max; + if (iatom >= chunk_size) return; snaKK.compute_bi(iatom, jjb); } @@ -1023,6 +967,7 @@ KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPComputeYi, const int& ii) const { const int iatom = ii / snaKK.idxz_max; const int jjz = ii % snaKK.idxz_max; + if (iatom >= chunk_size) return; snaKK.compute_yi(iatom, jjz); } @@ -1047,6 +992,7 @@ KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPComputeYiWithZlist, const int& ii) const { const int iatom = ii / snaKK.idxz_max; const int jjz = ii % snaKK.idxz_max; + if (iatom >= chunk_size) return; snaKK.compute_yi_with_zlist(iatom, jjz); } @@ -1120,6 +1066,7 @@ KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPComputeDuidrjCPU, const int& ii) const { const int iatom = ii / max_neighs; const int jnbor = ii % max_neighs; + if (iatom >= chunk_size) return; const int ninside = d_ninside(iatom); if (jnbor >= ninside) return; @@ -1137,6 +1084,7 @@ KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPComputeDeidrjCPU, const int& ii) const { const int iatom = ii / max_neighs; const int jnbor = ii % max_neighs; + if (iatom >= chunk_size) return; const int ninside = d_ninside(iatom); if (jnbor >= ninside) return; From e8e2a0b01225711a1dda59132008f74ee047cfe2 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Wed, 20 Nov 2024 10:36:01 -0800 Subject: [PATCH 20/30] Simplified the constuctor for SNAKokkos by passing PairSNAPKokkos in directly by reference --- src/KOKKOS/pair_snap_kokkos.h | 13 +++++++++++-- src/KOKKOS/pair_snap_kokkos_impl.h | 8 ++++---- src/KOKKOS/sna_kokkos.h | 12 +++++++++++- src/KOKKOS/sna_kokkos_impl.h | 21 ++++++--------------- 4 files changed, 32 insertions(+), 22 deletions(-) diff --git a/src/KOKKOS/pair_snap_kokkos.h b/src/KOKKOS/pair_snap_kokkos.h index ece1384a49..611c79b56d 100644 --- a/src/KOKKOS/pair_snap_kokkos.h +++ b/src/KOKKOS/pair_snap_kokkos.h @@ -30,9 +30,15 @@ PairStyle(snap/kk/host,PairSNAPKokkosDevice); #include "pair_snap.h" #include "kokkos_type.h" #include "neigh_list_kokkos.h" -#include "sna_kokkos.h" #include "pair_kokkos.h" +namespace LAMMPS_NS { +// pre-declare so sna_kokkos.h can refer to it +template class PairSNAPKokkos; +}; + +#include "sna_kokkos.h" + namespace LAMMPS_NS { // Routines for both the CPU and GPU backend @@ -262,7 +268,7 @@ class PairSNAPKokkos : public PairSNAP { Kokkos::View d_radelem; // element radii Kokkos::View d_wjelem; // elements weights - Kokkos::View d_coeffelem; // element bispectrum coefficients + typename SNAKokkos::t_sna_2d_lr d_coeffelem; // element bispectrum coefficients Kokkos::View d_sinnerelem; // element inner cutoff midpoint Kokkos::View d_dinnerelem; // element inner cutoff half-width Kokkos::View d_map; // mapping from atom types to elements @@ -302,6 +308,9 @@ class PairSNAPKokkos : public PairSNAP { template int scratch_size_helper(int values_per_team); + // Make SNAKokkos a friend + friend class SNAKokkos; + }; diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h index c365d03c90..721ce5352d 100644 --- a/src/KOKKOS/pair_snap_kokkos_impl.h +++ b/src/KOKKOS/pair_snap_kokkos_impl.h @@ -245,14 +245,14 @@ void PairSNAPKokkos::compute(int eflag_in, // tile_size_compute_ck is defined in `pair_snap_kokkos.h` Snap3DRangePolicy policy_compute_ck({0,0,0},{vector_length,max_neighs,chunk_size_div},{vector_length,tile_size_compute_ck,1}); - Kokkos::parallel_for("ComputeCayleyKlein",policy_compute_ck,*this); + Kokkos::parallel_for("ComputeCayleyKlein", policy_compute_ck, *this); } // PreUi; same CPU and GPU codepath { auto policy_pre_ui = snap_get_policy(chunk_size_div, twojmax + 1); //typename Kokkos::RangePolicy policy_preui_cpu(0, chunk_size * (twojmax + 1)); - Kokkos::parallel_for("PreUi",policy_pre_ui,*this); + Kokkos::parallel_for("PreUi", policy_pre_ui, *this); } // ComputeUi; separate CPU, GPU codepaths @@ -531,8 +531,8 @@ void PairSNAPKokkos::coeff(int narg, char Kokkos::deep_copy(d_dinnerelem,h_dinnerelem); Kokkos::deep_copy(d_map,h_map); - snaKK = SNAKokkos(rfac0,twojmax, - rmin0,switchflag,bzeroflag,chemflag,bnormflag,wselfallflag,nelements,switchinnerflag); + snaKK = SNAKokkos(*this); //rfac0,twojmax, + //rmin0,switchflag,bzeroflag,chemflag,bnormflag,wselfallflag,nelements,switchinnerflag); snaKK.grow_rij(0,0); snaKK.init(); } diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h index 92f413ed17..24dfa1f4ac 100644 --- a/src/KOKKOS/sna_kokkos.h +++ b/src/KOKKOS/sna_kokkos.h @@ -143,6 +143,7 @@ class SNAKokkos { typedef Kokkos::View t_sna_2i; typedef Kokkos::View t_sna_2d; typedef Kokkos::View t_sna_2d_ll; + typedef Kokkos::View t_sna_2d_lr; typedef Kokkos::View t_sna_3d; typedef Kokkos::View t_sna_3d_ll; typedef Kokkos::View t_sna_4d; @@ -170,7 +171,8 @@ class SNAKokkos { SNAKokkos(const SNAKokkos& sna, const typename Kokkos::TeamPolicy::member_type& team); inline - SNAKokkos(real_type, int, real_type, int, int, int, int, int, int, int); + //SNAKokkos(real_type, int, real_type, int, int, int, int, int, int, int); + SNAKokkos(const PairSNAPKokkos&); KOKKOS_INLINE_FUNCTION ~SNAKokkos(); @@ -282,7 +284,12 @@ class SNAKokkos { int twojmax, diagonalstyle; + // Input beta coefficients; aliases the object in PairSnapKokkos + t_sna_2d_lr d_coeffelem; + // Beta for all atoms in list; aliases the object in PairSnapKokkos + // for qSNAP the quadratic terms get accumulated into it + // in compute_bi t_sna_2d d_beta; // Structures for both the CPU, GPU backend @@ -379,6 +386,9 @@ class SNAKokkos { real_type wself; int wselfall_flag; + // quadratic flag + int quadratic_flag; + int bzero_flag; // 1 if bzero subtracted from barray Kokkos::View bzero; // array of B values for isolated atoms }; diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index 4cdd37d1f5..90232d1333 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -30,27 +30,18 @@ static const double MY_PI2 = 1.57079632679489661923; // pi/2 template inline -SNAKokkos::SNAKokkos(real_type rfac0_in, - int twojmax_in, real_type rmin0_in, int switch_flag_in, int bzero_flag_in, - int chem_flag_in, int bnorm_flag_in, int wselfall_flag_in, int nelements_in, int switch_inner_flag_in) +SNAKokkos::SNAKokkos(const PairSNAPKokkos& psk) + : rfac0(psk.rfac0), rmin0(psk.rmin0), switch_flag(psk.switchflag), + bzero_flag(psk.bzeroflag), chem_flag(psk.chemflag), bnorm_flag(psk.bnormflag), + wselfall_flag(psk.wselfallflag), switch_inner_flag(psk.switchinnerflag), + quadratic_flag(psk.quadraticflag), twojmax(psk.twojmax), d_coeffelem(psk.d_coeffelem) { wself = static_cast(1.0); - rfac0 = rfac0_in; - rmin0 = rmin0_in; - switch_flag = switch_flag_in; - switch_inner_flag = switch_inner_flag_in; - bzero_flag = bzero_flag_in; - - chem_flag = chem_flag_in; if (chem_flag) - nelements = nelements_in; + nelements = psk.nelements; else nelements = 1; - bnorm_flag = bnorm_flag_in; - wselfall_flag = wselfall_flag_in; - - twojmax = twojmax_in; ncoeff = compute_ncoeff(); From 2d65365a67149ed26cbc8618e0bbea7e2259ed88 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Wed, 20 Nov 2024 10:42:43 -0800 Subject: [PATCH 21/30] Added additional parallelism to TagPairSNAPBeta --- src/KOKKOS/pair_snap_kokkos.h | 27 ++++++++--- src/KOKKOS/pair_snap_kokkos_impl.h | 77 +++++++++++++++++++----------- src/KOKKOS/sna_kokkos.h | 2 + src/KOKKOS/sna_kokkos_impl.h | 56 ++++++++++++++++++++++ 4 files changed, 127 insertions(+), 35 deletions(-) diff --git a/src/KOKKOS/pair_snap_kokkos.h b/src/KOKKOS/pair_snap_kokkos.h index 611c79b56d..0f1d1c8aee 100644 --- a/src/KOKKOS/pair_snap_kokkos.h +++ b/src/KOKKOS/pair_snap_kokkos.h @@ -46,7 +46,8 @@ struct TagPairSNAPPreUi{}; struct TagPairSNAPTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist struct TagPairSNAPComputeZi{}; struct TagPairSNAPComputeBi{}; -struct TagPairSNAPBeta{}; +struct TagPairSNAPZeroBeta{}; +struct TagPairSNAPComputeBeta{}; struct TagPairSNAPComputeYi{}; struct TagPairSNAPComputeYiWithZlist{}; template @@ -93,7 +94,8 @@ class PairSNAPKokkos : public PairSNAP { static constexpr int tile_size_transform_ui = 2; static constexpr int tile_size_compute_zi = 2; static constexpr int tile_size_compute_bi = 2; - static constexpr int tile_size_transform_bi = 2; + static constexpr int tile_size_zero_beta = 2; + static constexpr int tile_size_compute_beta = 2; static constexpr int tile_size_compute_yi = 2; static constexpr int team_size_compute_fused_deidrj = 2; #elif defined(KOKKOS_ENABLE_SYCL) @@ -104,7 +106,8 @@ class PairSNAPKokkos : public PairSNAP { static constexpr int tile_size_transform_ui = 8; static constexpr int tile_size_compute_zi = 4; static constexpr int tile_size_compute_bi = 4; - static constexpr int tile_size_transform_bi = 4; + static constexpr int tile_size_zero_beta = 8; + static constexpr int tile_size_compute_beta = 8; static constexpr int tile_size_compute_yi = 8; static constexpr int team_size_compute_fused_deidrj = 4; #else @@ -115,7 +118,8 @@ class PairSNAPKokkos : public PairSNAP { static constexpr int tile_size_transform_ui = 4; static constexpr int tile_size_compute_zi = 8; static constexpr int tile_size_compute_bi = 4; - static constexpr int tile_size_transform_bi = 4; + static constexpr int tile_size_zero_beta = 4; + static constexpr int tile_size_compute_beta = 4; static constexpr int tile_size_compute_yi = 8; static constexpr int team_size_compute_fused_deidrj = sizeof(real_type) == 4 ? 4 : 2; #endif @@ -160,9 +164,6 @@ class PairSNAPKokkos : public PairSNAP { void check_team_size_reduce(int, int&); // CPU and GPU backend - KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPBeta, const int& ii) const; - template KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeForce,const int& ii) const; @@ -196,6 +197,12 @@ class PairSNAPKokkos : public PairSNAP { KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeBi,const int iatom_mod, const int idxb, const int iatom_div) const; + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPZeroBeta,const int iatom_mod, const int idxb, const int iatom_div) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeBeta, const int iatom_mod, const int idxb, const int iatom_div) const; + KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeYi,const int iatom_mod, const int idxz, const int iatom_div) const; @@ -229,6 +236,12 @@ class PairSNAPKokkos : public PairSNAP { KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeBi, const int& ii) const; + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPZeroBeta, const int& ii) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeBeta, const int& ii) const; + KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeYi, const int& ii) const; diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h index 721ce5352d..b758109a51 100644 --- a/src/KOKKOS/pair_snap_kokkos_impl.h +++ b/src/KOKKOS/pair_snap_kokkos_impl.h @@ -315,9 +315,13 @@ void PairSNAPKokkos::compute(int eflag_in, } { + // Zero beta out + auto policy_zero_beta = snap_get_policy(chunk_size_div, snaKK.idxb_max); + Kokkos::parallel_for("ZeroBeta",policy_zero_beta,*this); + //Compute beta = dE_i/dB_i for all i in list - typename Kokkos::RangePolicy policy_beta(0, chunk_size_pad); - Kokkos::parallel_for("ComputeBeta", policy_beta, *this); + auto policy_compute_beta = snap_get_policy(chunk_size_div, snaKK.idxb_max); + Kokkos::parallel_for("ComputeBeta", policy_compute_beta, *this); //Note zeroing `ylist` is fused into `TransformUi`. if (quadraticflag || eflag) { @@ -902,6 +906,32 @@ void PairSNAPKokkos::operator() (TagPairSN snaKK.compute_bi(iatom, jjb); } +/* ---------------------------------------------------------------------- + Zero out beta in advance of accumulating. CPU and GPU. +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPZeroBeta,const int iatom_mod, const int jjb, const int iatom_div) const { + + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; + if (jjb >= snaKK.idxb_max) return; + + for (int itriple = 0; itriple < snaKK.ntriples; itriple++) + snaKK.d_beta(iatom, jjb + itriple * snaKK.idxb_max) = 0; +} + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPZeroBeta, const int& ii) const { + const int iatom = ii / snaKK.idxb_max; + const int jjb = ii % snaKK.idxb_max; + if (iatom >= chunk_size) return; + + for (int itriple = 0; itriple < snaKK.ntriples; itriple++) + snaKK.d_beta(iatom, jjb + itriple * snaKK.idxb_max) = 0; +} /* ---------------------------------------------------------------------- Assemble the "beta" coefficients that enter the computation of the @@ -913,39 +943,30 @@ void PairSNAPKokkos::operator() (TagPairSN template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPBeta, const int& iatom) const { +void PairSNAPKokkos::operator() (TagPairSNAPComputeBeta, const int iatom_mod, const int idxb, const int iatom_div) const { + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; + if (idxb >= snaKK.idxb_max) return; + const int i = d_ilist[iatom + chunk_offset]; + const int itype = type[i]; + const int ielem = d_map[itype]; + + snaKK.compute_beta(iatom, idxb, ielem); +} + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeBeta, const int& ii) const { + const int iatom = ii / snaKK.idxb_max; + const int idxb = ii % snaKK.idxb_max; if (iatom >= chunk_size) return; const int i = d_ilist[iatom + chunk_offset]; const int itype = type[i]; const int ielem = d_map[itype]; - auto d_coeffi = Kokkos::subview(d_coeffelem, ielem, Kokkos::ALL); - - for (int icoeff = 0; icoeff < ncoeff; icoeff++) { - d_beta(iatom, icoeff) = d_coeffi[icoeff+1]; - } - - if (quadraticflag) { - const auto idxb_max = snaKK.idxb_max; - int k = ncoeff+1; - for (int icoeff = 0; icoeff < ncoeff; icoeff++) { - const auto idxb = icoeff % idxb_max; - const auto idx_chem = icoeff / idxb_max; - real_type bveci = snaKK.blist(iatom, idx_chem, idxb); - d_beta(iatom, icoeff) += d_coeffi[k] * bveci; - k++; - for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) { - const auto jdxb = jcoeff % idxb_max; - const auto jdx_chem = jcoeff / idxb_max; - real_type bvecj = snaKK.blist(iatom, jdx_chem, jdxb); - d_beta(iatom, icoeff) += d_coeffi[k] * bvecj; - d_beta(iatom, jcoeff) += d_coeffi[k] * bveci; - k++; - } - } - } + snaKK.compute_beta(iatom, idxb, ielem); } /* ---------------------------------------------------------------------- diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h index 24dfa1f4ac..cd624238a1 100644 --- a/src/KOKKOS/sna_kokkos.h +++ b/src/KOKKOS/sna_kokkos.h @@ -212,6 +212,8 @@ class SNAKokkos { void compute_yi_with_zlist(const int&, const int&) const; // ForceSNAP KOKKOS_INLINE_FUNCTION void compute_bi(const int&, const int&) const; // ForceSNAP + KOKKOS_INLINE_FUNCTION + void compute_beta(const int&, const int&, const int&) const; // functions for derivatives, GPU only // version of the code with parallelism over j_bend diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index 90232d1333..983f7a0d27 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -953,6 +953,62 @@ void SNAKokkos::compute_bi(const int& iato } // end loop over elem1 } +/* ---------------------------------------------------------------------- + compute beta by either appropriately copying it from d_coeffi + or accumulating the quadratic terms from blist +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void SNAKokkos::compute_beta(const int& iatom, const int& idxb, const int& ielem) const +{ + auto d_coeffi = Kokkos::subview(d_coeffelem, ielem, Kokkos::ALL); + + // handle quadratic && chemflag as a special case + if (quadratic_flag && chem_flag) { + if (idxb == 0) { + for (int icoeff = 0; icoeff < ncoeff; icoeff++) { + d_beta(iatom, icoeff) = d_coeffi[icoeff+1]; + } + + int k = ncoeff+1; + for (int icoeff = 0; icoeff < ncoeff; icoeff++) { + const auto idxb = icoeff % idxb_max; + const auto idx_chem = icoeff / idxb_max; + real_type bveci = blist(iatom, idx_chem, idxb); + d_beta(iatom, icoeff) += d_coeffi[k] * bveci; + k++; + for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) { + const auto jdxb = jcoeff % idxb_max; + const auto jdx_chem = jcoeff / idxb_max; + real_type bvecj = blist(iatom, jdx_chem, jdxb); + d_beta(iatom, icoeff) += d_coeffi[k] * bvecj; + d_beta(iatom, jcoeff) += d_coeffi[k] * bveci; + k++; + } + } + } + } else { + for (int itriple = 0; itriple < ntriples; itriple++) { + int icoeff = idxb + itriple * idxb_max; + Kokkos::atomic_add(&d_beta(iatom, icoeff), d_coeffi[icoeff+1]); + } + + if (quadratic_flag) { + int k = (idxb * (1 + 2 * idxb_max - idxb)) / 2 + idxb_max + 1; + real_type bveci = blist(iatom, 0, idxb); + Kokkos::atomic_add(&d_beta(iatom, idxb), d_coeffi[k] * bveci); + k++; + + for (int jdxb = idxb + 1; jdxb < idxb_max; jdxb++) { + real_type bvecj = blist(iatom, 0, jdxb); + Kokkos::atomic_add(&d_beta(iatom, idxb), d_coeffi[k] * bvecj); + Kokkos::atomic_add(&d_beta(iatom, jdxb), d_coeffi[k] * bveci); + k++; + } + } + } +} /* ---------------------------------------------------------------------- Compute Yi from Ui without storing Zi, looping over zlist indices. From 78bd71760b1cba34376432a0abf6216e92f85364 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Wed, 20 Nov 2024 10:53:33 -0800 Subject: [PATCH 22/30] Added a host vs device abstraction for SNAP kernel launches, greatly improves CPU launch perf --- src/KOKKOS/pair_snap_kokkos.h | 115 +++++++++++----- src/KOKKOS/pair_snap_kokkos_impl.h | 213 +++++++++++++++++++---------- src/KOKKOS/sna_kokkos.h | 8 +- src/KOKKOS/sna_kokkos_impl.h | 51 +++++-- 4 files changed, 264 insertions(+), 123 deletions(-) diff --git a/src/KOKKOS/pair_snap_kokkos.h b/src/KOKKOS/pair_snap_kokkos.h index 0f1d1c8aee..9f6ef83016 100644 --- a/src/KOKKOS/pair_snap_kokkos.h +++ b/src/KOKKOS/pair_snap_kokkos.h @@ -137,12 +137,24 @@ class PairSNAPKokkos : public PairSNAP { template using SnapAoSoATeamPolicy = typename Kokkos::TeamPolicy, TagPairSNAP>; + // Custom MDRangePolicy, Rank2, on the host, to reduce verbosity of kernel launches. The striding of this launch is intentionally + // different from the tiled 3D range policy on the device. + template + using Snap2DHostRangePolicy = typename Kokkos::MDRangePolicy, Kokkos::IndexType, Kokkos::Rank<2, Kokkos::Iterate::Right, Kokkos::Iterate::Right>, TagPairSNAP>; + + // Custom RangePolicy, Rank2, on the host, to reduce verbosity of kernel launches + template + using Snap1DHostRangePolicy = typename Kokkos::RangePolicy, TagPairSNAP>; + // Helper routine that returns a CPU or a GPU policy as appropriate template auto snap_get_policy(const int& chunk_size_div, const int& second_loop) { - if constexpr (host_flag) - return typename Kokkos::RangePolicy(0, chunk_size_div * vector_length * second_loop); - else + if constexpr (host_flag) { + return Snap1DHostRangePolicy(0, chunk_size_div * vector_length); + + // the 2-d policy is still correct but it has atomics so it's slower on the CPU + //return Snap2DHostRangePolicy({0, 0}, {chunk_size_div * vector_length, second_loop}); + } else return Snap3DRangePolicy({0, 0, 0}, {vector_length, second_loop, chunk_size_div}, {vector_length, num_teams, 1}); @@ -176,11 +188,19 @@ class PairSNAPKokkos : public PairSNAP { KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeNeigh,const typename Kokkos::TeamPolicy::member_type& team) const; + // GPU backend only KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeCayleyKlein, const int iatom_mod, const int jnbor, const int iatom_div) const; + // CPU and GPU KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPPreUi,const int iatom_mod, const int j, const int iatom_div) const; + void operator() (TagPairSNAPPreUi, const int& iatom_mod, const int& j, const int& iatom_div) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPPreUi, const int& iatom, const int& j) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPPreUi, const int& iatom) const; KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeUiSmall,const typename Kokkos::TeamPolicy::member_type& team) const; @@ -189,25 +209,67 @@ class PairSNAPKokkos : public PairSNAP { void operator() (TagPairSNAPComputeUiLarge,const typename Kokkos::TeamPolicy::member_type& team) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPTransformUi,const int iatom_mod, const int j, const int iatom_div) const; + void operator() (TagPairSNAPTransformUi, const int& iatom_mod, const int& idxu, const int& iatom_div) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeZi,const int iatom_mod, const int idxz, const int iatom_div) const; + void operator() (TagPairSNAPTransformUi, const int& iatom, const int& idxu) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeBi,const int iatom_mod, const int idxb, const int iatom_div) const; + void operator() (TagPairSNAPTransformUi, const int& iatom) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPZeroBeta,const int iatom_mod, const int idxb, const int iatom_div) const; - - KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeBeta, const int iatom_mod, const int idxb, const int iatom_div) const; + void operator() (TagPairSNAPComputeZi, const int& iatom_mod, const int& idxz, const int& iatom_div) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeYi,const int iatom_mod, const int idxz, const int iatom_div) const; + void operator() (TagPairSNAPComputeZi, const int& iatom, const int& idxz) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeYiWithZlist,const int iatom_mod, const int idxz, const int iatom_div) const; + void operator() (TagPairSNAPComputeZi, const int& iatom) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeBi, const int& iatom_mod, const int& idxb, const int& iatom_div) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeBi, const int& iatom, const int& idxb) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeBi, const int& iatom) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPZeroBeta, const int& iatom_mod, const int& idxb, const int& iatom_div) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPZeroBeta, const int& iatom, const int& idxb) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPZeroBeta, const int& iatom) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeBeta, const int& iatom_mod, const int& idxb, const int& iatom_div) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeBeta, const int& iatom, const int& idxb) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeBeta, const int& iatom) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeYi, const int& iatom_mod, const int& idxz, const int& iatom_div) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeYi, const int& iatom, const int& idxz) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeYi, const int& iatom) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeYiWithZlist, const int& iatom_mod, const int& idxz, const int& iatom_div) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeYiWithZlist, const int& iatom, const int& idxz) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeYiWithZlist, const int& iatom) const; template KOKKOS_INLINE_FUNCTION @@ -222,37 +284,22 @@ class PairSNAPKokkos : public PairSNAP { void operator() (TagPairSNAPComputeNeighCPU,const typename Kokkos::TeamPolicy::member_type& team) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPPreUi, const int& ii) const; + void operator() (TagPairSNAPComputeUiCPU, const int& iatom, const int& jnbor) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeUiCPU, const int& ii) const; + void operator() (TagPairSNAPComputeUiCPU, const int& iatom) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPTransformUi, const int& ii) const; + void operator() (TagPairSNAPComputeDuidrjCPU, const int& iatom, const int& jnbor) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeZi, const int& ii) const; + void operator() (TagPairSNAPComputeDuidrjCPU, const int& iatom) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeBi, const int& ii) const; + void operator() (TagPairSNAPComputeDeidrjCPU, const int& iatom, const int& jnbor) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPZeroBeta, const int& ii) const; - - KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeBeta, const int& ii) const; - - KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeYi, const int& ii) const; - - KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeYiWithZlist, const int& ii) const; - - KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeDuidrjCPU, const int& ii) const; - - KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeDeidrjCPU, const int& ii) const; + void operator() (TagPairSNAPComputeDeidrjCPU, const int& iatom) const; template KOKKOS_INLINE_FUNCTION diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h index b758109a51..056dfbf231 100644 --- a/src/KOKKOS/pair_snap_kokkos_impl.h +++ b/src/KOKKOS/pair_snap_kokkos_impl.h @@ -223,7 +223,7 @@ void PairSNAPKokkos::compute(int eflag_in, // pre-compute ceil(chunk_size / vector_length) and the padded chunk size for convenience const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length; - const int chunk_size_pad = chunk_size_div * vector_length; + //const int chunk_size_pad = chunk_size_div * vector_length; // ComputeNeigh if constexpr (host_flag) { @@ -251,15 +251,14 @@ void PairSNAPKokkos::compute(int eflag_in, // PreUi; same CPU and GPU codepath { auto policy_pre_ui = snap_get_policy(chunk_size_div, twojmax + 1); - //typename Kokkos::RangePolicy policy_preui_cpu(0, chunk_size * (twojmax + 1)); Kokkos::parallel_for("PreUi", policy_pre_ui, *this); } // ComputeUi; separate CPU, GPU codepaths if constexpr (host_flag) { // Fused calculation of ulist and accumulation into ulisttot using atomics - typename Kokkos::RangePolicy policy_ui_cpu(0, chunk_size*max_neighs); - Kokkos::parallel_for("ComputeUiCPU",policy_ui_cpu,*this); + auto policy_ui_cpu = snap_get_policy(chunk_size_div, max_neighs); + Kokkos::parallel_for("ComputeUiCPU", policy_ui_cpu, *this); } else { // ComputeUi w/vector parallelism, shared memory, direct atomicAdd into ulisttot @@ -296,9 +295,7 @@ void PairSNAPKokkos::compute(int eflag_in, // Expand ulisttot_re,_im -> ulisttot // Zero out ylist auto policy_transform_ui = snap_get_policy(chunk_size_div, snaKK.idxu_max); - //int idxu_max = snaKK.idxu_max; - //typename Kokkos::RangePolicy policy_transform_ui_cpu(0, chunk_size * idxu_max); - Kokkos::parallel_for("TransformUi",policy_transform_ui,*this); + Kokkos::parallel_for("TransformUi", policy_transform_ui, *this); } //Compute bispectrum @@ -307,11 +304,11 @@ void PairSNAPKokkos::compute(int eflag_in, //ComputeZi auto policy_compute_zi = snap_get_policy(chunk_size_div, snaKK.idxz_max); - Kokkos::parallel_for("ComputeZi",policy_compute_zi,*this); + Kokkos::parallel_for("ComputeZi", policy_compute_zi, *this); //ComputeBi auto policy_compute_bi = snap_get_policy(chunk_size_div, snaKK.idxb_max); - Kokkos::parallel_for("ComputeBi",policy_compute_bi,*this); + Kokkos::parallel_for("ComputeBi", policy_compute_bi, *this); } { @@ -326,19 +323,19 @@ void PairSNAPKokkos::compute(int eflag_in, //Note zeroing `ylist` is fused into `TransformUi`. if (quadraticflag || eflag) { auto policy_compute_yi = snap_get_policy(chunk_size_div, snaKK.idxz_max); - Kokkos::parallel_for("ComputeYiWithZlist",policy_compute_yi,*this); + Kokkos::parallel_for("ComputeYiWithZlist", policy_compute_yi, *this); } else { auto policy_compute_yi = snap_get_policy(chunk_size_div, snaKK.idxz_max); - Kokkos::parallel_for("ComputeYi",policy_compute_yi,*this); + Kokkos::parallel_for("ComputeYi", policy_compute_yi, *this); } } if constexpr (host_flag) { //ComputeDuidrj and Deidrj - typename Kokkos::RangePolicy policy_duidrj_cpu(0, chunk_size*max_neighs); - Kokkos::parallel_for("ComputeDuidrjCPU",policy_duidrj_cpu,*this); + auto policy_duidrj_cpu = snap_get_policy(chunk_size_div, max_neighs); + Kokkos::parallel_for("ComputeDuidrjCPU", policy_duidrj_cpu, *this); - typename Kokkos::RangePolicy policy_deidrj_cpu(0, chunk_size*max_neighs); + auto policy_deidrj_cpu = snap_get_policy(chunk_size_div, max_neighs); Kokkos::parallel_for("ComputeDeidrjCPU",policy_deidrj_cpu,*this); } else { // GPU // Fused ComputeDuidrj, ComputeDeidrj @@ -738,7 +735,7 @@ void PairSNAPKokkos::operator() (TagPairSN template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPPreUi, const int iatom_mod, const int j, const int iatom_div) const { +void PairSNAPKokkos::operator() (TagPairSNAPPreUi, const int& iatom_mod, const int& j, const int& iatom_div) const { const int iatom = iatom_mod + iatom_div * vector_length; if (iatom >= chunk_size) return; @@ -750,15 +747,25 @@ void PairSNAPKokkos::operator() (TagPairSN template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPPreUi, const int& ii) const { - const int iatom = ii / (twojmax+1); +void PairSNAPKokkos::operator() (TagPairSNAPPreUi, const int& iatom, const int& j) const { + if (iatom >= chunk_size) return; + + int itype = type(iatom); + int ielem = d_map[itype]; + + snaKK.pre_ui(iatom, j, ielem); +} + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPPreUi, const int& iatom) const { if (iatom >= chunk_size) return; - const int j = ii % (twojmax+1); const int itype = type(iatom); const int ielem = d_map[itype]; - snaKK.pre_ui(iatom, j, ielem); + for (int j = 0; j <= twojmax; j++) + snaKK.pre_ui(iatom, j, ielem); } /* ---------------------------------------------------------------------- @@ -823,15 +830,20 @@ void PairSNAPKokkos::operator() (TagPairSN template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeUiCPU, const int& ii) const { - const int iatom = ii / max_neighs; - const int jnbor = ii % max_neighs; - +void PairSNAPKokkos::operator() (TagPairSNAPComputeUiCPU, const int& iatom, const int& jnbor) const { if (iatom >= chunk_size) return; const int ninside = d_ninside(iatom); if (jnbor >= ninside) return; + snaKK.template compute_ui_cpu(iatom, jnbor); +} - snaKK.compute_ui_cpu(iatom, jnbor); +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeUiCPU, const int& iatom) const { + if (iatom >= chunk_size) return; + const int ninside = d_ninside(iatom); + for (int jnbor = 0; jnbor < ninside; jnbor++) + snaKK.template compute_ui_cpu(iatom, jnbor); } /* ---------------------------------------------------------------------- @@ -841,22 +853,28 @@ void PairSNAPKokkos::operator() (TagPairSN template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPTransformUi, const int iatom_mod, const int idxu, const int iatom_div) const { +void PairSNAPKokkos::operator() (TagPairSNAPTransformUi, const int& iatom_mod, const int& idxu, const int& iatom_div) const { const int iatom = iatom_mod + iatom_div * vector_length; if (iatom >= chunk_size) return; - if (idxu > snaKK.idxu_max) return; + if (idxu >= snaKK.idxu_max) return; snaKK.transform_ui(iatom, idxu); } template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPTransformUi, const int& ii) const { - const int iatom = ii / snaKK.idxu_max; - const int idxu = ii % snaKK.idxu_max; +void PairSNAPKokkos::operator() (TagPairSNAPTransformUi, const int& iatom, const int& idxu) const { if (iatom >= chunk_size) return; snaKK.transform_ui(iatom, idxu); } +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPTransformUi, const int& iatom) const { + if (iatom >= chunk_size) return; + for (int idxu = 0; idxu < snaKK.idxu_max; idxu++) + snaKK.transform_ui(iatom, idxu); +} + /* ---------------------------------------------------------------------- Compute all elements of the Z tensor and store them into the `zlist` View. This is only used for energy timesteps or quadratic SNAP. @@ -865,7 +883,7 @@ void PairSNAPKokkos::operator() (TagPairSN template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeZi, const int iatom_mod, const int jjz, const int iatom_div) const { +void PairSNAPKokkos::operator() (TagPairSNAPComputeZi, const int& iatom_mod, const int& jjz, const int& iatom_div) const { const int iatom = iatom_mod + iatom_div * vector_length; if (iatom >= chunk_size) return; if (jjz >= snaKK.idxz_max) return; @@ -874,13 +892,19 @@ void PairSNAPKokkos::operator() (TagPairSN template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeZi, const int& ii) const { - const int iatom = ii / snaKK.idxz_max; - const int jjz = ii % snaKK.idxz_max; +void PairSNAPKokkos::operator() (TagPairSNAPComputeZi, const int& iatom, const int& jjz) const { if (iatom >= chunk_size) return; snaKK.compute_zi(iatom, jjz); } +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeZi, const int& iatom) const { + if (iatom >= chunk_size) return; + for (int jjz = 0; jjz < snaKK.idxz_max; jjz++) + snaKK.compute_zi(iatom, jjz); +} + /* ---------------------------------------------------------------------- Compute the energy triple products and store in the "blist" View. CPU and GPU. @@ -888,51 +912,59 @@ void PairSNAPKokkos::operator() (TagPairSN template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeBi,const int iatom_mod, const int jjb, const int iatom_div) const { - +void PairSNAPKokkos::operator() (TagPairSNAPComputeBi, const int& iatom_mod, const int& jjb, const int& iatom_div) const { const int iatom = iatom_mod + iatom_div * vector_length; if (iatom >= chunk_size) return; if (jjb >= snaKK.idxb_max) return; - snaKK.compute_bi(iatom, jjb); } template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeBi, const int& ii) const { - const int iatom = ii / snaKK.idxb_max; - const int jjb = ii % snaKK.idxb_max; +void PairSNAPKokkos::operator() (TagPairSNAPComputeBi, const int& iatom, const int& jjb) const { if (iatom >= chunk_size) return; snaKK.compute_bi(iatom, jjb); } +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeBi, const int& iatom) const { + if (iatom >= chunk_size) return; + for (int jjb = 0; jjb < snaKK.idxb_max; jjb++) + snaKK.compute_bi(iatom, jjb); +} + /* ---------------------------------------------------------------------- Zero out beta in advance of accumulating. CPU and GPU. ------------------------------------------------------------------------- */ template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPZeroBeta,const int iatom_mod, const int jjb, const int iatom_div) const { - +void PairSNAPKokkos::operator() (TagPairSNAPZeroBeta, const int& iatom_mod, const int& jjb, const int& iatom_div) const { const int iatom = iatom_mod + iatom_div * vector_length; if (iatom >= chunk_size) return; if (jjb >= snaKK.idxb_max) return; - for (int itriple = 0; itriple < snaKK.ntriples; itriple++) snaKK.d_beta(iatom, jjb + itriple * snaKK.idxb_max) = 0; } template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPZeroBeta, const int& ii) const { - const int iatom = ii / snaKK.idxb_max; - const int jjb = ii % snaKK.idxb_max; +void PairSNAPKokkos::operator() (TagPairSNAPZeroBeta, const int& iatom, const int& jjb) const { if (iatom >= chunk_size) return; - for (int itriple = 0; itriple < snaKK.ntriples; itriple++) snaKK.d_beta(iatom, jjb + itriple * snaKK.idxb_max) = 0; } +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPZeroBeta, const int& iatom) const { + if (iatom >= chunk_size) return; + for (int jjb = 0; jjb < snaKK.idxb_max; jjb++) + for (int itriple = 0; itriple < snaKK.ntriples; itriple++) + snaKK.d_beta(iatom, jjb + itriple * snaKK.idxb_max) = 0; +} + /* ---------------------------------------------------------------------- Assemble the "beta" coefficients that enter the computation of the adjoint matrices Y. For quadratic SNAP, this includes accumulating @@ -943,7 +975,7 @@ void PairSNAPKokkos::operator() (TagPairSN template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeBeta, const int iatom_mod, const int idxb, const int iatom_div) const { +void PairSNAPKokkos::operator() (TagPairSNAPComputeBeta, const int& iatom_mod, const int& idxb, const int& iatom_div) const { const int iatom = iatom_mod + iatom_div * vector_length; if (iatom >= chunk_size) return; if (idxb >= snaKK.idxb_max) return; @@ -952,21 +984,32 @@ void PairSNAPKokkos::operator() (TagPairSN const int itype = type[i]; const int ielem = d_map[itype]; - snaKK.compute_beta(iatom, idxb, ielem); + snaKK.template compute_beta(iatom, idxb, ielem); } template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeBeta, const int& ii) const { - const int iatom = ii / snaKK.idxb_max; - const int idxb = ii % snaKK.idxb_max; +void PairSNAPKokkos::operator() (TagPairSNAPComputeBeta, const int& iatom, const int& idxb) const { if (iatom >= chunk_size) return; const int i = d_ilist[iatom + chunk_offset]; const int itype = type[i]; const int ielem = d_map[itype]; - snaKK.compute_beta(iatom, idxb, ielem); + snaKK.template compute_beta(iatom, idxb, ielem); +} + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeBeta, const int& iatom) const { + if (iatom >= chunk_size) return; + + const int i = d_ilist[iatom + chunk_offset]; + const int itype = type[i]; + const int ielem = d_map[itype]; + + for (int idxb = 0; idxb < snaKK.idxb_max; idxb++) + snaKK.template compute_beta(iatom, idxb, ielem); } /* ---------------------------------------------------------------------- @@ -976,20 +1019,26 @@ void PairSNAPKokkos::operator() (TagPairSN template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeYi, const int iatom_mod, const int jjz, const int iatom_div) const { +void PairSNAPKokkos::operator() (TagPairSNAPComputeYi, const int& iatom_mod, const int& jjz, const int& iatom_div) const { const int iatom = iatom_mod + iatom_div * vector_length; if (iatom >= chunk_size) return; if (jjz >= snaKK.idxz_max) return; - snaKK.compute_yi(iatom, jjz); + snaKK.template compute_yi(iatom, jjz); } template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeYi, const int& ii) const { - const int iatom = ii / snaKK.idxz_max; - const int jjz = ii % snaKK.idxz_max; +void PairSNAPKokkos::operator() (TagPairSNAPComputeYi, const int& iatom, const int& jjz) const { if (iatom >= chunk_size) return; - snaKK.compute_yi(iatom, jjz); + snaKK.template compute_yi(iatom, jjz); +} + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeYi, const int& iatom) const { + if (iatom >= chunk_size) return; + for (int jjz = 0; jjz < snaKK.idxz_max; jjz++) + snaKK.template compute_yi(iatom, jjz); } /* ---------------------------------------------------------------------- @@ -999,22 +1048,26 @@ void PairSNAPKokkos::operator() (TagPairSN template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeYiWithZlist,const int iatom_mod, const int jjz, const int iatom_div) const { - +void PairSNAPKokkos::operator() (TagPairSNAPComputeYiWithZlist, const int& iatom_mod, const int& jjz, const int& iatom_div) const { const int iatom = iatom_mod + iatom_div * vector_length; if (iatom >= chunk_size) return; if (jjz >= snaKK.idxz_max) return; - - snaKK.compute_yi_with_zlist(iatom, jjz); + snaKK.template compute_yi_with_zlist(iatom, jjz); } template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeYiWithZlist, const int& ii) const { - const int iatom = ii / snaKK.idxz_max; - const int jjz = ii % snaKK.idxz_max; +void PairSNAPKokkos::operator() (TagPairSNAPComputeYiWithZlist, const int& iatom, const int& jjz) const { if (iatom >= chunk_size) return; - snaKK.compute_yi_with_zlist(iatom, jjz); + snaKK.template compute_yi_with_zlist(iatom, jjz); +} + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeYiWithZlist, const int& iatom) const { + if (iatom >= chunk_size) return; + for (int jjz = 0; jjz < snaKK.idxz_max; jjz++) + snaKK.template compute_yi_with_zlist(iatom, jjz); } /* ---------------------------------------------------------------------- @@ -1084,16 +1137,22 @@ void PairSNAPKokkos::operator() (TagPairSN template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeDuidrjCPU, const int& ii) const { - const int iatom = ii / max_neighs; - const int jnbor = ii % max_neighs; +void PairSNAPKokkos::operator() (TagPairSNAPComputeDuidrjCPU, const int& iatom, const int& jnbor) const { if (iatom >= chunk_size) return; const int ninside = d_ninside(iatom); if (jnbor >= ninside) return; - snaKK.compute_duidrj_cpu(iatom, jnbor); } +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeDuidrjCPU, const int& iatom) const { + if (iatom >= chunk_size) return; + const int ninside = d_ninside(iatom); + for (int jnbor = 0; jnbor < ninside; jnbor++) + snaKK.compute_duidrj_cpu(iatom, jnbor); +} + /* ---------------------------------------------------------------------- Assemble the force contributions for each atom, neighbor pair by contracting the adjoint matrices Y with the pre-computed derivatives @@ -1102,16 +1161,22 @@ void PairSNAPKokkos::operator() (TagPairSN template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeDeidrjCPU, const int& ii) const { - const int iatom = ii / max_neighs; - const int jnbor = ii % max_neighs; +void PairSNAPKokkos::operator() (TagPairSNAPComputeDeidrjCPU, const int& iatom, const int& jnbor) const { if (iatom >= chunk_size) return; const int ninside = d_ninside(iatom); if (jnbor >= ninside) return; - snaKK.compute_deidrj_cpu(iatom, jnbor); } +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeDeidrjCPU, const int& iatom) const { + if (iatom >= chunk_size) return; + const int ninside = d_ninside(iatom); + for (int jnbor = 0; jnbor < ninside; jnbor++) + snaKK.compute_deidrj_cpu(iatom, jnbor); +} + /* ---------------------------------------------------------------------- This routine formally accumulates the "chunked" force contributions into the broader LAMMPS "f" force View. As appropriate it diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h index cd624238a1..438f1b4304 100644 --- a/src/KOKKOS/sna_kokkos.h +++ b/src/KOKKOS/sna_kokkos.h @@ -206,13 +206,13 @@ class SNAKokkos { KOKKOS_INLINE_FUNCTION void compute_zi(const int&, const int&) const; // ForceSNAP - KOKKOS_INLINE_FUNCTION + template KOKKOS_INLINE_FUNCTION void compute_yi(const int&, const int&) const; // ForceSNAP - KOKKOS_INLINE_FUNCTION + template KOKKOS_INLINE_FUNCTION void compute_yi_with_zlist(const int&, const int&) const; // ForceSNAP KOKKOS_INLINE_FUNCTION void compute_bi(const int&, const int&) const; // ForceSNAP - KOKKOS_INLINE_FUNCTION + template KOKKOS_INLINE_FUNCTION void compute_beta(const int&, const int&, const int&) const; // functions for derivatives, GPU only @@ -244,7 +244,7 @@ class SNAKokkos { const int&, const int&, const int&) const; // functions for bispectrum coefficients, CPU only - KOKKOS_INLINE_FUNCTION + template KOKKOS_INLINE_FUNCTION void compute_ui_cpu(const int&, const int&) const; // ForceSNAP // functions for derivatives, CPU only diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index 983f7a0d27..7d86d5ceaa 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -638,7 +638,7 @@ void SNAKokkos::evaluate_ui_jbend(const Wi ------------------------------------------------------------------------- */ template -KOKKOS_INLINE_FUNCTION +template KOKKOS_INLINE_FUNCTION void SNAKokkos::compute_ui_cpu(const int& iatom, const int& jnbor) const { // utot(j,ma,mb) = 0 for all j,ma,ma @@ -743,8 +743,13 @@ void SNAKokkos::compute_ui_cpu(const int& int count = 0; for (int mb = 0; 2*mb <= j; mb++) { for (int ma = 0; ma <= j; ma++) { - Kokkos::atomic_add(&(ulisttot_re(iatom, jelem, jju_half+count)), sfac * ulist_cpu(iatom, jnbor, jju_cache+count).re); - Kokkos::atomic_add(&(ulisttot_im(iatom, jelem, jju_half+count)), sfac * ulist_cpu(iatom, jnbor, jju_cache+count).im); + if constexpr (need_atomics) { + Kokkos::atomic_add(&(ulisttot_re(iatom, jelem, jju_half+count)), sfac * ulist_cpu(iatom, jnbor, jju_cache+count).re); + Kokkos::atomic_add(&(ulisttot_im(iatom, jelem, jju_half+count)), sfac * ulist_cpu(iatom, jnbor, jju_cache+count).im); + } else { + ulisttot_re(iatom, jelem, jju_half+count) += sfac * ulist_cpu(iatom, jnbor, jju_cache+count).re; + ulisttot_im(iatom, jelem, jju_half+count) += sfac * ulist_cpu(iatom, jnbor, jju_cache+count).im; + } count++; } } @@ -959,6 +964,7 @@ void SNAKokkos::compute_bi(const int& iato ------------------------------------------------------------------------- */ template +template KOKKOS_INLINE_FUNCTION void SNAKokkos::compute_beta(const int& iatom, const int& idxb, const int& ielem) const { @@ -991,19 +997,30 @@ void SNAKokkos::compute_beta(const int& ia } else { for (int itriple = 0; itriple < ntriples; itriple++) { int icoeff = idxb + itriple * idxb_max; - Kokkos::atomic_add(&d_beta(iatom, icoeff), d_coeffi[icoeff+1]); + if constexpr (need_atomics) + Kokkos::atomic_add(&d_beta(iatom, icoeff), d_coeffi[icoeff+1]); + else + d_beta(iatom, icoeff) += d_coeffi[icoeff+1]; } if (quadratic_flag) { int k = (idxb * (1 + 2 * idxb_max - idxb)) / 2 + idxb_max + 1; real_type bveci = blist(iatom, 0, idxb); - Kokkos::atomic_add(&d_beta(iatom, idxb), d_coeffi[k] * bveci); + if constexpr (need_atomics) + Kokkos::atomic_add(&d_beta(iatom, idxb), d_coeffi[k] * bveci); + else + d_beta(iatom, idxb) += d_coeffi[k] * bveci; k++; for (int jdxb = idxb + 1; jdxb < idxb_max; jdxb++) { real_type bvecj = blist(iatom, 0, jdxb); - Kokkos::atomic_add(&d_beta(iatom, idxb), d_coeffi[k] * bvecj); - Kokkos::atomic_add(&d_beta(iatom, jdxb), d_coeffi[k] * bveci); + if constexpr (need_atomics) { + Kokkos::atomic_add(&d_beta(iatom, idxb), d_coeffi[k] * bvecj); + Kokkos::atomic_add(&d_beta(iatom, jdxb), d_coeffi[k] * bveci); + } else { + d_beta(iatom, idxb) += d_coeffi[k] * bvecj; + d_beta(iatom, jdxb) += d_coeffi[k] * bveci; + } k++; } } @@ -1015,6 +1032,7 @@ void SNAKokkos::compute_beta(const int& ia ------------------------------------------------------------------------- */ template +template KOKKOS_INLINE_FUNCTION void SNAKokkos::compute_yi(const int& iatom, const int& jjz) const { @@ -1040,8 +1058,13 @@ void SNAKokkos::compute_yi(const int& iato const real_type betaj = evaluate_beta_scaled(j1, j2, j, iatom, elem1, elem2, elem3); - Kokkos::atomic_add(&(ylist_re(iatom, elem3, jju_half)), betaj * ztmp.re); - Kokkos::atomic_add(&(ylist_im(iatom, elem3, jju_half)), betaj * ztmp.im); + if constexpr (need_atomics) { + Kokkos::atomic_add(&(ylist_re(iatom, elem3, jju_half)), betaj * ztmp.re); + Kokkos::atomic_add(&(ylist_im(iatom, elem3, jju_half)), betaj * ztmp.im); + } else { + ylist_re(iatom, elem3, jju_half) += betaj * ztmp.re; + ylist_im(iatom, elem3, jju_half) += betaj * ztmp.im; + } } // end loop over elem3 } // end loop over elem2 } // end loop over elem1 @@ -1052,6 +1075,7 @@ void SNAKokkos::compute_yi(const int& iato ------------------------------------------------------------------------- */ template +template KOKKOS_INLINE_FUNCTION void SNAKokkos::compute_yi_with_zlist(const int& iatom, const int& jjz) const { @@ -1071,8 +1095,13 @@ void SNAKokkos::compute_yi_with_zlist(cons const real_type betaj = evaluate_beta_scaled(j1, j2, j, iatom, elem1, elem2, elem3); - Kokkos::atomic_add(&(ylist_re(iatom, elem3, jju_half)), betaj * ztmp.re); - Kokkos::atomic_add(&(ylist_im(iatom, elem3, jju_half)), betaj * ztmp.im); + if constexpr (need_atomics) { + Kokkos::atomic_add(&(ylist_re(iatom, elem3, jju_half)), betaj * ztmp.re); + Kokkos::atomic_add(&(ylist_im(iatom, elem3, jju_half)), betaj * ztmp.im); + } else { + ylist_re(iatom, elem3, jju_half) += betaj * ztmp.re; + ylist_im(iatom, elem3, jju_half) += betaj * ztmp.im; + } } // end loop over elem3 idouble++; } // end loop over elem2 From dde2c91a27af85b2fc5b25df4ac767ffd93c4a34 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Wed, 20 Nov 2024 11:03:37 -0800 Subject: [PATCH 23/30] Refactored the beta calculation into separate linear and quadratic portions --- src/KOKKOS/pair_snap_kokkos.h | 19 ++--- src/KOKKOS/pair_snap_kokkos_impl.h | 115 ++++++++++++++++------------- src/KOKKOS/sna_kokkos.h | 4 +- src/KOKKOS/sna_kokkos_impl.h | 67 +++++++++-------- 4 files changed, 113 insertions(+), 92 deletions(-) diff --git a/src/KOKKOS/pair_snap_kokkos.h b/src/KOKKOS/pair_snap_kokkos.h index 9f6ef83016..97ff5a00f0 100644 --- a/src/KOKKOS/pair_snap_kokkos.h +++ b/src/KOKKOS/pair_snap_kokkos.h @@ -46,8 +46,8 @@ struct TagPairSNAPPreUi{}; struct TagPairSNAPTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist struct TagPairSNAPComputeZi{}; struct TagPairSNAPComputeBi{}; -struct TagPairSNAPZeroBeta{}; -struct TagPairSNAPComputeBeta{}; +struct TagPairSNAPComputeBetaLinear{}; +struct TagPairSNAPComputeBetaQuadratic{}; struct TagPairSNAPComputeYi{}; struct TagPairSNAPComputeYiWithZlist{}; template @@ -94,7 +94,6 @@ class PairSNAPKokkos : public PairSNAP { static constexpr int tile_size_transform_ui = 2; static constexpr int tile_size_compute_zi = 2; static constexpr int tile_size_compute_bi = 2; - static constexpr int tile_size_zero_beta = 2; static constexpr int tile_size_compute_beta = 2; static constexpr int tile_size_compute_yi = 2; static constexpr int team_size_compute_fused_deidrj = 2; @@ -106,7 +105,6 @@ class PairSNAPKokkos : public PairSNAP { static constexpr int tile_size_transform_ui = 8; static constexpr int tile_size_compute_zi = 4; static constexpr int tile_size_compute_bi = 4; - static constexpr int tile_size_zero_beta = 8; static constexpr int tile_size_compute_beta = 8; static constexpr int tile_size_compute_yi = 8; static constexpr int team_size_compute_fused_deidrj = 4; @@ -118,7 +116,6 @@ class PairSNAPKokkos : public PairSNAP { static constexpr int tile_size_transform_ui = 4; static constexpr int tile_size_compute_zi = 8; static constexpr int tile_size_compute_bi = 4; - static constexpr int tile_size_zero_beta = 4; static constexpr int tile_size_compute_beta = 4; static constexpr int tile_size_compute_yi = 8; static constexpr int team_size_compute_fused_deidrj = sizeof(real_type) == 4 ? 4 : 2; @@ -236,22 +233,22 @@ class PairSNAPKokkos : public PairSNAP { void operator() (TagPairSNAPComputeBi, const int& iatom) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPZeroBeta, const int& iatom_mod, const int& idxb, const int& iatom_div) const; + void operator() (TagPairSNAPComputeBetaLinear, const int& iatom_mod, const int& idxb, const int& iatom_div) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPZeroBeta, const int& iatom, const int& idxb) const; + void operator() (TagPairSNAPComputeBetaLinear, const int& iatom, const int& idxb) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPZeroBeta, const int& iatom) const; + void operator() (TagPairSNAPComputeBetaLinear, const int& iatom) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeBeta, const int& iatom_mod, const int& idxb, const int& iatom_div) const; + void operator() (TagPairSNAPComputeBetaQuadratic, const int& iatom_mod, const int& idxb, const int& iatom_div) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeBeta, const int& iatom, const int& idxb) const; + void operator() (TagPairSNAPComputeBetaQuadratic, const int& iatom, const int& idxb) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeBeta, const int& iatom) const; + void operator() (TagPairSNAPComputeBetaQuadratic, const int& iatom) const; KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeYi, const int& iatom_mod, const int& idxz, const int& iatom_div) const; diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h index 056dfbf231..3ba61dab91 100644 --- a/src/KOKKOS/pair_snap_kokkos_impl.h +++ b/src/KOKKOS/pair_snap_kokkos_impl.h @@ -292,7 +292,7 @@ void PairSNAPKokkos::compute(int eflag_in, } { - // Expand ulisttot_re,_im -> ulisttot + // Expand ulisttot -> ulisttot // Zero out ylist auto policy_transform_ui = snap_get_policy(chunk_size_div, snaKK.idxu_max); Kokkos::parallel_for("TransformUi", policy_transform_ui, *this); @@ -312,13 +312,15 @@ void PairSNAPKokkos::compute(int eflag_in, } { - // Zero beta out - auto policy_zero_beta = snap_get_policy(chunk_size_div, snaKK.idxb_max); - Kokkos::parallel_for("ZeroBeta",policy_zero_beta,*this); + //Compute beta = dE_i/dB_i for all i in list; linear portion only + auto policy_compute_beta_linear = snap_get_policy(chunk_size_div, snaKK.idxb_max); + Kokkos::parallel_for("ComputeBetaLinear", policy_compute_beta_linear, *this); - //Compute beta = dE_i/dB_i for all i in list - auto policy_compute_beta = snap_get_policy(chunk_size_div, snaKK.idxb_max); - Kokkos::parallel_for("ComputeBeta", policy_compute_beta, *this); + if (quadraticflag) { + // Compute the quadratic correction + auto policy_compute_beta_quadratic = snap_get_policy(chunk_size_div, snaKK.idxb_max); + Kokkos::parallel_for("ComputeBetaQuadratic", policy_compute_beta_quadratic, *this); + } //Note zeroing `ylist` is fused into `TransformUi`. if (quadraticflag || eflag) { @@ -450,8 +452,8 @@ void PairSNAPKokkos::compute(int eflag_in, // free duplicated memory if (need_dup) { - dup_f = {}; - dup_vatom = {}; + dup_f = decltype(dup_f)(); + dup_vatom = decltype(dup_vatom)(); } } @@ -934,48 +936,15 @@ void PairSNAPKokkos::operator() (TagPairSN snaKK.compute_bi(iatom, jjb); } -/* ---------------------------------------------------------------------- - Zero out beta in advance of accumulating. CPU and GPU. -------------------------------------------------------------------------- */ - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPZeroBeta, const int& iatom_mod, const int& jjb, const int& iatom_div) const { - const int iatom = iatom_mod + iatom_div * vector_length; - if (iatom >= chunk_size) return; - if (jjb >= snaKK.idxb_max) return; - for (int itriple = 0; itriple < snaKK.ntriples; itriple++) - snaKK.d_beta(iatom, jjb + itriple * snaKK.idxb_max) = 0; -} - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPZeroBeta, const int& iatom, const int& jjb) const { - if (iatom >= chunk_size) return; - for (int itriple = 0; itriple < snaKK.ntriples; itriple++) - snaKK.d_beta(iatom, jjb + itriple * snaKK.idxb_max) = 0; -} - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPZeroBeta, const int& iatom) const { - if (iatom >= chunk_size) return; - for (int jjb = 0; jjb < snaKK.idxb_max; jjb++) - for (int itriple = 0; itriple < snaKK.ntriples; itriple++) - snaKK.d_beta(iatom, jjb + itriple * snaKK.idxb_max) = 0; -} - /* ---------------------------------------------------------------------- Assemble the "beta" coefficients that enter the computation of the - adjoint matrices Y. For quadratic SNAP, this includes accumulating - energy triple products into an "effective" beta that encodes the - quadratic terms with otherwise linear compute work. - CPU and GPU. + adjoint matrices Y. This is just for a linear potential. A quadratic + contribution is added in a subsequent kernel. CPU and GPU. ------------------------------------------------------------------------- */ template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeBeta, const int& iatom_mod, const int& idxb, const int& iatom_div) const { +void PairSNAPKokkos::operator() (TagPairSNAPComputeBetaLinear, const int& iatom_mod, const int& idxb, const int& iatom_div) const { const int iatom = iatom_mod + iatom_div * vector_length; if (iatom >= chunk_size) return; if (idxb >= snaKK.idxb_max) return; @@ -984,24 +953,24 @@ void PairSNAPKokkos::operator() (TagPairSN const int itype = type[i]; const int ielem = d_map[itype]; - snaKK.template compute_beta(iatom, idxb, ielem); + snaKK.compute_beta_linear(iatom, idxb, ielem); } template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeBeta, const int& iatom, const int& idxb) const { +void PairSNAPKokkos::operator() (TagPairSNAPComputeBetaLinear, const int& iatom, const int& idxb) const { if (iatom >= chunk_size) return; const int i = d_ilist[iatom + chunk_offset]; const int itype = type[i]; const int ielem = d_map[itype]; - snaKK.template compute_beta(iatom, idxb, ielem); + snaKK.compute_beta_linear(iatom, idxb, ielem); } template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeBeta, const int& iatom) const { +void PairSNAPKokkos::operator() (TagPairSNAPComputeBetaLinear, const int& iatom) const { if (iatom >= chunk_size) return; const int i = d_ilist[iatom + chunk_offset]; @@ -1009,7 +978,53 @@ void PairSNAPKokkos::operator() (TagPairSN const int ielem = d_map[itype]; for (int idxb = 0; idxb < snaKK.idxb_max; idxb++) - snaKK.template compute_beta(iatom, idxb, ielem); + snaKK.compute_beta_linear(iatom, idxb, ielem); +} + +/* ---------------------------------------------------------------------- + Accumulate the qudratic terms which includes accumulating + energy triple products into an "effective" beta that encodes the + quadratic terms with otherwise linear compute work. + CPU and GPU. +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeBetaQuadratic, const int& iatom_mod, const int& idxb, const int& iatom_div) const { + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; + if (idxb >= snaKK.idxb_max) return; + + const int i = d_ilist[iatom + chunk_offset]; + const int itype = type[i]; + const int ielem = d_map[itype]; + + snaKK.template compute_beta_quadratic(iatom, idxb, ielem); +} + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeBetaQuadratic, const int& iatom, const int& idxb) const { + if (iatom >= chunk_size) return; + + const int i = d_ilist[iatom + chunk_offset]; + const int itype = type[i]; + const int ielem = d_map[itype]; + + snaKK.template compute_beta_quadratic(iatom, idxb, ielem); +} + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeBetaQuadratic, const int& iatom) const { + if (iatom >= chunk_size) return; + + const int i = d_ilist[iatom + chunk_offset]; + const int itype = type[i]; + const int ielem = d_map[itype]; + + for (int idxb = 0; idxb < snaKK.idxb_max; idxb++) + snaKK.template compute_beta_quadratic(iatom, idxb, ielem); } /* ---------------------------------------------------------------------- diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h index 438f1b4304..4fc34c260d 100644 --- a/src/KOKKOS/sna_kokkos.h +++ b/src/KOKKOS/sna_kokkos.h @@ -212,8 +212,10 @@ class SNAKokkos { void compute_yi_with_zlist(const int&, const int&) const; // ForceSNAP KOKKOS_INLINE_FUNCTION void compute_bi(const int&, const int&) const; // ForceSNAP + KOKKOS_INLINE_FUNCTION + void compute_beta_linear(const int&, const int&, const int&) const; template KOKKOS_INLINE_FUNCTION - void compute_beta(const int&, const int&, const int&) const; + void compute_beta_quadratic(const int&, const int&, const int&) const; // functions for derivatives, GPU only // version of the code with parallelism over j_bend diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index 7d86d5ceaa..e14add1c8f 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -963,20 +963,29 @@ void SNAKokkos::compute_bi(const int& iato or accumulating the quadratic terms from blist ------------------------------------------------------------------------- */ +template +KOKKOS_INLINE_FUNCTION +void SNAKokkos::compute_beta_linear(const int& iatom, const int& idxb, const int& ielem) const +{ + auto d_coeffi = Kokkos::subview(d_coeffelem, ielem, Kokkos::ALL); + + for (int itriple = 0; itriple < ntriples; itriple++) { + int icoeff = idxb + itriple * idxb_max; + d_beta(iatom, icoeff) = d_coeffi[icoeff+1]; + } +} + template template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_beta(const int& iatom, const int& idxb, const int& ielem) const +void SNAKokkos::compute_beta_quadratic(const int& iatom, const int& idxb, const int& ielem) const { auto d_coeffi = Kokkos::subview(d_coeffelem, ielem, Kokkos::ALL); // handle quadratic && chemflag as a special case - if (quadratic_flag && chem_flag) { + if (chem_flag) { if (idxb == 0) { - for (int icoeff = 0; icoeff < ncoeff; icoeff++) { - d_beta(iatom, icoeff) = d_coeffi[icoeff+1]; - } - + // no need to use atomics, we're just serializing int k = ncoeff+1; for (int icoeff = 0; icoeff < ncoeff; icoeff++) { const auto idxb = icoeff % idxb_max; @@ -995,38 +1004,36 @@ void SNAKokkos::compute_beta(const int& ia } } } else { - for (int itriple = 0; itriple < ntriples; itriple++) { - int icoeff = idxb + itriple * idxb_max; - if constexpr (need_atomics) - Kokkos::atomic_add(&d_beta(iatom, icoeff), d_coeffi[icoeff+1]); - else - d_beta(iatom, icoeff) += d_coeffi[icoeff+1]; - } + // Compute triangular partial sum via a closed form to get the starting offset + int k = (idxb * (1 + 2 * idxb_max - idxb)) / 2 + idxb_max + 1; + real_type bveci = blist(iatom, 0, idxb); - if (quadratic_flag) { - int k = (idxb * (1 + 2 * idxb_max - idxb)) / 2 + idxb_max + 1; - real_type bveci = blist(iatom, 0, idxb); + // Locally accumulate the contribution to d_beta(iatom, idxb) + real_type beta_idxb_accum = d_coeffi[k] * bveci; + k++; + + for (int jdxb = idxb + 1; jdxb < idxb_max; jdxb++) { + real_type bvecj = blist(iatom, 0, jdxb); + real_type coeff_k = d_coeffi[k]; + beta_idxb_accum += coeff_k * bvecj; + + // Accumulate "half" contribution into d_beta(iatom, jdxb) if constexpr (need_atomics) - Kokkos::atomic_add(&d_beta(iatom, idxb), d_coeffi[k] * bveci); + Kokkos::atomic_add(&d_beta(iatom, jdxb), coeff_k * bveci); else - d_beta(iatom, idxb) += d_coeffi[k] * bveci; + d_beta(iatom, jdxb) += coeff_k * bveci; + k++; - - for (int jdxb = idxb + 1; jdxb < idxb_max; jdxb++) { - real_type bvecj = blist(iatom, 0, jdxb); - if constexpr (need_atomics) { - Kokkos::atomic_add(&d_beta(iatom, idxb), d_coeffi[k] * bvecj); - Kokkos::atomic_add(&d_beta(iatom, jdxb), d_coeffi[k] * bveci); - } else { - d_beta(iatom, idxb) += d_coeffi[k] * bvecj; - d_beta(iatom, jdxb) += d_coeffi[k] * bveci; - } - k++; - } } + + if constexpr (need_atomics) + Kokkos::atomic_add(&d_beta(iatom, idxb), beta_idxb_accum); + else + d_beta(iatom, idxb) += beta_idxb_accum; } } + /* ---------------------------------------------------------------------- Compute Yi from Ui without storing Zi, looping over zlist indices. ------------------------------------------------------------------------- */ From f3221ee2e1c91779ec5ea474dc6cca94e1e24832 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Wed, 20 Nov 2024 11:10:13 -0800 Subject: [PATCH 24/30] Abstracted computation portion of compute_bi into a separate evaluate_bi --- src/KOKKOS/pair_snap_kokkos_impl.h | 6 +- src/KOKKOS/sna_kokkos.h | 4 + src/KOKKOS/sna_kokkos_impl.h | 129 +++++++++++++++++------------ 3 files changed, 82 insertions(+), 57 deletions(-) diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h index 3ba61dab91..0e3ec77616 100644 --- a/src/KOKKOS/pair_snap_kokkos_impl.h +++ b/src/KOKKOS/pair_snap_kokkos_impl.h @@ -292,7 +292,7 @@ void PairSNAPKokkos::compute(int eflag_in, } { - // Expand ulisttot -> ulisttot + // Expand ulisttot_re,_im -> ulisttot // Zero out ylist auto policy_transform_ui = snap_get_policy(chunk_size_div, snaKK.idxu_max); Kokkos::parallel_for("TransformUi", policy_transform_ui, *this); @@ -452,8 +452,8 @@ void PairSNAPKokkos::compute(int eflag_in, // free duplicated memory if (need_dup) { - dup_f = decltype(dup_f)(); - dup_vatom = decltype(dup_vatom)(); + dup_f = {}; + dup_vatom = {}; } } diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h index 4fc34c260d..922aa22351 100644 --- a/src/KOKKOS/sna_kokkos.h +++ b/src/KOKKOS/sna_kokkos.h @@ -236,6 +236,10 @@ class SNAKokkos { KOKKOS_FORCEINLINE_FUNCTION complex evaluate_zi(const int&, const int&, const int&, const int&, const int&, const int&, const int&, const int&, const int&, const int&, const int&, const int&, const real_type*) const; + // plugged into compute_bi + KOKKOS_FORCEINLINE_FUNCTION + real_type evaluate_bi(const int&, const int&, const int&, const int&, + const int&, const int&, const int&) const; // plugged into compute_yi, compute_yi_with_zlist KOKKOS_FORCEINLINE_FUNCTION real_type evaluate_beta_scaled(const int&, const int&, const int&, const int&, const int&, const int&, const int&) const; diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index e14add1c8f..f0d4881f8d 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -897,60 +897,7 @@ void SNAKokkos::compute_bi(const int& iato for (int elem1 = 0; elem1 < nelements; elem1++) { for (int elem2 = 0; elem2 < nelements; elem2++) { for (int elem3 = 0; elem3 < nelements; elem3++) { - - double sumzu = 0.0; - double sumzu_temp = 0.0; - - for (int mb = 0; 2*mb < j; mb++) { - for (int ma = 0; ma <= j; ma++) { - const int jju_index = jju+mb*(j+1)+ma; - const int jjz_index = jjz+mb*(j+1)+ma; - if (2*mb == j) return; // I think we can remove this? - const complex utot = ulisttot(iatom, elem3, jju_index); - const complex zloc = zlist(iatom, idouble, jjz_index); - sumzu_temp += utot.re * zloc.re + utot.im * zloc.im; - } - } - sumzu += sumzu_temp; - - // For j even, special treatment for middle column - if (j%2 == 0) { - sumzu_temp = 0.; - - const int mb = j/2; - for (int ma = 0; ma < mb; ma++) { - const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma; - const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma; - - const complex utot = ulisttot(iatom, elem3, jju_index); - const complex zloc = zlist(iatom, idouble, jjz_index); - sumzu_temp += utot.re * zloc.re + utot.im * zloc.im; - - } - sumzu += sumzu_temp; - - const int ma = mb; - const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma; - const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma; - - const complex utot = ulisttot(iatom, elem3, jju_index); - const complex zloc = zlist(iatom, idouble, jjz_index); - sumzu += static_cast(0.5) * (utot.re * zloc.re + utot.im * zloc.im); - } // end if jeven - - sumzu *= static_cast(2.0); - if (bzero_flag) { - if (!wselfall_flag) { - if (elem1 == elem2 && elem1 == elem3) { - sumzu -= bzero[j]; - } - } else { - sumzu -= bzero[j]; - } - } - blist(iatom, itriple, jjb) = sumzu; - //} // end loop over j - //} // end loop over j1, j2 + blist(iatom, itriple, jjb) = evaluate_bi(j, jjz, jju, iatom, elem1, elem2, elem3); itriple++; } // end loop over elem3 idouble++; @@ -958,6 +905,80 @@ void SNAKokkos::compute_bi(const int& iato } // end loop over elem1 } +/* ---------------------------------------------------------------------- + Core "evaluation" kernel that computes a single blist value. + This gets used in `compute_bi` +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +real_type SNAKokkos::evaluate_bi(const int& j, const int& jjz, const int& jju, const int& iatom, const int& elem1, const int& elem2, const int& elem3) const +{ + // this computes the: + // b(j1,j2,j) = 0 + // for mb = 0,...,jmid + // for ma = 0,...,j + // b(j1,j2,j) += + // 2*Conj(u(j,ma,mb))*z(j1,j2,j,ma,mb) + // portion + + const int idouble = elem1 * nelements + elem2; + real_type sumzu = 0.0; + real_type sumzu_temp = 0.0; + + for (int mb = 0; 2*mb < j; mb++) { + for (int ma = 0; ma <= j; ma++) { + const int jju_index = jju+mb*(j+1)+ma; + const int jjz_index = jjz+mb*(j+1)+ma; + if (2*mb == j) return 0; // I think we can remove this? + const complex utot = ulisttot(iatom, elem3, jju_index); + const complex zloc = zlist(iatom, idouble, jjz_index); + sumzu_temp += utot.re * zloc.re + utot.im * zloc.im; + } + } + sumzu += sumzu_temp; + + // For j even, special treatment for middle column + if (j%2 == 0) { + sumzu_temp = 0.; + + const int mb = j/2; + for (int ma = 0; ma < mb; ma++) { + const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma; + const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma; + + const complex utot = ulisttot(iatom, elem3, jju_index); + const complex zloc = zlist(iatom, idouble, jjz_index); + sumzu_temp += utot.re * zloc.re + utot.im * zloc.im; + + } + sumzu += sumzu_temp; + + const int ma = mb; + const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma; + const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma; + + const complex utot = ulisttot(iatom, elem3, jju_index); + const complex zloc = zlist(iatom, idouble, jjz_index); + sumzu += static_cast(0.5) * (utot.re * zloc.re + utot.im * zloc.im); + } // end if jeven + + sumzu *= static_cast(2.0); + if (bzero_flag) { + if (!wselfall_flag) { + if (elem1 == elem2 && elem1 == elem3) { + sumzu -= bzero[j]; + } + } else { + sumzu -= bzero[j]; + } + } + return sumzu; + //} // end loop over j + //} // end loop over j1, j2 +} + + /* ---------------------------------------------------------------------- compute beta by either appropriately copying it from d_coeffi or accumulating the quadratic terms from blist From 67470f236ebf6e90686767aba7d3836ebcc9d8c7 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Thu, 21 Nov 2024 13:48:04 -0800 Subject: [PATCH 25/30] Added a (default zero) extra launch bound arg to help with ComputeYi perf consistency --- src/KOKKOS/pair_snap_kokkos.h | 18 +++++++++++++----- src/KOKKOS/pair_snap_kokkos_impl.h | 4 ++-- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/src/KOKKOS/pair_snap_kokkos.h b/src/KOKKOS/pair_snap_kokkos.h index 97ff5a00f0..13e838356b 100644 --- a/src/KOKKOS/pair_snap_kokkos.h +++ b/src/KOKKOS/pair_snap_kokkos.h @@ -93,9 +93,11 @@ class PairSNAPKokkos : public PairSNAP { static constexpr int team_size_compute_ui = 2; static constexpr int tile_size_transform_ui = 2; static constexpr int tile_size_compute_zi = 2; + static constexpr int min_blocks_compute_zi = 0; // no minimum bound static constexpr int tile_size_compute_bi = 2; static constexpr int tile_size_compute_beta = 2; static constexpr int tile_size_compute_yi = 2; + static constexpr int min_blocks_compute_yi = 0; // no minimum bound static constexpr int team_size_compute_fused_deidrj = 2; #elif defined(KOKKOS_ENABLE_SYCL) static constexpr int team_size_compute_neigh = 4; @@ -104,9 +106,11 @@ class PairSNAPKokkos : public PairSNAP { static constexpr int team_size_compute_ui = 8; static constexpr int tile_size_transform_ui = 8; static constexpr int tile_size_compute_zi = 4; + static constexpr int min_blocks_compute_zi = 0; // no minimum bound static constexpr int tile_size_compute_bi = 4; static constexpr int tile_size_compute_beta = 8; static constexpr int tile_size_compute_yi = 8; + static constexpr int min_blocks_compute_yi = 0; // no minimum bound static constexpr int team_size_compute_fused_deidrj = 4; #else static constexpr int team_size_compute_neigh = 4; @@ -119,14 +123,18 @@ class PairSNAPKokkos : public PairSNAP { static constexpr int tile_size_compute_beta = 4; static constexpr int tile_size_compute_yi = 8; static constexpr int team_size_compute_fused_deidrj = sizeof(real_type) == 4 ? 4 : 2; + + // this empirically reduces perf fluctuations from compiler version to compiler version + static constexpr int min_blocks_compute_zi = 4; + static constexpr int min_blocks_compute_yi = 4; #endif // Custom MDRangePolicy, Rank3, to reduce verbosity of kernel launches // This hides the Kokkos::IndexType and Kokkos::Rank<3...> // and reduces the verbosity of the LaunchBound by hiding the explicit // multiplication by vector_length - template - using Snap3DRangePolicy = typename Kokkos::MDRangePolicy, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, Kokkos::LaunchBounds, TagPairSNAP>; + template + using Snap3DRangePolicy = typename Kokkos::MDRangePolicy, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, Kokkos::LaunchBounds, TagPairSNAP>; // Custom SnapAoSoATeamPolicy to reduce the verbosity of kernel launches // This hides the LaunchBounds abstraction by hiding the explicit @@ -144,7 +152,7 @@ class PairSNAPKokkos : public PairSNAP { using Snap1DHostRangePolicy = typename Kokkos::RangePolicy, TagPairSNAP>; // Helper routine that returns a CPU or a GPU policy as appropriate - template + template auto snap_get_policy(const int& chunk_size_div, const int& second_loop) { if constexpr (host_flag) { return Snap1DHostRangePolicy(0, chunk_size_div * vector_length); @@ -152,9 +160,9 @@ class PairSNAPKokkos : public PairSNAP { // the 2-d policy is still correct but it has atomics so it's slower on the CPU //return Snap2DHostRangePolicy({0, 0}, {chunk_size_div * vector_length, second_loop}); } else - return Snap3DRangePolicy({0, 0, 0}, + return Snap3DRangePolicy({0, 0, 0}, {vector_length, second_loop, chunk_size_div}, - {vector_length, num_teams, 1}); + {vector_length, num_tiles, 1}); } PairSNAPKokkos(class LAMMPS *); diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h index 0e3ec77616..c2e546912e 100644 --- a/src/KOKKOS/pair_snap_kokkos_impl.h +++ b/src/KOKKOS/pair_snap_kokkos_impl.h @@ -303,7 +303,7 @@ void PairSNAPKokkos::compute(int eflag_in, // team_size_[compute_zi, compute_bi, transform_bi] are defined in `pair_snap_kokkos.h` //ComputeZi - auto policy_compute_zi = snap_get_policy(chunk_size_div, snaKK.idxz_max); + auto policy_compute_zi = snap_get_policy(chunk_size_div, snaKK.idxz_max); Kokkos::parallel_for("ComputeZi", policy_compute_zi, *this); //ComputeBi @@ -327,7 +327,7 @@ void PairSNAPKokkos::compute(int eflag_in, auto policy_compute_yi = snap_get_policy(chunk_size_div, snaKK.idxz_max); Kokkos::parallel_for("ComputeYiWithZlist", policy_compute_yi, *this); } else { - auto policy_compute_yi = snap_get_policy(chunk_size_div, snaKK.idxz_max); + auto policy_compute_yi = snap_get_policy(chunk_size_div, snaKK.idxz_max); Kokkos::parallel_for("ComputeYi", policy_compute_yi, *this); } } From 28e64fca94cb8abe3f0b2c483dccf5652df8235b Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Thu, 21 Nov 2024 14:06:10 -0800 Subject: [PATCH 26/30] Added templating over chemsnap for ComputeZi and ComputeBi --- src/KOKKOS/pair_snap_kokkos.h | 28 +++++++-------- src/KOKKOS/pair_snap_kokkos_impl.h | 56 +++++++++++++++++------------- src/KOKKOS/sna_kokkos.h | 4 +-- src/KOKKOS/sna_kokkos_impl.h | 49 +++++++++++++++----------- 4 files changed, 76 insertions(+), 61 deletions(-) diff --git a/src/KOKKOS/pair_snap_kokkos.h b/src/KOKKOS/pair_snap_kokkos.h index 13e838356b..c42d7cdb88 100644 --- a/src/KOKKOS/pair_snap_kokkos.h +++ b/src/KOKKOS/pair_snap_kokkos.h @@ -44,8 +44,8 @@ namespace LAMMPS_NS { // Routines for both the CPU and GPU backend struct TagPairSNAPPreUi{}; struct TagPairSNAPTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist -struct TagPairSNAPComputeZi{}; -struct TagPairSNAPComputeBi{}; +template struct TagPairSNAPComputeZi{}; +template struct TagPairSNAPComputeBi{}; struct TagPairSNAPComputeBetaLinear{}; struct TagPairSNAPComputeBetaQuadratic{}; struct TagPairSNAPComputeYi{}; @@ -222,23 +222,23 @@ class PairSNAPKokkos : public PairSNAP { KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPTransformUi, const int& iatom) const; - KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeZi, const int& iatom_mod, const int& idxz, const int& iatom_div) const; + template KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeZi, const int& iatom_mod, const int& idxz, const int& iatom_div) const; - KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeZi, const int& iatom, const int& idxz) const; + template KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeZi, const int& iatom, const int& idxz) const; - KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeZi, const int& iatom) const; + template KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeZi, const int& iatom) const; - KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeBi, const int& iatom_mod, const int& idxb, const int& iatom_div) const; + template KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeBi, const int& iatom_mod, const int& idxb, const int& iatom_div) const; - KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeBi, const int& iatom, const int& idxb) const; + template KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeBi, const int& iatom, const int& idxb) const; - KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeBi, const int& iatom) const; + template KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeBi, const int& iatom) const; KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeBetaLinear, const int& iatom_mod, const int& idxb, const int& iatom_div) const; diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h index c2e546912e..dfdee2e1c0 100644 --- a/src/KOKKOS/pair_snap_kokkos_impl.h +++ b/src/KOKKOS/pair_snap_kokkos_impl.h @@ -302,13 +302,21 @@ void PairSNAPKokkos::compute(int eflag_in, if (quadraticflag || eflag) { // team_size_[compute_zi, compute_bi, transform_bi] are defined in `pair_snap_kokkos.h` - //ComputeZi - auto policy_compute_zi = snap_get_policy(chunk_size_div, snaKK.idxz_max); - Kokkos::parallel_for("ComputeZi", policy_compute_zi, *this); + //ComputeZi and Bi + if (nelements > 1) { + auto policy_compute_zi = snap_get_policy, min_blocks_compute_zi>(chunk_size_div, snaKK.idxz_max); + Kokkos::parallel_for("ComputeZiChemsnap", policy_compute_zi, *this); + + auto policy_compute_bi = snap_get_policy>(chunk_size_div, snaKK.idxb_max); + Kokkos::parallel_for("ComputeBiChemsnap", policy_compute_bi, *this); + } else { + auto policy_compute_zi = snap_get_policy, min_blocks_compute_zi>(chunk_size_div, snaKK.idxz_max); + Kokkos::parallel_for("ComputeZi", policy_compute_zi, *this); + + auto policy_compute_bi = snap_get_policy>(chunk_size_div, snaKK.idxb_max); + Kokkos::parallel_for("ComputeBi", policy_compute_bi, *this); + } - //ComputeBi - auto policy_compute_bi = snap_get_policy(chunk_size_div, snaKK.idxb_max); - Kokkos::parallel_for("ComputeBi", policy_compute_bi, *this); } { @@ -884,27 +892,27 @@ void PairSNAPKokkos::operator() (TagPairSN ------------------------------------------------------------------------- */ template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeZi, const int& iatom_mod, const int& jjz, const int& iatom_div) const { +template KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeZi, const int& iatom_mod, const int& jjz, const int& iatom_div) const { const int iatom = iatom_mod + iatom_div * vector_length; if (iatom >= chunk_size) return; if (jjz >= snaKK.idxz_max) return; - snaKK.compute_zi(iatom, jjz); + snaKK.template compute_zi(iatom, jjz); } template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeZi, const int& iatom, const int& jjz) const { +template KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeZi, const int& iatom, const int& jjz) const { if (iatom >= chunk_size) return; - snaKK.compute_zi(iatom, jjz); + snaKK.template compute_zi(iatom, jjz); } template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeZi, const int& iatom) const { +template KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeZi, const int& iatom) const { if (iatom >= chunk_size) return; for (int jjz = 0; jjz < snaKK.idxz_max; jjz++) - snaKK.compute_zi(iatom, jjz); + snaKK.template compute_zi(iatom, jjz); } /* ---------------------------------------------------------------------- @@ -913,27 +921,27 @@ void PairSNAPKokkos::operator() (TagPairSN ------------------------------------------------------------------------- */ template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeBi, const int& iatom_mod, const int& jjb, const int& iatom_div) const { +template KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeBi, const int& iatom_mod, const int& jjb, const int& iatom_div) const { const int iatom = iatom_mod + iatom_div * vector_length; if (iatom >= chunk_size) return; if (jjb >= snaKK.idxb_max) return; - snaKK.compute_bi(iatom, jjb); + snaKK.template compute_bi(iatom, jjb); } template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeBi, const int& iatom, const int& jjb) const { +template KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeBi, const int& iatom, const int& jjb) const { if (iatom >= chunk_size) return; - snaKK.compute_bi(iatom, jjb); + snaKK.template compute_bi(iatom, jjb); } template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeBi, const int& iatom) const { +template KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeBi, const int& iatom) const { if (iatom >= chunk_size) return; for (int jjb = 0; jjb < snaKK.idxb_max; jjb++) - snaKK.compute_bi(iatom, jjb); + snaKK.template compute_bi(iatom, jjb); } /* ---------------------------------------------------------------------- diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h index 922aa22351..ee1eb263dc 100644 --- a/src/KOKKOS/sna_kokkos.h +++ b/src/KOKKOS/sna_kokkos.h @@ -204,13 +204,13 @@ class SNAKokkos { KOKKOS_INLINE_FUNCTION void transform_ui(const int&, const int&) const; - KOKKOS_INLINE_FUNCTION + template KOKKOS_INLINE_FUNCTION void compute_zi(const int&, const int&) const; // ForceSNAP template KOKKOS_INLINE_FUNCTION void compute_yi(const int&, const int&) const; // ForceSNAP template KOKKOS_INLINE_FUNCTION void compute_yi_with_zlist(const int&, const int&) const; // ForceSNAP - KOKKOS_INLINE_FUNCTION + template KOKKOS_INLINE_FUNCTION void compute_bi(const int&, const int&) const; // ForceSNAP KOKKOS_INLINE_FUNCTION void compute_beta_linear(const int&, const int&, const int&) const; diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index f0d4881f8d..2d567759ea 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -794,7 +794,7 @@ void SNAKokkos::transform_ui(const int& ia ------------------------------------------------------------------------- */ template -KOKKOS_INLINE_FUNCTION +template KOKKOS_INLINE_FUNCTION void SNAKokkos::compute_zi(const int& iatom, const int& jjz) const { int j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, idxcg; @@ -802,14 +802,17 @@ void SNAKokkos::compute_zi(const int& iato const real_type *cgblock = cglist.data() + idxcg; - int idouble = 0; - - for (int elem1 = 0; elem1 < nelements; elem1++) { - for (int elem2 = 0; elem2 < nelements; elem2++) { - zlist(iatom, idouble, jjz) = evaluate_zi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, iatom, elem1, elem2, cgblock); - idouble++; - } // end loop over elem2 - } // end loop over elem1 + if constexpr (chemsnap) { + int idouble = 0; + for (int elem1 = 0; elem1 < nelements; elem1++) { + for (int elem2 = 0; elem2 < nelements; elem2++) { + zlist(iatom, idouble, jjz) = evaluate_zi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, iatom, elem1, elem2, cgblock); + idouble++; + } // end loop over elem2 + } // end loop over elem1 + } else { + zlist(iatom, 0, jjz) = evaluate_zi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, iatom, 0, 0, cgblock); + } } /* ---------------------------------------------------------------------- @@ -873,7 +876,7 @@ typename SNAKokkos::complex SNAKokkos -KOKKOS_INLINE_FUNCTION +template KOKKOS_INLINE_FUNCTION void SNAKokkos::compute_bi(const int& iatom, const int& jjb) const { // for j1 = 0,...,twojmax @@ -892,17 +895,21 @@ void SNAKokkos::compute_bi(const int& iato const int jjz = idxz_block(j1,j2,j); const int jju = idxu_block[j]; - int itriple = 0; - int idouble = 0; - for (int elem1 = 0; elem1 < nelements; elem1++) { - for (int elem2 = 0; elem2 < nelements; elem2++) { - for (int elem3 = 0; elem3 < nelements; elem3++) { - blist(iatom, itriple, jjb) = evaluate_bi(j, jjz, jju, iatom, elem1, elem2, elem3); - itriple++; - } // end loop over elem3 - idouble++; - } // end loop over elem2 - } // end loop over elem1 + if constexpr (chemsnap) { + int itriple = 0; + int idouble = 0; + for (int elem1 = 0; elem1 < nelements; elem1++) { + for (int elem2 = 0; elem2 < nelements; elem2++) { + for (int elem3 = 0; elem3 < nelements; elem3++) { + blist(iatom, itriple, jjb) = evaluate_bi(j, jjz, jju, iatom, elem1, elem2, elem3); + itriple++; + } // end loop over elem3 + idouble++; + } // end loop over elem2 + } // end loop over elem1 + } else { + blist(iatom, 0, jjb) = evaluate_bi(j, jjz, jju, iatom, 0, 0, 0); + } } /* ---------------------------------------------------------------------- From 261abaa6839ea22c880e6169a37f02465f3739c1 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Thu, 21 Nov 2024 14:17:40 -0800 Subject: [PATCH 27/30] Added templating over chemsnap for ComputeYi and ComputeYiWithZlist --- src/KOKKOS/pair_snap_kokkos.h | 28 +++---- src/KOKKOS/pair_snap_kokkos_impl.h | 54 ++++++++------ src/KOKKOS/sna_kokkos.h | 4 +- src/KOKKOS/sna_kokkos_impl.h | 114 ++++++++++++++++++----------- 4 files changed, 118 insertions(+), 82 deletions(-) diff --git a/src/KOKKOS/pair_snap_kokkos.h b/src/KOKKOS/pair_snap_kokkos.h index c42d7cdb88..660503eed8 100644 --- a/src/KOKKOS/pair_snap_kokkos.h +++ b/src/KOKKOS/pair_snap_kokkos.h @@ -48,8 +48,8 @@ template struct TagPairSNAPComputeZi{}; template struct TagPairSNAPComputeBi{}; struct TagPairSNAPComputeBetaLinear{}; struct TagPairSNAPComputeBetaQuadratic{}; -struct TagPairSNAPComputeYi{}; -struct TagPairSNAPComputeYiWithZlist{}; +template struct TagPairSNAPComputeYi{}; +template struct TagPairSNAPComputeYiWithZlist{}; template struct TagPairSNAPComputeForce{}; @@ -258,23 +258,23 @@ class PairSNAPKokkos : public PairSNAP { KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeBetaQuadratic, const int& iatom) const; - KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeYi, const int& iatom_mod, const int& idxz, const int& iatom_div) const; + template KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeYi, const int& iatom_mod, const int& idxz, const int& iatom_div) const; - KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeYi, const int& iatom, const int& idxz) const; + template KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeYi, const int& iatom, const int& idxz) const; - KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeYi, const int& iatom) const; + template KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeYi, const int& iatom) const; - KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeYiWithZlist, const int& iatom_mod, const int& idxz, const int& iatom_div) const; + template KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeYiWithZlist, const int& iatom_mod, const int& idxz, const int& iatom_div) const; - KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeYiWithZlist, const int& iatom, const int& idxz) const; + template KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeYiWithZlist, const int& iatom, const int& idxz) const; - KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeYiWithZlist, const int& iatom) const; + template KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeYiWithZlist, const int& iatom) const; template KOKKOS_INLINE_FUNCTION diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h index dfdee2e1c0..810d30e3ce 100644 --- a/src/KOKKOS/pair_snap_kokkos_impl.h +++ b/src/KOKKOS/pair_snap_kokkos_impl.h @@ -332,11 +332,21 @@ void PairSNAPKokkos::compute(int eflag_in, //Note zeroing `ylist` is fused into `TransformUi`. if (quadraticflag || eflag) { - auto policy_compute_yi = snap_get_policy(chunk_size_div, snaKK.idxz_max); - Kokkos::parallel_for("ComputeYiWithZlist", policy_compute_yi, *this); + if (nelements > 1) { + auto policy_compute_yi = snap_get_policy>(chunk_size_div, snaKK.idxz_max); + Kokkos::parallel_for("ComputeYiWithZlistChemsnap", policy_compute_yi, *this); + } else { + auto policy_compute_yi = snap_get_policy>(chunk_size_div, snaKK.idxz_max); + Kokkos::parallel_for("ComputeYiWithZlist", policy_compute_yi, *this); + } } else { - auto policy_compute_yi = snap_get_policy(chunk_size_div, snaKK.idxz_max); - Kokkos::parallel_for("ComputeYi", policy_compute_yi, *this); + if (nelements > 1) { + auto policy_compute_yi = snap_get_policy, min_blocks_compute_yi>(chunk_size_div, snaKK.idxz_max); + Kokkos::parallel_for("ComputeYiChemsnap", policy_compute_yi, *this); + } else { + auto policy_compute_yi = snap_get_policy, min_blocks_compute_yi>(chunk_size_div, snaKK.idxz_max); + Kokkos::parallel_for("ComputeYi", policy_compute_yi, *this); + } } } @@ -1041,27 +1051,27 @@ void PairSNAPKokkos::operator() (TagPairSN ------------------------------------------------------------------------- */ template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeYi, const int& iatom_mod, const int& jjz, const int& iatom_div) const { +template KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeYi, const int& iatom_mod, const int& jjz, const int& iatom_div) const { const int iatom = iatom_mod + iatom_div * vector_length; if (iatom >= chunk_size) return; if (jjz >= snaKK.idxz_max) return; - snaKK.template compute_yi(iatom, jjz); + snaKK.template compute_yi(iatom, jjz); } template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeYi, const int& iatom, const int& jjz) const { +template KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeYi, const int& iatom, const int& jjz) const { if (iatom >= chunk_size) return; - snaKK.template compute_yi(iatom, jjz); + snaKK.template compute_yi(iatom, jjz); } template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeYi, const int& iatom) const { +template KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeYi, const int& iatom) const { if (iatom >= chunk_size) return; for (int jjz = 0; jjz < snaKK.idxz_max; jjz++) - snaKK.template compute_yi(iatom, jjz); + snaKK.template compute_yi(iatom, jjz); } /* ---------------------------------------------------------------------- @@ -1070,27 +1080,27 @@ void PairSNAPKokkos::operator() (TagPairSN ------------------------------------------------------------------------- */ template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeYiWithZlist, const int& iatom_mod, const int& jjz, const int& iatom_div) const { +template KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeYiWithZlist, const int& iatom_mod, const int& jjz, const int& iatom_div) const { const int iatom = iatom_mod + iatom_div * vector_length; if (iatom >= chunk_size) return; if (jjz >= snaKK.idxz_max) return; - snaKK.template compute_yi_with_zlist(iatom, jjz); + snaKK.template compute_yi_with_zlist(iatom, jjz); } template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeYiWithZlist, const int& iatom, const int& jjz) const { +template KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeYiWithZlist, const int& iatom, const int& jjz) const { if (iatom >= chunk_size) return; - snaKK.template compute_yi_with_zlist(iatom, jjz); + snaKK.template compute_yi_with_zlist(iatom, jjz); } template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeYiWithZlist, const int& iatom) const { +template KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeYiWithZlist, const int& iatom) const { if (iatom >= chunk_size) return; for (int jjz = 0; jjz < snaKK.idxz_max; jjz++) - snaKK.template compute_yi_with_zlist(iatom, jjz); + snaKK.template compute_yi_with_zlist(iatom, jjz); } /* ---------------------------------------------------------------------- diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h index ee1eb263dc..4247a79504 100644 --- a/src/KOKKOS/sna_kokkos.h +++ b/src/KOKKOS/sna_kokkos.h @@ -206,9 +206,9 @@ class SNAKokkos { template KOKKOS_INLINE_FUNCTION void compute_zi(const int&, const int&) const; // ForceSNAP - template KOKKOS_INLINE_FUNCTION + template KOKKOS_INLINE_FUNCTION void compute_yi(const int&, const int&) const; // ForceSNAP - template KOKKOS_INLINE_FUNCTION + template KOKKOS_INLINE_FUNCTION void compute_yi_with_zlist(const int&, const int&) const; // ForceSNAP template KOKKOS_INLINE_FUNCTION void compute_bi(const int&, const int&) const; // ForceSNAP diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index 2d567759ea..4c11b1213e 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -1067,7 +1067,7 @@ void SNAKokkos::compute_beta_quadratic(con ------------------------------------------------------------------------- */ template -template +template KOKKOS_INLINE_FUNCTION void SNAKokkos::compute_yi(const int& iatom, const int& jjz) const { @@ -1078,31 +1078,44 @@ void SNAKokkos::compute_yi(const int& iato //int mb = (2 * (mb1min+mb2max) - j1 - j2 + j) / 2; //int ma = (2 * (ma1min+ma2max) - j1 - j2 + j) / 2; - for (int elem1 = 0; elem1 < nelements; elem1++) { - for (int elem2 = 0; elem2 < nelements; elem2++) { + if constexpr (chemsnap) { + for (int elem1 = 0; elem1 < nelements; elem1++) { + for (int elem2 = 0; elem2 < nelements; elem2++) { - const complex ztmp = evaluate_zi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, iatom, elem1, elem2, cgblock); + const complex ztmp = evaluate_zi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, iatom, elem1, elem2, cgblock); - // apply to z(j1,j2,j,ma,mb) to unique element of y(j) - // find right y_list[jju] and beta(iatom,jjb) entries - // multiply and divide by j+1 factors - // account for multiplicity of 1, 2, or 3 + // apply to z(j1,j2,j,ma,mb) to unique element of y(j) + // find right y_list[jju] and beta(iatom,jjb) entries + // multiply and divide by j+1 factors + // account for multiplicity of 1, 2, or 3 - // pick out right beta value - for (int elem3 = 0; elem3 < nelements; elem3++) { + // pick out right beta value + for (int elem3 = 0; elem3 < nelements; elem3++) { - const real_type betaj = evaluate_beta_scaled(j1, j2, j, iatom, elem1, elem2, elem3); + const real_type betaj = evaluate_beta_scaled(j1, j2, j, iatom, elem1, elem2, elem3); - if constexpr (need_atomics) { - Kokkos::atomic_add(&(ylist_re(iatom, elem3, jju_half)), betaj * ztmp.re); - Kokkos::atomic_add(&(ylist_im(iatom, elem3, jju_half)), betaj * ztmp.im); - } else { - ylist_re(iatom, elem3, jju_half) += betaj * ztmp.re; - ylist_im(iatom, elem3, jju_half) += betaj * ztmp.im; - } - } // end loop over elem3 - } // end loop over elem2 - } // end loop over elem1 + if constexpr (need_atomics) { + Kokkos::atomic_add(&(ylist_re(iatom, elem3, jju_half)), betaj * ztmp.re); + Kokkos::atomic_add(&(ylist_im(iatom, elem3, jju_half)), betaj * ztmp.im); + } else { + ylist_re(iatom, elem3, jju_half) += betaj * ztmp.re; + ylist_im(iatom, elem3, jju_half) += betaj * ztmp.im; + } + } // end loop over elem3 + } // end loop over elem2 + } // end loop over elem1 + } else { + const complex ztmp = evaluate_zi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, iatom, 0, 0, cgblock); + const real_type betaj = evaluate_beta_scaled(j1, j2, j, iatom, 0, 0, 0); + + if constexpr (need_atomics) { + Kokkos::atomic_add(&(ylist_re(iatom, 0, jju_half)), betaj * ztmp.re); + Kokkos::atomic_add(&(ylist_im(iatom, 0, jju_half)), betaj * ztmp.im); + } else { + ylist_re(iatom, 0, jju_half) += betaj * ztmp.re; + ylist_im(iatom, 0, jju_half) += betaj * ztmp.im; + } + } } /* ---------------------------------------------------------------------- @@ -1110,37 +1123,50 @@ void SNAKokkos::compute_yi(const int& iato ------------------------------------------------------------------------- */ template -template +template KOKKOS_INLINE_FUNCTION void SNAKokkos::compute_yi_with_zlist(const int& iatom, const int& jjz) const { int j1, j2, j, jju_half; idxz(jjz).get_yi_with_zlist(j1, j2, j, jju_half); - int idouble = 0; - for (int elem1 = 0; elem1 < nelements; elem1++) { - for (int elem2 = 0; elem2 < nelements; elem2++) { - const complex ztmp = zlist(iatom, idouble, jjz); - // apply to z(j1,j2,j,ma,mb) to unique element of y(j) - // find right y_list[jju] and beta(iatom,jjb) entries - // multiply and divide by j+1 factors - // account for multiplicity of 1, 2, or 3 - // pick out right beta value - for (int elem3 = 0; elem3 < nelements; elem3++) { + if constexpr (chemsnap) { + int idouble = 0; + for (int elem1 = 0; elem1 < nelements; elem1++) { + for (int elem2 = 0; elem2 < nelements; elem2++) { + const complex ztmp = zlist(iatom, idouble, jjz); + // apply to z(j1,j2,j,ma,mb) to unique element of y(j) + // find right y_list[jju] and beta(iatom,jjb) entries + // multiply and divide by j+1 factors + // account for multiplicity of 1, 2, or 3 + // pick out right beta value + for (int elem3 = 0; elem3 < nelements; elem3++) { - const real_type betaj = evaluate_beta_scaled(j1, j2, j, iatom, elem1, elem2, elem3); + const real_type betaj = evaluate_beta_scaled(j1, j2, j, iatom, elem1, elem2, elem3); - if constexpr (need_atomics) { - Kokkos::atomic_add(&(ylist_re(iatom, elem3, jju_half)), betaj * ztmp.re); - Kokkos::atomic_add(&(ylist_im(iatom, elem3, jju_half)), betaj * ztmp.im); - } else { - ylist_re(iatom, elem3, jju_half) += betaj * ztmp.re; - ylist_im(iatom, elem3, jju_half) += betaj * ztmp.im; - } - } // end loop over elem3 - idouble++; - } // end loop over elem2 - } // end loop over elem1 + if constexpr (need_atomics) { + Kokkos::atomic_add(&(ylist_re(iatom, elem3, jju_half)), betaj * ztmp.re); + Kokkos::atomic_add(&(ylist_im(iatom, elem3, jju_half)), betaj * ztmp.im); + } else { + ylist_re(iatom, elem3, jju_half) += betaj * ztmp.re; + ylist_im(iatom, elem3, jju_half) += betaj * ztmp.im; + } + } // end loop over elem3 + idouble++; + } // end loop over elem2 + } // end loop over elem1 + } else { + const complex ztmp = zlist(iatom, 0, jjz); + const real_type betaj = evaluate_beta_scaled(j1, j2, j, iatom, 0, 0, 0); + + if constexpr (need_atomics) { + Kokkos::atomic_add(&(ylist_re(iatom, 0, jju_half)), betaj * ztmp.re); + Kokkos::atomic_add(&(ylist_im(iatom, 0, jju_half)), betaj * ztmp.im); + } else { + ylist_re(iatom, 0, jju_half) += betaj * ztmp.re; + ylist_im(iatom, 0, jju_half) += betaj * ztmp.im; + } + } } /* ---------------------------------------------------------------------- From ce6e0dbe68611bd20ce5ab4bcce73725a69e4c20 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Thu, 21 Nov 2024 14:33:30 -0800 Subject: [PATCH 28/30] Added templating over chemsnap for evaluate_beta_scaled --- src/KOKKOS/sna_kokkos.h | 2 +- src/KOKKOS/sna_kokkos_impl.h | 59 +++++++++++++++++++++++------------- 2 files changed, 39 insertions(+), 22 deletions(-) diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h index 4247a79504..a438ccd25e 100644 --- a/src/KOKKOS/sna_kokkos.h +++ b/src/KOKKOS/sna_kokkos.h @@ -241,7 +241,7 @@ class SNAKokkos { real_type evaluate_bi(const int&, const int&, const int&, const int&, const int&, const int&, const int&) const; // plugged into compute_yi, compute_yi_with_zlist - KOKKOS_FORCEINLINE_FUNCTION + template KOKKOS_FORCEINLINE_FUNCTION real_type evaluate_beta_scaled(const int&, const int&, const int&, const int&, const int&, const int&, const int&) const; // plugged into compute_fused_deidrj_small, compute_fused_deidrj_large KOKKOS_FORCEINLINE_FUNCTION diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index 4c11b1213e..9a97f229b5 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -1092,7 +1092,7 @@ void SNAKokkos::compute_yi(const int& iato // pick out right beta value for (int elem3 = 0; elem3 < nelements; elem3++) { - const real_type betaj = evaluate_beta_scaled(j1, j2, j, iatom, elem1, elem2, elem3); + const real_type betaj = evaluate_beta_scaled(j1, j2, j, iatom, elem1, elem2, elem3); if constexpr (need_atomics) { Kokkos::atomic_add(&(ylist_re(iatom, elem3, jju_half)), betaj * ztmp.re); @@ -1106,7 +1106,7 @@ void SNAKokkos::compute_yi(const int& iato } // end loop over elem1 } else { const complex ztmp = evaluate_zi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, iatom, 0, 0, cgblock); - const real_type betaj = evaluate_beta_scaled(j1, j2, j, iatom, 0, 0, 0); + const real_type betaj = evaluate_beta_scaled(j1, j2, j, iatom, 0, 0, 0); if constexpr (need_atomics) { Kokkos::atomic_add(&(ylist_re(iatom, 0, jju_half)), betaj * ztmp.re); @@ -1142,7 +1142,7 @@ void SNAKokkos::compute_yi_with_zlist(cons // pick out right beta value for (int elem3 = 0; elem3 < nelements; elem3++) { - const real_type betaj = evaluate_beta_scaled(j1, j2, j, iatom, elem1, elem2, elem3); + const real_type betaj = evaluate_beta_scaled(j1, j2, j, iatom, elem1, elem2, elem3); if constexpr (need_atomics) { Kokkos::atomic_add(&(ylist_re(iatom, elem3, jju_half)), betaj * ztmp.re); @@ -1157,7 +1157,7 @@ void SNAKokkos::compute_yi_with_zlist(cons } // end loop over elem1 } else { const complex ztmp = zlist(iatom, 0, jjz); - const real_type betaj = evaluate_beta_scaled(j1, j2, j, iatom, 0, 0, 0); + const real_type betaj = evaluate_beta_scaled(j1, j2, j, iatom, 0, 0, 0); if constexpr (need_atomics) { Kokkos::atomic_add(&(ylist_re(iatom, 0, jju_half)), betaj * ztmp.re); @@ -1175,30 +1175,47 @@ void SNAKokkos::compute_yi_with_zlist(cons ------------------------------------------------------------------------- */ template -KOKKOS_FORCEINLINE_FUNCTION +template KOKKOS_FORCEINLINE_FUNCTION typename SNAKokkos::real_type SNAKokkos::evaluate_beta_scaled(const int& j1, const int& j2, const int& j, const int& iatom, const int& elem1, const int& elem2, const int& elem3) const { - real_type betaj = 0; + int itriple_jjb = 0; + real_type factor = 0; - if (j >= j1) { - const int jjb = idxb_block(j1, j2, j); - const int itriple = ((elem1 * nelements + elem2) * nelements + elem3) * idxb_max + jjb; - if (j1 == j) { - if (j2 == j) betaj = static_cast(3) * d_beta(iatom, itriple); - else betaj = static_cast(2) * d_beta(iatom, itriple); - } else betaj = d_beta(iatom, itriple); - } else if (j >= j2) { - const int jjb = idxb_block(j, j2, j1); - const int itriple = ((elem3 * nelements + elem2) * nelements + elem1) * idxb_max + jjb; - if (j2 == j) betaj = static_cast(2) * d_beta(iatom, itriple); - else betaj = d_beta(iatom, itriple); + if constexpr (chemsnap) { + if (j >= j1) { + itriple_jjb = ((elem1 * nelements + elem2) * nelements + elem3) * idxb_max + idxb_block(j1, j2, j); + if (j1 == j) { + if (j2 == j) factor = 3; + else factor = 2; + } else factor = 1; + } else if (j >= j2) { + itriple_jjb = ((elem3 * nelements + elem2) * nelements + elem1) * idxb_max + idxb_block(j, j2, j1); + if (j2 == j) factor = 2; + else factor = 1; + } else { + itriple_jjb = ((elem2 * nelements + elem3) * nelements + elem1) * idxb_max + idxb_block(j2, j, j1); + factor = 1; + } } else { - const int jjb = idxb_block(j2, j, j1); - const int itriple = ((elem2 * nelements + elem3) * nelements + elem1) * idxb_max + jjb; - betaj = d_beta(iatom, itriple); + if (j >= j1) { + itriple_jjb = idxb_block(j1, j2, j); + if (j1 == j) { + if (j2 == j) factor = 3; + else factor = 2; + } else factor = 1; + } else if (j >= j2) { + itriple_jjb = idxb_block(j, j2, j1); + if (j2 == j) factor = 2; + else factor = 1; + } else { + itriple_jjb = idxb_block(j2, j, j1); + factor = 1; + } } + real_type betaj = factor * d_beta(iatom, itriple_jjb); + if (!bnorm_flag && j1 > j) { const real_type scale = static_cast(j1 + 1) / static_cast(j + 1); betaj *= scale; From e7c995abcbc857ed2d5db0640f93630926097de1 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Fri, 22 Nov 2024 11:28:25 -0800 Subject: [PATCH 29/30] Re-disabled SNAP Kokkos CPU path =( --- src/KOKKOS/pair_snap_kokkos_impl.h | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h index 810d30e3ce..bcece9626a 100644 --- a/src/KOKKOS/pair_snap_kokkos_impl.h +++ b/src/KOKKOS/pair_snap_kokkos_impl.h @@ -82,14 +82,14 @@ PairSNAPKokkos::~PairSNAPKokkos() template void PairSNAPKokkos::init_style() { - //if constexpr (host_flag) { - // if (lmp->kokkos->nthreads > 1) - // error->all(FLERR,"Pair style snap/kk can currently only run on a single " - // "CPU thread"); + if constexpr (host_flag) { + if (lmp->kokkos->nthreads > 1) + error->all(FLERR,"Pair style snap/kk can currently only run on a single " + "CPU thread"); - // PairSNAP::init_style(); - // return; - //} + PairSNAP::init_style(); + return; + } if (force->newton_pair == 0) error->all(FLERR,"Pair style SNAP requires newton pair on"); @@ -131,12 +131,12 @@ struct FindMaxNumNeighs { template void PairSNAPKokkos::compute(int eflag_in, int vflag_in) { - //if constexpr (host_flag) { - // atomKK->sync(Host,X_MASK|F_MASK|TYPE_MASK); - // PairSNAP::compute(eflag_in,vflag_in); - // atomKK->modified(Host,F_MASK); - // return; - //} + if constexpr (host_flag) { + atomKK->sync(Host,X_MASK|F_MASK|TYPE_MASK); + PairSNAP::compute(eflag_in,vflag_in); + atomKK->modified(Host,F_MASK); + return; + } eflag = eflag_in; vflag = vflag_in; From 8daad30ae1e64c6465f2abfcf44e653d928b61c7 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Mon, 16 Dec 2024 11:07:47 -0700 Subject: [PATCH 30/30] Re-enable Kokkos OpenMP and Serial code paths for SNAP --- src/KOKKOS/pair_snap_kokkos_impl.h | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h index bcece9626a..2b9b862645 100644 --- a/src/KOKKOS/pair_snap_kokkos_impl.h +++ b/src/KOKKOS/pair_snap_kokkos_impl.h @@ -82,15 +82,6 @@ PairSNAPKokkos::~PairSNAPKokkos() template void PairSNAPKokkos::init_style() { - if constexpr (host_flag) { - if (lmp->kokkos->nthreads > 1) - error->all(FLERR,"Pair style snap/kk can currently only run on a single " - "CPU thread"); - - PairSNAP::init_style(); - return; - } - if (force->newton_pair == 0) error->all(FLERR,"Pair style SNAP requires newton pair on"); @@ -131,13 +122,6 @@ struct FindMaxNumNeighs { template void PairSNAPKokkos::compute(int eflag_in, int vflag_in) { - if constexpr (host_flag) { - atomKK->sync(Host,X_MASK|F_MASK|TYPE_MASK); - PairSNAP::compute(eflag_in,vflag_in); - atomKK->modified(Host,F_MASK); - return; - } - eflag = eflag_in; vflag = vflag_in;