From befd4c8bfdd309029f81fd517d7df3181098e991 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Fri, 19 Jun 2020 14:12:29 -0400 Subject: [PATCH 1/6] Optimizations to Compute[Yi/Zi/Bi], switching over to an AoSoA data layout on the GPU. CPU vs GPU code paths are now maximally divergent, will include some discussion of that in PR. Small performance tweaks in Compute[UiTot/FusedDeidrj]. --- src/KOKKOS/pair_snap_kokkos.h | 63 +- src/KOKKOS/pair_snap_kokkos_impl.h | 522 +++++++++++---- src/KOKKOS/sna_kokkos.h | 64 +- src/KOKKOS/sna_kokkos_impl.h | 978 +++++++++++++++++++---------- 4 files changed, 1144 insertions(+), 483 deletions(-) diff --git a/src/KOKKOS/pair_snap_kokkos.h b/src/KOKKOS/pair_snap_kokkos.h index d6b028640d..a4e15fe5ce 100644 --- a/src/KOKKOS/pair_snap_kokkos.h +++ b/src/KOKKOS/pair_snap_kokkos.h @@ -30,19 +30,31 @@ PairStyle(snap/kk/host,PairSNAPKokkos) namespace LAMMPS_NS { +// Routines for both the CPU and GPU backend template struct TagPairSNAPComputeForce{}; - -struct TagPairSNAPBeta{}; struct TagPairSNAPComputeNeigh{}; + +// GPU backend only struct TagPairSNAPPreUi{}; struct TagPairSNAPComputeUi{}; -struct TagPairSNAPComputeUiCPU{}; +struct TagPairSNAPTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist struct TagPairSNAPComputeZi{}; +struct TagPairSNAPBeta{}; struct TagPairSNAPComputeBi{}; -struct TagPairSNAPZeroYi{}; +struct TagPairSNAPTransformBi{}; // re-order blist from AoSoA to AoS struct TagPairSNAPComputeYi{}; +struct TagPairSNAPTransformYi{}; // re-order ylist from AoSoA to AoS struct TagPairSNAPComputeFusedDeidrj{}; + +// CPU backend only +struct TagPairSNAPPreUiCPU{}; +struct TagPairSNAPComputeUiCPU{}; +struct TagPairSNAPComputeZiCPU{}; +struct TagPairSNAPBetaCPU{}; +struct TagPairSNAPComputeBiCPU{}; +struct TagPairSNAPZeroYiCPU{}; +struct TagPairSNAPComputeYiCPU{}; struct TagPairSNAPComputeDuidrjCPU{}; struct TagPairSNAPComputeDeidrjCPU{}; @@ -81,6 +93,10 @@ public: KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeNeigh,const typename Kokkos::TeamPolicy::member_type& team) const; + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPBetaCPU,const int& ii) const; + + // GPU backend only KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPPreUi,const typename Kokkos::TeamPolicy::member_type& team) const; @@ -88,32 +104,54 @@ public: void operator() (TagPairSNAPComputeUi,const typename Kokkos::TeamPolicy::member_type& team) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy::member_type& team) const; + void operator() (TagPairSNAPTransformUi,const int iatom_mod, const int idxu, const int iatom_div) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeZi,const int& ii) const; + void operator() (TagPairSNAPComputeZi,const int iatom_mod, const int idxz, const int iatom_div) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeBi,const typename Kokkos::TeamPolicy::member_type& team) const; + void operator() (TagPairSNAPBeta, const int& ii) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPZeroYi,const typename Kokkos::TeamPolicy::member_type& team) const; + void operator() (TagPairSNAPComputeBi,const int iatom_mod, const int idxb, const int iatom_div) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeYi,const int& ii) const; + void operator() (TagPairSNAPTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeYi,const int iatom_mod, const int idxz, const int iatom_div) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPTransformYi,const int iatom_mod, const int idxu, const int iatom_div) const; KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeFusedDeidrj,const typename Kokkos::TeamPolicy::member_type& team) const; + // CPU backend only + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPPreUiCPU,const typename Kokkos::TeamPolicy::member_type& team) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy::member_type& team) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeZiCPU,const int& ii) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeBiCPU,const typename Kokkos::TeamPolicy::member_type& team) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPZeroYiCPU,const typename Kokkos::TeamPolicy::member_type& team) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeYiCPU,const int& ii) const; + KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeDuidrjCPU,const typename Kokkos::TeamPolicy::member_type& team) const; KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeDeidrjCPU,const typename Kokkos::TeamPolicy::member_type& team) const; - KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPBeta,const int& ii) const; - template KOKKOS_INLINE_FUNCTION void v_tally_xyz(EV_FLOAT &ev, const int &i, const int &j, @@ -175,6 +213,7 @@ inline double dist2(double* x,double* y); Kokkos::View d_map; // mapping from atom types to elements Kokkos::View d_ninside; // ninside for all atoms in list Kokkos::View d_beta; // betas for all atoms in list + Kokkos::View d_beta_pack; // betas for all atoms in list, GPU Kokkos::View d_bispectrum; // bispectrum components for all atoms in list typedef Kokkos::DualView tdual_fparams; diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h index 5dbf8e3a5a..64831e91cf 100644 --- a/src/KOKKOS/pair_snap_kokkos_impl.h +++ b/src/KOKKOS/pair_snap_kokkos_impl.h @@ -194,6 +194,8 @@ void PairSNAPKokkos::compute(int eflag_in, int vflag_in) if (beta_max < inum) { beta_max = inum; d_beta = Kokkos::View("PairSNAPKokkos:beta",ncoeff,inum); + if (!host_flag) + d_beta_pack = Kokkos::View("PairSNAPKokkos:beta_pack",32,ncoeff,(inum+32-1)/32); d_ninside = Kokkos::View("PairSNAPKokkos:ninside",inum); } @@ -222,32 +224,95 @@ void PairSNAPKokkos::compute(int eflag_in, int vflag_in) Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this); } - //PreUi + if (host_flag) { - int vector_length = vector_length_default; - int team_size = team_size_default; - if (!host_flag) - vector_length = 32; - check_team_size_for(chunk_size,team_size,vector_length); - typename Kokkos::TeamPolicy policy_preui((chunk_size+team_size-1)/team_size,team_size,vector_length); - Kokkos::parallel_for("PreUi",policy_preui,*this); - } + // Host codepath - // ComputeUI - { - int vector_length = vector_length_default; - int team_size = team_size_default; - if (host_flag) { // CPU - // Run a fused calculation of ulist and accumulation into ulisttot using atomics + //PreUi + { + int vector_length = vector_length_default; + int team_size = team_size_default; + check_team_size_for(chunk_size,team_size,vector_length); + typename Kokkos::TeamPolicy policy_preui_cpu((chunk_size+team_size-1)/team_size,team_size,vector_length); + Kokkos::parallel_for("PreUiCPU",policy_preui_cpu,*this); + } + // ComputeUi + { + int vector_length = vector_length_default; + int team_size = team_size_default; + // Fused calculation of ulist and accumulation into ulisttot using atomics typename Kokkos::TeamPolicy policy_ui_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); - Kokkos::parallel_for("ComputeUiCPU",policy_ui_cpu,*this); + } - } else { // GPU, vector parallelism, shared memory, separate ulist and ulisttot to avoid atomics + //Compute bispectrum + if (quadraticflag || eflag) { + //ComputeZi + int idxz_max = snaKK.idxz_max; + typename Kokkos::RangePolicy policy_zi_cpu(0,chunk_size*idxz_max); + Kokkos::parallel_for("ComputeZiCPU",policy_zi_cpu,*this); - vector_length = 32; - team_size = 4; // need to cap b/c of shared memory reqs + //ComputeBi + int vector_length = vector_length_default; + int team_size = team_size_default; + check_team_size_for(chunk_size,team_size,vector_length); + typename Kokkos::TeamPolicy policy_bi_cpu(chunk_size,team_size,vector_length); + Kokkos::parallel_for("ComputeBiCPU",policy_bi_cpu,*this); + } + + + //ZeroYi,ComputeYi + { + + int vector_length = vector_length_default; + int team_size = team_size_default; + + //Compute beta = dE_i/dB_i for all i in list + typename Kokkos::RangePolicy policy_beta(0,chunk_size); + Kokkos::parallel_for("ComputeBetaCPU",policy_beta,*this); + + //ZeroYi + check_team_size_for(chunk_size,team_size,vector_length); + typename Kokkos::TeamPolicy policy_zero_yi(((idxu_max+team_size-1)/team_size)*chunk_size,team_size,vector_length); + Kokkos::parallel_for("ZeroYiCPU",policy_zero_yi,*this); + + //ComputeYi + int idxz_max = snaKK.idxz_max; + typename Kokkos::RangePolicy policy_yi_cpu(0,chunk_size*idxz_max); + Kokkos::parallel_for("ComputeYiCPU",policy_yi_cpu,*this); + } // host flag + + //ComputeDuidrj and Deidrj + { + int team_size = team_size_default; + int vector_length = vector_length_default; + + typename Kokkos::TeamPolicy policy_duidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); + snaKK.set_dir(-1); // technically doesn't do anything + Kokkos::parallel_for("ComputeDuidrjCPU",policy_duidrj_cpu,*this); + + typename Kokkos::TeamPolicy policy_deidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); + + Kokkos::parallel_for("ComputeDeidrjCPU",policy_deidrj_cpu,*this); + } + } else { // GPU + +#ifdef KOKKOS_ENABLE_CUDA + //PreUi + { + int vector_length = vector_length_default; + int team_size = team_size_default; + check_team_size_for(chunk_size,team_size,vector_length); + typename Kokkos::TeamPolicy policy_preui((chunk_size+team_size-1)/team_size,team_size,vector_length); + Kokkos::parallel_for("PreUi",policy_preui,*this); + } + + // ComputeUi w/vector parallelism, shared memory, direct atomicAdd into ulisttot + { + + int vector_length = 32; + int team_size = 4; // need to cap b/c of shared memory reqs check_team_size_for(chunk_size,team_size,vector_length); // scratch size: 2 * team_size * (twojmax+1)^2, to cover all `m1`,`m2` values @@ -265,62 +330,54 @@ void PairSNAPKokkos::compute(int eflag_in, int vflag_in) Kokkos::parallel_for("ComputeUi",policy_ui,*this); + //Transform data layout of ulisttot to AoSoA, zero ylist + typename Kokkos::MDRangePolicy, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformUi> policy_transform_ui({0,0,0},{32,idxu_max,(chunk_size + 32 - 1) / 32},{32,4,1}); + Kokkos::parallel_for("TransformUi",policy_transform_ui,*this); + } - } + //Compute bispectrum in AoSoA data layout, transform Bi + if (quadraticflag || eflag) { + //ComputeZi + int idxz_max = snaKK.idxz_max; + typename Kokkos::MDRangePolicy, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPComputeZi> policy_compute_zi({0,0,0},{32,idxz_max,(chunk_size + 32 - 1) / 32},{32,4,1}); + Kokkos::parallel_for("ComputeZi",policy_compute_zi,*this); - //Compute bispectrum - if (quadraticflag || eflag) { - //ComputeZi - int idxz_max = snaKK.idxz_max; - typename Kokkos::RangePolicy policy_zi(0,chunk_size*idxz_max); - Kokkos::parallel_for("ComputeZi",policy_zi,*this); + //ComputeBi + int idxb_max = snaKK.idxb_max; + typename Kokkos::MDRangePolicy, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPComputeBi> policy_compute_bi({0,0,0},{32,idxb_max,(chunk_size + 32 - 1) / 32},{32,4,1}); + Kokkos::parallel_for("ComputeBi",policy_compute_bi,*this); - //ComputeBi - int vector_length = vector_length_default; - int team_size = team_size_default; - check_team_size_for(chunk_size,team_size,vector_length); - typename Kokkos::TeamPolicy policy_bi(chunk_size,team_size,vector_length); - Kokkos::parallel_for("ComputeBi",policy_bi,*this); - } + //Transform data layout of blist out of AoSoA + //We need this b/c `blist` gets used in ComputeForce which doesn't + //take advantage of AoSoA (which at best would only be beneficial + //on the margins) + typename Kokkos::MDRangePolicy, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformBi> policy_transform_bi({0,0,0},{32,idxb_max,(chunk_size + 32 - 1) / 32},{32,4,1}); + Kokkos::parallel_for("TransformBi",policy_transform_bi,*this); + } - //Compute beta = dE_i/dB_i for all i in list - typename Kokkos::RangePolicy policy_beta(0,chunk_size); - Kokkos::parallel_for("ComputeBeta",policy_beta,*this); + //ComputeYi in AoSoA data layout, transform to AoS for ComputeFusedDeidrj + //Note zeroing `ylist` is fused into `TransformUi`. + { + //Compute beta = dE_i/dB_i for all i in list + typename Kokkos::RangePolicy policy_beta(0,chunk_size); + Kokkos::parallel_for("ComputeBeta",policy_beta,*this); - //ZeroYi - { - int vector_length = vector_length_default; - int team_size = team_size_default; - if (!host_flag) - team_size = 128; - check_team_size_for(chunk_size,team_size,vector_length); - typename Kokkos::TeamPolicy policy_zero_yi(((idxu_max+team_size-1)/team_size)*chunk_size,team_size,vector_length); - Kokkos::parallel_for("ZeroYi",policy_zero_yi,*this); - } + //ComputeYi + const int idxz_max = snaKK.idxz_max; + typename Kokkos::MDRangePolicy, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPComputeYi> policy_compute_yi({0,0,0},{32,idxz_max,(chunk_size + 32 - 1) / 32},{32,4,1}); + Kokkos::parallel_for("ComputeYi",policy_compute_yi,*this); - //ComputeYi - int idxz_max = snaKK.idxz_max; - typename Kokkos::RangePolicy policy_yi(0,chunk_size*idxz_max); - Kokkos::parallel_for("ComputeYi",policy_yi,*this); + //Transform data layout of ylist out of AoSoA + typename Kokkos::MDRangePolicy, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformYi> policy_transform_yi({0,0,0},{32,idxu_max,(chunk_size + 32 - 1) / 32},{32,4,1}); + Kokkos::parallel_for("TransformYi",policy_transform_yi,*this); - //ComputeDuidrj and Deidrj - { - int team_size = team_size_default; - int vector_length = vector_length_default; - if (host_flag) { // CPU + } - typename Kokkos::TeamPolicy policy_duidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); - snaKK.set_dir(-1); // technically doesn't do anything - Kokkos::parallel_for("ComputeDuidrjCPU",policy_duidrj_cpu,*this); - - typename Kokkos::TeamPolicy policy_deidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); - - Kokkos::parallel_for("ComputeDeidrjCPU",policy_deidrj_cpu,*this); - } else { // GPU, utilize scratch memory and splitting over dimensions, fused dui and dei - - vector_length = 32; - team_size = 2; // need to cap b/c of shared memory reqs + // Fused ComputeDuidrj, ComputeDeidrj + { + int vector_length = 32; + int team_size = 2; // need to cap b/c of shared memory reqs check_team_size_for(chunk_size,team_size,vector_length); // scratch size: 2 * 2 * team_size * (twojmax+1)*(twojmax/2+1), to cover half `m1`,`m2` values due to symmetry @@ -341,6 +398,9 @@ void PairSNAPKokkos::compute(int eflag_in, int vflag_in) Kokkos::parallel_for("ComputeFusedDeidrj",policy_fused_deidrj,*this); } } + +#endif // KOKKOS_ENABLE_CUDA + } //ComputeForce @@ -416,38 +476,6 @@ void PairSNAPKokkos::compute(int eflag_in, int vflag_in) } } -/* ---------------------------------------------------------------------- */ - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPBeta,const int& ii) const { - - const int i = d_ilist[ii + chunk_offset]; - const int itype = type[i]; - const int ielem = d_map[itype]; - SNAKokkos my_sna = snaKK; - - Kokkos::View> - d_coeffi(d_coeffelem,ielem,Kokkos::ALL); - - for (int icoeff = 0; icoeff < ncoeff; icoeff++) - d_beta(icoeff,ii) = d_coeffi[icoeff+1]; - - if (quadraticflag) { - int k = ncoeff+1; - for (int icoeff = 0; icoeff < ncoeff; icoeff++) { - double bveci = my_sna.blist(icoeff,ii); - d_beta(icoeff,ii) += d_coeffi[k]*bveci; - k++; - for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) { - double bvecj = my_sna.blist(jcoeff,ii); - d_beta(icoeff,ii) += d_coeffi[k]*bvecj; - d_beta(jcoeff,ii) += d_coeffi[k]*bveci; - k++; - } - } - } -} /* ---------------------------------------------------------------------- allocate all arrays @@ -520,6 +548,10 @@ void PairSNAPKokkos::coeff(int narg, char **arg) snaKK.init(); } +/* ---------------------------------------------------------------------- + Begin routines that are called on both CPU and GPU codepaths +------------------------------------------------------------------------- */ + /* ---------------------------------------------------------------------- */ template @@ -594,6 +626,53 @@ void PairSNAPKokkos::operator() (TagPairSNAPComputeNeigh,const typen }); } +/* ---------------------------------------------------------------------- + Begin routines that are unique to the GPU codepath. These take advantage + of AoSoA data layouts and scratch memory for recursive polynomials +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPBeta,const int& ii) const { + + if (ii >= chunk_size) return; + + const int iatom_mod = ii % 32; + const int iatom_div = ii / 32; + + const int i = d_ilist[ii + chunk_offset]; + const int itype = type[i]; + const int ielem = d_map[itype]; + SNAKokkos my_sna = snaKK; + + Kokkos::View> + d_coeffi(d_coeffelem,ielem,Kokkos::ALL); + + for (int icoeff = 0; icoeff < ncoeff; icoeff++) { + d_beta_pack(iatom_mod,icoeff,iatom_div) = d_coeffi[icoeff+1]; + } + + if (quadraticflag) { + const auto idxb_max = my_sna.idxb_max; + int k = ncoeff+1; + for (int icoeff = 0; icoeff < ncoeff; icoeff++) { + const auto idxb = icoeff % idxb_max; + const auto idx_chem = icoeff / idxb_max; + double bveci = my_sna.blist(idxb, idx_chem, ii); + d_beta_pack(iatom_mod,icoeff,iatom_div) += d_coeffi[k]*bveci; + k++; + for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) { + const auto jdxb = jcoeff % idxb_max; + const auto jdx_chem = jcoeff / idxb_max; + double bvecj = my_sna.blist(jdxb, jdx_chem, ii); + d_beta_pack(iatom_mod,icoeff,iatom_div) += d_coeffi[k]*bvecj; + d_beta_pack(iatom_mod,jcoeff,iatom_div) += d_coeffi[k]*bveci; + k++; + } + } + } +} + template KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPPreUi,const typename Kokkos::TeamPolicy::member_type& team) const { @@ -627,61 +706,107 @@ void PairSNAPKokkos::operator() (TagPairSNAPComputeUi,const typename template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy::member_type& team) const { +void PairSNAPKokkos::operator() (TagPairSNAPTransformUi,const int iatom_mod, const int idxu, const int iatom_div) const { SNAKokkos my_sna = snaKK; - // Extract the atom number - int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size())); - if (ii >= chunk_size) return; + const int iatom = iatom_mod + iatom_div * 32; + if (iatom >= chunk_size) return; - // Extract the neighbor number - const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size()); - const int ninside = d_ninside(ii); - if (jj >= ninside) return; + if (idxu >= my_sna.idxu_max) return; + + int elem_count = chemflag ? nelements : 1; + + for (int ielem = 0; ielem < elem_count; ielem++) { + + const auto utot_re = my_sna.ulisttot_re(idxu, ielem, iatom); + const auto utot_im = my_sna.ulisttot_im(idxu, ielem, iatom); + + my_sna.ulisttot_pack(iatom_mod, idxu, ielem, iatom_div) = { utot_re, utot_im }; + + my_sna.ylist_pack_re(iatom_mod, idxu, ielem, iatom_div) = 0.; + my_sna.ylist_pack_im(iatom_mod, idxu, ielem, iatom_div) = 0.; + } - my_sna.compute_ui_cpu(team,ii,jj); } template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPZeroYi,const typename Kokkos::TeamPolicy::member_type& team) const { +void PairSNAPKokkos::operator() (TagPairSNAPComputeYi,const int iatom_mod, const int jjz, const int iatom_div) const { SNAKokkos my_sna = snaKK; - // Extract the quantum number - const int idx = team.team_rank() + team.team_size() * (team.league_rank() % ((my_sna.idxu_max+team.team_size()-1)/team.team_size())); - if (idx >= my_sna.idxu_max) return; + const int iatom = iatom_mod + iatom_div * 32; + if (iatom >= chunk_size) return; - // Extract the atomic index - const int ii = team.league_rank() / ((my_sna.idxu_max+team.team_size()-1)/team.team_size()); - if (ii >= chunk_size) return; + if (jjz >= my_sna.idxz_max) return; - if (chemflag) - for(int ielem = 0; ielem < nelements; ielem++) - my_sna.zero_yi(idx,ii,ielem); - else - my_sna.zero_yi(idx,ii,0); + my_sna.compute_yi(iatom_mod,jjz,iatom_div,d_beta_pack); } template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeYi,const int& ii) const { +void PairSNAPKokkos::operator() (TagPairSNAPTransformYi,const int iatom_mod, const int idxu, const int iatom_div) const { SNAKokkos my_sna = snaKK; - my_sna.compute_yi(ii,d_beta); + + const int iatom = iatom_mod + iatom_div * 32; + if (iatom >= chunk_size) return; + + if (idxu >= my_sna.idxu_max) return; + + int elem_count = chemflag ? nelements : 1; + for (int ielem = 0; ielem < elem_count; ielem++) { + const auto y_re = my_sna.ylist_pack_re(iatom_mod, idxu, ielem, iatom_div); + const auto y_im = my_sna.ylist_pack_im(iatom_mod, idxu, ielem, iatom_div); + + my_sna.ylist(idxu, ielem, iatom) = { y_re, y_im }; + } + } template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeZi,const int& ii) const { +void PairSNAPKokkos::operator() (TagPairSNAPComputeZi,const int iatom_mod, const int jjz, const int iatom_div) const { SNAKokkos my_sna = snaKK; - my_sna.compute_zi(ii); + + const int iatom = iatom_mod + iatom_div * 32; + if (iatom >= chunk_size) return; + + if (jjz >= my_sna.idxz_max) return; + + my_sna.compute_zi(iatom_mod,jjz,iatom_div); } template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeBi,const typename Kokkos::TeamPolicy::member_type& team) const { - int ii = team.league_rank(); +void PairSNAPKokkos::operator() (TagPairSNAPComputeBi,const int iatom_mod, const int jjb, const int iatom_div) const { SNAKokkos my_sna = snaKK; - my_sna.compute_bi(team,ii); + + const int iatom = iatom_mod + iatom_div * 32; + if (iatom >= chunk_size) return; + + if (jjb >= my_sna.idxb_max) return; + + my_sna.compute_bi(iatom_mod,jjb,iatom_div); +} + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const { + SNAKokkos my_sna = snaKK; + + const int iatom = iatom_mod + iatom_div * 32; + if (iatom >= chunk_size) return; + + if (idxb >= my_sna.idxb_max) return; + + const int ntriples = my_sna.ntriples; + + for (int itriple = 0; itriple < ntriples; itriple++) { + + const auto blocal = my_sna.blist_pack(iatom_mod, idxb, itriple, iatom_div); + + my_sna.blist(idxb, itriple, iatom) = blocal; + } + } template @@ -701,6 +826,126 @@ void PairSNAPKokkos::operator() (TagPairSNAPComputeFusedDeidrj,const my_sna.compute_fused_deidrj(team,ii,jj); } +/* ---------------------------------------------------------------------- + Begin routines that are unique to the CPU codepath. These do not take + advantage of AoSoA data layouts, but that could be a good point of + future optimization and unification with the above kernels. It's unlikely + that scratch memory optimizations will ever be useful for the CPU due to + different arithmetic intensity requirements for the CPU vs GPU. +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPBetaCPU,const int& ii) const { + + const int i = d_ilist[ii + chunk_offset]; + const int itype = type[i]; + const int ielem = d_map[itype]; + SNAKokkos my_sna = snaKK; + + Kokkos::View> + d_coeffi(d_coeffelem,ielem,Kokkos::ALL); + + for (int icoeff = 0; icoeff < ncoeff; icoeff++) + d_beta(icoeff,ii) = d_coeffi[icoeff+1]; + + if (quadraticflag) { + const auto idxb_max = my_sna.idxb_max; + int k = ncoeff+1; + for (int icoeff = 0; icoeff < ncoeff; icoeff++) { + const auto idxb = icoeff % idxb_max; + const auto idx_chem = icoeff / idxb_max; + double bveci = my_sna.blist(idxb,idx_chem,ii); + d_beta(icoeff,ii) += d_coeffi[k]*bveci; + k++; + for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) { + const auto jdxb = jcoeff % idxb_max; + const auto jdx_chem = jcoeff / idxb_max; + double bvecj = my_sna.blist(jdxb,jdx_chem,ii); + d_beta(icoeff,ii) += d_coeffi[k]*bvecj; + d_beta(jcoeff,ii) += d_coeffi[k]*bveci; + k++; + } + } + } +} + + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPPreUiCPU,const typename Kokkos::TeamPolicy::member_type& team) const { + SNAKokkos my_sna = snaKK; + + // Extract the atom number + const int ii = team.team_rank() + team.team_size() * team.league_rank(); + if (ii >= chunk_size) return; + int itype = type(ii); + int ielem = d_map[itype]; + + my_sna.pre_ui_cpu(team,ii,ielem); +} + + + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy::member_type& team) const { + SNAKokkos my_sna = snaKK; + + // Extract the atom number + int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size())); + if (ii >= chunk_size) return; + + // Extract the neighbor number + const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size()); + const int ninside = d_ninside(ii); + if (jj >= ninside) return; + + my_sna.compute_ui_cpu(team,ii,jj); +} + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPZeroYiCPU,const typename Kokkos::TeamPolicy::member_type& team) const { + SNAKokkos my_sna = snaKK; + + // Extract the quantum number + const int idx = team.team_rank() + team.team_size() * (team.league_rank() % ((my_sna.idxu_max+team.team_size()-1)/team.team_size())); + if (idx >= my_sna.idxu_max) return; + + // Extract the atomic index + const int ii = team.league_rank() / ((my_sna.idxu_max+team.team_size()-1)/team.team_size()); + if (ii >= chunk_size) return; + + if (chemflag) + for(int ielem = 0; ielem < nelements; ielem++) + my_sna.zero_yi_cpu(idx,ii,ielem); + else + my_sna.zero_yi_cpu(idx,ii,0); +} + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeYiCPU,const int& ii) const { + SNAKokkos my_sna = snaKK; + my_sna.compute_yi_cpu(ii,d_beta); +} + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeZiCPU,const int& ii) const { + SNAKokkos my_sna = snaKK; + my_sna.compute_zi_cpu(ii); +} + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeBiCPU,const typename Kokkos::TeamPolicy::member_type& team) const { + int ii = team.league_rank(); + SNAKokkos my_sna = snaKK; + my_sna.compute_bi_cpu(team,ii); +} + template KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPComputeDuidrjCPU,const typename Kokkos::TeamPolicy::member_type& team) const { @@ -735,6 +980,12 @@ void PairSNAPKokkos::operator() (TagPairSNAPComputeDeidrjCPU,const t my_sna.compute_deidrj_cpu(team,ii,jj); } +/* ---------------------------------------------------------------------- + Also used for both CPU and GPU codepaths. Could maybe benefit from a + separate GPU/CPU codepath, but this kernel takes so little time it's + likely not worth it. +------------------------------------------------------------------------- */ + template template KOKKOS_INLINE_FUNCTION @@ -799,20 +1050,31 @@ void PairSNAPKokkos::operator() (TagPairSNAPComputeForce t_sna_2d; typedef Kokkos::View t_sna_2d_ll; typedef Kokkos::View t_sna_3d; + typedef Kokkos::View t_sna_3d_ll; typedef Kokkos::View t_sna_4d; + typedef Kokkos::View t_sna_4d_ll; typedef Kokkos::View t_sna_3d3; typedef Kokkos::View t_sna_5d; @@ -48,7 +50,8 @@ public: typedef Kokkos::View t_sna_3c; typedef Kokkos::View t_sna_3c_ll; typedef Kokkos::View t_sna_4c; - typedef Kokkos::View t_sna_4c_ll; + typedef Kokkos::View t_sna_4c3_ll; + typedef Kokkos::View t_sna_4c_ll; typedef Kokkos::View t_sna_3c3; typedef Kokkos::View t_sna_5c; @@ -73,27 +76,39 @@ inline int ncoeff; - // functions for bispectrum coefficients + // functions for bispectrum coefficients, GPU only KOKKOS_INLINE_FUNCTION - void pre_ui(const typename Kokkos::TeamPolicy::member_type& team,const int&, int); // ForceSNAP + void pre_ui(const typename Kokkos::TeamPolicy::member_type& team,const int&,const int&); // ForceSNAP KOKKOS_INLINE_FUNCTION void compute_ui(const typename Kokkos::TeamPolicy::member_type& team, const int, const int); // ForceSNAP KOKKOS_INLINE_FUNCTION + void compute_zi(const int&, const int&, const int&); // ForceSNAP + KOKKOS_INLINE_FUNCTION + void compute_yi(int,int,int, + const Kokkos::View &beta_pack); // ForceSNAP + KOKKOS_INLINE_FUNCTION + void compute_bi(const int&, const int&, const int&); // ForceSNAP + + // functions for bispectrum coefficients, CPU only + KOKKOS_INLINE_FUNCTION + void pre_ui_cpu(const typename Kokkos::TeamPolicy::member_type& team,const int&,const int&); // ForceSNAP + KOKKOS_INLINE_FUNCTION void compute_ui_cpu(const typename Kokkos::TeamPolicy::member_type& team, int, int); // ForceSNAP KOKKOS_INLINE_FUNCTION - void compute_zi(const int&); // ForceSNAP + void compute_zi_cpu(const int&); // ForceSNAP KOKKOS_INLINE_FUNCTION - void zero_yi(const int&, const int&, int); // ForceSNAP + void zero_yi_cpu(const int&,const int&,const int&); // ForceSNAP KOKKOS_INLINE_FUNCTION - void compute_yi(int, + void compute_yi_cpu(int, const Kokkos::View &beta); // ForceSNAP - KOKKOS_INLINE_FUNCTION - void compute_bi(const typename Kokkos::TeamPolicy::member_type& team, int); // ForceSNAP - - // functions for derivatives + KOKKOS_INLINE_FUNCTION + void compute_bi_cpu(const typename Kokkos::TeamPolicy::member_type& team, int); // ForceSNAP + // functions for derivatives, GPU only KOKKOS_INLINE_FUNCTION void compute_fused_deidrj(const typename Kokkos::TeamPolicy::member_type& team, const int, const int); //ForceSNAP + + // functions for derivatives, CPU only KOKKOS_INLINE_FUNCTION void compute_duidrj_cpu(const typename Kokkos::TeamPolicy::member_type& team, int, int); //ForceSNAP KOKKOS_INLINE_FUNCTION @@ -139,18 +154,32 @@ inline int twojmax, diagonalstyle; - t_sna_2d_ll blist; - t_sna_2c_ll ulisttot; - t_sna_2c_ll zlist; + t_sna_3d_ll blist; + t_sna_3c_ll ulisttot; + t_sna_3c_ll zlist; t_sna_3c_ll ulist; - t_sna_2c_ll ylist; - + t_sna_3c_ll ylist; + // derivatives of data - t_sna_4c_ll dulist; + t_sna_4c3_ll dulist; + + // Modified structures for GPU backend + t_sna_3d_ll ulisttot_re; // split real, + t_sna_3d_ll ulisttot_im; // imag + t_sna_4c_ll ulisttot_pack; // AoSoA layout + t_sna_4c_ll zlist_pack; // AoSoA layout + t_sna_4d_ll blist_pack; + t_sna_4d_ll ylist_pack_re; // split real, + t_sna_4d_ll ylist_pack_im; // imag AoSoA layout int idxcg_max, idxu_max, idxz_max, idxb_max; + // Chem snap counts + int nelements; + int ndoubles; + int ntriples; + private: double rmin0, rfac0; @@ -212,9 +241,6 @@ inline // Chem snap flags int chem_flag; int bnorm_flag; - int nelements; - int ndoubles; - int ntriples; // Self-weight double wself; diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index dae4a0c2a1..8f265751ed 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -237,56 +237,71 @@ void SNAKokkos::grow_rij(int newnatom, int newnmax) element = t_sna_2i("sna:rcutij",natom,nmax); dedr = t_sna_3d("sna:dedr",natom,nmax,3); - blist = t_sna_2d_ll("sna:blist",idxb_max*ntriples,natom); - //ulisttot = t_sna_2c("sna:ulisttot",natom,idxu_max); - ulisttot = t_sna_2c_ll("sna:ulisttot",idxu_max*nelements,natom); - - zlist = t_sna_2c_ll("sna:zlist",idxz_max*ndoubles,natom); - - //ulist = t_sna_3c("sna:ulist",natom,nmax,idxu_max); #ifdef KOKKOS_ENABLE_CUDA if (std::is_same::value) { // dummy allocation + ulisttot = t_sna_3c_ll("sna:ulisttot",1,1,1); + ulisttot_re = t_sna_3d_ll("sna:ulisttot_re",idxu_max,nelements,natom); + ulisttot_im = t_sna_3d_ll("sna:ulisttot_im",idxu_max,nelements,natom); + ulisttot_pack = t_sna_4c_ll("sna:ulisttot_pack",32,idxu_max,nelements,(natom+32-1)/32); ulist = t_sna_3c_ll("sna:ulist",1,1,1); - dulist = t_sna_4c_ll("sna:dulist",1,1,1); + zlist = t_sna_3c_ll("sna:zlist",1,1,1); + zlist_pack = t_sna_4c_ll("sna:zlist_pack",32,idxz_max,ndoubles,(natom+32-1)/32); + blist = t_sna_3d_ll("sna:blist",idxb_max,ntriples,natom); + blist_pack = t_sna_4d_ll("sna:blist_pack",32,idxb_max,ntriples,(natom+32-1)/32); + ylist = t_sna_3c_ll("sna:ylist",idxu_max,nelements,natom); + ylist_pack_re = t_sna_4d_ll("sna:ylist_pack_re",32,idxu_max,nelements,(natom+32-1)/32); + ylist_pack_im = t_sna_4d_ll("sna:ylist_pack_im",32,idxu_max,nelements,(natom+32-1)/32); + dulist = t_sna_4c3_ll("sna:dulist",1,1,1); } else { #endif + ulisttot = t_sna_3c_ll("sna:ulisttot",idxu_max,nelements,natom); + ulisttot_re = t_sna_3d_ll("sna:ulisttot_re",1,1,1); + ulisttot_im = t_sna_3d_ll("sna:ulisttot_im",1,1,1); + ulisttot_pack = t_sna_4c_ll("sna:ulisttot_pack",1,1,1,1); ulist = t_sna_3c_ll("sna:ulist",idxu_max,natom,nmax); - dulist = t_sna_4c_ll("sna:dulist",idxu_max,natom,nmax); + zlist = t_sna_3c_ll("sna:zlist",idxz_max,ndoubles,natom); + zlist_pack = t_sna_4c_ll("sna:zlist_pack",1,1,1,1); + blist = t_sna_3d_ll("sna:blist",idxb_max,ntriples,natom); + blist_pack = t_sna_4d_ll("sna:blist_pack",1,1,1,1); + ylist = t_sna_3c_ll("sna:ylist",idxu_max,nelements,natom); + ylist_pack_re = t_sna_4d_ll("sna:ylist_pack_re",1,1,1,1); + ylist_pack_im = t_sna_4d_ll("sna:ylist_pack_im",1,1,1,1); + dulist = t_sna_4c3_ll("sna:dulist",idxu_max,natom,nmax); + #ifdef KOKKOS_ENABLE_CUDA } #endif - - //ylist = t_sna_2c_lr("sna:ylist",natom,idxu_max); - ylist = t_sna_2c_ll("sna:ylist",idxu_max*nelements,natom); } /* ---------------------------------------------------------------------- - * compute Ui by summing over neighbors j - * ------------------------------------------------------------------------- */ + * GPU routines + * ----------------------------------------------------------------------*/ template KOKKOS_INLINE_FUNCTION -void SNAKokkos::pre_ui(const typename Kokkos::TeamPolicy::member_type& team, const int& iatom, int ielem) +void SNAKokkos::pre_ui(const typename Kokkos::TeamPolicy::member_type& team, const int& iatom, const int& ielem) { - for(int jelem = 0; jelem < nelements; jelem++) - for (int j = 0; j <= twojmax; j++) { - const int jju = idxu_block(j); + for (int jelem = 0; jelem < nelements; jelem++) { + for (int j = 0; j <= twojmax; j++) { + const int jju = idxu_block(j); - // Only diagonal elements get initialized - // for (int m = 0; m < (j+1)*(j+1); m++) - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, (j+1)*(j+1)), - [&] (const int m) { + // Only diagonal elements get initialized + // for (int m = 0; m < (j+1)*(j+1); m++) + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, (j+1)*(j+1)), + [&] (const int m) { - const int jjup = jju + m; + const int jjup = jju + m; - // if m is on the "diagonal", initialize it with the self energy. - // Otherwise zero it out - SNAcomplex init = {0., 0.}; - if (m % (j+2) == 0 && (!chem_flag || ielem == jelem || wselfall_flag)) { init = {wself, 0.0}; } //need to map iatom to element + // if m is on the "diagonal", initialize it with the self energy. + // Otherwise zero it out + double re_part = 0.; + if (m % (j+2) == 0 && (!chem_flag || ielem == jelem || wselfall_flag)) { re_part = wself; } - ulisttot(jelem*idxu_max+jjup, iatom) = init; - }); + ulisttot_re(jjup, jelem, iatom) = re_part; + ulisttot_im(jjup, jelem, iatom) = 0.; + }); + } } } @@ -322,7 +337,7 @@ void SNAKokkos::compute_ui(const typename Kokkos::TeamPolicy::compute_ui(const typename Kokkos::TeamPolicy::compute_ui(const typename Kokkos::TeamPolicy::compute_ui(const typename Kokkos::TeamPolicy::compute_ui(const typename Kokkos::TeamPolicy 0)?rootpq2*buf1[jjup_shared_idx-1]:SNAcomplex(0.,0.); - //const SNAcomplex u_up2 = (ma > 0)?rootpq2*ulist(jjup_index-1,iatom,jnbor):SNAcomplex(0.,0.); + const SNAcomplex u_up2 = rootpq2*((ma > 0)?buf1[jjup_shared_idx-1]:SNAcomplex(0.,0.)); + + // u_accum += conj(b) * u_up2 caconjxpy(b, u_up2, u_accum); // VMK recursion relation: grab contribution which is multiplied by a* const double rootpq1 = rootpqarray(j - ma, j - mb); - const SNAcomplex u_up1 = (ma < j)?rootpq1*buf1[jjup_shared_idx]:SNAcomplex(0.,0.); - //const SNAcomplex u_up1 = (ma < j)?rootpq1*ulist(jjup_index,iatom,jnbor):SNAcomplex(0.,0.); + const SNAcomplex u_up1 = rootpq1*((ma < j)?buf1[jjup_shared_idx]:SNAcomplex(0.,0.)); + + // u_accum += conj(a) * u_up1 caconjxpy(a, u_up1, u_accum); - //ulist(jju_index,iatom,jnbor) = u_accum; // back up into shared memory for next iter buf2[jju_shared_idx] = u_accum; - Kokkos::atomic_add(&(ulisttot(jju_index,iatom).re), sfac * u_accum.re); - Kokkos::atomic_add(&(ulisttot(jju_index,iatom).im), sfac * u_accum.im); + Kokkos::atomic_add(&(ulisttot_re(jju_index,jpos,iatom)), sfac * u_accum.re); + Kokkos::atomic_add(&(ulisttot_im(jju_index,jpos,iatom)), sfac * u_accum.im); // copy left side to right side with inversion symmetry VMK 4.4(2) // u[ma-j,mb-j] = (-1)^(ma-mb)*Conj([u[ma,mb)) @@ -415,7 +428,7 @@ void SNAKokkos::compute_ui(const typename Kokkos::TeamPolicy::compute_ui(const typename Kokkos::TeamPolicy::compute_ui(const typename Kokkos::TeamPolicy KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_ui_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor) +void SNAKokkos::compute_zi(const int& iatom_mod, const int& jjz, const int& iatom_div) { - double rsq, r, x, y, z, z0, theta0; - - // utot(j,ma,mb) = 0 for all j,ma,ma - // utot(j,ma,ma) = 1 for all j,ma - // for j in neighbors of i: - // compute r0 = (x,y,z,z0) - // utot(j,ma,mb) += u(r0;j,ma,mb) for all j,ma,mb - - x = rij(iatom,jnbor,0); - y = rij(iatom,jnbor,1); - z = rij(iatom,jnbor,2); - rsq = x * x + y * y + z * z; - r = sqrt(rsq); - - theta0 = (r - rmin0) * rfac0 * MY_PI / (rcutij(iatom,jnbor) - rmin0); - // theta0 = (r - rmin0) * rscale0; - z0 = r / tan(theta0); - - compute_uarray_cpu(team, iatom, jnbor, x, y, z, z0, r); - add_uarraytot(team, iatom, jnbor, r, wj(iatom,jnbor), rcutij(iatom,jnbor), element(iatom, jnbor)); - -} - -/* ---------------------------------------------------------------------- - compute Zi by summing over products of Ui -------------------------------------------------------------------------- */ - -template -KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_zi(const int& iter) -{ - const int iatom = iter / idxz_max; - const int jjz = iter % idxz_max; const int j1 = idxz(jjz, 0); const int j2 = idxz(jjz, 1); @@ -493,77 +474,161 @@ void SNAKokkos::compute_zi(const int& iter) const int na = idxz(jjz, 7); const int nb = idxz(jjz, 8); - const double *cgblock = cglist.data() + idxcg_block(j1, j2, j); + const double* cgblock = cglist.data() + idxcg_block(j1, j2, j); int idouble = 0; - for(int elem1 = 0; elem1 < nelements; elem1++) - for(int elem2 = 0; elem2 < nelements; elem2++) { - const int jjza = idouble*idxz_max + jjz; + for (int elem1 = 0; elem1 < nelements; elem1++) { + for (int elem2 = 0; elem2 < nelements; elem2++) { + double ztmp_r = 0.; + double ztmp_i = 0.; - zlist(jjza, iatom).re = 0.0; - zlist(jjza, iatom).im = 0.0; + int jju1 = idxu_block[j1] + (j1+1)*mb1min; + int jju2 = idxu_block[j2] + (j2+1)*mb2max; + int icgb = mb1min*(j2+1) + mb2max; - int jju1 = elem1 * idxu_max + idxu_block[j1] + (j1 + 1) * mb1min; - int jju2 = elem2 * idxu_max + idxu_block[j2] + (j2 + 1) * mb2max; - int icgb = mb1min * (j2 + 1) + mb2max; - for (int ib = 0; ib < nb; ib++) { + #pragma unroll + for(int ib = 0; ib < nb; ib++) { - double suma1_r = 0.0; - double suma1_i = 0.0; + int ma1 = ma1min; + int ma2 = ma2max; + int icga = ma1min*(j2+1) + ma2max; - int ma1 = ma1min; - int ma2 = ma2max; - int icga = ma1min * (j2 + 1) + ma2max; - for (int ia = 0; ia < na; ia++) { - suma1_r += cgblock[icga] * (ulisttot(jju1 + ma1, iatom).re * ulisttot(jju2 + ma2, iatom).re - - ulisttot(jju1 + ma1, iatom).im * ulisttot(jju2 + ma2, iatom).im); - suma1_i += cgblock[icga] * (ulisttot(jju1 + ma1, iatom).re * ulisttot(jju2 + ma2, iatom).im + - ulisttot(jju1 + ma1, iatom).im * ulisttot(jju2 + ma2, iatom).re); - ma1++; - ma2--; - icga += j2; - } // end loop over ia + #pragma unroll + for(int ia = 0; ia < na; ia++) { + const SNAcomplex utot1 = ulisttot_pack(iatom_mod, jju1+ma1, elem1, iatom_div); + const SNAcomplex utot2 = ulisttot_pack(iatom_mod, jju2+ma2, elem2, iatom_div); + const auto cgcoeff_a = cgblock[icga]; + const auto cgcoeff_b = cgblock[icgb]; + ztmp_r += cgcoeff_a * cgcoeff_b * (utot1.re * utot2.re - utot1.im * utot2.im); + ztmp_i += cgcoeff_a * cgcoeff_b * (utot1.re * utot2.im + utot1.im * utot2.re); + ma1++; + ma2--; + icga += j2; + } // end loop over ia - zlist(jjza, iatom).re += cgblock[icgb] * suma1_r; - zlist(jjza, iatom).im += cgblock[icgb] * suma1_i; + jju1 += j1 + 1; + jju2 -= j2 + 1; + icgb += j2; + } // end loop over ib - jju1 += j1 + 1; - jju2 -= j2 + 1; - icgb += j2; + if (bnorm_flag) { + ztmp_r /= (j + 1); + ztmp_i /= (j + 1); + } - } // end loop over ib - if (bnorm_flag) { - zlist(jjza, iatom).re /= (j+1); - zlist(jjza, iatom).im /= (j+1); - } - idouble++; + zlist_pack(iatom_mod,jjz,idouble,iatom_div) = { ztmp_r, ztmp_i }; + + idouble++; + } } } /* ---------------------------------------------------------------------- - compute Yi from Ui without storing Zi, looping over zlist indices + compute Bi by summing conj(Ui)*Zi ------------------------------------------------------------------------- */ template KOKKOS_INLINE_FUNCTION -void SNAKokkos::zero_yi(const int& idx, const int& iatom, int ielem) +void SNAKokkos::compute_bi(const int& iatom_mod, const int& jjb, const int& iatom_div) { - ylist(ielem*idxu_max+idx,iatom) = {0.0, 0.0}; + // for j1 = 0,...,twojmax + // for j2 = 0,twojmax + // for j = |j1-j2|,Min(twojmax,j1+j2),2 + // b(j1,j2,j) = 0 + // for mb = 0,...,jmid + // for ma = 0,...,j + // b(j1,j2,j) += + // 2*Conj(u(j,ma,mb))*z(j1,j2,j,ma,mb) + + const int j1 = idxb(jjb,0); + const int j2 = idxb(jjb,1); + const int j = idxb(jjb,2); + + const int jjz = idxz_block(j1,j2,j); + const int jju = idxu_block[j]; + + int itriple = 0; + int idouble = 0; + for (int elem1 = 0; elem1 < nelements; elem1++) { + for (int elem2 = 0; elem2 < nelements; elem2++) { + const int jalloy = idouble; + + for (int elem3 = 0; elem3 < nelements; elem3++) { + const int jjballoy = itriple; + + double sumzu = 0.0; + double sumzu_temp = 0.0; + + for(int mb = 0; 2*mb < j; mb++) { + for(int ma = 0; ma <= j; ma++) { + const int jju_index = jju+mb*(j+1)+ma; + const int jjz_index = jjz+mb*(j+1)+ma; + if (2*mb == j) return; // I think we can remove this? + const auto utot = ulisttot_pack(iatom_mod, jju_index, elem3, iatom_div); + const auto zloc = zlist_pack(iatom_mod, jjz_index, jalloy, iatom_div); + sumzu_temp += utot.re * zloc.re + utot.im * zloc.im; + } + } + sumzu += sumzu_temp; + + // For j even, special treatment for middle column + if (j%2 == 0) { + sumzu_temp = 0.; + + const int mb = j/2; + for(int ma = 0; ma < mb; ma++) { + const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma; + const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma; + + const auto utot = ulisttot_pack(iatom_mod, jju_index, elem3, iatom_div); + const auto zloc = zlist_pack(iatom_mod, jjz_index, jalloy, iatom_div); + sumzu_temp += utot.re * zloc.re + utot.im * zloc.im; + + } + sumzu += sumzu_temp; + + const int ma = mb; + const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma; + const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma; + + const auto utot = ulisttot_pack(iatom_mod, jju_index, elem3, iatom_div); + const auto zloc = zlist_pack(iatom_mod, jjz_index, jalloy, iatom_div); + sumzu += 0.5 * (utot.re * zloc.re + utot.im * zloc.im); + } // end if jeven + + sumzu *= 2.0; + if (bzero_flag) { + if (!wselfall_flag) { + if (elem1 == elem2 && elem1 == elem3) { + sumzu -= bzero[j]; + } + } else { + sumzu -= bzero[j]; + } + } + blist_pack(iatom_mod, jjb, jjballoy, iatom_div) = sumzu; + //} // end loop over j + //} // end loop over j1, j2 + itriple++; + } // end loop over elem3 + idouble++; + } // end loop over elem2 + } // end loop over elem1 } + /* ---------------------------------------------------------------------- - compute Yi from Ui without storing Zi, looping over zlist indices + compute Yi from Ui without storing Zi, looping over zlist indices, + GPU version ------------------------------------------------------------------------- */ template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_yi(int iter, - const Kokkos::View &beta) +void SNAKokkos::compute_yi(int iatom_mod, int jjz, int iatom_div, + const Kokkos::View &beta_pack) { double betaj; - const int iatom = iter / idxz_max; - const int jjz = iter % idxz_max; const int j1 = idxz(jjz, 0); const int j2 = idxz(jjz, 1); @@ -576,88 +641,84 @@ void SNAKokkos::compute_yi(int iter, const int nb = idxz(jjz, 8); const int jju = idxz(jjz, 9); - const double *cgblock = cglist.data() + idxcg_block(j1, j2, j); + const double *cgblock = cglist.data() + idxcg_block(j1,j2,j); //int mb = (2 * (mb1min+mb2max) - j1 - j2 + j) / 2; //int ma = (2 * (ma1min+ma2max) - j1 - j2 + j) / 2; - int itriple; - for(int elem1 = 0; elem1 < nelements; elem1++) + for (int elem1 = 0; elem1 < nelements; elem1++) { for (int elem2 = 0; elem2 < nelements; elem2++) { - double ztmp_r = 0.0; - double ztmp_i = 0.0; + double ztmp_r = 0.0; + double ztmp_i = 0.0; - int jju1 = elem1 * idxu_max + idxu_block[j1] + (j1 + 1) * mb1min; - int jju2 = elem2 * idxu_max + idxu_block[j2] + (j2 + 1) * mb2max; - int icgb = mb1min * (j2 + 1) + mb2max; - for (int ib = 0; ib < nb; ib++) { + int jju1 = idxu_block[j1] + (j1 + 1) * mb1min; + int jju2 = idxu_block[j2] + (j2 + 1) * mb2max; + int icgb = mb1min * (j2 + 1) + mb2max; - double suma1_r = 0.0; - double suma1_i = 0.0; + #pragma unroll + for (int ib = 0; ib < nb; ib++) { - int ma1 = ma1min; - int ma2 = ma2max; - int icga = ma1min * (j2 + 1) + ma2max; + int ma1 = ma1min; + int ma2 = ma2max; + int icga = ma1min*(j2+1) + ma2max; - for (int ia = 0; ia < na; ia++) { - suma1_r += cgblock[icga] * (ulisttot(jju1 + ma1, iatom).re * - ulisttot(jju2 + ma2, iatom).re - - ulisttot(jju1 + ma1, iatom).im * - ulisttot(jju2 + ma2, iatom).im); - suma1_i += cgblock[icga] * (ulisttot(jju1 + ma1, iatom).re * - ulisttot(jju2 + ma2, iatom).im + - ulisttot(jju1 + ma1, iatom).im * - ulisttot(jju2 + ma2, iatom).re); - ma1++; - ma2--; - icga += j2; - } // end loop over ia + #pragma unroll + for (int ia = 0; ia < na; ia++) { + const SNAcomplex utot1 = ulisttot_pack(iatom_mod,jju1+ma1,elem1,iatom_div); + const SNAcomplex utot2 = ulisttot_pack(iatom_mod,jju2+ma2,elem2,iatom_div); + const auto cgcoeff_a = cgblock[icga]; + const auto cgcoeff_b = cgblock[icgb]; + ztmp_r += cgcoeff_a * cgcoeff_b * (utot1.re * utot2.re - utot1.im * utot2.im); + ztmp_i += cgcoeff_a * cgcoeff_b * (utot1.re * utot2.im + utot1.im * utot2.re); + ma1++; + ma2--; + icga += j2; + } // end loop over ia - ztmp_r += cgblock[icgb] * suma1_r; - ztmp_i += cgblock[icgb] * suma1_i; - jju1 += j1 + 1; - jju2 -= j2 + 1; - icgb += j2; - } // end loop over ib + jju1 += j1 + 1; + jju2 -= j2 + 1; + icgb += j2; + } // end loop over ib - if (bnorm_flag) { - ztmp_i /= j + 1; - ztmp_r /= j + 1; - } - - // apply to z(j1,j2,j,ma,mb) to unique element of y(j) - // find right y_list[jju] and beta(iatom,jjb) entries - // multiply and divide by j+1 factors - // account for multiplicity of 1, 2, or 3 - - // pick out right beta value - for (int elem3 = 0; elem3 < nelements; elem3++) { - const int jjuy = elem3 * idxu_max + jju; - if (j >= j1) { - const int jjb = idxb_block(j1, j2, j); - itriple = ((elem1 * nelements + elem2) * nelements + elem3) * idxb_max + jjb; - if (j1 == j) { - if (j2 == j) betaj = 3 * beta(itriple, iatom); - else betaj = 2 * beta(itriple, iatom); - } else betaj = beta(itriple, iatom); - } else if (j >= j2) { - const int jjb = idxb_block(j, j2, j1); - itriple = ((elem3 * nelements + elem2) * nelements + elem1) * idxb_max + jjb; - if (j2 == j) betaj = 2 * beta(itriple, iatom); - else betaj = beta(itriple, iatom); - } else { - const int jjb = idxb_block(j2, j, j1); - itriple = ((elem2 * nelements + elem3) * nelements + elem1) * idxb_max + jjb; - betaj = beta(itriple, iatom); + if (bnorm_flag) { + ztmp_r /= j + 1; + ztmp_i /= j + 1; } - if (!bnorm_flag && j1 > j) - betaj *= (j1 + 1) / (j + 1.0); + // apply to z(j1,j2,j,ma,mb) to unique element of y(j) + // find right y_list[jju] and beta(iatom,jjb) entries + // multiply and divide by j+1 factors + // account for multiplicity of 1, 2, or 3 - Kokkos::atomic_add(&(ylist(jjuy, iatom).re), betaj * ztmp_r); - Kokkos::atomic_add(&(ylist(jjuy, iatom).im), betaj * ztmp_i); - } - } + // pick out right beta value + for (int elem3 = 0; elem3 < nelements; elem3++) { + if (j >= j1) { + const int jjb = idxb_block(j1, j2, j); + const auto itriple = ((elem1 * nelements + elem2) * nelements + elem3) * idxb_max + jjb; + if (j1 == j) { + if (j2 == j) betaj = 3 * beta_pack(iatom_mod, itriple, iatom_div); + else betaj = 2 * beta_pack(iatom_mod, itriple, iatom_div); + } else betaj = beta_pack(iatom_mod, itriple, iatom_div); + } else if (j >= j2) { + const int jjb = idxb_block(j, j2, j1); + const auto itriple = ((elem3 * nelements + elem2) * nelements + elem1) * idxb_max + jjb; + if (j2 == j) betaj = 2 * beta_pack(iatom_mod, itriple, iatom_div); + else betaj = beta_pack(iatom_mod, itriple, iatom_div); + } else { + const int jjb = idxb_block(j2, j, j1); + const auto itriple = ((elem2 * nelements + elem3) * nelements + elem1) * idxb_max + jjb; + betaj = beta_pack(iatom_mod, itriple, iatom_div); + } + + if (!bnorm_flag && j1 > j) + betaj *= (j1 + 1) / (j + 1.0); + + + Kokkos::atomic_add(&(ylist_pack_re(iatom_mod, jju, elem3, iatom_div)), betaj*ztmp_r); + Kokkos::atomic_add(&(ylist_pack_im(iatom_mod, jju, elem3, iatom_div)), betaj*ztmp_i); + } // end loop over elem3 + } // end loop over elem2 + } // end loop over elem1 } /* ---------------------------------------------------------------------- @@ -673,7 +734,7 @@ void SNAKokkos::compute_fused_deidrj(const typename Kokkos::TeamPoli const int max_m_tile = (twojmax+1)*(twojmax/2+1); const int team_rank = team.team_rank(); const int scratch_shift = team_rank * max_m_tile; - const int jpos = element(iatom, jnbor)*idxu_max; + const int jelem = element(iatom, jnbor); // double buffer for ulist SNAcomplex* ulist_buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift; @@ -723,14 +784,14 @@ void SNAKokkos::compute_fused_deidrj(const typename Kokkos::TeamPoli // Accumulate the full contribution to dedr on the fly const double du_prod = dsfac * u; // chain rule - const SNAcomplex y_local = ylist(jpos, iatom); + const SNAcomplex y_local = ylist(0, jelem, iatom); // Symmetry factor of 0.5 b/c 0 element is on diagonal for even j==0 double dedr_full_sum = 0.5 * du_prod * y_local.re; // single has a warp barrier at the end Kokkos::single(Kokkos::PerThread(team), [=]() { - //dulist(0,iatom,jnbor,dir) = { dsfac * u, 0. }; // fold in chain rule here + ulist_buf1[0] = {1., 0.}; dulist_buf1[0] = {0., 0.}; }); @@ -755,10 +816,11 @@ void SNAKokkos::compute_fused_deidrj(const typename Kokkos::TeamPoli [&] (const int m, double& sum_tmp) { // ma fast, mb slow - int ma = m % n_ma; - int mb = m / n_ma; + // Equivalent to `int ma = m % n_ma; int mb = m / n_ma;` IF everything's positive. + const int mb = m / n_ma; + const int ma = m - mb * n_ma; - const int jju_index = jpos+jju+m; + const int jju_index = jju+m; // Load y_local, apply the symmetry scaling factor // The "secret" of the shared memory optimization is it eliminates @@ -766,7 +828,7 @@ void SNAKokkos::compute_fused_deidrj(const typename Kokkos::TeamPoli // shared memory and otherwise always writing, making the kernel // ultimately compute bound. We take advantage of that by adding // some reads back in. - auto y_local = ylist(jju_index,iatom); + auto y_local = ylist(jju_index, jelem, iatom); if (j % 2 == 0 && 2*mb == j) { if (ma == mb) { y_local = 0.5*y_local; } else if (ma > mb) { y_local = { 0., 0. }; } // can probably avoid this outright @@ -783,29 +845,35 @@ void SNAKokkos::compute_fused_deidrj(const typename Kokkos::TeamPoli SNAcomplex du_accum = { 0., 0. }; const double rootpq2 = -rootpqarray(ma, j - mb); - const SNAcomplex u_up2 = (ma > 0)?rootpq2*ulist_buf1[jjup_shared_idx-1]:SNAcomplex(0.,0.); + const SNAcomplex u_up2 = rootpq2*((ma > 0) ? ulist_buf1[jjup_shared_idx-1]:SNAcomplex(0.,0.)); + + // u_accum += conj(b) * u_up2 caconjxpy(b, u_up2, u_accum); const double rootpq1 = rootpqarray(j - ma, j - mb); - const SNAcomplex u_up1 = (ma < j)?rootpq1*ulist_buf1[jjup_shared_idx]:SNAcomplex(0.,0.); + const SNAcomplex u_up1 = rootpq1*((ma < j) ? ulist_buf1[jjup_shared_idx]:SNAcomplex(0.,0.)); + + // u_accum += conj(a) * u_up1 caconjxpy(a, u_up1, u_accum); // Next, spin up du_accum - const SNAcomplex du_up1 = (ma < j) ? rootpq1*dulist_buf1[jjup_shared_idx] : SNAcomplex(0.,0.); + const SNAcomplex du_up1 = rootpq1*((ma < j) ? dulist_buf1[jjup_shared_idx] : SNAcomplex(0.,0.)); + + // du_accum += conj(da) * u_up1 + conj(a) * du_up1 caconjxpy(da, u_up1, du_accum); caconjxpy(a, du_up1, du_accum); - const SNAcomplex du_up2 = (ma > 0) ? rootpq2*dulist_buf1[jjup_shared_idx-1] : SNAcomplex(0.,0.); + const SNAcomplex du_up2 = rootpq2*((ma > 0) ? dulist_buf1[jjup_shared_idx-1] : SNAcomplex(0.,0.)); + + // du_accum += conj(db) * u_up2 + conj(b) * du_up2 caconjxpy(db, u_up2, du_accum); caconjxpy(b, du_up2, du_accum); - // No need to save u_accum to global memory // Cache u_accum, du_accum to scratch memory. ulist_buf2[jju_shared_idx] = u_accum; dulist_buf2[jju_shared_idx] = du_accum; // Directly accumulate deidrj into sum_tmp - //dulist(jju_index,iatom,jnbor,dir) = ((dsfac * u)*u_accum) + (sfac*du_accum); const SNAcomplex du_prod = ((dsfac * u)*u_accum) + (sfac*du_accum); sum_tmp += du_prod.re * y_local.re + du_prod.im * y_local.im; @@ -813,7 +881,7 @@ void SNAKokkos::compute_fused_deidrj(const typename Kokkos::TeamPoli // u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb]) if (j%2==1 && mb+1==n_mb) { int sign_factor = (((ma+mb)%2==0)?1:-1); - //const int jjup_flip = jju+(j+1-mb)*(j+1)-(ma+1); // no longer needed b/c we don't update dulist + const int jju_shared_flip = (j+1-mb)*(j+1)-(ma+1); if (sign_factor == 1) { @@ -826,8 +894,6 @@ void SNAKokkos::compute_fused_deidrj(const typename Kokkos::TeamPoli // We don't need the second half of the tile for the deidrj accumulation. // That's taken care of by the symmetry factor above. - //dulist(jjup_flip,iatom,jnbor,dir) = ((dsfac * u)*u_accum) + (sfac*du_accum); - // We do need it for ortho polynomial generation, though ulist_buf2[jju_shared_flip] = u_accum; dulist_buf2[jju_shared_flip] = du_accum; @@ -854,69 +920,150 @@ void SNAKokkos::compute_fused_deidrj(const typename Kokkos::TeamPoli }); } -/* ---------------------------------------------------------------------- - compute dEidRj, CPU path only. -------------------------------------------------------------------------- */ +/* ---------------------------------------------------------------------- + * CPU routines + * ----------------------------------------------------------------------*/ + +/* ---------------------------------------------------------------------- + * compute Ui by summing over neighbors j + * ------------------------------------------------------------------------- */ + template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_deidrj_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor) +void SNAKokkos::pre_ui_cpu(const typename Kokkos::TeamPolicy::member_type& team, const int& iatom, const int& ielem) { - t_scalar3 final_sum; - const int jelem = element(iatom, jnbor); + for (int jelem = 0; jelem < nelements; jelem++) { + for (int j = 0; j <= twojmax; j++) { + const int jju = idxu_block(j); - //for(int j = 0; j <= twojmax; j++) { - Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,twojmax+1), - [&] (const int& j, t_scalar3& sum_tmp) { - int jju = idxu_block[j]; - int jjuy = idxu_block[j] + jelem*idxu_max; + // Only diagonal elements get initialized + // for (int m = 0; m < (j+1)*(j+1); m++) + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, (j+1)*(j+1)), + [&] (const int m) { - for(int mb = 0; 2*mb < j; mb++) - for(int ma = 0; ma <= j; ma++) { - sum_tmp.x += dulist(jju,iatom,jnbor,0).re * ylist(jjuy,iatom).re + dulist(jju,iatom,jnbor,0).im * ylist(jjuy,iatom).im; - sum_tmp.y += dulist(jju,iatom,jnbor,1).re * ylist(jjuy,iatom).re + dulist(jju,iatom,jnbor,1).im * ylist(jjuy,iatom).im; - sum_tmp.z += dulist(jju,iatom,jnbor,2).re * ylist(jjuy,iatom).re + dulist(jju,iatom,jnbor,2).im * ylist(jjuy,iatom).im; - jju++; - jjuy++; - } //end loop over ma mb + const int jjup = jju + m; - // For j even, handle middle column + // if m is on the "diagonal", initialize it with the self energy. + // Otherwise zero it out + SNAcomplex init = {0., 0.}; + if (m % (j+2) == 0 && (!chem_flag || ielem == jelem || wselfall_flag)) { init = {wself, 0.0}; } //need to map iatom to element - if (j%2 == 0) { - - int mb = j/2; - for(int ma = 0; ma < mb; ma++) { - sum_tmp.x += dulist(jju,iatom,jnbor,0).re * ylist(jjuy,iatom).re + dulist(jju,iatom,jnbor,0).im * ylist(jjuy,iatom).im; - sum_tmp.y += dulist(jju,iatom,jnbor,1).re * ylist(jjuy,iatom).re + dulist(jju,iatom,jnbor,1).im * ylist(jjuy,iatom).im; - sum_tmp.z += dulist(jju,iatom,jnbor,2).re * ylist(jjuy,iatom).re + dulist(jju,iatom,jnbor,2).im * ylist(jjuy,iatom).im; - jju++; - jjuy++; - } - - //int ma = mb; - sum_tmp.x += (dulist(jju,iatom,jnbor,0).re * ylist(jjuy,iatom).re + dulist(jju,iatom,jnbor,0).im * ylist(jjuy,iatom).im)*0.5; - sum_tmp.y += (dulist(jju,iatom,jnbor,1).re * ylist(jjuy,iatom).re + dulist(jju,iatom,jnbor,1).im * ylist(jjuy,iatom).im)*0.5; - sum_tmp.z += (dulist(jju,iatom,jnbor,2).re * ylist(jjuy,iatom).re + dulist(jju,iatom,jnbor,2).im * ylist(jjuy,iatom).im)*0.5; - } // end if jeven - - },final_sum); // end loop over j - - Kokkos::single(Kokkos::PerThread(team), [&] () { - dedr(iatom,jnbor,0) = final_sum.x*2.0; - dedr(iatom,jnbor,1) = final_sum.y*2.0; - dedr(iatom,jnbor,2) = final_sum.z*2.0; - }); + ulisttot(jjup, jelem, iatom) = init; + }); + } + } } + /* ---------------------------------------------------------------------- - compute Bi by summing conj(Ui)*Zi + compute Ui by summing over bispectrum components. CPU only. ------------------------------------------------------------------------- */ template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_bi(const typename Kokkos::TeamPolicy::member_type& team, int iatom) +void SNAKokkos::compute_ui_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor) +{ + double rsq, r, x, y, z, z0, theta0; + + // utot(j,ma,mb) = 0 for all j,ma,ma + // utot(j,ma,ma) = 1 for all j,ma + // for j in neighbors of i: + // compute r0 = (x,y,z,z0) + // utot(j,ma,mb) += u(r0;j,ma,mb) for all j,ma,mb + + x = rij(iatom,jnbor,0); + y = rij(iatom,jnbor,1); + z = rij(iatom,jnbor,2); + rsq = x * x + y * y + z * z; + r = sqrt(rsq); + + theta0 = (r - rmin0) * rfac0 * MY_PI / (rcutij(iatom,jnbor) - rmin0); + // theta0 = (r - rmin0) * rscale0; + z0 = r / tan(theta0); + + compute_uarray_cpu(team, iatom, jnbor, x, y, z, z0, r); + add_uarraytot(team, iatom, jnbor, r, wj(iatom,jnbor), rcutij(iatom,jnbor), element(iatom, jnbor)); + +} +/* ---------------------------------------------------------------------- + compute Zi by summing over products of Ui, CPU version +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void SNAKokkos::compute_zi_cpu(const int& iter) +{ + const int iatom = iter / idxz_max; + const int jjz = iter % idxz_max; + + const int j1 = idxz(jjz, 0); + const int j2 = idxz(jjz, 1); + const int j = idxz(jjz, 2); + const int ma1min = idxz(jjz, 3); + const int ma2max = idxz(jjz, 4); + const int mb1min = idxz(jjz, 5); + const int mb2max = idxz(jjz, 6); + const int na = idxz(jjz, 7); + const int nb = idxz(jjz, 8); + + const double *cgblock = cglist.data() + idxcg_block(j1,j2,j); + + int idouble = 0; + + for (int elem1 = 0; elem1 < nelements; elem1++) { + for (int elem2 = 0; elem2 < nelements; elem2++) { + zlist(jjz, idouble, iatom).re = 0.0; + zlist(jjz, idouble, iatom).im = 0.0; + + int jju1 = idxu_block[j1] + (j1+1)*mb1min; + int jju2 = idxu_block[j2] + (j2+1)*mb2max; + int icgb = mb1min*(j2+1) + mb2max; + for(int ib = 0; ib < nb; ib++) { + + double suma1_r = 0.0; + double suma1_i = 0.0; + + int ma1 = ma1min; + int ma2 = ma2max; + int icga = ma1min * (j2 + 1) + ma2max; + for(int ia = 0; ia < na; ia++) { + suma1_r += cgblock[icga] * (ulisttot(jju1+ma1, elem1, iatom).re * ulisttot(jju2+ma2, elem2, iatom).re - + ulisttot(jju1+ma1, elem1, iatom).im * ulisttot(jju2+ma2, elem2, iatom).im); + suma1_i += cgblock[icga] * (ulisttot(jju1+ma1, elem1, iatom).re * ulisttot(jju2+ma2, elem2, iatom).im + + ulisttot(jju1+ma1, elem1, iatom).im * ulisttot(jju2+ma2, elem2, iatom).re); + ma1++; + ma2--; + icga += j2; + } // end loop over ia + + zlist(jjz, idouble, iatom).re += cgblock[icgb] * suma1_r; + zlist(jjz, idouble, iatom).im += cgblock[icgb] * suma1_i; + + jju1 += j1 + 1; + jju2 -= j2 + 1; + icgb += j2; + } // end loop over ib + + if (bnorm_flag) { + zlist(jjz, idouble, iatom).re /= (j+1); + zlist(jjz, idouble, iatom).im /= (j+1); + } + idouble++; + } // end loop over elem2 + } // end loop over elem1 +} + + +/* ---------------------------------------------------------------------- + compute Bi by summing conj(Ui)*Zi, CPU version +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void SNAKokkos::compute_bi_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom) { // for j1 = 0,...,twojmax // for j2 = 0,twojmax @@ -929,15 +1076,14 @@ void SNAKokkos::compute_bi(const typename Kokkos::TeamPolicy::compute_bi(const typename Kokkos::TeamPolicy +KOKKOS_INLINE_FUNCTION +void SNAKokkos::zero_yi_cpu(const int& idx, const int& iatom, const int& ielem) +{ + ylist(idx,ielem,iatom) = {0.0, 0.0}; +} + +/* ---------------------------------------------------------------------- + compute Yi from Ui without storing Zi, looping over zlist indices, + CPU version +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void SNAKokkos::compute_yi_cpu(int iter, + const Kokkos::View &beta) +{ + double betaj; + const int iatom = iter / idxz_max; + const int jjz = iter % idxz_max; + + const int j1 = idxz(jjz, 0); + const int j2 = idxz(jjz, 1); + const int j = idxz(jjz, 2); + const int ma1min = idxz(jjz, 3); + const int ma2max = idxz(jjz, 4); + const int mb1min = idxz(jjz, 5); + const int mb2max = idxz(jjz, 6); + const int na = idxz(jjz, 7); + const int nb = idxz(jjz, 8); + const int jju = idxz(jjz, 9); + + const double *cgblock = cglist.data() + idxcg_block(j1,j2,j); + //int mb = (2 * (mb1min+mb2max) - j1 - j2 + j) / 2; + //int ma = (2 * (ma1min+ma2max) - j1 - j2 + j) / 2; + + for (int elem1 = 0; elem1 < nelements; elem1++) { + for (int elem2 = 0; elem2 < nelements; elem2++) { + + double ztmp_r = 0.0; + double ztmp_i = 0.0; + + int jju1 = idxu_block[j1] + (j1 + 1) * mb1min; + int jju2 = idxu_block[j2] + (j2 + 1) * mb2max; + int icgb = mb1min * (j2 +1 ) + mb2max; + + for (int ib = 0; ib < nb; ib++) { + + double suma1_r = 0.0; + double suma1_i = 0.0; + + int ma1 = ma1min; + int ma2 = ma2max; + int icga = ma1min*(j2+1) + ma2max; + + for (int ia = 0; ia < na; ia++) { + suma1_r += cgblock[icga] * (ulisttot(jju1+ma1, elem1, iatom).re * ulisttot(jju2+ma2, elem2, iatom).re - + ulisttot(jju1+ma1, elem1, iatom).im * ulisttot(jju2+ma2, elem2, iatom).im); + suma1_i += cgblock[icga] * (ulisttot(jju1+ma1, elem1, iatom).re * ulisttot(jju2+ma2, elem2, iatom).im + + ulisttot(jju1+ma1, elem1, iatom).im * ulisttot(jju2+ma2, elem2, iatom).re); + ma1++; + ma2--; + icga += j2; + } // end loop over ia + + ztmp_r += cgblock[icgb] * suma1_r; + ztmp_i += cgblock[icgb] * suma1_i; + jju1 += j1 + 1; + jju2 -= j2 + 1; + icgb += j2; + } // end loop over ib + + if (bnorm_flag) { + ztmp_i /= j + 1; + ztmp_r /= j + 1; + } + + // apply to z(j1,j2,j,ma,mb) to unique element of y(j) + // find right y_list[jju] and beta(iatom,jjb) entries + // multiply and divide by j+1 factors + // account for multiplicity of 1, 2, or 3 + + // pick out right beta value + for (int elem3 = 0; elem3 < nelements; elem3++) { + + if (j >= j1) { + const int jjb = idxb_block(j1, j2, j); + const auto itriple = ((elem1 * nelements + elem2) * nelements + elem3) * idxb_max + jjb; + if (j1 == j) { + if (j2 == j) betaj = 3 * beta(itriple, iatom); + else betaj = 2 * beta(itriple, iatom); + } else betaj = beta(itriple, iatom); + } else if (j >= j2) { + const int jjb = idxb_block(j, j2, j1); + const auto itriple = ((elem3 * nelements + elem2) * nelements + elem1) * idxb_max + jjb; + if (j2 == j) betaj = 2 * beta(itriple, iatom); + else betaj = beta(itriple, iatom); + } else { + const int jjb = idxb_block(j2, j, j1); + const auto itriple = ((elem2 * nelements + elem3) * nelements + elem1) * idxb_max + jjb; + betaj = beta(itriple, iatom); + } + + if (!bnorm_flag && j1 > j) + betaj *= (j1 + 1) / (j + 1.0); + + Kokkos::atomic_add(&(ylist(jju, elem3, iatom).re), betaj*ztmp_r); + Kokkos::atomic_add(&(ylist(jju, elem3, iatom).im), betaj*ztmp_i); + } // end loop over elem3 + } // end loop over elem2 + } // end loop over elem1 +} + + /* ---------------------------------------------------------------------- calculate derivative of Ui w.r.t. atom j ------------------------------------------------------------------------- */ @@ -1042,6 +1303,61 @@ void SNAKokkos::compute_duidrj_cpu(const typename Kokkos::TeamPolicy compute_duarray_cpu(team, iatom, jnbor, x, y, z, z0, r, dz0dr, wj(iatom,jnbor), rcutij(iatom,jnbor)); } + +/* ---------------------------------------------------------------------- + compute dEidRj, CPU path only. +------------------------------------------------------------------------- */ + + +template +KOKKOS_INLINE_FUNCTION +void SNAKokkos::compute_deidrj_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor) +{ + t_scalar3 final_sum; + const int jelem = element(iatom, jnbor); + + //for(int j = 0; j <= twojmax; j++) { + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,twojmax+1), + [&] (const int& j, t_scalar3& sum_tmp) { + int jju = idxu_block[j]; + + for(int mb = 0; 2*mb < j; mb++) + for(int ma = 0; ma <= j; ma++) { + sum_tmp.x += dulist(jju,iatom,jnbor,0).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,0).im * ylist(jju,jelem,iatom).im; + sum_tmp.y += dulist(jju,iatom,jnbor,1).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,1).im * ylist(jju,jelem,iatom).im; + sum_tmp.z += dulist(jju,iatom,jnbor,2).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,2).im * ylist(jju,jelem,iatom).im; + jju++; + } //end loop over ma mb + + // For j even, handle middle column + + if (j%2 == 0) { + + int mb = j/2; + for(int ma = 0; ma < mb; ma++) { + sum_tmp.x += dulist(jju,iatom,jnbor,0).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,0).im * ylist(jju,jelem,iatom).im; + sum_tmp.y += dulist(jju,iatom,jnbor,1).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,1).im * ylist(jju,jelem,iatom).im; + sum_tmp.z += dulist(jju,iatom,jnbor,2).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,2).im * ylist(jju,jelem,iatom).im; + jju++; + } + + //int ma = mb; + sum_tmp.x += (dulist(jju,iatom,jnbor,0).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,0).im * ylist(jju,jelem,iatom).im)*0.5; + sum_tmp.y += (dulist(jju,iatom,jnbor,1).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,1).im * ylist(jju,jelem,iatom).im)*0.5; + sum_tmp.z += (dulist(jju,iatom,jnbor,2).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,2).im * ylist(jju,jelem,iatom).im)*0.5; + } // end if jeven + + },final_sum); // end loop over j + + Kokkos::single(Kokkos::PerThread(team), [&] () { + dedr(iatom,jnbor,0) = final_sum.x*2.0; + dedr(iatom,jnbor,1) = final_sum.y*2.0; + dedr(iatom,jnbor,2) = final_sum.z*2.0; + }); + +} + + /* ---------------------------------------------------------------------- add Wigner U-functions for one neighbor to the total ------------------------------------------------------------------------- */ @@ -1052,10 +1368,11 @@ void SNAKokkos::add_uarraytot(const typename Kokkos::TeamPolicy::compute_duarray_cpu(const typename Kokkos::TeamPolic double z0, double r, double dz0dr, double wj, double rcut) { - double r0inv; +double r0inv; double a_r, a_i, b_r, b_i; double da_r[3], da_i[3], db_r[3], db_i[3]; double dz0[3], dr0inv[3], dr0invdr; @@ -1710,6 +2027,7 @@ double SNAKokkos::memory_usage() { int jdimpq = twojmax + 2; int jdim = twojmax + 1; + int natom_pad = ((natom + 32 - 1) / 32) * 32; // for AoSoA layouts double bytes; bytes = 0; @@ -1717,21 +2035,38 @@ double SNAKokkos::memory_usage() bytes += jdimpq*jdimpq * sizeof(double); // pqarray bytes += idxcg_max * sizeof(double); // cglist + + #ifdef KOKKOS_ENABLE_CUDA - if (!std::is_same::value) { + if (std::is_same::value) { + + bytes += natom * idxu_max * nelements * sizeof(double); // ulisttot_re + bytes += natom * idxu_max * nelements * sizeof(double); // ulisttot_im + bytes += natom_pad * idxu_max * nelements * sizeof(double) * 2; // ulisttot_pack + + bytes += natom_pad * idxz_max * ndoubles * sizeof(double) * 2; // zlist_pack + bytes += natom_pad * idxb_max * ntriples * sizeof(double); // blist_pack + + bytes += natom_pad * idxu_max * nelements * sizeof(double); // ylist_pack_re + bytes += natom_pad * idxu_max * nelements * sizeof(double); // ylist_pack_im + bytes += natom * idxu_max * nelements * sizeof(double) * 2; // ylist + } else { #endif - bytes += natom * idxu_max * sizeof(double) * 2; // ulist - bytes += natom * idxu_max * 3 * sizeof(double) * 2; // dulist + + bytes += natom * nmax * idxu_max * sizeof(double) * 2; // ulist + bytes += natom * idxu_max * nelements * sizeof(double) * 2; // ulisttot + + bytes += natom * idxz_max * ndoubles * sizeof(double) * 2; // zlist + bytes += natom * idxb_max * ntriples * sizeof(double); // blist + + bytes += natom * idxu_max * nelements * sizeof(double) * 2; // ylist + + bytes += natom * nmax * idxu_max * 3 * sizeof(double) * 2; // dulist #ifdef KOKKOS_ENABLE_CUDA } #endif - bytes += natom * idxu_max * nelements * sizeof(double) * 2; // ulisttot - if (!std::is_same::value) - bytes += natom * idxu_max * nelements * sizeof(double) * 2; // ulisttot_lr - bytes += natom * idxz_max * ndoubles * sizeof(double) * 2; // zlist - bytes += natom * idxb_max * ntriples * sizeof(double); // blist - bytes += natom * idxu_max * nelements * sizeof(double) * 2; // ylist + bytes += natom * nmax * 3 * sizeof(double); // dedr bytes += jdim * jdim * jdim * sizeof(int); // idxcg_block bytes += jdim * sizeof(int); // idxu_block @@ -1747,7 +2082,6 @@ double SNAKokkos::memory_usage() bytes += natom * nmax * sizeof(int); // inside bytes += natom * nmax * sizeof(double); // wj bytes += natom * nmax * sizeof(double); // rcutij - bytes += natom * nmax * idxu_max * sizeof(double) * 2; // ulist_ij return bytes; } From 55d6f1a34f55b3d8365130b31fa1358c6c20baa5 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Fri, 19 Jun 2020 19:33:17 -0400 Subject: [PATCH 2/6] Fixed variable name to be more consistent with rest of code. --- src/KOKKOS/sna_kokkos_impl.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index 8f265751ed..866b4c516f 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -337,7 +337,7 @@ void SNAKokkos::compute_ui(const typename Kokkos::TeamPolicy::compute_ui(const typename Kokkos::TeamPolicy::compute_ui(const typename Kokkos::TeamPolicy::compute_ui(const typename Kokkos::TeamPolicy Date: Mon, 22 Jun 2020 10:50:31 -0600 Subject: [PATCH 3/6] Whitespace tweak --- src/KOKKOS/pair_snap_kokkos_impl.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h index 64831e91cf..6bedf5554e 100644 --- a/src/KOKKOS/pair_snap_kokkos_impl.h +++ b/src/KOKKOS/pair_snap_kokkos_impl.h @@ -261,10 +261,8 @@ void PairSNAPKokkos::compute(int eflag_in, int vflag_in) Kokkos::parallel_for("ComputeBiCPU",policy_bi_cpu,*this); } - //ZeroYi,ComputeYi { - int vector_length = vector_length_default; int team_size = team_size_default; From 198258766d768cce97e560014630c68a3eb3620f Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Mon, 22 Jun 2020 18:59:09 -0400 Subject: [PATCH 4/6] ensure that per-arrays are only freed if the last atom style instance is deleted --- src/atom_vec.cpp | 76 ++++++++++++++++++++++++++---------------------- src/atom_vec.h | 4 +++ 2 files changed, 46 insertions(+), 34 deletions(-) diff --git a/src/atom_vec.cpp b/src/atom_vec.cpp index 6a294d3ccf..5a86caa4fb 100644 --- a/src/atom_vec.cpp +++ b/src/atom_vec.cpp @@ -33,6 +33,8 @@ using namespace MathConst; #define DELTA 16384 #define DELTA_BONUS 8192 +int AtomVec::num_atom_vecs = 0; + /* ---------------------------------------------------------------------- */ AtomVec::AtomVec(LAMMPS *lmp) : Pointers(lmp) @@ -54,6 +56,8 @@ AtomVec::AtomVec(LAMMPS *lmp) : Pointers(lmp) threads = NULL; + ++num_atom_vecs; + // peratom variables auto-included in corresponding child style fields string // these fields cannot be specified in the fields string @@ -93,44 +97,48 @@ AtomVec::~AtomVec() int datatype,cols; void *pdata; + --num_atom_vecs; + for (int i = 0; i < nargcopy; i++) delete [] argcopy[i]; delete [] argcopy; - memory->destroy(atom->tag); - memory->destroy(atom->type); - memory->destroy(atom->mask); - memory->destroy(atom->image); - memory->destroy(atom->x); - memory->destroy(atom->v); - memory->destroy(atom->f); + if (num_atom_vecs == 0) { + memory->destroy(atom->tag); + memory->destroy(atom->type); + memory->destroy(atom->mask); + memory->destroy(atom->image); + memory->destroy(atom->x); + memory->destroy(atom->v); + memory->destroy(atom->f); - for (int i = 0; i < ngrow; i++) { - pdata = mgrow.pdata[i]; - datatype = mgrow.datatype[i]; - cols = mgrow.cols[i]; - if (datatype == Atom::DOUBLE) { - if (cols == 0) - memory->destroy(*((double **) pdata)); - else if (cols > 0) - memory->destroy(*((double ***) pdata)); - else { - memory->destroy(*((double ***) pdata)); - } - } else if (datatype == Atom::INT) { - if (cols == 0) - memory->destroy(*((int **) pdata)); - else if (cols > 0) - memory->destroy(*((int ***) pdata)); - else { - memory->destroy(*((int ***) pdata)); - } - } else if (datatype == Atom::BIGINT) { - if (cols == 0) - memory->destroy(*((bigint **) pdata)); - else if (cols > 0) - memory->destroy(*((bigint ***) pdata)); - else { - memory->destroy(*((bigint ***) pdata)); + for (int i = 0; i < ngrow; i++) { + pdata = mgrow.pdata[i]; + datatype = mgrow.datatype[i]; + cols = mgrow.cols[i]; + if (datatype == Atom::DOUBLE) { + if (cols == 0) + memory->destroy(*((double **) pdata)); + else if (cols > 0) + memory->destroy(*((double ***) pdata)); + else { + memory->destroy(*((double ***) pdata)); + } + } else if (datatype == Atom::INT) { + if (cols == 0) + memory->destroy(*((int **) pdata)); + else if (cols > 0) + memory->destroy(*((int ***) pdata)); + else { + memory->destroy(*((int ***) pdata)); + } + } else if (datatype == Atom::BIGINT) { + if (cols == 0) + memory->destroy(*((bigint **) pdata)); + else if (cols > 0) + memory->destroy(*((bigint ***) pdata)); + else { + memory->destroy(*((bigint ***) pdata)); + } } } } diff --git a/src/atom_vec.h b/src/atom_vec.h index 8ccf922c4b..625307face 100644 --- a/src/atom_vec.h +++ b/src/atom_vec.h @@ -209,6 +209,10 @@ class AtomVec : protected Pointers { bool *threads; + // counter for atom vec instances + + static int num_atom_vecs; + // local methods void grow_nmax(); From afe6484c445db453135f296541fa50187940aa0b Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Mon, 22 Jun 2020 19:15:28 -0400 Subject: [PATCH 5/6] Revert "reorganize memory (de-)allocation and fix substantial memory leak in MSM" This reverts commit f78671c1a4d81e5907e929c9341d784e03881bd6. --- src/KSPACE/msm.cpp | 195 +++++++++++++++++++++++---------------------- 1 file changed, 100 insertions(+), 95 deletions(-) diff --git a/src/KSPACE/msm.cpp b/src/KSPACE/msm.cpp index abc8ca1150..70a36984d7 100644 --- a/src/KSPACE/msm.cpp +++ b/src/KSPACE/msm.cpp @@ -28,8 +28,6 @@ #include "domain.h" #include "memory.h" #include "error.h" -#include "utils.h" -#include "fmt/format.h" #include "math_const.h" @@ -65,6 +63,9 @@ MSM::MSM(LAMMPS *lmp) : KSpace(lmp), factors[0] = 2; MPI_Comm_rank(world,&me); + procneigh_levels = NULL; + world_levels = NULL; + active_flag = NULL; phi1d = dphi1d = NULL; @@ -80,7 +81,15 @@ MSM::MSM(LAMMPS *lmp) : KSpace(lmp), v0_direct_top = v1_direct_top = v2_direct_top = NULL; v3_direct_top = v4_direct_top = v5_direct_top = NULL; + cg_all = cg_peratom_all = NULL; + cg = cg_peratom = NULL; + ngrid = NULL; + cg = NULL; + cg_peratom = NULL; + procneigh_levels = NULL; + world_levels = NULL; + active_flag = NULL; alpha = betax = betay = betaz = NULL; nx_msm = ny_msm = nz_msm = NULL; @@ -141,7 +150,10 @@ MSM::~MSM() void MSM::init() { - if (me == 0) utils::logmesg(lmp,"MSM initialization ...\n"); + if (me == 0) { + if (screen) fprintf(screen,"MSM initialization ...\n"); + if (logfile) fprintf(logfile,"MSM initialization ...\n"); + } // error check @@ -157,8 +169,13 @@ void MSM::init() if ((slabflag == 1) && (me == 0)) error->warning(FLERR,"Slab correction not needed for MSM"); - if ((order < 4) || (order > 10) || (order%2 != 0)) - error->all(FLERR,"MSM order must be 4, 6, 8, or 10"); + if (order < 4 || order > 10) { + char str[128]; + sprintf(str,"MSM order must be 4, 6, 8, or 10"); + error->all(FLERR,str); + } + + if (order%2 != 0) error->all(FLERR,"MSM order must be 4, 6, 8, or 10"); if (sizeof(FFT_SCALAR) != 8) error->all(FLERR,"Cannot (yet) use single precision with MSM " @@ -204,14 +221,33 @@ void MSM::init() MPI_Allreduce(&ngrid[0],&ngrid_max,1,MPI_INT,MPI_MAX,world); if (me == 0) { - std::string mesg = fmt::format(" 3d grid size/proc = {}\n", ngrid_max); - mesg += fmt::format(" estimated absolute RMS force accuracy = {}\n", - estimated_error); - mesg += fmt::format(" estimated relative force accuracy = {}\n", - estimated_error/two_charge_force); - mesg += fmt::format(" grid = {} {} {}\n",nx_msm[0],ny_msm[0],nz_msm[0]); - mesg += fmt::format(" order = {}\n",order); - utils::logmesg(lmp,mesg); + if (screen) { + fprintf(screen," 3d grid size/proc = %d\n", + ngrid_max); + fprintf(screen," estimated absolute RMS force accuracy = %g\n", + estimated_error); + fprintf(screen," estimated relative force accuracy = %g\n", + estimated_error/two_charge_force); + } + if (logfile) { + fprintf(logfile," 3d grid size/proc = %d\n", + ngrid_max); + fprintf(logfile," estimated absolute RMS force accuracy = %g\n", + estimated_error); + fprintf(logfile," estimated relative force accuracy = %g\n", + estimated_error/two_charge_force); + } + } + + if (me == 0) { + if (screen) { + fprintf(screen," grid = %d %d %d\n",nx_msm[0],ny_msm[0],nz_msm[0]); + fprintf(screen," order = %d\n",order); + } + if (logfile) { + fprintf(logfile," grid = %d %d %d\n",nx_msm[0],ny_msm[0],nz_msm[0]); + fprintf(logfile," order = %d\n",order); + } } } @@ -362,6 +398,7 @@ void MSM::setup() nmax_direct = 8*(nxhi_direct+1)*(nyhi_direct+1)*(nzhi_direct+1); + deallocate(); if (peratom_allocate_flag) deallocate_peratom(); // compute direct sum interaction weights @@ -612,8 +649,6 @@ void MSM::compute(int eflag, int vflag) void MSM::allocate() { - deallocate(); - // interpolation coeffs order_allocated = order; @@ -635,9 +670,9 @@ void MSM::allocate() // allocate memory for each grid level for (int n=0; ncreate3d_offset(qgrid[n],nzlo_out[n],nzhi_out[n], nylo_out[n],nyhi_out[n],nxlo_out[n],nxhi_out[n],"msm:qgrid"); + memory->create3d_offset(egrid[n],nzlo_out[n],nzhi_out[n], nylo_out[n],nyhi_out[n],nxlo_out[n],nxhi_out[n],"msm:egrid"); @@ -660,23 +695,24 @@ void MSM::allocate() void MSM::deallocate() { - delete cg_all; - cg_all = nullptr; - memory->destroy2d_offset(phi1d,-order_allocated); memory->destroy2d_offset(dphi1d,-order_allocated); + if (cg_all) delete cg_all; + for (int n=0; ndestroy3d_offset(qgrid[n],nzlo_out[n],nylo_out[n],nxlo_out[n]); - memory->destroy3d_offset(egrid[n],nzlo_out[n],nylo_out[n],nxlo_out[n]); + if (qgrid[n]) + memory->destroy3d_offset(qgrid[n],nzlo_out[n],nylo_out[n],nxlo_out[n]); - if (world_levels[n] != MPI_COMM_NULL) - MPI_Comm_free(&world_levels[n]); - world_levels[n] = MPI_COMM_NULL; - active_flag[n] = 0; + if (egrid[n]) + memory->destroy3d_offset(egrid[n],nzlo_out[n],nylo_out[n],nxlo_out[n]); - delete cg[n]; - cg[n] = nullptr; + if (world_levels) + if (world_levels[n] != MPI_COMM_NULL) + MPI_Comm_free(&world_levels[n]); + + if (cg) + if (cg[n]) delete cg[n]; } } @@ -765,7 +801,6 @@ void MSM::deallocate_peratom() void MSM::allocate_levels() { - deallocate_levels(); ngrid = new int[levels]; cg = new GridComm*[levels]; @@ -815,21 +850,21 @@ void MSM::allocate_levels() v5grid = new double***[levels]; for (int n=0; ndestroy(procneigh_levels); delete [] world_levels; @@ -886,50 +919,6 @@ void MSM::deallocate_levels() delete [] v3grid; delete [] v4grid; delete [] v5grid; - - world_levels = nullptr; - active_flag = nullptr; - cg = nullptr; - cg_peratom = nullptr; - - alpha = nullptr; - betax = nullptr; - betay = nullptr; - betaz = nullptr; - - nx_msm = nullptr; - ny_msm = nullptr; - nz_msm = nullptr; - - nxlo_in = nullptr; - nylo_in = nullptr; - nzlo_in = nullptr; - - nxhi_in = nullptr; - nyhi_in = nullptr; - nzhi_in = nullptr; - - nxlo_out = nullptr; - nylo_out = nullptr; - nzlo_out = nullptr; - - nxhi_out = nullptr; - nyhi_out = nullptr; - nzhi_out = nullptr; - - delxinv = nullptr; - delyinv = nullptr; - delzinv = nullptr; - - qgrid = nullptr; - egrid = nullptr; - - v0grid = nullptr; - v1grid = nullptr; - v2grid = nullptr; - v3grid = nullptr; - v4grid = nullptr; - v5grid = nullptr; } /* ---------------------------------------------------------------------- @@ -1064,9 +1053,9 @@ void MSM::set_grid_global() double *p_cutoff = (double *) force->pair->extract("cut_coul",itmp); *p_cutoff = cutoff; - if (me == 0) - error->warning(FLERR,fmt::format("Adjusting Coulombic cutoff for " - "MSM, new cutoff = {}", cutoff)); + char str[128]; + sprintf(str,"Adjusting Coulombic cutoff for MSM, new cutoff = %g",cutoff); + if (me == 0) error->warning(FLERR,str); } if (triclinic == 0) { @@ -1105,6 +1094,7 @@ void MSM::set_grid_global() if (!domain->nonperiodic) levels -= 1; + deallocate_levels(); allocate_levels(); // find number of grid levels in each direction @@ -1328,8 +1318,8 @@ void MSM::set_grid_local() void MSM::set_proc_grid(int n) { - for (int i=0; i<3; i++) - myloc[i] = comm->myloc[i]; + for (int i=0; i<3; i++) + myloc[i] = comm->myloc[i]; // size of inner MSM grid owned by this proc @@ -1372,7 +1362,6 @@ void MSM::set_proc_grid(int n) // define a new MPI communicator for this grid level that only includes active procs - if(world_levels[n] != MPI_COMM_NULL) MPI_Comm_free(&world_levels[n]); MPI_Comm_split(world,color,me,&world_levels[n]); if (!active_flag[n]) return; @@ -1490,13 +1479,16 @@ void MSM::particle_map() void MSM::make_rho() { + //fprintf(screen,"MSM aninterpolation\n\n"); + int i,l,m,n,nx,ny,nz,mx,my,mz; double dx,dy,dz,x0,y0,z0; // clear 3d density array - double ***qgrid0 = qgrid[0]; - memset(&(qgrid0[nzlo_out[0]][nylo_out[0]][nxlo_out[0]]),0,ngrid[0]*sizeof(double)); + double ***qgridn = qgrid[0]; + + memset(&(qgridn[nzlo_out[0]][nylo_out[0]][nxlo_out[0]]),0,ngrid[0]*sizeof(double)); // loop over my charges, add their contribution to nearby grid points // (nx,ny,nz) = global coords of grid pt to "lower left" of charge @@ -1527,7 +1519,7 @@ void MSM::make_rho() x0 = y0*phi1d[1][m]; for (l = nlower; l <= nupper; l++) { mx = l+nx; - qgrid0[mz][my][mx] += x0*phi1d[0][l]; + qgridn[mz][my][mx] += x0*phi1d[0][l]; } } } @@ -1542,6 +1534,8 @@ void MSM::make_rho() void MSM::direct(int n) { + //fprintf(screen,"Direct contribution on level %i\n\n",n); + double ***qgridn = qgrid[n]; double ***egridn = egrid[n]; double ***v0gridn = v0grid[n]; @@ -1774,6 +1768,8 @@ void MSM::direct(int n) void MSM::direct_peratom(int n) { + //fprintf(screen,"Direct contribution on level %i\n\n",n); + double ***qgridn = qgrid[n]; double ***v0gridn = v0grid[n]; double ***v1gridn = v1grid[n]; @@ -1896,6 +1892,8 @@ void MSM::direct_peratom(int n) void MSM::direct_top(int n) { + //fprintf(screen,"Direct contribution on level %i\n\n",n); + double ***qgridn = qgrid[n]; double ***egridn = egrid[n]; double ***v0gridn = v0grid[n]; @@ -2259,6 +2257,8 @@ void MSM::direct_peratom_top(int n) void MSM::restriction(int n) { + //fprintf(screen,"Restricting from level %i to %i\n\n",n,n+1); + const int p = order-1; double ***qgrid1 = qgrid[n]; @@ -2281,7 +2281,8 @@ void MSM::restriction(int n) // zero out charge on coarser grid - memset(&(qgrid2[nzlo_out[n+1]][nylo_out[n+1]][nxlo_out[n+1]]),0,ngrid[n+1]*sizeof(double)); + memset(&(qgrid2[nzlo_out[n+1]][nylo_out[n+1]][nxlo_out[n+1]]),0, + ngrid[n+1]*sizeof(double)); for (kp = nzlo_in[n+1]; kp <= nzhi_in[n+1]; kp++) for (jp = nylo_in[n+1]; jp <= nyhi_in[n+1]; jp++) @@ -2330,6 +2331,8 @@ void MSM::restriction(int n) void MSM::prolongation(int n) { + //fprintf(screen,"Prolongating from level %i to %i\n\n",n+1,n); + const int p = order-1; double ***egrid1 = egrid[n]; @@ -2718,6 +2721,8 @@ void MSM::unpack_reverse(int flag, double *buf, int nlist, int *list) void MSM::fieldforce() { + //fprintf(screen,"MSM interpolation\n\n"); + double ***egridn = egrid[0]; int i,l,m,n,nx,ny,nz,mx,my,mz; From 8d5a9ad4af7bf708a45ab20900e2373384ed2fa0 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Mon, 22 Jun 2020 19:28:24 -0400 Subject: [PATCH 6/6] implement alternate version of MSM leak fix --- src/KSPACE/msm.cpp | 152 ++++++++++++++++++++++----------------------- 1 file changed, 76 insertions(+), 76 deletions(-) diff --git a/src/KSPACE/msm.cpp b/src/KSPACE/msm.cpp index 70a36984d7..e4e5846c9e 100644 --- a/src/KSPACE/msm.cpp +++ b/src/KSPACE/msm.cpp @@ -28,6 +28,8 @@ #include "domain.h" #include "memory.h" #include "error.h" +#include "utils.h" +#include "fmt/format.h" #include "math_const.h" @@ -63,9 +65,6 @@ MSM::MSM(LAMMPS *lmp) : KSpace(lmp), factors[0] = 2; MPI_Comm_rank(world,&me); - procneigh_levels = NULL; - world_levels = NULL; - active_flag = NULL; phi1d = dphi1d = NULL; @@ -81,15 +80,7 @@ MSM::MSM(LAMMPS *lmp) : KSpace(lmp), v0_direct_top = v1_direct_top = v2_direct_top = NULL; v3_direct_top = v4_direct_top = v5_direct_top = NULL; - cg_all = cg_peratom_all = NULL; - cg = cg_peratom = NULL; - ngrid = NULL; - cg = NULL; - cg_peratom = NULL; - procneigh_levels = NULL; - world_levels = NULL; - active_flag = NULL; alpha = betax = betay = betaz = NULL; nx_msm = ny_msm = nz_msm = NULL; @@ -150,10 +141,7 @@ MSM::~MSM() void MSM::init() { - if (me == 0) { - if (screen) fprintf(screen,"MSM initialization ...\n"); - if (logfile) fprintf(logfile,"MSM initialization ...\n"); - } + if (me == 0) utils::logmesg(lmp,"MSM initialization ...\n"); // error check @@ -169,13 +157,8 @@ void MSM::init() if ((slabflag == 1) && (me == 0)) error->warning(FLERR,"Slab correction not needed for MSM"); - if (order < 4 || order > 10) { - char str[128]; - sprintf(str,"MSM order must be 4, 6, 8, or 10"); - error->all(FLERR,str); - } - - if (order%2 != 0) error->all(FLERR,"MSM order must be 4, 6, 8, or 10"); + if ((order < 4) || (order > 10) || (order%2 != 0)) + error->all(FLERR,"MSM order must be 4, 6, 8, or 10"); if (sizeof(FFT_SCALAR) != 8) error->all(FLERR,"Cannot (yet) use single precision with MSM " @@ -221,33 +204,14 @@ void MSM::init() MPI_Allreduce(&ngrid[0],&ngrid_max,1,MPI_INT,MPI_MAX,world); if (me == 0) { - if (screen) { - fprintf(screen," 3d grid size/proc = %d\n", - ngrid_max); - fprintf(screen," estimated absolute RMS force accuracy = %g\n", - estimated_error); - fprintf(screen," estimated relative force accuracy = %g\n", - estimated_error/two_charge_force); - } - if (logfile) { - fprintf(logfile," 3d grid size/proc = %d\n", - ngrid_max); - fprintf(logfile," estimated absolute RMS force accuracy = %g\n", - estimated_error); - fprintf(logfile," estimated relative force accuracy = %g\n", - estimated_error/two_charge_force); - } - } - - if (me == 0) { - if (screen) { - fprintf(screen," grid = %d %d %d\n",nx_msm[0],ny_msm[0],nz_msm[0]); - fprintf(screen," order = %d\n",order); - } - if (logfile) { - fprintf(logfile," grid = %d %d %d\n",nx_msm[0],ny_msm[0],nz_msm[0]); - fprintf(logfile," order = %d\n",order); - } + std::string mesg = fmt::format(" 3d grid size/proc = {}\n", ngrid_max); + mesg += fmt::format(" estimated absolute RMS force accuracy = {}\n", + estimated_error); + mesg += fmt::format(" estimated relative force accuracy = {}\n", + estimated_error/two_charge_force); + mesg += fmt::format(" grid = {} {} {}\n",nx_msm[0],ny_msm[0],nz_msm[0]); + mesg += fmt::format(" order = {}\n",order); + utils::logmesg(lmp,mesg); } } @@ -699,6 +663,7 @@ void MSM::deallocate() memory->destroy2d_offset(dphi1d,-order_allocated); if (cg_all) delete cg_all; + cg_all = nullptr; for (int n=0; ndestroy(procneigh_levels); delete [] world_levels; @@ -919,6 +890,50 @@ void MSM::deallocate_levels() delete [] v3grid; delete [] v4grid; delete [] v5grid; + + world_levels = nullptr; + active_flag = nullptr; + cg = nullptr; + cg_peratom = nullptr; + + alpha = nullptr; + betax = nullptr; + betay = nullptr; + betaz = nullptr; + + nx_msm = nullptr; + ny_msm = nullptr; + nz_msm = nullptr; + + nxlo_in = nullptr; + nylo_in = nullptr; + nzlo_in = nullptr; + + nxhi_in = nullptr; + nyhi_in = nullptr; + nzhi_in = nullptr; + + nxlo_out = nullptr; + nylo_out = nullptr; + nzlo_out = nullptr; + + nxhi_out = nullptr; + nyhi_out = nullptr; + nzhi_out = nullptr; + + delxinv = nullptr; + delyinv = nullptr; + delzinv = nullptr; + + qgrid = nullptr; + egrid = nullptr; + + v0grid = nullptr; + v1grid = nullptr; + v2grid = nullptr; + v3grid = nullptr; + v4grid = nullptr; + v5grid = nullptr; } /* ---------------------------------------------------------------------- @@ -1053,9 +1068,9 @@ void MSM::set_grid_global() double *p_cutoff = (double *) force->pair->extract("cut_coul",itmp); *p_cutoff = cutoff; - char str[128]; - sprintf(str,"Adjusting Coulombic cutoff for MSM, new cutoff = %g",cutoff); - if (me == 0) error->warning(FLERR,str); + if (me == 0) + error->warning(FLERR,fmt::format("Adjusting Coulombic cutoff for " + "MSM, new cutoff = {}", cutoff)); } if (triclinic == 0) { @@ -1318,8 +1333,8 @@ void MSM::set_grid_local() void MSM::set_proc_grid(int n) { - for (int i=0; i<3; i++) - myloc[i] = comm->myloc[i]; + for (int i=0; i<3; i++) + myloc[i] = comm->myloc[i]; // size of inner MSM grid owned by this proc @@ -1362,6 +1377,7 @@ void MSM::set_proc_grid(int n) // define a new MPI communicator for this grid level that only includes active procs + if(world_levels[n] != MPI_COMM_NULL) MPI_Comm_free(&world_levels[n]); MPI_Comm_split(world,color,me,&world_levels[n]); if (!active_flag[n]) return; @@ -1479,16 +1495,13 @@ void MSM::particle_map() void MSM::make_rho() { - //fprintf(screen,"MSM aninterpolation\n\n"); - int i,l,m,n,nx,ny,nz,mx,my,mz; double dx,dy,dz,x0,y0,z0; // clear 3d density array - double ***qgridn = qgrid[0]; - - memset(&(qgridn[nzlo_out[0]][nylo_out[0]][nxlo_out[0]]),0,ngrid[0]*sizeof(double)); + double ***qgrid0 = qgrid[0]; + memset(&(qgrid0[nzlo_out[0]][nylo_out[0]][nxlo_out[0]]),0,ngrid[0]*sizeof(double)); // loop over my charges, add their contribution to nearby grid points // (nx,ny,nz) = global coords of grid pt to "lower left" of charge @@ -1519,7 +1532,7 @@ void MSM::make_rho() x0 = y0*phi1d[1][m]; for (l = nlower; l <= nupper; l++) { mx = l+nx; - qgridn[mz][my][mx] += x0*phi1d[0][l]; + qgrid0[mz][my][mx] += x0*phi1d[0][l]; } } } @@ -1534,8 +1547,6 @@ void MSM::make_rho() void MSM::direct(int n) { - //fprintf(screen,"Direct contribution on level %i\n\n",n); - double ***qgridn = qgrid[n]; double ***egridn = egrid[n]; double ***v0gridn = v0grid[n]; @@ -1768,8 +1779,6 @@ void MSM::direct(int n) void MSM::direct_peratom(int n) { - //fprintf(screen,"Direct contribution on level %i\n\n",n); - double ***qgridn = qgrid[n]; double ***v0gridn = v0grid[n]; double ***v1gridn = v1grid[n]; @@ -1892,8 +1901,6 @@ void MSM::direct_peratom(int n) void MSM::direct_top(int n) { - //fprintf(screen,"Direct contribution on level %i\n\n",n); - double ***qgridn = qgrid[n]; double ***egridn = egrid[n]; double ***v0gridn = v0grid[n]; @@ -2257,8 +2264,6 @@ void MSM::direct_peratom_top(int n) void MSM::restriction(int n) { - //fprintf(screen,"Restricting from level %i to %i\n\n",n,n+1); - const int p = order-1; double ***qgrid1 = qgrid[n]; @@ -2281,8 +2286,7 @@ void MSM::restriction(int n) // zero out charge on coarser grid - memset(&(qgrid2[nzlo_out[n+1]][nylo_out[n+1]][nxlo_out[n+1]]),0, - ngrid[n+1]*sizeof(double)); + memset(&(qgrid2[nzlo_out[n+1]][nylo_out[n+1]][nxlo_out[n+1]]),0,ngrid[n+1]*sizeof(double)); for (kp = nzlo_in[n+1]; kp <= nzhi_in[n+1]; kp++) for (jp = nylo_in[n+1]; jp <= nyhi_in[n+1]; jp++) @@ -2331,8 +2335,6 @@ void MSM::restriction(int n) void MSM::prolongation(int n) { - //fprintf(screen,"Prolongating from level %i to %i\n\n",n+1,n); - const int p = order-1; double ***egrid1 = egrid[n]; @@ -2721,8 +2723,6 @@ void MSM::unpack_reverse(int flag, double *buf, int nlist, int *list) void MSM::fieldforce() { - //fprintf(screen,"MSM interpolation\n\n"); - double ***egridn = egrid[0]; int i,l,m,n,nx,ny,nz,mx,my,mz;