diff --git a/src/KOKKOS/pair_snap_kokkos.h b/src/KOKKOS/pair_snap_kokkos.h index d6b028640d..a4e15fe5ce 100644 --- a/src/KOKKOS/pair_snap_kokkos.h +++ b/src/KOKKOS/pair_snap_kokkos.h @@ -30,19 +30,31 @@ PairStyle(snap/kk/host,PairSNAPKokkos) namespace LAMMPS_NS { +// Routines for both the CPU and GPU backend template struct TagPairSNAPComputeForce{}; - -struct TagPairSNAPBeta{}; struct TagPairSNAPComputeNeigh{}; + +// GPU backend only struct TagPairSNAPPreUi{}; struct TagPairSNAPComputeUi{}; -struct TagPairSNAPComputeUiCPU{}; +struct TagPairSNAPTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist struct TagPairSNAPComputeZi{}; +struct TagPairSNAPBeta{}; struct TagPairSNAPComputeBi{}; -struct TagPairSNAPZeroYi{}; +struct TagPairSNAPTransformBi{}; // re-order blist from AoSoA to AoS struct TagPairSNAPComputeYi{}; +struct TagPairSNAPTransformYi{}; // re-order ylist from AoSoA to AoS struct TagPairSNAPComputeFusedDeidrj{}; + +// CPU backend only +struct TagPairSNAPPreUiCPU{}; +struct TagPairSNAPComputeUiCPU{}; +struct TagPairSNAPComputeZiCPU{}; +struct TagPairSNAPBetaCPU{}; +struct TagPairSNAPComputeBiCPU{}; +struct TagPairSNAPZeroYiCPU{}; +struct TagPairSNAPComputeYiCPU{}; struct TagPairSNAPComputeDuidrjCPU{}; struct TagPairSNAPComputeDeidrjCPU{}; @@ -81,6 +93,10 @@ public: KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeNeigh,const typename Kokkos::TeamPolicy::member_type& team) const; + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPBetaCPU,const int& ii) const; + + // GPU backend only KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPPreUi,const typename Kokkos::TeamPolicy::member_type& team) const; @@ -88,32 +104,54 @@ public: void operator() (TagPairSNAPComputeUi,const typename Kokkos::TeamPolicy::member_type& team) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy::member_type& team) const; + void operator() (TagPairSNAPTransformUi,const int iatom_mod, const int idxu, const int iatom_div) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeZi,const int& ii) const; + void operator() (TagPairSNAPComputeZi,const int iatom_mod, const int idxz, const int iatom_div) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeBi,const typename Kokkos::TeamPolicy::member_type& team) const; + void operator() (TagPairSNAPBeta, const int& ii) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPZeroYi,const typename Kokkos::TeamPolicy::member_type& team) const; + void operator() (TagPairSNAPComputeBi,const int iatom_mod, const int idxb, const int iatom_div) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeYi,const int& ii) const; + void operator() (TagPairSNAPTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeYi,const int iatom_mod, const int idxz, const int iatom_div) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPTransformYi,const int iatom_mod, const int idxu, const int iatom_div) const; KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeFusedDeidrj,const typename Kokkos::TeamPolicy::member_type& team) const; + // CPU backend only + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPPreUiCPU,const typename Kokkos::TeamPolicy::member_type& team) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy::member_type& team) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeZiCPU,const int& ii) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeBiCPU,const typename Kokkos::TeamPolicy::member_type& team) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPZeroYiCPU,const typename Kokkos::TeamPolicy::member_type& team) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeYiCPU,const int& ii) const; + KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeDuidrjCPU,const typename Kokkos::TeamPolicy::member_type& team) const; KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeDeidrjCPU,const typename Kokkos::TeamPolicy::member_type& team) const; - KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPBeta,const int& ii) const; - template KOKKOS_INLINE_FUNCTION void v_tally_xyz(EV_FLOAT &ev, const int &i, const int &j, @@ -175,6 +213,7 @@ inline double dist2(double* x,double* y); Kokkos::View d_map; // mapping from atom types to elements Kokkos::View d_ninside; // ninside for all atoms in list Kokkos::View d_beta; // betas for all atoms in list + Kokkos::View d_beta_pack; // betas for all atoms in list, GPU Kokkos::View d_bispectrum; // bispectrum components for all atoms in list typedef Kokkos::DualView tdual_fparams; diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h index 5dbf8e3a5a..6bedf5554e 100644 --- a/src/KOKKOS/pair_snap_kokkos_impl.h +++ b/src/KOKKOS/pair_snap_kokkos_impl.h @@ -194,6 +194,8 @@ void PairSNAPKokkos::compute(int eflag_in, int vflag_in) if (beta_max < inum) { beta_max = inum; d_beta = Kokkos::View("PairSNAPKokkos:beta",ncoeff,inum); + if (!host_flag) + d_beta_pack = Kokkos::View("PairSNAPKokkos:beta_pack",32,ncoeff,(inum+32-1)/32); d_ninside = Kokkos::View("PairSNAPKokkos:ninside",inum); } @@ -222,32 +224,93 @@ void PairSNAPKokkos::compute(int eflag_in, int vflag_in) Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this); } - //PreUi + if (host_flag) { - int vector_length = vector_length_default; - int team_size = team_size_default; - if (!host_flag) - vector_length = 32; - check_team_size_for(chunk_size,team_size,vector_length); - typename Kokkos::TeamPolicy policy_preui((chunk_size+team_size-1)/team_size,team_size,vector_length); - Kokkos::parallel_for("PreUi",policy_preui,*this); - } + // Host codepath - // ComputeUI - { - int vector_length = vector_length_default; - int team_size = team_size_default; - if (host_flag) { // CPU - // Run a fused calculation of ulist and accumulation into ulisttot using atomics + //PreUi + { + int vector_length = vector_length_default; + int team_size = team_size_default; + check_team_size_for(chunk_size,team_size,vector_length); + typename Kokkos::TeamPolicy policy_preui_cpu((chunk_size+team_size-1)/team_size,team_size,vector_length); + Kokkos::parallel_for("PreUiCPU",policy_preui_cpu,*this); + } + // ComputeUi + { + int vector_length = vector_length_default; + int team_size = team_size_default; + // Fused calculation of ulist and accumulation into ulisttot using atomics typename Kokkos::TeamPolicy policy_ui_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); - Kokkos::parallel_for("ComputeUiCPU",policy_ui_cpu,*this); + } - } else { // GPU, vector parallelism, shared memory, separate ulist and ulisttot to avoid atomics + //Compute bispectrum + if (quadraticflag || eflag) { + //ComputeZi + int idxz_max = snaKK.idxz_max; + typename Kokkos::RangePolicy policy_zi_cpu(0,chunk_size*idxz_max); + Kokkos::parallel_for("ComputeZiCPU",policy_zi_cpu,*this); - vector_length = 32; - team_size = 4; // need to cap b/c of shared memory reqs + //ComputeBi + int vector_length = vector_length_default; + int team_size = team_size_default; + check_team_size_for(chunk_size,team_size,vector_length); + typename Kokkos::TeamPolicy policy_bi_cpu(chunk_size,team_size,vector_length); + Kokkos::parallel_for("ComputeBiCPU",policy_bi_cpu,*this); + } + + //ZeroYi,ComputeYi + { + int vector_length = vector_length_default; + int team_size = team_size_default; + + //Compute beta = dE_i/dB_i for all i in list + typename Kokkos::RangePolicy policy_beta(0,chunk_size); + Kokkos::parallel_for("ComputeBetaCPU",policy_beta,*this); + + //ZeroYi + check_team_size_for(chunk_size,team_size,vector_length); + typename Kokkos::TeamPolicy policy_zero_yi(((idxu_max+team_size-1)/team_size)*chunk_size,team_size,vector_length); + Kokkos::parallel_for("ZeroYiCPU",policy_zero_yi,*this); + + //ComputeYi + int idxz_max = snaKK.idxz_max; + typename Kokkos::RangePolicy policy_yi_cpu(0,chunk_size*idxz_max); + Kokkos::parallel_for("ComputeYiCPU",policy_yi_cpu,*this); + } // host flag + + //ComputeDuidrj and Deidrj + { + int team_size = team_size_default; + int vector_length = vector_length_default; + + typename Kokkos::TeamPolicy policy_duidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); + snaKK.set_dir(-1); // technically doesn't do anything + Kokkos::parallel_for("ComputeDuidrjCPU",policy_duidrj_cpu,*this); + + typename Kokkos::TeamPolicy policy_deidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); + + Kokkos::parallel_for("ComputeDeidrjCPU",policy_deidrj_cpu,*this); + } + } else { // GPU + +#ifdef KOKKOS_ENABLE_CUDA + //PreUi + { + int vector_length = vector_length_default; + int team_size = team_size_default; + check_team_size_for(chunk_size,team_size,vector_length); + typename Kokkos::TeamPolicy policy_preui((chunk_size+team_size-1)/team_size,team_size,vector_length); + Kokkos::parallel_for("PreUi",policy_preui,*this); + } + + // ComputeUi w/vector parallelism, shared memory, direct atomicAdd into ulisttot + { + + int vector_length = 32; + int team_size = 4; // need to cap b/c of shared memory reqs check_team_size_for(chunk_size,team_size,vector_length); // scratch size: 2 * team_size * (twojmax+1)^2, to cover all `m1`,`m2` values @@ -265,62 +328,54 @@ void PairSNAPKokkos::compute(int eflag_in, int vflag_in) Kokkos::parallel_for("ComputeUi",policy_ui,*this); + //Transform data layout of ulisttot to AoSoA, zero ylist + typename Kokkos::MDRangePolicy, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformUi> policy_transform_ui({0,0,0},{32,idxu_max,(chunk_size + 32 - 1) / 32},{32,4,1}); + Kokkos::parallel_for("TransformUi",policy_transform_ui,*this); + } - } + //Compute bispectrum in AoSoA data layout, transform Bi + if (quadraticflag || eflag) { + //ComputeZi + int idxz_max = snaKK.idxz_max; + typename Kokkos::MDRangePolicy, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPComputeZi> policy_compute_zi({0,0,0},{32,idxz_max,(chunk_size + 32 - 1) / 32},{32,4,1}); + Kokkos::parallel_for("ComputeZi",policy_compute_zi,*this); - //Compute bispectrum - if (quadraticflag || eflag) { - //ComputeZi - int idxz_max = snaKK.idxz_max; - typename Kokkos::RangePolicy policy_zi(0,chunk_size*idxz_max); - Kokkos::parallel_for("ComputeZi",policy_zi,*this); + //ComputeBi + int idxb_max = snaKK.idxb_max; + typename Kokkos::MDRangePolicy, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPComputeBi> policy_compute_bi({0,0,0},{32,idxb_max,(chunk_size + 32 - 1) / 32},{32,4,1}); + Kokkos::parallel_for("ComputeBi",policy_compute_bi,*this); - //ComputeBi - int vector_length = vector_length_default; - int team_size = team_size_default; - check_team_size_for(chunk_size,team_size,vector_length); - typename Kokkos::TeamPolicy policy_bi(chunk_size,team_size,vector_length); - Kokkos::parallel_for("ComputeBi",policy_bi,*this); - } + //Transform data layout of blist out of AoSoA + //We need this b/c `blist` gets used in ComputeForce which doesn't + //take advantage of AoSoA (which at best would only be beneficial + //on the margins) + typename Kokkos::MDRangePolicy, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformBi> policy_transform_bi({0,0,0},{32,idxb_max,(chunk_size + 32 - 1) / 32},{32,4,1}); + Kokkos::parallel_for("TransformBi",policy_transform_bi,*this); + } - //Compute beta = dE_i/dB_i for all i in list - typename Kokkos::RangePolicy policy_beta(0,chunk_size); - Kokkos::parallel_for("ComputeBeta",policy_beta,*this); + //ComputeYi in AoSoA data layout, transform to AoS for ComputeFusedDeidrj + //Note zeroing `ylist` is fused into `TransformUi`. + { + //Compute beta = dE_i/dB_i for all i in list + typename Kokkos::RangePolicy policy_beta(0,chunk_size); + Kokkos::parallel_for("ComputeBeta",policy_beta,*this); - //ZeroYi - { - int vector_length = vector_length_default; - int team_size = team_size_default; - if (!host_flag) - team_size = 128; - check_team_size_for(chunk_size,team_size,vector_length); - typename Kokkos::TeamPolicy policy_zero_yi(((idxu_max+team_size-1)/team_size)*chunk_size,team_size,vector_length); - Kokkos::parallel_for("ZeroYi",policy_zero_yi,*this); - } + //ComputeYi + const int idxz_max = snaKK.idxz_max; + typename Kokkos::MDRangePolicy, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPComputeYi> policy_compute_yi({0,0,0},{32,idxz_max,(chunk_size + 32 - 1) / 32},{32,4,1}); + Kokkos::parallel_for("ComputeYi",policy_compute_yi,*this); - //ComputeYi - int idxz_max = snaKK.idxz_max; - typename Kokkos::RangePolicy policy_yi(0,chunk_size*idxz_max); - Kokkos::parallel_for("ComputeYi",policy_yi,*this); + //Transform data layout of ylist out of AoSoA + typename Kokkos::MDRangePolicy, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformYi> policy_transform_yi({0,0,0},{32,idxu_max,(chunk_size + 32 - 1) / 32},{32,4,1}); + Kokkos::parallel_for("TransformYi",policy_transform_yi,*this); - //ComputeDuidrj and Deidrj - { - int team_size = team_size_default; - int vector_length = vector_length_default; - if (host_flag) { // CPU + } - typename Kokkos::TeamPolicy policy_duidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); - snaKK.set_dir(-1); // technically doesn't do anything - Kokkos::parallel_for("ComputeDuidrjCPU",policy_duidrj_cpu,*this); - - typename Kokkos::TeamPolicy policy_deidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); - - Kokkos::parallel_for("ComputeDeidrjCPU",policy_deidrj_cpu,*this); - } else { // GPU, utilize scratch memory and splitting over dimensions, fused dui and dei - - vector_length = 32; - team_size = 2; // need to cap b/c of shared memory reqs + // Fused ComputeDuidrj, ComputeDeidrj + { + int vector_length = 32; + int team_size = 2; // need to cap b/c of shared memory reqs check_team_size_for(chunk_size,team_size,vector_length); // scratch size: 2 * 2 * team_size * (twojmax+1)*(twojmax/2+1), to cover half `m1`,`m2` values due to symmetry @@ -341,6 +396,9 @@ void PairSNAPKokkos::compute(int eflag_in, int vflag_in) Kokkos::parallel_for("ComputeFusedDeidrj",policy_fused_deidrj,*this); } } + +#endif // KOKKOS_ENABLE_CUDA + } //ComputeForce @@ -416,38 +474,6 @@ void PairSNAPKokkos::compute(int eflag_in, int vflag_in) } } -/* ---------------------------------------------------------------------- */ - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPBeta,const int& ii) const { - - const int i = d_ilist[ii + chunk_offset]; - const int itype = type[i]; - const int ielem = d_map[itype]; - SNAKokkos my_sna = snaKK; - - Kokkos::View> - d_coeffi(d_coeffelem,ielem,Kokkos::ALL); - - for (int icoeff = 0; icoeff < ncoeff; icoeff++) - d_beta(icoeff,ii) = d_coeffi[icoeff+1]; - - if (quadraticflag) { - int k = ncoeff+1; - for (int icoeff = 0; icoeff < ncoeff; icoeff++) { - double bveci = my_sna.blist(icoeff,ii); - d_beta(icoeff,ii) += d_coeffi[k]*bveci; - k++; - for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) { - double bvecj = my_sna.blist(jcoeff,ii); - d_beta(icoeff,ii) += d_coeffi[k]*bvecj; - d_beta(jcoeff,ii) += d_coeffi[k]*bveci; - k++; - } - } - } -} /* ---------------------------------------------------------------------- allocate all arrays @@ -520,6 +546,10 @@ void PairSNAPKokkos::coeff(int narg, char **arg) snaKK.init(); } +/* ---------------------------------------------------------------------- + Begin routines that are called on both CPU and GPU codepaths +------------------------------------------------------------------------- */ + /* ---------------------------------------------------------------------- */ template @@ -594,6 +624,53 @@ void PairSNAPKokkos::operator() (TagPairSNAPComputeNeigh,const typen }); } +/* ---------------------------------------------------------------------- + Begin routines that are unique to the GPU codepath. These take advantage + of AoSoA data layouts and scratch memory for recursive polynomials +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPBeta,const int& ii) const { + + if (ii >= chunk_size) return; + + const int iatom_mod = ii % 32; + const int iatom_div = ii / 32; + + const int i = d_ilist[ii + chunk_offset]; + const int itype = type[i]; + const int ielem = d_map[itype]; + SNAKokkos my_sna = snaKK; + + Kokkos::View> + d_coeffi(d_coeffelem,ielem,Kokkos::ALL); + + for (int icoeff = 0; icoeff < ncoeff; icoeff++) { + d_beta_pack(iatom_mod,icoeff,iatom_div) = d_coeffi[icoeff+1]; + } + + if (quadraticflag) { + const auto idxb_max = my_sna.idxb_max; + int k = ncoeff+1; + for (int icoeff = 0; icoeff < ncoeff; icoeff++) { + const auto idxb = icoeff % idxb_max; + const auto idx_chem = icoeff / idxb_max; + double bveci = my_sna.blist(idxb, idx_chem, ii); + d_beta_pack(iatom_mod,icoeff,iatom_div) += d_coeffi[k]*bveci; + k++; + for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) { + const auto jdxb = jcoeff % idxb_max; + const auto jdx_chem = jcoeff / idxb_max; + double bvecj = my_sna.blist(jdxb, jdx_chem, ii); + d_beta_pack(iatom_mod,icoeff,iatom_div) += d_coeffi[k]*bvecj; + d_beta_pack(iatom_mod,jcoeff,iatom_div) += d_coeffi[k]*bveci; + k++; + } + } + } +} + template KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPPreUi,const typename Kokkos::TeamPolicy::member_type& team) const { @@ -627,61 +704,107 @@ void PairSNAPKokkos::operator() (TagPairSNAPComputeUi,const typename template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy::member_type& team) const { +void PairSNAPKokkos::operator() (TagPairSNAPTransformUi,const int iatom_mod, const int idxu, const int iatom_div) const { SNAKokkos my_sna = snaKK; - // Extract the atom number - int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size())); - if (ii >= chunk_size) return; + const int iatom = iatom_mod + iatom_div * 32; + if (iatom >= chunk_size) return; - // Extract the neighbor number - const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size()); - const int ninside = d_ninside(ii); - if (jj >= ninside) return; + if (idxu >= my_sna.idxu_max) return; + + int elem_count = chemflag ? nelements : 1; + + for (int ielem = 0; ielem < elem_count; ielem++) { + + const auto utot_re = my_sna.ulisttot_re(idxu, ielem, iatom); + const auto utot_im = my_sna.ulisttot_im(idxu, ielem, iatom); + + my_sna.ulisttot_pack(iatom_mod, idxu, ielem, iatom_div) = { utot_re, utot_im }; + + my_sna.ylist_pack_re(iatom_mod, idxu, ielem, iatom_div) = 0.; + my_sna.ylist_pack_im(iatom_mod, idxu, ielem, iatom_div) = 0.; + } - my_sna.compute_ui_cpu(team,ii,jj); } template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPZeroYi,const typename Kokkos::TeamPolicy::member_type& team) const { +void PairSNAPKokkos::operator() (TagPairSNAPComputeYi,const int iatom_mod, const int jjz, const int iatom_div) const { SNAKokkos my_sna = snaKK; - // Extract the quantum number - const int idx = team.team_rank() + team.team_size() * (team.league_rank() % ((my_sna.idxu_max+team.team_size()-1)/team.team_size())); - if (idx >= my_sna.idxu_max) return; + const int iatom = iatom_mod + iatom_div * 32; + if (iatom >= chunk_size) return; - // Extract the atomic index - const int ii = team.league_rank() / ((my_sna.idxu_max+team.team_size()-1)/team.team_size()); - if (ii >= chunk_size) return; + if (jjz >= my_sna.idxz_max) return; - if (chemflag) - for(int ielem = 0; ielem < nelements; ielem++) - my_sna.zero_yi(idx,ii,ielem); - else - my_sna.zero_yi(idx,ii,0); + my_sna.compute_yi(iatom_mod,jjz,iatom_div,d_beta_pack); } template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeYi,const int& ii) const { +void PairSNAPKokkos::operator() (TagPairSNAPTransformYi,const int iatom_mod, const int idxu, const int iatom_div) const { SNAKokkos my_sna = snaKK; - my_sna.compute_yi(ii,d_beta); + + const int iatom = iatom_mod + iatom_div * 32; + if (iatom >= chunk_size) return; + + if (idxu >= my_sna.idxu_max) return; + + int elem_count = chemflag ? nelements : 1; + for (int ielem = 0; ielem < elem_count; ielem++) { + const auto y_re = my_sna.ylist_pack_re(iatom_mod, idxu, ielem, iatom_div); + const auto y_im = my_sna.ylist_pack_im(iatom_mod, idxu, ielem, iatom_div); + + my_sna.ylist(idxu, ielem, iatom) = { y_re, y_im }; + } + } template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeZi,const int& ii) const { +void PairSNAPKokkos::operator() (TagPairSNAPComputeZi,const int iatom_mod, const int jjz, const int iatom_div) const { SNAKokkos my_sna = snaKK; - my_sna.compute_zi(ii); + + const int iatom = iatom_mod + iatom_div * 32; + if (iatom >= chunk_size) return; + + if (jjz >= my_sna.idxz_max) return; + + my_sna.compute_zi(iatom_mod,jjz,iatom_div); } template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeBi,const typename Kokkos::TeamPolicy::member_type& team) const { - int ii = team.league_rank(); +void PairSNAPKokkos::operator() (TagPairSNAPComputeBi,const int iatom_mod, const int jjb, const int iatom_div) const { SNAKokkos my_sna = snaKK; - my_sna.compute_bi(team,ii); + + const int iatom = iatom_mod + iatom_div * 32; + if (iatom >= chunk_size) return; + + if (jjb >= my_sna.idxb_max) return; + + my_sna.compute_bi(iatom_mod,jjb,iatom_div); +} + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const { + SNAKokkos my_sna = snaKK; + + const int iatom = iatom_mod + iatom_div * 32; + if (iatom >= chunk_size) return; + + if (idxb >= my_sna.idxb_max) return; + + const int ntriples = my_sna.ntriples; + + for (int itriple = 0; itriple < ntriples; itriple++) { + + const auto blocal = my_sna.blist_pack(iatom_mod, idxb, itriple, iatom_div); + + my_sna.blist(idxb, itriple, iatom) = blocal; + } + } template @@ -701,6 +824,126 @@ void PairSNAPKokkos::operator() (TagPairSNAPComputeFusedDeidrj,const my_sna.compute_fused_deidrj(team,ii,jj); } +/* ---------------------------------------------------------------------- + Begin routines that are unique to the CPU codepath. These do not take + advantage of AoSoA data layouts, but that could be a good point of + future optimization and unification with the above kernels. It's unlikely + that scratch memory optimizations will ever be useful for the CPU due to + different arithmetic intensity requirements for the CPU vs GPU. +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPBetaCPU,const int& ii) const { + + const int i = d_ilist[ii + chunk_offset]; + const int itype = type[i]; + const int ielem = d_map[itype]; + SNAKokkos my_sna = snaKK; + + Kokkos::View> + d_coeffi(d_coeffelem,ielem,Kokkos::ALL); + + for (int icoeff = 0; icoeff < ncoeff; icoeff++) + d_beta(icoeff,ii) = d_coeffi[icoeff+1]; + + if (quadraticflag) { + const auto idxb_max = my_sna.idxb_max; + int k = ncoeff+1; + for (int icoeff = 0; icoeff < ncoeff; icoeff++) { + const auto idxb = icoeff % idxb_max; + const auto idx_chem = icoeff / idxb_max; + double bveci = my_sna.blist(idxb,idx_chem,ii); + d_beta(icoeff,ii) += d_coeffi[k]*bveci; + k++; + for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) { + const auto jdxb = jcoeff % idxb_max; + const auto jdx_chem = jcoeff / idxb_max; + double bvecj = my_sna.blist(jdxb,jdx_chem,ii); + d_beta(icoeff,ii) += d_coeffi[k]*bvecj; + d_beta(jcoeff,ii) += d_coeffi[k]*bveci; + k++; + } + } + } +} + + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPPreUiCPU,const typename Kokkos::TeamPolicy::member_type& team) const { + SNAKokkos my_sna = snaKK; + + // Extract the atom number + const int ii = team.team_rank() + team.team_size() * team.league_rank(); + if (ii >= chunk_size) return; + int itype = type(ii); + int ielem = d_map[itype]; + + my_sna.pre_ui_cpu(team,ii,ielem); +} + + + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy::member_type& team) const { + SNAKokkos my_sna = snaKK; + + // Extract the atom number + int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size())); + if (ii >= chunk_size) return; + + // Extract the neighbor number + const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size()); + const int ninside = d_ninside(ii); + if (jj >= ninside) return; + + my_sna.compute_ui_cpu(team,ii,jj); +} + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPZeroYiCPU,const typename Kokkos::TeamPolicy::member_type& team) const { + SNAKokkos my_sna = snaKK; + + // Extract the quantum number + const int idx = team.team_rank() + team.team_size() * (team.league_rank() % ((my_sna.idxu_max+team.team_size()-1)/team.team_size())); + if (idx >= my_sna.idxu_max) return; + + // Extract the atomic index + const int ii = team.league_rank() / ((my_sna.idxu_max+team.team_size()-1)/team.team_size()); + if (ii >= chunk_size) return; + + if (chemflag) + for(int ielem = 0; ielem < nelements; ielem++) + my_sna.zero_yi_cpu(idx,ii,ielem); + else + my_sna.zero_yi_cpu(idx,ii,0); +} + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeYiCPU,const int& ii) const { + SNAKokkos my_sna = snaKK; + my_sna.compute_yi_cpu(ii,d_beta); +} + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeZiCPU,const int& ii) const { + SNAKokkos my_sna = snaKK; + my_sna.compute_zi_cpu(ii); +} + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeBiCPU,const typename Kokkos::TeamPolicy::member_type& team) const { + int ii = team.league_rank(); + SNAKokkos my_sna = snaKK; + my_sna.compute_bi_cpu(team,ii); +} + template KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPComputeDuidrjCPU,const typename Kokkos::TeamPolicy::member_type& team) const { @@ -735,6 +978,12 @@ void PairSNAPKokkos::operator() (TagPairSNAPComputeDeidrjCPU,const t my_sna.compute_deidrj_cpu(team,ii,jj); } +/* ---------------------------------------------------------------------- + Also used for both CPU and GPU codepaths. Could maybe benefit from a + separate GPU/CPU codepath, but this kernel takes so little time it's + likely not worth it. +------------------------------------------------------------------------- */ + template template KOKKOS_INLINE_FUNCTION @@ -799,20 +1048,31 @@ void PairSNAPKokkos::operator() (TagPairSNAPComputeForce t_sna_2d; typedef Kokkos::View t_sna_2d_ll; typedef Kokkos::View t_sna_3d; + typedef Kokkos::View t_sna_3d_ll; typedef Kokkos::View t_sna_4d; + typedef Kokkos::View t_sna_4d_ll; typedef Kokkos::View t_sna_3d3; typedef Kokkos::View t_sna_5d; @@ -48,7 +50,8 @@ public: typedef Kokkos::View t_sna_3c; typedef Kokkos::View t_sna_3c_ll; typedef Kokkos::View t_sna_4c; - typedef Kokkos::View t_sna_4c_ll; + typedef Kokkos::View t_sna_4c3_ll; + typedef Kokkos::View t_sna_4c_ll; typedef Kokkos::View t_sna_3c3; typedef Kokkos::View t_sna_5c; @@ -73,27 +76,39 @@ inline int ncoeff; - // functions for bispectrum coefficients + // functions for bispectrum coefficients, GPU only KOKKOS_INLINE_FUNCTION - void pre_ui(const typename Kokkos::TeamPolicy::member_type& team,const int&, int); // ForceSNAP + void pre_ui(const typename Kokkos::TeamPolicy::member_type& team,const int&,const int&); // ForceSNAP KOKKOS_INLINE_FUNCTION void compute_ui(const typename Kokkos::TeamPolicy::member_type& team, const int, const int); // ForceSNAP KOKKOS_INLINE_FUNCTION + void compute_zi(const int&, const int&, const int&); // ForceSNAP + KOKKOS_INLINE_FUNCTION + void compute_yi(int,int,int, + const Kokkos::View &beta_pack); // ForceSNAP + KOKKOS_INLINE_FUNCTION + void compute_bi(const int&, const int&, const int&); // ForceSNAP + + // functions for bispectrum coefficients, CPU only + KOKKOS_INLINE_FUNCTION + void pre_ui_cpu(const typename Kokkos::TeamPolicy::member_type& team,const int&,const int&); // ForceSNAP + KOKKOS_INLINE_FUNCTION void compute_ui_cpu(const typename Kokkos::TeamPolicy::member_type& team, int, int); // ForceSNAP KOKKOS_INLINE_FUNCTION - void compute_zi(const int&); // ForceSNAP + void compute_zi_cpu(const int&); // ForceSNAP KOKKOS_INLINE_FUNCTION - void zero_yi(const int&, const int&, int); // ForceSNAP + void zero_yi_cpu(const int&,const int&,const int&); // ForceSNAP KOKKOS_INLINE_FUNCTION - void compute_yi(int, + void compute_yi_cpu(int, const Kokkos::View &beta); // ForceSNAP - KOKKOS_INLINE_FUNCTION - void compute_bi(const typename Kokkos::TeamPolicy::member_type& team, int); // ForceSNAP - - // functions for derivatives + KOKKOS_INLINE_FUNCTION + void compute_bi_cpu(const typename Kokkos::TeamPolicy::member_type& team, int); // ForceSNAP + // functions for derivatives, GPU only KOKKOS_INLINE_FUNCTION void compute_fused_deidrj(const typename Kokkos::TeamPolicy::member_type& team, const int, const int); //ForceSNAP + + // functions for derivatives, CPU only KOKKOS_INLINE_FUNCTION void compute_duidrj_cpu(const typename Kokkos::TeamPolicy::member_type& team, int, int); //ForceSNAP KOKKOS_INLINE_FUNCTION @@ -139,18 +154,32 @@ inline int twojmax, diagonalstyle; - t_sna_2d_ll blist; - t_sna_2c_ll ulisttot; - t_sna_2c_ll zlist; + t_sna_3d_ll blist; + t_sna_3c_ll ulisttot; + t_sna_3c_ll zlist; t_sna_3c_ll ulist; - t_sna_2c_ll ylist; - + t_sna_3c_ll ylist; + // derivatives of data - t_sna_4c_ll dulist; + t_sna_4c3_ll dulist; + + // Modified structures for GPU backend + t_sna_3d_ll ulisttot_re; // split real, + t_sna_3d_ll ulisttot_im; // imag + t_sna_4c_ll ulisttot_pack; // AoSoA layout + t_sna_4c_ll zlist_pack; // AoSoA layout + t_sna_4d_ll blist_pack; + t_sna_4d_ll ylist_pack_re; // split real, + t_sna_4d_ll ylist_pack_im; // imag AoSoA layout int idxcg_max, idxu_max, idxz_max, idxb_max; + // Chem snap counts + int nelements; + int ndoubles; + int ntriples; + private: double rmin0, rfac0; @@ -212,9 +241,6 @@ inline // Chem snap flags int chem_flag; int bnorm_flag; - int nelements; - int ndoubles; - int ntriples; // Self-weight double wself; diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index dae4a0c2a1..866b4c516f 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -237,56 +237,71 @@ void SNAKokkos::grow_rij(int newnatom, int newnmax) element = t_sna_2i("sna:rcutij",natom,nmax); dedr = t_sna_3d("sna:dedr",natom,nmax,3); - blist = t_sna_2d_ll("sna:blist",idxb_max*ntriples,natom); - //ulisttot = t_sna_2c("sna:ulisttot",natom,idxu_max); - ulisttot = t_sna_2c_ll("sna:ulisttot",idxu_max*nelements,natom); - - zlist = t_sna_2c_ll("sna:zlist",idxz_max*ndoubles,natom); - - //ulist = t_sna_3c("sna:ulist",natom,nmax,idxu_max); #ifdef KOKKOS_ENABLE_CUDA if (std::is_same::value) { // dummy allocation + ulisttot = t_sna_3c_ll("sna:ulisttot",1,1,1); + ulisttot_re = t_sna_3d_ll("sna:ulisttot_re",idxu_max,nelements,natom); + ulisttot_im = t_sna_3d_ll("sna:ulisttot_im",idxu_max,nelements,natom); + ulisttot_pack = t_sna_4c_ll("sna:ulisttot_pack",32,idxu_max,nelements,(natom+32-1)/32); ulist = t_sna_3c_ll("sna:ulist",1,1,1); - dulist = t_sna_4c_ll("sna:dulist",1,1,1); + zlist = t_sna_3c_ll("sna:zlist",1,1,1); + zlist_pack = t_sna_4c_ll("sna:zlist_pack",32,idxz_max,ndoubles,(natom+32-1)/32); + blist = t_sna_3d_ll("sna:blist",idxb_max,ntriples,natom); + blist_pack = t_sna_4d_ll("sna:blist_pack",32,idxb_max,ntriples,(natom+32-1)/32); + ylist = t_sna_3c_ll("sna:ylist",idxu_max,nelements,natom); + ylist_pack_re = t_sna_4d_ll("sna:ylist_pack_re",32,idxu_max,nelements,(natom+32-1)/32); + ylist_pack_im = t_sna_4d_ll("sna:ylist_pack_im",32,idxu_max,nelements,(natom+32-1)/32); + dulist = t_sna_4c3_ll("sna:dulist",1,1,1); } else { #endif + ulisttot = t_sna_3c_ll("sna:ulisttot",idxu_max,nelements,natom); + ulisttot_re = t_sna_3d_ll("sna:ulisttot_re",1,1,1); + ulisttot_im = t_sna_3d_ll("sna:ulisttot_im",1,1,1); + ulisttot_pack = t_sna_4c_ll("sna:ulisttot_pack",1,1,1,1); ulist = t_sna_3c_ll("sna:ulist",idxu_max,natom,nmax); - dulist = t_sna_4c_ll("sna:dulist",idxu_max,natom,nmax); + zlist = t_sna_3c_ll("sna:zlist",idxz_max,ndoubles,natom); + zlist_pack = t_sna_4c_ll("sna:zlist_pack",1,1,1,1); + blist = t_sna_3d_ll("sna:blist",idxb_max,ntriples,natom); + blist_pack = t_sna_4d_ll("sna:blist_pack",1,1,1,1); + ylist = t_sna_3c_ll("sna:ylist",idxu_max,nelements,natom); + ylist_pack_re = t_sna_4d_ll("sna:ylist_pack_re",1,1,1,1); + ylist_pack_im = t_sna_4d_ll("sna:ylist_pack_im",1,1,1,1); + dulist = t_sna_4c3_ll("sna:dulist",idxu_max,natom,nmax); + #ifdef KOKKOS_ENABLE_CUDA } #endif - - //ylist = t_sna_2c_lr("sna:ylist",natom,idxu_max); - ylist = t_sna_2c_ll("sna:ylist",idxu_max*nelements,natom); } /* ---------------------------------------------------------------------- - * compute Ui by summing over neighbors j - * ------------------------------------------------------------------------- */ + * GPU routines + * ----------------------------------------------------------------------*/ template KOKKOS_INLINE_FUNCTION -void SNAKokkos::pre_ui(const typename Kokkos::TeamPolicy::member_type& team, const int& iatom, int ielem) +void SNAKokkos::pre_ui(const typename Kokkos::TeamPolicy::member_type& team, const int& iatom, const int& ielem) { - for(int jelem = 0; jelem < nelements; jelem++) - for (int j = 0; j <= twojmax; j++) { - const int jju = idxu_block(j); + for (int jelem = 0; jelem < nelements; jelem++) { + for (int j = 0; j <= twojmax; j++) { + const int jju = idxu_block(j); - // Only diagonal elements get initialized - // for (int m = 0; m < (j+1)*(j+1); m++) - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, (j+1)*(j+1)), - [&] (const int m) { + // Only diagonal elements get initialized + // for (int m = 0; m < (j+1)*(j+1); m++) + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, (j+1)*(j+1)), + [&] (const int m) { - const int jjup = jju + m; + const int jjup = jju + m; - // if m is on the "diagonal", initialize it with the self energy. - // Otherwise zero it out - SNAcomplex init = {0., 0.}; - if (m % (j+2) == 0 && (!chem_flag || ielem == jelem || wselfall_flag)) { init = {wself, 0.0}; } //need to map iatom to element + // if m is on the "diagonal", initialize it with the self energy. + // Otherwise zero it out + double re_part = 0.; + if (m % (j+2) == 0 && (!chem_flag || ielem == jelem || wselfall_flag)) { re_part = wself; } - ulisttot(jelem*idxu_max+jjup, iatom) = init; - }); + ulisttot_re(jjup, jelem, iatom) = re_part; + ulisttot_im(jjup, jelem, iatom) = 0.; + }); + } } } @@ -322,7 +337,7 @@ void SNAKokkos::compute_ui(const typename Kokkos::TeamPolicy::compute_ui(const typename Kokkos::TeamPolicy::compute_ui(const typename Kokkos::TeamPolicy::compute_ui(const typename Kokkos::TeamPolicy::compute_ui(const typename Kokkos::TeamPolicy 0)?rootpq2*buf1[jjup_shared_idx-1]:SNAcomplex(0.,0.); - //const SNAcomplex u_up2 = (ma > 0)?rootpq2*ulist(jjup_index-1,iatom,jnbor):SNAcomplex(0.,0.); + const SNAcomplex u_up2 = rootpq2*((ma > 0)?buf1[jjup_shared_idx-1]:SNAcomplex(0.,0.)); + + // u_accum += conj(b) * u_up2 caconjxpy(b, u_up2, u_accum); // VMK recursion relation: grab contribution which is multiplied by a* const double rootpq1 = rootpqarray(j - ma, j - mb); - const SNAcomplex u_up1 = (ma < j)?rootpq1*buf1[jjup_shared_idx]:SNAcomplex(0.,0.); - //const SNAcomplex u_up1 = (ma < j)?rootpq1*ulist(jjup_index,iatom,jnbor):SNAcomplex(0.,0.); + const SNAcomplex u_up1 = rootpq1*((ma < j)?buf1[jjup_shared_idx]:SNAcomplex(0.,0.)); + + // u_accum += conj(a) * u_up1 caconjxpy(a, u_up1, u_accum); - //ulist(jju_index,iatom,jnbor) = u_accum; // back up into shared memory for next iter buf2[jju_shared_idx] = u_accum; - Kokkos::atomic_add(&(ulisttot(jju_index,iatom).re), sfac * u_accum.re); - Kokkos::atomic_add(&(ulisttot(jju_index,iatom).im), sfac * u_accum.im); + Kokkos::atomic_add(&(ulisttot_re(jju_index,jelem,iatom)), sfac * u_accum.re); + Kokkos::atomic_add(&(ulisttot_im(jju_index,jelem,iatom)), sfac * u_accum.im); // copy left side to right side with inversion symmetry VMK 4.4(2) // u[ma-j,mb-j] = (-1)^(ma-mb)*Conj([u[ma,mb)) @@ -415,7 +428,7 @@ void SNAKokkos::compute_ui(const typename Kokkos::TeamPolicy::compute_ui(const typename Kokkos::TeamPolicy::compute_ui(const typename Kokkos::TeamPolicy KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_ui_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor) +void SNAKokkos::compute_zi(const int& iatom_mod, const int& jjz, const int& iatom_div) { - double rsq, r, x, y, z, z0, theta0; - - // utot(j,ma,mb) = 0 for all j,ma,ma - // utot(j,ma,ma) = 1 for all j,ma - // for j in neighbors of i: - // compute r0 = (x,y,z,z0) - // utot(j,ma,mb) += u(r0;j,ma,mb) for all j,ma,mb - - x = rij(iatom,jnbor,0); - y = rij(iatom,jnbor,1); - z = rij(iatom,jnbor,2); - rsq = x * x + y * y + z * z; - r = sqrt(rsq); - - theta0 = (r - rmin0) * rfac0 * MY_PI / (rcutij(iatom,jnbor) - rmin0); - // theta0 = (r - rmin0) * rscale0; - z0 = r / tan(theta0); - - compute_uarray_cpu(team, iatom, jnbor, x, y, z, z0, r); - add_uarraytot(team, iatom, jnbor, r, wj(iatom,jnbor), rcutij(iatom,jnbor), element(iatom, jnbor)); - -} - -/* ---------------------------------------------------------------------- - compute Zi by summing over products of Ui -------------------------------------------------------------------------- */ - -template -KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_zi(const int& iter) -{ - const int iatom = iter / idxz_max; - const int jjz = iter % idxz_max; const int j1 = idxz(jjz, 0); const int j2 = idxz(jjz, 1); @@ -493,77 +474,161 @@ void SNAKokkos::compute_zi(const int& iter) const int na = idxz(jjz, 7); const int nb = idxz(jjz, 8); - const double *cgblock = cglist.data() + idxcg_block(j1, j2, j); + const double* cgblock = cglist.data() + idxcg_block(j1, j2, j); int idouble = 0; - for(int elem1 = 0; elem1 < nelements; elem1++) - for(int elem2 = 0; elem2 < nelements; elem2++) { - const int jjza = idouble*idxz_max + jjz; + for (int elem1 = 0; elem1 < nelements; elem1++) { + for (int elem2 = 0; elem2 < nelements; elem2++) { + double ztmp_r = 0.; + double ztmp_i = 0.; - zlist(jjza, iatom).re = 0.0; - zlist(jjza, iatom).im = 0.0; + int jju1 = idxu_block[j1] + (j1+1)*mb1min; + int jju2 = idxu_block[j2] + (j2+1)*mb2max; + int icgb = mb1min*(j2+1) + mb2max; - int jju1 = elem1 * idxu_max + idxu_block[j1] + (j1 + 1) * mb1min; - int jju2 = elem2 * idxu_max + idxu_block[j2] + (j2 + 1) * mb2max; - int icgb = mb1min * (j2 + 1) + mb2max; - for (int ib = 0; ib < nb; ib++) { + #pragma unroll + for(int ib = 0; ib < nb; ib++) { - double suma1_r = 0.0; - double suma1_i = 0.0; + int ma1 = ma1min; + int ma2 = ma2max; + int icga = ma1min*(j2+1) + ma2max; - int ma1 = ma1min; - int ma2 = ma2max; - int icga = ma1min * (j2 + 1) + ma2max; - for (int ia = 0; ia < na; ia++) { - suma1_r += cgblock[icga] * (ulisttot(jju1 + ma1, iatom).re * ulisttot(jju2 + ma2, iatom).re - - ulisttot(jju1 + ma1, iatom).im * ulisttot(jju2 + ma2, iatom).im); - suma1_i += cgblock[icga] * (ulisttot(jju1 + ma1, iatom).re * ulisttot(jju2 + ma2, iatom).im + - ulisttot(jju1 + ma1, iatom).im * ulisttot(jju2 + ma2, iatom).re); - ma1++; - ma2--; - icga += j2; - } // end loop over ia + #pragma unroll + for(int ia = 0; ia < na; ia++) { + const SNAcomplex utot1 = ulisttot_pack(iatom_mod, jju1+ma1, elem1, iatom_div); + const SNAcomplex utot2 = ulisttot_pack(iatom_mod, jju2+ma2, elem2, iatom_div); + const auto cgcoeff_a = cgblock[icga]; + const auto cgcoeff_b = cgblock[icgb]; + ztmp_r += cgcoeff_a * cgcoeff_b * (utot1.re * utot2.re - utot1.im * utot2.im); + ztmp_i += cgcoeff_a * cgcoeff_b * (utot1.re * utot2.im + utot1.im * utot2.re); + ma1++; + ma2--; + icga += j2; + } // end loop over ia - zlist(jjza, iatom).re += cgblock[icgb] * suma1_r; - zlist(jjza, iatom).im += cgblock[icgb] * suma1_i; + jju1 += j1 + 1; + jju2 -= j2 + 1; + icgb += j2; + } // end loop over ib - jju1 += j1 + 1; - jju2 -= j2 + 1; - icgb += j2; + if (bnorm_flag) { + ztmp_r /= (j + 1); + ztmp_i /= (j + 1); + } - } // end loop over ib - if (bnorm_flag) { - zlist(jjza, iatom).re /= (j+1); - zlist(jjza, iatom).im /= (j+1); - } - idouble++; + zlist_pack(iatom_mod,jjz,idouble,iatom_div) = { ztmp_r, ztmp_i }; + + idouble++; + } } } /* ---------------------------------------------------------------------- - compute Yi from Ui without storing Zi, looping over zlist indices + compute Bi by summing conj(Ui)*Zi ------------------------------------------------------------------------- */ template KOKKOS_INLINE_FUNCTION -void SNAKokkos::zero_yi(const int& idx, const int& iatom, int ielem) +void SNAKokkos::compute_bi(const int& iatom_mod, const int& jjb, const int& iatom_div) { - ylist(ielem*idxu_max+idx,iatom) = {0.0, 0.0}; + // for j1 = 0,...,twojmax + // for j2 = 0,twojmax + // for j = |j1-j2|,Min(twojmax,j1+j2),2 + // b(j1,j2,j) = 0 + // for mb = 0,...,jmid + // for ma = 0,...,j + // b(j1,j2,j) += + // 2*Conj(u(j,ma,mb))*z(j1,j2,j,ma,mb) + + const int j1 = idxb(jjb,0); + const int j2 = idxb(jjb,1); + const int j = idxb(jjb,2); + + const int jjz = idxz_block(j1,j2,j); + const int jju = idxu_block[j]; + + int itriple = 0; + int idouble = 0; + for (int elem1 = 0; elem1 < nelements; elem1++) { + for (int elem2 = 0; elem2 < nelements; elem2++) { + const int jalloy = idouble; + + for (int elem3 = 0; elem3 < nelements; elem3++) { + const int jjballoy = itriple; + + double sumzu = 0.0; + double sumzu_temp = 0.0; + + for(int mb = 0; 2*mb < j; mb++) { + for(int ma = 0; ma <= j; ma++) { + const int jju_index = jju+mb*(j+1)+ma; + const int jjz_index = jjz+mb*(j+1)+ma; + if (2*mb == j) return; // I think we can remove this? + const auto utot = ulisttot_pack(iatom_mod, jju_index, elem3, iatom_div); + const auto zloc = zlist_pack(iatom_mod, jjz_index, jalloy, iatom_div); + sumzu_temp += utot.re * zloc.re + utot.im * zloc.im; + } + } + sumzu += sumzu_temp; + + // For j even, special treatment for middle column + if (j%2 == 0) { + sumzu_temp = 0.; + + const int mb = j/2; + for(int ma = 0; ma < mb; ma++) { + const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma; + const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma; + + const auto utot = ulisttot_pack(iatom_mod, jju_index, elem3, iatom_div); + const auto zloc = zlist_pack(iatom_mod, jjz_index, jalloy, iatom_div); + sumzu_temp += utot.re * zloc.re + utot.im * zloc.im; + + } + sumzu += sumzu_temp; + + const int ma = mb; + const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma; + const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma; + + const auto utot = ulisttot_pack(iatom_mod, jju_index, elem3, iatom_div); + const auto zloc = zlist_pack(iatom_mod, jjz_index, jalloy, iatom_div); + sumzu += 0.5 * (utot.re * zloc.re + utot.im * zloc.im); + } // end if jeven + + sumzu *= 2.0; + if (bzero_flag) { + if (!wselfall_flag) { + if (elem1 == elem2 && elem1 == elem3) { + sumzu -= bzero[j]; + } + } else { + sumzu -= bzero[j]; + } + } + blist_pack(iatom_mod, jjb, jjballoy, iatom_div) = sumzu; + //} // end loop over j + //} // end loop over j1, j2 + itriple++; + } // end loop over elem3 + idouble++; + } // end loop over elem2 + } // end loop over elem1 } + /* ---------------------------------------------------------------------- - compute Yi from Ui without storing Zi, looping over zlist indices + compute Yi from Ui without storing Zi, looping over zlist indices, + GPU version ------------------------------------------------------------------------- */ template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_yi(int iter, - const Kokkos::View &beta) +void SNAKokkos::compute_yi(int iatom_mod, int jjz, int iatom_div, + const Kokkos::View &beta_pack) { double betaj; - const int iatom = iter / idxz_max; - const int jjz = iter % idxz_max; const int j1 = idxz(jjz, 0); const int j2 = idxz(jjz, 1); @@ -576,88 +641,84 @@ void SNAKokkos::compute_yi(int iter, const int nb = idxz(jjz, 8); const int jju = idxz(jjz, 9); - const double *cgblock = cglist.data() + idxcg_block(j1, j2, j); + const double *cgblock = cglist.data() + idxcg_block(j1,j2,j); //int mb = (2 * (mb1min+mb2max) - j1 - j2 + j) / 2; //int ma = (2 * (ma1min+ma2max) - j1 - j2 + j) / 2; - int itriple; - for(int elem1 = 0; elem1 < nelements; elem1++) + for (int elem1 = 0; elem1 < nelements; elem1++) { for (int elem2 = 0; elem2 < nelements; elem2++) { - double ztmp_r = 0.0; - double ztmp_i = 0.0; + double ztmp_r = 0.0; + double ztmp_i = 0.0; - int jju1 = elem1 * idxu_max + idxu_block[j1] + (j1 + 1) * mb1min; - int jju2 = elem2 * idxu_max + idxu_block[j2] + (j2 + 1) * mb2max; - int icgb = mb1min * (j2 + 1) + mb2max; - for (int ib = 0; ib < nb; ib++) { + int jju1 = idxu_block[j1] + (j1 + 1) * mb1min; + int jju2 = idxu_block[j2] + (j2 + 1) * mb2max; + int icgb = mb1min * (j2 + 1) + mb2max; - double suma1_r = 0.0; - double suma1_i = 0.0; + #pragma unroll + for (int ib = 0; ib < nb; ib++) { - int ma1 = ma1min; - int ma2 = ma2max; - int icga = ma1min * (j2 + 1) + ma2max; + int ma1 = ma1min; + int ma2 = ma2max; + int icga = ma1min*(j2+1) + ma2max; - for (int ia = 0; ia < na; ia++) { - suma1_r += cgblock[icga] * (ulisttot(jju1 + ma1, iatom).re * - ulisttot(jju2 + ma2, iatom).re - - ulisttot(jju1 + ma1, iatom).im * - ulisttot(jju2 + ma2, iatom).im); - suma1_i += cgblock[icga] * (ulisttot(jju1 + ma1, iatom).re * - ulisttot(jju2 + ma2, iatom).im + - ulisttot(jju1 + ma1, iatom).im * - ulisttot(jju2 + ma2, iatom).re); - ma1++; - ma2--; - icga += j2; - } // end loop over ia + #pragma unroll + for (int ia = 0; ia < na; ia++) { + const SNAcomplex utot1 = ulisttot_pack(iatom_mod,jju1+ma1,elem1,iatom_div); + const SNAcomplex utot2 = ulisttot_pack(iatom_mod,jju2+ma2,elem2,iatom_div); + const auto cgcoeff_a = cgblock[icga]; + const auto cgcoeff_b = cgblock[icgb]; + ztmp_r += cgcoeff_a * cgcoeff_b * (utot1.re * utot2.re - utot1.im * utot2.im); + ztmp_i += cgcoeff_a * cgcoeff_b * (utot1.re * utot2.im + utot1.im * utot2.re); + ma1++; + ma2--; + icga += j2; + } // end loop over ia - ztmp_r += cgblock[icgb] * suma1_r; - ztmp_i += cgblock[icgb] * suma1_i; - jju1 += j1 + 1; - jju2 -= j2 + 1; - icgb += j2; - } // end loop over ib + jju1 += j1 + 1; + jju2 -= j2 + 1; + icgb += j2; + } // end loop over ib - if (bnorm_flag) { - ztmp_i /= j + 1; - ztmp_r /= j + 1; - } - - // apply to z(j1,j2,j,ma,mb) to unique element of y(j) - // find right y_list[jju] and beta(iatom,jjb) entries - // multiply and divide by j+1 factors - // account for multiplicity of 1, 2, or 3 - - // pick out right beta value - for (int elem3 = 0; elem3 < nelements; elem3++) { - const int jjuy = elem3 * idxu_max + jju; - if (j >= j1) { - const int jjb = idxb_block(j1, j2, j); - itriple = ((elem1 * nelements + elem2) * nelements + elem3) * idxb_max + jjb; - if (j1 == j) { - if (j2 == j) betaj = 3 * beta(itriple, iatom); - else betaj = 2 * beta(itriple, iatom); - } else betaj = beta(itriple, iatom); - } else if (j >= j2) { - const int jjb = idxb_block(j, j2, j1); - itriple = ((elem3 * nelements + elem2) * nelements + elem1) * idxb_max + jjb; - if (j2 == j) betaj = 2 * beta(itriple, iatom); - else betaj = beta(itriple, iatom); - } else { - const int jjb = idxb_block(j2, j, j1); - itriple = ((elem2 * nelements + elem3) * nelements + elem1) * idxb_max + jjb; - betaj = beta(itriple, iatom); + if (bnorm_flag) { + ztmp_r /= j + 1; + ztmp_i /= j + 1; } - if (!bnorm_flag && j1 > j) - betaj *= (j1 + 1) / (j + 1.0); + // apply to z(j1,j2,j,ma,mb) to unique element of y(j) + // find right y_list[jju] and beta(iatom,jjb) entries + // multiply and divide by j+1 factors + // account for multiplicity of 1, 2, or 3 - Kokkos::atomic_add(&(ylist(jjuy, iatom).re), betaj * ztmp_r); - Kokkos::atomic_add(&(ylist(jjuy, iatom).im), betaj * ztmp_i); - } - } + // pick out right beta value + for (int elem3 = 0; elem3 < nelements; elem3++) { + if (j >= j1) { + const int jjb = idxb_block(j1, j2, j); + const auto itriple = ((elem1 * nelements + elem2) * nelements + elem3) * idxb_max + jjb; + if (j1 == j) { + if (j2 == j) betaj = 3 * beta_pack(iatom_mod, itriple, iatom_div); + else betaj = 2 * beta_pack(iatom_mod, itriple, iatom_div); + } else betaj = beta_pack(iatom_mod, itriple, iatom_div); + } else if (j >= j2) { + const int jjb = idxb_block(j, j2, j1); + const auto itriple = ((elem3 * nelements + elem2) * nelements + elem1) * idxb_max + jjb; + if (j2 == j) betaj = 2 * beta_pack(iatom_mod, itriple, iatom_div); + else betaj = beta_pack(iatom_mod, itriple, iatom_div); + } else { + const int jjb = idxb_block(j2, j, j1); + const auto itriple = ((elem2 * nelements + elem3) * nelements + elem1) * idxb_max + jjb; + betaj = beta_pack(iatom_mod, itriple, iatom_div); + } + + if (!bnorm_flag && j1 > j) + betaj *= (j1 + 1) / (j + 1.0); + + + Kokkos::atomic_add(&(ylist_pack_re(iatom_mod, jju, elem3, iatom_div)), betaj*ztmp_r); + Kokkos::atomic_add(&(ylist_pack_im(iatom_mod, jju, elem3, iatom_div)), betaj*ztmp_i); + } // end loop over elem3 + } // end loop over elem2 + } // end loop over elem1 } /* ---------------------------------------------------------------------- @@ -673,7 +734,7 @@ void SNAKokkos::compute_fused_deidrj(const typename Kokkos::TeamPoli const int max_m_tile = (twojmax+1)*(twojmax/2+1); const int team_rank = team.team_rank(); const int scratch_shift = team_rank * max_m_tile; - const int jpos = element(iatom, jnbor)*idxu_max; + const int jelem = element(iatom, jnbor); // double buffer for ulist SNAcomplex* ulist_buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift; @@ -723,14 +784,14 @@ void SNAKokkos::compute_fused_deidrj(const typename Kokkos::TeamPoli // Accumulate the full contribution to dedr on the fly const double du_prod = dsfac * u; // chain rule - const SNAcomplex y_local = ylist(jpos, iatom); + const SNAcomplex y_local = ylist(0, jelem, iatom); // Symmetry factor of 0.5 b/c 0 element is on diagonal for even j==0 double dedr_full_sum = 0.5 * du_prod * y_local.re; // single has a warp barrier at the end Kokkos::single(Kokkos::PerThread(team), [=]() { - //dulist(0,iatom,jnbor,dir) = { dsfac * u, 0. }; // fold in chain rule here + ulist_buf1[0] = {1., 0.}; dulist_buf1[0] = {0., 0.}; }); @@ -755,10 +816,11 @@ void SNAKokkos::compute_fused_deidrj(const typename Kokkos::TeamPoli [&] (const int m, double& sum_tmp) { // ma fast, mb slow - int ma = m % n_ma; - int mb = m / n_ma; + // Equivalent to `int ma = m % n_ma; int mb = m / n_ma;` IF everything's positive. + const int mb = m / n_ma; + const int ma = m - mb * n_ma; - const int jju_index = jpos+jju+m; + const int jju_index = jju+m; // Load y_local, apply the symmetry scaling factor // The "secret" of the shared memory optimization is it eliminates @@ -766,7 +828,7 @@ void SNAKokkos::compute_fused_deidrj(const typename Kokkos::TeamPoli // shared memory and otherwise always writing, making the kernel // ultimately compute bound. We take advantage of that by adding // some reads back in. - auto y_local = ylist(jju_index,iatom); + auto y_local = ylist(jju_index, jelem, iatom); if (j % 2 == 0 && 2*mb == j) { if (ma == mb) { y_local = 0.5*y_local; } else if (ma > mb) { y_local = { 0., 0. }; } // can probably avoid this outright @@ -783,29 +845,35 @@ void SNAKokkos::compute_fused_deidrj(const typename Kokkos::TeamPoli SNAcomplex du_accum = { 0., 0. }; const double rootpq2 = -rootpqarray(ma, j - mb); - const SNAcomplex u_up2 = (ma > 0)?rootpq2*ulist_buf1[jjup_shared_idx-1]:SNAcomplex(0.,0.); + const SNAcomplex u_up2 = rootpq2*((ma > 0) ? ulist_buf1[jjup_shared_idx-1]:SNAcomplex(0.,0.)); + + // u_accum += conj(b) * u_up2 caconjxpy(b, u_up2, u_accum); const double rootpq1 = rootpqarray(j - ma, j - mb); - const SNAcomplex u_up1 = (ma < j)?rootpq1*ulist_buf1[jjup_shared_idx]:SNAcomplex(0.,0.); + const SNAcomplex u_up1 = rootpq1*((ma < j) ? ulist_buf1[jjup_shared_idx]:SNAcomplex(0.,0.)); + + // u_accum += conj(a) * u_up1 caconjxpy(a, u_up1, u_accum); // Next, spin up du_accum - const SNAcomplex du_up1 = (ma < j) ? rootpq1*dulist_buf1[jjup_shared_idx] : SNAcomplex(0.,0.); + const SNAcomplex du_up1 = rootpq1*((ma < j) ? dulist_buf1[jjup_shared_idx] : SNAcomplex(0.,0.)); + + // du_accum += conj(da) * u_up1 + conj(a) * du_up1 caconjxpy(da, u_up1, du_accum); caconjxpy(a, du_up1, du_accum); - const SNAcomplex du_up2 = (ma > 0) ? rootpq2*dulist_buf1[jjup_shared_idx-1] : SNAcomplex(0.,0.); + const SNAcomplex du_up2 = rootpq2*((ma > 0) ? dulist_buf1[jjup_shared_idx-1] : SNAcomplex(0.,0.)); + + // du_accum += conj(db) * u_up2 + conj(b) * du_up2 caconjxpy(db, u_up2, du_accum); caconjxpy(b, du_up2, du_accum); - // No need to save u_accum to global memory // Cache u_accum, du_accum to scratch memory. ulist_buf2[jju_shared_idx] = u_accum; dulist_buf2[jju_shared_idx] = du_accum; // Directly accumulate deidrj into sum_tmp - //dulist(jju_index,iatom,jnbor,dir) = ((dsfac * u)*u_accum) + (sfac*du_accum); const SNAcomplex du_prod = ((dsfac * u)*u_accum) + (sfac*du_accum); sum_tmp += du_prod.re * y_local.re + du_prod.im * y_local.im; @@ -813,7 +881,7 @@ void SNAKokkos::compute_fused_deidrj(const typename Kokkos::TeamPoli // u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb]) if (j%2==1 && mb+1==n_mb) { int sign_factor = (((ma+mb)%2==0)?1:-1); - //const int jjup_flip = jju+(j+1-mb)*(j+1)-(ma+1); // no longer needed b/c we don't update dulist + const int jju_shared_flip = (j+1-mb)*(j+1)-(ma+1); if (sign_factor == 1) { @@ -826,8 +894,6 @@ void SNAKokkos::compute_fused_deidrj(const typename Kokkos::TeamPoli // We don't need the second half of the tile for the deidrj accumulation. // That's taken care of by the symmetry factor above. - //dulist(jjup_flip,iatom,jnbor,dir) = ((dsfac * u)*u_accum) + (sfac*du_accum); - // We do need it for ortho polynomial generation, though ulist_buf2[jju_shared_flip] = u_accum; dulist_buf2[jju_shared_flip] = du_accum; @@ -854,69 +920,150 @@ void SNAKokkos::compute_fused_deidrj(const typename Kokkos::TeamPoli }); } -/* ---------------------------------------------------------------------- - compute dEidRj, CPU path only. -------------------------------------------------------------------------- */ +/* ---------------------------------------------------------------------- + * CPU routines + * ----------------------------------------------------------------------*/ + +/* ---------------------------------------------------------------------- + * compute Ui by summing over neighbors j + * ------------------------------------------------------------------------- */ + template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_deidrj_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor) +void SNAKokkos::pre_ui_cpu(const typename Kokkos::TeamPolicy::member_type& team, const int& iatom, const int& ielem) { - t_scalar3 final_sum; - const int jelem = element(iatom, jnbor); + for (int jelem = 0; jelem < nelements; jelem++) { + for (int j = 0; j <= twojmax; j++) { + const int jju = idxu_block(j); - //for(int j = 0; j <= twojmax; j++) { - Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,twojmax+1), - [&] (const int& j, t_scalar3& sum_tmp) { - int jju = idxu_block[j]; - int jjuy = idxu_block[j] + jelem*idxu_max; + // Only diagonal elements get initialized + // for (int m = 0; m < (j+1)*(j+1); m++) + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, (j+1)*(j+1)), + [&] (const int m) { - for(int mb = 0; 2*mb < j; mb++) - for(int ma = 0; ma <= j; ma++) { - sum_tmp.x += dulist(jju,iatom,jnbor,0).re * ylist(jjuy,iatom).re + dulist(jju,iatom,jnbor,0).im * ylist(jjuy,iatom).im; - sum_tmp.y += dulist(jju,iatom,jnbor,1).re * ylist(jjuy,iatom).re + dulist(jju,iatom,jnbor,1).im * ylist(jjuy,iatom).im; - sum_tmp.z += dulist(jju,iatom,jnbor,2).re * ylist(jjuy,iatom).re + dulist(jju,iatom,jnbor,2).im * ylist(jjuy,iatom).im; - jju++; - jjuy++; - } //end loop over ma mb + const int jjup = jju + m; - // For j even, handle middle column + // if m is on the "diagonal", initialize it with the self energy. + // Otherwise zero it out + SNAcomplex init = {0., 0.}; + if (m % (j+2) == 0 && (!chem_flag || ielem == jelem || wselfall_flag)) { init = {wself, 0.0}; } //need to map iatom to element - if (j%2 == 0) { - - int mb = j/2; - for(int ma = 0; ma < mb; ma++) { - sum_tmp.x += dulist(jju,iatom,jnbor,0).re * ylist(jjuy,iatom).re + dulist(jju,iatom,jnbor,0).im * ylist(jjuy,iatom).im; - sum_tmp.y += dulist(jju,iatom,jnbor,1).re * ylist(jjuy,iatom).re + dulist(jju,iatom,jnbor,1).im * ylist(jjuy,iatom).im; - sum_tmp.z += dulist(jju,iatom,jnbor,2).re * ylist(jjuy,iatom).re + dulist(jju,iatom,jnbor,2).im * ylist(jjuy,iatom).im; - jju++; - jjuy++; - } - - //int ma = mb; - sum_tmp.x += (dulist(jju,iatom,jnbor,0).re * ylist(jjuy,iatom).re + dulist(jju,iatom,jnbor,0).im * ylist(jjuy,iatom).im)*0.5; - sum_tmp.y += (dulist(jju,iatom,jnbor,1).re * ylist(jjuy,iatom).re + dulist(jju,iatom,jnbor,1).im * ylist(jjuy,iatom).im)*0.5; - sum_tmp.z += (dulist(jju,iatom,jnbor,2).re * ylist(jjuy,iatom).re + dulist(jju,iatom,jnbor,2).im * ylist(jjuy,iatom).im)*0.5; - } // end if jeven - - },final_sum); // end loop over j - - Kokkos::single(Kokkos::PerThread(team), [&] () { - dedr(iatom,jnbor,0) = final_sum.x*2.0; - dedr(iatom,jnbor,1) = final_sum.y*2.0; - dedr(iatom,jnbor,2) = final_sum.z*2.0; - }); + ulisttot(jjup, jelem, iatom) = init; + }); + } + } } + /* ---------------------------------------------------------------------- - compute Bi by summing conj(Ui)*Zi + compute Ui by summing over bispectrum components. CPU only. ------------------------------------------------------------------------- */ template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_bi(const typename Kokkos::TeamPolicy::member_type& team, int iatom) +void SNAKokkos::compute_ui_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor) +{ + double rsq, r, x, y, z, z0, theta0; + + // utot(j,ma,mb) = 0 for all j,ma,ma + // utot(j,ma,ma) = 1 for all j,ma + // for j in neighbors of i: + // compute r0 = (x,y,z,z0) + // utot(j,ma,mb) += u(r0;j,ma,mb) for all j,ma,mb + + x = rij(iatom,jnbor,0); + y = rij(iatom,jnbor,1); + z = rij(iatom,jnbor,2); + rsq = x * x + y * y + z * z; + r = sqrt(rsq); + + theta0 = (r - rmin0) * rfac0 * MY_PI / (rcutij(iatom,jnbor) - rmin0); + // theta0 = (r - rmin0) * rscale0; + z0 = r / tan(theta0); + + compute_uarray_cpu(team, iatom, jnbor, x, y, z, z0, r); + add_uarraytot(team, iatom, jnbor, r, wj(iatom,jnbor), rcutij(iatom,jnbor), element(iatom, jnbor)); + +} +/* ---------------------------------------------------------------------- + compute Zi by summing over products of Ui, CPU version +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void SNAKokkos::compute_zi_cpu(const int& iter) +{ + const int iatom = iter / idxz_max; + const int jjz = iter % idxz_max; + + const int j1 = idxz(jjz, 0); + const int j2 = idxz(jjz, 1); + const int j = idxz(jjz, 2); + const int ma1min = idxz(jjz, 3); + const int ma2max = idxz(jjz, 4); + const int mb1min = idxz(jjz, 5); + const int mb2max = idxz(jjz, 6); + const int na = idxz(jjz, 7); + const int nb = idxz(jjz, 8); + + const double *cgblock = cglist.data() + idxcg_block(j1,j2,j); + + int idouble = 0; + + for (int elem1 = 0; elem1 < nelements; elem1++) { + for (int elem2 = 0; elem2 < nelements; elem2++) { + zlist(jjz, idouble, iatom).re = 0.0; + zlist(jjz, idouble, iatom).im = 0.0; + + int jju1 = idxu_block[j1] + (j1+1)*mb1min; + int jju2 = idxu_block[j2] + (j2+1)*mb2max; + int icgb = mb1min*(j2+1) + mb2max; + for(int ib = 0; ib < nb; ib++) { + + double suma1_r = 0.0; + double suma1_i = 0.0; + + int ma1 = ma1min; + int ma2 = ma2max; + int icga = ma1min * (j2 + 1) + ma2max; + for(int ia = 0; ia < na; ia++) { + suma1_r += cgblock[icga] * (ulisttot(jju1+ma1, elem1, iatom).re * ulisttot(jju2+ma2, elem2, iatom).re - + ulisttot(jju1+ma1, elem1, iatom).im * ulisttot(jju2+ma2, elem2, iatom).im); + suma1_i += cgblock[icga] * (ulisttot(jju1+ma1, elem1, iatom).re * ulisttot(jju2+ma2, elem2, iatom).im + + ulisttot(jju1+ma1, elem1, iatom).im * ulisttot(jju2+ma2, elem2, iatom).re); + ma1++; + ma2--; + icga += j2; + } // end loop over ia + + zlist(jjz, idouble, iatom).re += cgblock[icgb] * suma1_r; + zlist(jjz, idouble, iatom).im += cgblock[icgb] * suma1_i; + + jju1 += j1 + 1; + jju2 -= j2 + 1; + icgb += j2; + } // end loop over ib + + if (bnorm_flag) { + zlist(jjz, idouble, iatom).re /= (j+1); + zlist(jjz, idouble, iatom).im /= (j+1); + } + idouble++; + } // end loop over elem2 + } // end loop over elem1 +} + + +/* ---------------------------------------------------------------------- + compute Bi by summing conj(Ui)*Zi, CPU version +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void SNAKokkos::compute_bi_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom) { // for j1 = 0,...,twojmax // for j2 = 0,twojmax @@ -929,15 +1076,14 @@ void SNAKokkos::compute_bi(const typename Kokkos::TeamPolicy::compute_bi(const typename Kokkos::TeamPolicy +KOKKOS_INLINE_FUNCTION +void SNAKokkos::zero_yi_cpu(const int& idx, const int& iatom, const int& ielem) +{ + ylist(idx,ielem,iatom) = {0.0, 0.0}; +} + +/* ---------------------------------------------------------------------- + compute Yi from Ui without storing Zi, looping over zlist indices, + CPU version +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void SNAKokkos::compute_yi_cpu(int iter, + const Kokkos::View &beta) +{ + double betaj; + const int iatom = iter / idxz_max; + const int jjz = iter % idxz_max; + + const int j1 = idxz(jjz, 0); + const int j2 = idxz(jjz, 1); + const int j = idxz(jjz, 2); + const int ma1min = idxz(jjz, 3); + const int ma2max = idxz(jjz, 4); + const int mb1min = idxz(jjz, 5); + const int mb2max = idxz(jjz, 6); + const int na = idxz(jjz, 7); + const int nb = idxz(jjz, 8); + const int jju = idxz(jjz, 9); + + const double *cgblock = cglist.data() + idxcg_block(j1,j2,j); + //int mb = (2 * (mb1min+mb2max) - j1 - j2 + j) / 2; + //int ma = (2 * (ma1min+ma2max) - j1 - j2 + j) / 2; + + for (int elem1 = 0; elem1 < nelements; elem1++) { + for (int elem2 = 0; elem2 < nelements; elem2++) { + + double ztmp_r = 0.0; + double ztmp_i = 0.0; + + int jju1 = idxu_block[j1] + (j1 + 1) * mb1min; + int jju2 = idxu_block[j2] + (j2 + 1) * mb2max; + int icgb = mb1min * (j2 +1 ) + mb2max; + + for (int ib = 0; ib < nb; ib++) { + + double suma1_r = 0.0; + double suma1_i = 0.0; + + int ma1 = ma1min; + int ma2 = ma2max; + int icga = ma1min*(j2+1) + ma2max; + + for (int ia = 0; ia < na; ia++) { + suma1_r += cgblock[icga] * (ulisttot(jju1+ma1, elem1, iatom).re * ulisttot(jju2+ma2, elem2, iatom).re - + ulisttot(jju1+ma1, elem1, iatom).im * ulisttot(jju2+ma2, elem2, iatom).im); + suma1_i += cgblock[icga] * (ulisttot(jju1+ma1, elem1, iatom).re * ulisttot(jju2+ma2, elem2, iatom).im + + ulisttot(jju1+ma1, elem1, iatom).im * ulisttot(jju2+ma2, elem2, iatom).re); + ma1++; + ma2--; + icga += j2; + } // end loop over ia + + ztmp_r += cgblock[icgb] * suma1_r; + ztmp_i += cgblock[icgb] * suma1_i; + jju1 += j1 + 1; + jju2 -= j2 + 1; + icgb += j2; + } // end loop over ib + + if (bnorm_flag) { + ztmp_i /= j + 1; + ztmp_r /= j + 1; + } + + // apply to z(j1,j2,j,ma,mb) to unique element of y(j) + // find right y_list[jju] and beta(iatom,jjb) entries + // multiply and divide by j+1 factors + // account for multiplicity of 1, 2, or 3 + + // pick out right beta value + for (int elem3 = 0; elem3 < nelements; elem3++) { + + if (j >= j1) { + const int jjb = idxb_block(j1, j2, j); + const auto itriple = ((elem1 * nelements + elem2) * nelements + elem3) * idxb_max + jjb; + if (j1 == j) { + if (j2 == j) betaj = 3 * beta(itriple, iatom); + else betaj = 2 * beta(itriple, iatom); + } else betaj = beta(itriple, iatom); + } else if (j >= j2) { + const int jjb = idxb_block(j, j2, j1); + const auto itriple = ((elem3 * nelements + elem2) * nelements + elem1) * idxb_max + jjb; + if (j2 == j) betaj = 2 * beta(itriple, iatom); + else betaj = beta(itriple, iatom); + } else { + const int jjb = idxb_block(j2, j, j1); + const auto itriple = ((elem2 * nelements + elem3) * nelements + elem1) * idxb_max + jjb; + betaj = beta(itriple, iatom); + } + + if (!bnorm_flag && j1 > j) + betaj *= (j1 + 1) / (j + 1.0); + + Kokkos::atomic_add(&(ylist(jju, elem3, iatom).re), betaj*ztmp_r); + Kokkos::atomic_add(&(ylist(jju, elem3, iatom).im), betaj*ztmp_i); + } // end loop over elem3 + } // end loop over elem2 + } // end loop over elem1 +} + + /* ---------------------------------------------------------------------- calculate derivative of Ui w.r.t. atom j ------------------------------------------------------------------------- */ @@ -1042,6 +1303,61 @@ void SNAKokkos::compute_duidrj_cpu(const typename Kokkos::TeamPolicy compute_duarray_cpu(team, iatom, jnbor, x, y, z, z0, r, dz0dr, wj(iatom,jnbor), rcutij(iatom,jnbor)); } + +/* ---------------------------------------------------------------------- + compute dEidRj, CPU path only. +------------------------------------------------------------------------- */ + + +template +KOKKOS_INLINE_FUNCTION +void SNAKokkos::compute_deidrj_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor) +{ + t_scalar3 final_sum; + const int jelem = element(iatom, jnbor); + + //for(int j = 0; j <= twojmax; j++) { + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,twojmax+1), + [&] (const int& j, t_scalar3& sum_tmp) { + int jju = idxu_block[j]; + + for(int mb = 0; 2*mb < j; mb++) + for(int ma = 0; ma <= j; ma++) { + sum_tmp.x += dulist(jju,iatom,jnbor,0).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,0).im * ylist(jju,jelem,iatom).im; + sum_tmp.y += dulist(jju,iatom,jnbor,1).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,1).im * ylist(jju,jelem,iatom).im; + sum_tmp.z += dulist(jju,iatom,jnbor,2).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,2).im * ylist(jju,jelem,iatom).im; + jju++; + } //end loop over ma mb + + // For j even, handle middle column + + if (j%2 == 0) { + + int mb = j/2; + for(int ma = 0; ma < mb; ma++) { + sum_tmp.x += dulist(jju,iatom,jnbor,0).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,0).im * ylist(jju,jelem,iatom).im; + sum_tmp.y += dulist(jju,iatom,jnbor,1).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,1).im * ylist(jju,jelem,iatom).im; + sum_tmp.z += dulist(jju,iatom,jnbor,2).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,2).im * ylist(jju,jelem,iatom).im; + jju++; + } + + //int ma = mb; + sum_tmp.x += (dulist(jju,iatom,jnbor,0).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,0).im * ylist(jju,jelem,iatom).im)*0.5; + sum_tmp.y += (dulist(jju,iatom,jnbor,1).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,1).im * ylist(jju,jelem,iatom).im)*0.5; + sum_tmp.z += (dulist(jju,iatom,jnbor,2).re * ylist(jju,jelem,iatom).re + dulist(jju,iatom,jnbor,2).im * ylist(jju,jelem,iatom).im)*0.5; + } // end if jeven + + },final_sum); // end loop over j + + Kokkos::single(Kokkos::PerThread(team), [&] () { + dedr(iatom,jnbor,0) = final_sum.x*2.0; + dedr(iatom,jnbor,1) = final_sum.y*2.0; + dedr(iatom,jnbor,2) = final_sum.z*2.0; + }); + +} + + /* ---------------------------------------------------------------------- add Wigner U-functions for one neighbor to the total ------------------------------------------------------------------------- */ @@ -1052,10 +1368,11 @@ void SNAKokkos::add_uarraytot(const typename Kokkos::TeamPolicy::compute_duarray_cpu(const typename Kokkos::TeamPolic double z0, double r, double dz0dr, double wj, double rcut) { - double r0inv; +double r0inv; double a_r, a_i, b_r, b_i; double da_r[3], da_i[3], db_r[3], db_i[3]; double dz0[3], dr0inv[3], dr0invdr; @@ -1710,6 +2027,7 @@ double SNAKokkos::memory_usage() { int jdimpq = twojmax + 2; int jdim = twojmax + 1; + int natom_pad = ((natom + 32 - 1) / 32) * 32; // for AoSoA layouts double bytes; bytes = 0; @@ -1717,21 +2035,38 @@ double SNAKokkos::memory_usage() bytes += jdimpq*jdimpq * sizeof(double); // pqarray bytes += idxcg_max * sizeof(double); // cglist + + #ifdef KOKKOS_ENABLE_CUDA - if (!std::is_same::value) { + if (std::is_same::value) { + + bytes += natom * idxu_max * nelements * sizeof(double); // ulisttot_re + bytes += natom * idxu_max * nelements * sizeof(double); // ulisttot_im + bytes += natom_pad * idxu_max * nelements * sizeof(double) * 2; // ulisttot_pack + + bytes += natom_pad * idxz_max * ndoubles * sizeof(double) * 2; // zlist_pack + bytes += natom_pad * idxb_max * ntriples * sizeof(double); // blist_pack + + bytes += natom_pad * idxu_max * nelements * sizeof(double); // ylist_pack_re + bytes += natom_pad * idxu_max * nelements * sizeof(double); // ylist_pack_im + bytes += natom * idxu_max * nelements * sizeof(double) * 2; // ylist + } else { #endif - bytes += natom * idxu_max * sizeof(double) * 2; // ulist - bytes += natom * idxu_max * 3 * sizeof(double) * 2; // dulist + + bytes += natom * nmax * idxu_max * sizeof(double) * 2; // ulist + bytes += natom * idxu_max * nelements * sizeof(double) * 2; // ulisttot + + bytes += natom * idxz_max * ndoubles * sizeof(double) * 2; // zlist + bytes += natom * idxb_max * ntriples * sizeof(double); // blist + + bytes += natom * idxu_max * nelements * sizeof(double) * 2; // ylist + + bytes += natom * nmax * idxu_max * 3 * sizeof(double) * 2; // dulist #ifdef KOKKOS_ENABLE_CUDA } #endif - bytes += natom * idxu_max * nelements * sizeof(double) * 2; // ulisttot - if (!std::is_same::value) - bytes += natom * idxu_max * nelements * sizeof(double) * 2; // ulisttot_lr - bytes += natom * idxz_max * ndoubles * sizeof(double) * 2; // zlist - bytes += natom * idxb_max * ntriples * sizeof(double); // blist - bytes += natom * idxu_max * nelements * sizeof(double) * 2; // ylist + bytes += natom * nmax * 3 * sizeof(double); // dedr bytes += jdim * jdim * jdim * sizeof(int); // idxcg_block bytes += jdim * sizeof(int); // idxu_block @@ -1747,7 +2082,6 @@ double SNAKokkos::memory_usage() bytes += natom * nmax * sizeof(int); // inside bytes += natom * nmax * sizeof(double); // wj bytes += natom * nmax * sizeof(double); // rcutij - bytes += natom * nmax * idxu_max * sizeof(double) * 2; // ulist_ij return bytes; } diff --git a/src/KSPACE/msm.cpp b/src/KSPACE/msm.cpp index abc8ca1150..e4e5846c9e 100644 --- a/src/KSPACE/msm.cpp +++ b/src/KSPACE/msm.cpp @@ -362,6 +362,7 @@ void MSM::setup() nmax_direct = 8*(nxhi_direct+1)*(nyhi_direct+1)*(nzhi_direct+1); + deallocate(); if (peratom_allocate_flag) deallocate_peratom(); // compute direct sum interaction weights @@ -612,8 +613,6 @@ void MSM::compute(int eflag, int vflag) void MSM::allocate() { - deallocate(); - // interpolation coeffs order_allocated = order; @@ -635,9 +634,9 @@ void MSM::allocate() // allocate memory for each grid level for (int n=0; ncreate3d_offset(qgrid[n],nzlo_out[n],nzhi_out[n], nylo_out[n],nyhi_out[n],nxlo_out[n],nxhi_out[n],"msm:qgrid"); + memory->create3d_offset(egrid[n],nzlo_out[n],nzhi_out[n], nylo_out[n],nyhi_out[n],nxlo_out[n],nxhi_out[n],"msm:egrid"); @@ -660,23 +659,29 @@ void MSM::allocate() void MSM::deallocate() { - delete cg_all; - cg_all = nullptr; - memory->destroy2d_offset(phi1d,-order_allocated); memory->destroy2d_offset(dphi1d,-order_allocated); + if (cg_all) delete cg_all; + cg_all = nullptr; + for (int n=0; ndestroy3d_offset(qgrid[n],nzlo_out[n],nylo_out[n],nxlo_out[n]); - memory->destroy3d_offset(egrid[n],nzlo_out[n],nylo_out[n],nxlo_out[n]); + if (qgrid[n]) + memory->destroy3d_offset(qgrid[n],nzlo_out[n],nylo_out[n],nxlo_out[n]); - if (world_levels[n] != MPI_COMM_NULL) - MPI_Comm_free(&world_levels[n]); - world_levels[n] = MPI_COMM_NULL; - active_flag[n] = 0; + if (egrid[n]) + memory->destroy3d_offset(egrid[n],nzlo_out[n],nylo_out[n],nxlo_out[n]); - delete cg[n]; - cg[n] = nullptr; + if (world_levels) + if (world_levels[n] != MPI_COMM_NULL) + MPI_Comm_free(&world_levels[n]); + + if (cg) { + if (cg[n]) { + delete cg[n]; + cg[n] = nullptr; + } + } } } @@ -765,7 +770,6 @@ void MSM::deallocate_peratom() void MSM::allocate_levels() { - deallocate_levels(); ngrid = new int[levels]; cg = new GridComm*[levels]; @@ -815,21 +819,21 @@ void MSM::allocate_levels() v5grid = new double***[levels]; for (int n=0; nnonperiodic) levels -= 1; + deallocate_levels(); allocate_levels(); // find number of grid levels in each direction diff --git a/src/atom_vec.cpp b/src/atom_vec.cpp index 6a294d3ccf..5a86caa4fb 100644 --- a/src/atom_vec.cpp +++ b/src/atom_vec.cpp @@ -33,6 +33,8 @@ using namespace MathConst; #define DELTA 16384 #define DELTA_BONUS 8192 +int AtomVec::num_atom_vecs = 0; + /* ---------------------------------------------------------------------- */ AtomVec::AtomVec(LAMMPS *lmp) : Pointers(lmp) @@ -54,6 +56,8 @@ AtomVec::AtomVec(LAMMPS *lmp) : Pointers(lmp) threads = NULL; + ++num_atom_vecs; + // peratom variables auto-included in corresponding child style fields string // these fields cannot be specified in the fields string @@ -93,44 +97,48 @@ AtomVec::~AtomVec() int datatype,cols; void *pdata; + --num_atom_vecs; + for (int i = 0; i < nargcopy; i++) delete [] argcopy[i]; delete [] argcopy; - memory->destroy(atom->tag); - memory->destroy(atom->type); - memory->destroy(atom->mask); - memory->destroy(atom->image); - memory->destroy(atom->x); - memory->destroy(atom->v); - memory->destroy(atom->f); + if (num_atom_vecs == 0) { + memory->destroy(atom->tag); + memory->destroy(atom->type); + memory->destroy(atom->mask); + memory->destroy(atom->image); + memory->destroy(atom->x); + memory->destroy(atom->v); + memory->destroy(atom->f); - for (int i = 0; i < ngrow; i++) { - pdata = mgrow.pdata[i]; - datatype = mgrow.datatype[i]; - cols = mgrow.cols[i]; - if (datatype == Atom::DOUBLE) { - if (cols == 0) - memory->destroy(*((double **) pdata)); - else if (cols > 0) - memory->destroy(*((double ***) pdata)); - else { - memory->destroy(*((double ***) pdata)); - } - } else if (datatype == Atom::INT) { - if (cols == 0) - memory->destroy(*((int **) pdata)); - else if (cols > 0) - memory->destroy(*((int ***) pdata)); - else { - memory->destroy(*((int ***) pdata)); - } - } else if (datatype == Atom::BIGINT) { - if (cols == 0) - memory->destroy(*((bigint **) pdata)); - else if (cols > 0) - memory->destroy(*((bigint ***) pdata)); - else { - memory->destroy(*((bigint ***) pdata)); + for (int i = 0; i < ngrow; i++) { + pdata = mgrow.pdata[i]; + datatype = mgrow.datatype[i]; + cols = mgrow.cols[i]; + if (datatype == Atom::DOUBLE) { + if (cols == 0) + memory->destroy(*((double **) pdata)); + else if (cols > 0) + memory->destroy(*((double ***) pdata)); + else { + memory->destroy(*((double ***) pdata)); + } + } else if (datatype == Atom::INT) { + if (cols == 0) + memory->destroy(*((int **) pdata)); + else if (cols > 0) + memory->destroy(*((int ***) pdata)); + else { + memory->destroy(*((int ***) pdata)); + } + } else if (datatype == Atom::BIGINT) { + if (cols == 0) + memory->destroy(*((bigint **) pdata)); + else if (cols > 0) + memory->destroy(*((bigint ***) pdata)); + else { + memory->destroy(*((bigint ***) pdata)); + } } } } diff --git a/src/atom_vec.h b/src/atom_vec.h index 8ccf922c4b..625307face 100644 --- a/src/atom_vec.h +++ b/src/atom_vec.h @@ -209,6 +209,10 @@ class AtomVec : protected Pointers { bool *threads; + // counter for atom vec instances + + static int num_atom_vecs; + // local methods void grow_nmax();