diff --git a/src/KOKKOS/pair_snap_kokkos.h b/src/KOKKOS/pair_snap_kokkos.h index 9acf3011ce..b7208a577c 100644 --- a/src/KOKKOS/pair_snap_kokkos.h +++ b/src/KOKKOS/pair_snap_kokkos.h @@ -50,6 +50,7 @@ struct TagPairSNAPBeta{}; struct TagPairSNAPComputeBi{}; struct TagPairSNAPTransformBi{}; // re-order blist from AoSoA to AoS struct TagPairSNAPComputeYi{}; +struct TagPairSNAPComputeYiWithZlist{}; template struct TagPairSNAPComputeFusedDeidrj{}; @@ -121,11 +122,11 @@ public: template KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeForce,const typename Kokkos::TeamPolicy >::member_type& team) const; + void operator() (TagPairSNAPComputeForce,const int& ii) const; template KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeForce,const typename Kokkos::TeamPolicy >::member_type& team, EV_FLOAT&) const; + void operator() (TagPairSNAPComputeForce,const int& ii, EV_FLOAT&) const; KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPBetaCPU,const int& ii) const; @@ -161,6 +162,9 @@ public: KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeYi,const int iatom_mod, const int idxz, const int iatom_div) const; + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeYiWithZlist,const int iatom_mod, const int idxz, const int iatom_div) const; + template KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeFusedDeidrj,const typename Kokkos::TeamPolicy >::member_type& team) const; diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h index 2578f3d47c..f624131377 100644 --- a/src/KOKKOS/pair_snap_kokkos_impl.h +++ b/src/KOKKOS/pair_snap_kokkos_impl.h @@ -373,14 +373,16 @@ void PairSNAPKokkos::compute(int eflag_in, //Compute beta = dE_i/dB_i for all i in list typename Kokkos::RangePolicy policy_beta(0,chunk_size); Kokkos::parallel_for("ComputeBeta",policy_beta,*this); - - //ComputeYi - // team_size_compute_yi is defined in `pair_snap_kokkos.h` const int idxz_max = snaKK.idxz_max; - Snap3DRangePolicy - policy_compute_yi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_yi,1}); - Kokkos::parallel_for("ComputeYi",policy_compute_yi,*this); - + if (quadraticflag || eflag) { + Snap3DRangePolicy + policy_compute_yi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_yi,1}); + Kokkos::parallel_for("ComputeYiWithZlist",policy_compute_yi,*this); + } else { + Snap3DRangePolicy + policy_compute_yi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_yi,1}); + Kokkos::parallel_for("ComputeYi",policy_compute_yi,*this); + } } // Fused ComputeDuidrj, ComputeDeidrj @@ -418,30 +420,21 @@ void PairSNAPKokkos::compute(int eflag_in, //ComputeForce { - int team_size = team_size_default; if (evflag) { if (neighflag == HALF) { - check_team_size_reduce >(chunk_size,team_size); - typename Kokkos::TeamPolicy > policy_force(chunk_size,team_size,vector_length); - Kokkos::parallel_reduce(policy_force - ,*this,ev_tmp); + typename Kokkos::RangePolicy > policy_force(0,chunk_size); + Kokkos::parallel_reduce(policy_force, *this, ev_tmp); } else if (neighflag == HALFTHREAD) { - check_team_size_reduce >(chunk_size,team_size); - typename Kokkos::TeamPolicy > policy_force(chunk_size,team_size,vector_length); - Kokkos::parallel_reduce(policy_force - ,*this,ev_tmp); + typename Kokkos::RangePolicy > policy_force(0,chunk_size); + Kokkos::parallel_reduce(policy_force, *this, ev_tmp); } } else { if (neighflag == HALF) { - check_team_size_for >(chunk_size,team_size); - typename Kokkos::TeamPolicy > policy_force(chunk_size,team_size,vector_length); - Kokkos::parallel_for(policy_force - ,*this); + typename Kokkos::RangePolicy > policy_force(0,chunk_size); + Kokkos::parallel_for(policy_force, *this); } else if (neighflag == HALFTHREAD) { - check_team_size_for >(chunk_size,team_size); - typename Kokkos::TeamPolicy > policy_force(chunk_size,team_size,vector_length); - Kokkos::parallel_for(policy_force - ,*this); + typename Kokkos::RangePolicy > policy_force(0,chunk_size); + Kokkos::parallel_for(policy_force, *this); } } } @@ -796,6 +789,19 @@ void PairSNAPKokkos::operator() (TagPairSN my_sna.compute_yi(iatom_mod,jjz,iatom_div,d_beta_pack); } +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeYiWithZlist,const int iatom_mod, const int jjz, const int iatom_div) const { + SNAKokkos my_sna = snaKK; + + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; + + if (jjz >= my_sna.idxz_max) return; + + my_sna.compute_yi_with_zlist(iatom_mod,jjz,iatom_div,d_beta_pack); +} + template KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPComputeZi,const int iatom_mod, const int jjz, const int iatom_div) const { @@ -1137,20 +1143,19 @@ void PairSNAPKokkos::operator() (TagPairSN template template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeForce,const typename Kokkos::TeamPolicy >::member_type& team, EV_FLOAT& ev) const { +void PairSNAPKokkos::operator() (TagPairSNAPComputeForce, const int& ii, EV_FLOAT& ev) const { // The f array is duplicated for OpenMP, atomic for CUDA, and neither for Serial - auto v_f = ScatterViewHelper::value,decltype(dup_f),decltype(ndup_f)>::get(dup_f,ndup_f); auto a_f = v_f.template access::value>(); - int ii = team.league_rank(); const int i = d_ilist[ii + chunk_offset]; + SNAKokkos my_sna = snaKK; + const int ninside = d_ninside(ii); - Kokkos::parallel_for (Kokkos::TeamThreadRange(team,ninside), - [&] (const int jj) { + for (int jj = 0; jj < ninside; jj++) { int j = my_sna.inside(ii,jj); F_FLOAT fij[3]; @@ -1158,28 +1163,23 @@ void PairSNAPKokkos::operator() (TagPairSN fij[1] = my_sna.dedr(ii,jj,1); fij[2] = my_sna.dedr(ii,jj,2); - Kokkos::single(Kokkos::PerThread(team), [&] () { - a_f(i,0) += fij[0]; - a_f(i,1) += fij[1]; - a_f(i,2) += fij[2]; - a_f(j,0) -= fij[0]; - a_f(j,1) -= fij[1]; - a_f(j,2) -= fij[2]; - - // tally global and per-atom virial contribution - - if (EVFLAG) { - if (vflag_either) { - v_tally_xyz(ev,i,j, - fij[0],fij[1],fij[2], - -my_sna.rij(ii,jj,0),-my_sna.rij(ii,jj,1), - -my_sna.rij(ii,jj,2)); - } + a_f(i,0) += fij[0]; + a_f(i,1) += fij[1]; + a_f(i,2) += fij[2]; + a_f(j,0) -= fij[0]; + a_f(j,1) -= fij[1]; + a_f(j,2) -= fij[2]; + // tally global and per-atom virial contribution + if (EVFLAG) { + if (vflag_either) { + v_tally_xyz(ev,i,j, + fij[0],fij[1],fij[2], + -my_sna.rij(ii,jj,0),-my_sna.rij(ii,jj,1), + -my_sna.rij(ii,jj,2)); } + } - }); - }); - + } // tally energy contribution if (EVFLAG) { @@ -1189,48 +1189,41 @@ void PairSNAPKokkos::operator() (TagPairSN const int ielem = d_map[itype]; auto d_coeffi = Kokkos::subview(d_coeffelem, ielem, Kokkos::ALL); - Kokkos::single(Kokkos::PerTeam(team), [&] () { + // evdwl = energy of atom I, sum over coeffs_k * Bi_k - // evdwl = energy of atom I, sum over coeffs_k * Bi_k + auto evdwl = d_coeffi[0]; - auto evdwl = d_coeffi[0]; + // E = beta.B + 0.5*B^t.alpha.B - // E = beta.B + 0.5*B^t.alpha.B + const auto idxb_max = snaKK.idxb_max; - const auto idxb_max = snaKK.idxb_max; + // linear contributions - // linear contributions + for (int icoeff = 0; icoeff < ncoeff; icoeff++) { + const auto idxb = icoeff % idxb_max; + const auto idx_chem = icoeff / idxb_max; + evdwl += d_coeffi[icoeff+1]*my_sna.blist(idxb,idx_chem,ii); + } + // quadratic contributions + if (quadraticflag) { + int k = ncoeff+1; for (int icoeff = 0; icoeff < ncoeff; icoeff++) { const auto idxb = icoeff % idxb_max; const auto idx_chem = icoeff / idxb_max; - evdwl += d_coeffi[icoeff+1]*my_sna.blist(idxb,idx_chem,ii); - } - - // quadratic contributions - - if (quadraticflag) { - int k = ncoeff+1; - for (int icoeff = 0; icoeff < ncoeff; icoeff++) { - const auto idxb = icoeff % idxb_max; - const auto idx_chem = icoeff / idxb_max; - auto bveci = my_sna.blist(idxb,idx_chem,ii); - - evdwl += 0.5*d_coeffi[k++]*bveci*bveci; - for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) { - auto jdxb = jcoeff % idxb_max; - auto jdx_chem = jcoeff / idxb_max; - auto bvecj = my_sna.blist(jdxb,jdx_chem,ii); - - evdwl += d_coeffi[k++]*bveci*bvecj; - } + auto bveci = my_sna.blist(idxb,idx_chem,ii); + evdwl += 0.5*d_coeffi[k++]*bveci*bveci; + for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) { + auto jdxb = jcoeff % idxb_max; + auto jdx_chem = jcoeff / idxb_max; + auto bvecj = my_sna.blist(jdxb,jdx_chem,ii); + evdwl += d_coeffi[k++]*bveci*bvecj; } } - - //ev_tally_full(i,2.0*evdwl,0.0,0.0,0.0,0.0,0.0); - if (eflag_global) ev.evdwl += evdwl; - if (eflag_atom) d_eatom[i] += evdwl; - }); + } + //ev_tally_full(i,2.0*evdwl,0.0,0.0,0.0,0.0,0.0); + if (eflag_global) ev.evdwl += evdwl; + if (eflag_atom) d_eatom[i] += evdwl; } } } @@ -1238,9 +1231,9 @@ void PairSNAPKokkos::operator() (TagPairSN template template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeForce,const typename Kokkos::TeamPolicy >::member_type& team) const { +void PairSNAPKokkos::operator() (TagPairSNAPComputeForce,const int& ii) const { EV_FLOAT ev; - this->template operator()(TagPairSNAPComputeForce(), team, ev); + this->template operator()(TagPairSNAPComputeForce(), ii, ev); } /* ---------------------------------------------------------------------- */ diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h index 55983f2a90..33ef37ca98 100644 --- a/src/KOKKOS/sna_kokkos.h +++ b/src/KOKKOS/sna_kokkos.h @@ -125,6 +125,9 @@ inline void compute_yi(int,int,int, const Kokkos::View &beta_pack); // ForceSNAP KOKKOS_INLINE_FUNCTION + void compute_yi_with_zlist(int,int,int, + const Kokkos::View &beta_pack); // ForceSNAP + KOKKOS_INLINE_FUNCTION void compute_bi(const int&, const int&, const int&); // ForceSNAP // functions for bispectrum coefficients, CPU only diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index 2a44f943d3..2b55f97c30 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -880,6 +880,59 @@ void SNAKokkos::compute_yi(int iatom_mod, } // end loop over elem1 } +/* ---------------------------------------------------------------------- + compute Yi from Ui without storing Zi, looping over zlist indices. + AoSoA data layout to take advantage of coalescing, avoiding warp + divergence. GPU version. +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void SNAKokkos::compute_yi_with_zlist(int iatom_mod, int jjz, int iatom_div, + const Kokkos::View &beta_pack) +{ + real_type betaj; + const int j1 = idxz(jjz, 0); + const int j2 = idxz(jjz, 1); + const int j = idxz(jjz, 2); + const int jju_half = idxz(jjz, 9); + int idouble = 0; + for (int elem1 = 0; elem1 < nelements; elem1++) { + for (int elem2 = 0; elem2 < nelements; elem2++) { + auto ztmp = zlist_pack(iatom_mod,jjz,idouble,iatom_div); + // apply to z(j1,j2,j,ma,mb) to unique element of y(j) + // find right y_list[jju] and beta(iatom,jjb) entries + // multiply and divide by j+1 factors + // account for multiplicity of 1, 2, or 3 + // pick out right beta value + for (int elem3 = 0; elem3 < nelements; elem3++) { + if (j >= j1) { + const int jjb = idxb_block(j1, j2, j); + const auto itriple = ((elem1 * nelements + elem2) * nelements + elem3) * idxb_max + jjb; + if (j1 == j) { + if (j2 == j) betaj = 3 * beta_pack(iatom_mod, itriple, iatom_div); + else betaj = 2 * beta_pack(iatom_mod, itriple, iatom_div); + } else betaj = beta_pack(iatom_mod, itriple, iatom_div); + } else if (j >= j2) { + const int jjb = idxb_block(j, j2, j1); + const auto itriple = ((elem3 * nelements + elem2) * nelements + elem1) * idxb_max + jjb; + if (j2 == j) betaj = 2 * beta_pack(iatom_mod, itriple, iatom_div); + else betaj = beta_pack(iatom_mod, itriple, iatom_div); + } else { + const int jjb = idxb_block(j2, j, j1); + const auto itriple = ((elem2 * nelements + elem3) * nelements + elem1) * idxb_max + jjb; + betaj = beta_pack(iatom_mod, itriple, iatom_div); + } + if (!bnorm_flag && j1 > j) + betaj *= (j1 + 1) / (j + 1.0); + Kokkos::atomic_add(&(ylist_pack_re(iatom_mod, jju_half, elem3, iatom_div)), betaj*ztmp.re); + Kokkos::atomic_add(&(ylist_pack_im(iatom_mod, jju_half, elem3, iatom_div)), betaj*ztmp.im); + } // end loop over elem3 + idouble++; + } // end loop over elem2 + } // end loop over elem1 +} + /* ---------------------------------------------------------------------- Fused calculation of the derivative of Ui w.r.t. atom j and accumulation into dEidRj. GPU only.