Merge pull request #2677 from stanmoore1/kk_snap_opt

Optimizations to Kokkos SNAP
This commit is contained in:
Axel Kohlmeyer
2021-03-23 13:32:02 -04:00
committed by GitHub
4 changed files with 137 additions and 84 deletions

View File

@ -50,6 +50,7 @@ struct TagPairSNAPBeta{};
struct TagPairSNAPComputeBi{};
struct TagPairSNAPTransformBi{}; // re-order blist from AoSoA to AoS
struct TagPairSNAPComputeYi{};
struct TagPairSNAPComputeYiWithZlist{};
template<int dir>
struct TagPairSNAPComputeFusedDeidrj{};
@ -121,11 +122,11 @@ public:
template<int NEIGHFLAG, int EVFLAG>
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeForce<NEIGHFLAG,EVFLAG>,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeForce<NEIGHFLAG,EVFLAG> >::member_type& team) const;
void operator() (TagPairSNAPComputeForce<NEIGHFLAG,EVFLAG>,const int& ii) const;
template<int NEIGHFLAG, int EVFLAG>
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeForce<NEIGHFLAG,EVFLAG>,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeForce<NEIGHFLAG,EVFLAG> >::member_type& team, EV_FLOAT&) const;
void operator() (TagPairSNAPComputeForce<NEIGHFLAG,EVFLAG>,const int& ii, EV_FLOAT&) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPBetaCPU,const int& ii) const;
@ -161,6 +162,9 @@ public:
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeYi,const int iatom_mod, const int idxz, const int iatom_div) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeYiWithZlist,const int iatom_mod, const int idxz, const int iatom_div) const;
template<int dir>
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeFusedDeidrj<dir>,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeFusedDeidrj<dir> >::member_type& team) const;

View File

@ -373,14 +373,16 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::compute(int eflag_in,
//Compute beta = dE_i/dB_i for all i in list
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPBeta> policy_beta(0,chunk_size);
Kokkos::parallel_for("ComputeBeta",policy_beta,*this);
//ComputeYi
// team_size_compute_yi is defined in `pair_snap_kokkos.h`
const int idxz_max = snaKK.idxz_max;
Snap3DRangePolicy<DeviceType, tile_size_compute_yi, TagPairSNAPComputeYi>
policy_compute_yi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_yi,1});
Kokkos::parallel_for("ComputeYi",policy_compute_yi,*this);
if (quadraticflag || eflag) {
Snap3DRangePolicy<DeviceType, tile_size_compute_yi, TagPairSNAPComputeYiWithZlist>
policy_compute_yi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_yi,1});
Kokkos::parallel_for("ComputeYiWithZlist",policy_compute_yi,*this);
} else {
Snap3DRangePolicy<DeviceType, tile_size_compute_yi, TagPairSNAPComputeYi>
policy_compute_yi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_yi,1});
Kokkos::parallel_for("ComputeYi",policy_compute_yi,*this);
}
}
// Fused ComputeDuidrj, ComputeDeidrj
@ -418,30 +420,21 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::compute(int eflag_in,
//ComputeForce
{
int team_size = team_size_default;
if (evflag) {
if (neighflag == HALF) {
check_team_size_reduce<TagPairSNAPComputeForce<HALF,1> >(chunk_size,team_size);
typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeForce<HALF,1> > policy_force(chunk_size,team_size,vector_length);
Kokkos::parallel_reduce(policy_force
,*this,ev_tmp);
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeForce<HALF,1> > policy_force(0,chunk_size);
Kokkos::parallel_reduce(policy_force, *this, ev_tmp);
} else if (neighflag == HALFTHREAD) {
check_team_size_reduce<TagPairSNAPComputeForce<HALFTHREAD,1> >(chunk_size,team_size);
typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeForce<HALFTHREAD,1> > policy_force(chunk_size,team_size,vector_length);
Kokkos::parallel_reduce(policy_force
,*this,ev_tmp);
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeForce<HALFTHREAD,1> > policy_force(0,chunk_size);
Kokkos::parallel_reduce(policy_force, *this, ev_tmp);
}
} else {
if (neighflag == HALF) {
check_team_size_for<TagPairSNAPComputeForce<HALF,0> >(chunk_size,team_size);
typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeForce<HALF,0> > policy_force(chunk_size,team_size,vector_length);
Kokkos::parallel_for(policy_force
,*this);
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeForce<HALF,0> > policy_force(0,chunk_size);
Kokkos::parallel_for(policy_force, *this);
} else if (neighflag == HALFTHREAD) {
check_team_size_for<TagPairSNAPComputeForce<HALFTHREAD,0> >(chunk_size,team_size);
typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeForce<HALFTHREAD,0> > policy_force(chunk_size,team_size,vector_length);
Kokkos::parallel_for(policy_force
,*this);
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeForce<HALFTHREAD,0> > policy_force(0,chunk_size);
Kokkos::parallel_for(policy_force, *this);
}
}
}
@ -796,6 +789,19 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
my_sna.compute_yi(iatom_mod,jjz,iatom_div,d_beta_pack);
}
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeYiWithZlist,const int iatom_mod, const int jjz, const int iatom_div) const {
SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
const int iatom = iatom_mod + iatom_div * vector_length;
if (iatom >= chunk_size) return;
if (jjz >= my_sna.idxz_max) return;
my_sna.compute_yi_with_zlist(iatom_mod,jjz,iatom_div,d_beta_pack);
}
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeZi,const int iatom_mod, const int jjz, const int iatom_div) const {
@ -1137,20 +1143,19 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
template<class DeviceType, typename real_type, int vector_length>
template<int NEIGHFLAG, int EVFLAG>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeForce<NEIGHFLAG,EVFLAG>,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeForce<NEIGHFLAG,EVFLAG> >::member_type& team, EV_FLOAT& ev) const {
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeForce<NEIGHFLAG,EVFLAG>, const int& ii, EV_FLOAT& ev) const {
// The f array is duplicated for OpenMP, atomic for CUDA, and neither for Serial
auto v_f = ScatterViewHelper<typename NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_f),decltype(ndup_f)>::get(dup_f,ndup_f);
auto a_f = v_f.template access<typename AtomicDup<NEIGHFLAG,DeviceType>::value>();
int ii = team.league_rank();
const int i = d_ilist[ii + chunk_offset];
SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
const int ninside = d_ninside(ii);
Kokkos::parallel_for (Kokkos::TeamThreadRange(team,ninside),
[&] (const int jj) {
for (int jj = 0; jj < ninside; jj++) {
int j = my_sna.inside(ii,jj);
F_FLOAT fij[3];
@ -1158,28 +1163,23 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
fij[1] = my_sna.dedr(ii,jj,1);
fij[2] = my_sna.dedr(ii,jj,2);
Kokkos::single(Kokkos::PerThread(team), [&] () {
a_f(i,0) += fij[0];
a_f(i,1) += fij[1];
a_f(i,2) += fij[2];
a_f(j,0) -= fij[0];
a_f(j,1) -= fij[1];
a_f(j,2) -= fij[2];
// tally global and per-atom virial contribution
if (EVFLAG) {
if (vflag_either) {
v_tally_xyz<NEIGHFLAG>(ev,i,j,
fij[0],fij[1],fij[2],
-my_sna.rij(ii,jj,0),-my_sna.rij(ii,jj,1),
-my_sna.rij(ii,jj,2));
}
a_f(i,0) += fij[0];
a_f(i,1) += fij[1];
a_f(i,2) += fij[2];
a_f(j,0) -= fij[0];
a_f(j,1) -= fij[1];
a_f(j,2) -= fij[2];
// tally global and per-atom virial contribution
if (EVFLAG) {
if (vflag_either) {
v_tally_xyz<NEIGHFLAG>(ev,i,j,
fij[0],fij[1],fij[2],
-my_sna.rij(ii,jj,0),-my_sna.rij(ii,jj,1),
-my_sna.rij(ii,jj,2));
}
}
});
});
}
// tally energy contribution
if (EVFLAG) {
@ -1189,48 +1189,41 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
const int ielem = d_map[itype];
auto d_coeffi = Kokkos::subview(d_coeffelem, ielem, Kokkos::ALL);
Kokkos::single(Kokkos::PerTeam(team), [&] () {
// evdwl = energy of atom I, sum over coeffs_k * Bi_k
// evdwl = energy of atom I, sum over coeffs_k * Bi_k
auto evdwl = d_coeffi[0];
auto evdwl = d_coeffi[0];
// E = beta.B + 0.5*B^t.alpha.B
// E = beta.B + 0.5*B^t.alpha.B
const auto idxb_max = snaKK.idxb_max;
const auto idxb_max = snaKK.idxb_max;
// linear contributions
// linear contributions
for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
const auto idxb = icoeff % idxb_max;
const auto idx_chem = icoeff / idxb_max;
evdwl += d_coeffi[icoeff+1]*my_sna.blist(idxb,idx_chem,ii);
}
// quadratic contributions
if (quadraticflag) {
int k = ncoeff+1;
for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
const auto idxb = icoeff % idxb_max;
const auto idx_chem = icoeff / idxb_max;
evdwl += d_coeffi[icoeff+1]*my_sna.blist(idxb,idx_chem,ii);
}
// quadratic contributions
if (quadraticflag) {
int k = ncoeff+1;
for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
const auto idxb = icoeff % idxb_max;
const auto idx_chem = icoeff / idxb_max;
auto bveci = my_sna.blist(idxb,idx_chem,ii);
evdwl += 0.5*d_coeffi[k++]*bveci*bveci;
for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
auto jdxb = jcoeff % idxb_max;
auto jdx_chem = jcoeff / idxb_max;
auto bvecj = my_sna.blist(jdxb,jdx_chem,ii);
evdwl += d_coeffi[k++]*bveci*bvecj;
}
auto bveci = my_sna.blist(idxb,idx_chem,ii);
evdwl += 0.5*d_coeffi[k++]*bveci*bveci;
for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
auto jdxb = jcoeff % idxb_max;
auto jdx_chem = jcoeff / idxb_max;
auto bvecj = my_sna.blist(jdxb,jdx_chem,ii);
evdwl += d_coeffi[k++]*bveci*bvecj;
}
}
//ev_tally_full(i,2.0*evdwl,0.0,0.0,0.0,0.0,0.0);
if (eflag_global) ev.evdwl += evdwl;
if (eflag_atom) d_eatom[i] += evdwl;
});
}
//ev_tally_full(i,2.0*evdwl,0.0,0.0,0.0,0.0,0.0);
if (eflag_global) ev.evdwl += evdwl;
if (eflag_atom) d_eatom[i] += evdwl;
}
}
}
@ -1238,9 +1231,9 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
template<class DeviceType, typename real_type, int vector_length>
template<int NEIGHFLAG, int EVFLAG>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeForce<NEIGHFLAG,EVFLAG>,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeForce<NEIGHFLAG,EVFLAG> >::member_type& team) const {
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeForce<NEIGHFLAG,EVFLAG>,const int& ii) const {
EV_FLOAT ev;
this->template operator()<NEIGHFLAG,EVFLAG>(TagPairSNAPComputeForce<NEIGHFLAG,EVFLAG>(), team, ev);
this->template operator()<NEIGHFLAG,EVFLAG>(TagPairSNAPComputeForce<NEIGHFLAG,EVFLAG>(), ii, ev);
}
/* ---------------------------------------------------------------------- */

View File

@ -125,6 +125,9 @@ inline
void compute_yi(int,int,int,
const Kokkos::View<real_type***, Kokkos::LayoutLeft, DeviceType> &beta_pack); // ForceSNAP
KOKKOS_INLINE_FUNCTION
void compute_yi_with_zlist(int,int,int,
const Kokkos::View<real_type***, Kokkos::LayoutLeft, DeviceType> &beta_pack); // ForceSNAP
KOKKOS_INLINE_FUNCTION
void compute_bi(const int&, const int&, const int&); // ForceSNAP
// functions for bispectrum coefficients, CPU only

View File

@ -880,6 +880,59 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_yi(int iatom_mod,
} // end loop over elem1
}
/* ----------------------------------------------------------------------
compute Yi from Ui without storing Zi, looping over zlist indices.
AoSoA data layout to take advantage of coalescing, avoiding warp
divergence. GPU version.
------------------------------------------------------------------------- */
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType, real_type, vector_length>::compute_yi_with_zlist(int iatom_mod, int jjz, int iatom_div,
const Kokkos::View<real_type***, Kokkos::LayoutLeft, DeviceType> &beta_pack)
{
real_type betaj;
const int j1 = idxz(jjz, 0);
const int j2 = idxz(jjz, 1);
const int j = idxz(jjz, 2);
const int jju_half = idxz(jjz, 9);
int idouble = 0;
for (int elem1 = 0; elem1 < nelements; elem1++) {
for (int elem2 = 0; elem2 < nelements; elem2++) {
auto ztmp = zlist_pack(iatom_mod,jjz,idouble,iatom_div);
// apply to z(j1,j2,j,ma,mb) to unique element of y(j)
// find right y_list[jju] and beta(iatom,jjb) entries
// multiply and divide by j+1 factors
// account for multiplicity of 1, 2, or 3
// pick out right beta value
for (int elem3 = 0; elem3 < nelements; elem3++) {
if (j >= j1) {
const int jjb = idxb_block(j1, j2, j);
const auto itriple = ((elem1 * nelements + elem2) * nelements + elem3) * idxb_max + jjb;
if (j1 == j) {
if (j2 == j) betaj = 3 * beta_pack(iatom_mod, itriple, iatom_div);
else betaj = 2 * beta_pack(iatom_mod, itriple, iatom_div);
} else betaj = beta_pack(iatom_mod, itriple, iatom_div);
} else if (j >= j2) {
const int jjb = idxb_block(j, j2, j1);
const auto itriple = ((elem3 * nelements + elem2) * nelements + elem1) * idxb_max + jjb;
if (j2 == j) betaj = 2 * beta_pack(iatom_mod, itriple, iatom_div);
else betaj = beta_pack(iatom_mod, itriple, iatom_div);
} else {
const int jjb = idxb_block(j2, j, j1);
const auto itriple = ((elem2 * nelements + elem3) * nelements + elem1) * idxb_max + jjb;
betaj = beta_pack(iatom_mod, itriple, iatom_div);
}
if (!bnorm_flag && j1 > j)
betaj *= (j1 + 1) / (j + 1.0);
Kokkos::atomic_add(&(ylist_pack_re(iatom_mod, jju_half, elem3, iatom_div)), betaj*ztmp.re);
Kokkos::atomic_add(&(ylist_pack_im(iatom_mod, jju_half, elem3, iatom_div)), betaj*ztmp.im);
} // end loop over elem3
idouble++;
} // end loop over elem2
} // end loop over elem1
}
/* ----------------------------------------------------------------------
Fused calculation of the derivative of Ui w.r.t. atom j
and accumulation into dEidRj. GPU only.