Merge pull request #2677 from stanmoore1/kk_snap_opt
Optimizations to Kokkos SNAP
This commit is contained in:
@ -50,6 +50,7 @@ struct TagPairSNAPBeta{};
|
||||
struct TagPairSNAPComputeBi{};
|
||||
struct TagPairSNAPTransformBi{}; // re-order blist from AoSoA to AoS
|
||||
struct TagPairSNAPComputeYi{};
|
||||
struct TagPairSNAPComputeYiWithZlist{};
|
||||
template<int dir>
|
||||
struct TagPairSNAPComputeFusedDeidrj{};
|
||||
|
||||
@ -121,11 +122,11 @@ public:
|
||||
|
||||
template<int NEIGHFLAG, int EVFLAG>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPComputeForce<NEIGHFLAG,EVFLAG>,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeForce<NEIGHFLAG,EVFLAG> >::member_type& team) const;
|
||||
void operator() (TagPairSNAPComputeForce<NEIGHFLAG,EVFLAG>,const int& ii) const;
|
||||
|
||||
template<int NEIGHFLAG, int EVFLAG>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPComputeForce<NEIGHFLAG,EVFLAG>,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeForce<NEIGHFLAG,EVFLAG> >::member_type& team, EV_FLOAT&) const;
|
||||
void operator() (TagPairSNAPComputeForce<NEIGHFLAG,EVFLAG>,const int& ii, EV_FLOAT&) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPBetaCPU,const int& ii) const;
|
||||
@ -161,6 +162,9 @@ public:
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPComputeYi,const int iatom_mod, const int idxz, const int iatom_div) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPComputeYiWithZlist,const int iatom_mod, const int idxz, const int iatom_div) const;
|
||||
|
||||
template<int dir>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPComputeFusedDeidrj<dir>,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeFusedDeidrj<dir> >::member_type& team) const;
|
||||
|
||||
@ -373,14 +373,16 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::compute(int eflag_in,
|
||||
//Compute beta = dE_i/dB_i for all i in list
|
||||
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPBeta> policy_beta(0,chunk_size);
|
||||
Kokkos::parallel_for("ComputeBeta",policy_beta,*this);
|
||||
|
||||
//ComputeYi
|
||||
// team_size_compute_yi is defined in `pair_snap_kokkos.h`
|
||||
const int idxz_max = snaKK.idxz_max;
|
||||
Snap3DRangePolicy<DeviceType, tile_size_compute_yi, TagPairSNAPComputeYi>
|
||||
policy_compute_yi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_yi,1});
|
||||
Kokkos::parallel_for("ComputeYi",policy_compute_yi,*this);
|
||||
|
||||
if (quadraticflag || eflag) {
|
||||
Snap3DRangePolicy<DeviceType, tile_size_compute_yi, TagPairSNAPComputeYiWithZlist>
|
||||
policy_compute_yi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_yi,1});
|
||||
Kokkos::parallel_for("ComputeYiWithZlist",policy_compute_yi,*this);
|
||||
} else {
|
||||
Snap3DRangePolicy<DeviceType, tile_size_compute_yi, TagPairSNAPComputeYi>
|
||||
policy_compute_yi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_yi,1});
|
||||
Kokkos::parallel_for("ComputeYi",policy_compute_yi,*this);
|
||||
}
|
||||
}
|
||||
|
||||
// Fused ComputeDuidrj, ComputeDeidrj
|
||||
@ -418,30 +420,21 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::compute(int eflag_in,
|
||||
|
||||
//ComputeForce
|
||||
{
|
||||
int team_size = team_size_default;
|
||||
if (evflag) {
|
||||
if (neighflag == HALF) {
|
||||
check_team_size_reduce<TagPairSNAPComputeForce<HALF,1> >(chunk_size,team_size);
|
||||
typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeForce<HALF,1> > policy_force(chunk_size,team_size,vector_length);
|
||||
Kokkos::parallel_reduce(policy_force
|
||||
,*this,ev_tmp);
|
||||
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeForce<HALF,1> > policy_force(0,chunk_size);
|
||||
Kokkos::parallel_reduce(policy_force, *this, ev_tmp);
|
||||
} else if (neighflag == HALFTHREAD) {
|
||||
check_team_size_reduce<TagPairSNAPComputeForce<HALFTHREAD,1> >(chunk_size,team_size);
|
||||
typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeForce<HALFTHREAD,1> > policy_force(chunk_size,team_size,vector_length);
|
||||
Kokkos::parallel_reduce(policy_force
|
||||
,*this,ev_tmp);
|
||||
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeForce<HALFTHREAD,1> > policy_force(0,chunk_size);
|
||||
Kokkos::parallel_reduce(policy_force, *this, ev_tmp);
|
||||
}
|
||||
} else {
|
||||
if (neighflag == HALF) {
|
||||
check_team_size_for<TagPairSNAPComputeForce<HALF,0> >(chunk_size,team_size);
|
||||
typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeForce<HALF,0> > policy_force(chunk_size,team_size,vector_length);
|
||||
Kokkos::parallel_for(policy_force
|
||||
,*this);
|
||||
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeForce<HALF,0> > policy_force(0,chunk_size);
|
||||
Kokkos::parallel_for(policy_force, *this);
|
||||
} else if (neighflag == HALFTHREAD) {
|
||||
check_team_size_for<TagPairSNAPComputeForce<HALFTHREAD,0> >(chunk_size,team_size);
|
||||
typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeForce<HALFTHREAD,0> > policy_force(chunk_size,team_size,vector_length);
|
||||
Kokkos::parallel_for(policy_force
|
||||
,*this);
|
||||
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeForce<HALFTHREAD,0> > policy_force(0,chunk_size);
|
||||
Kokkos::parallel_for(policy_force, *this);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -796,6 +789,19 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
|
||||
my_sna.compute_yi(iatom_mod,jjz,iatom_div,d_beta_pack);
|
||||
}
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeYiWithZlist,const int iatom_mod, const int jjz, const int iatom_div) const {
|
||||
SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
|
||||
|
||||
const int iatom = iatom_mod + iatom_div * vector_length;
|
||||
if (iatom >= chunk_size) return;
|
||||
|
||||
if (jjz >= my_sna.idxz_max) return;
|
||||
|
||||
my_sna.compute_yi_with_zlist(iatom_mod,jjz,iatom_div,d_beta_pack);
|
||||
}
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeZi,const int iatom_mod, const int jjz, const int iatom_div) const {
|
||||
@ -1137,20 +1143,19 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
template<int NEIGHFLAG, int EVFLAG>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeForce<NEIGHFLAG,EVFLAG>,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeForce<NEIGHFLAG,EVFLAG> >::member_type& team, EV_FLOAT& ev) const {
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeForce<NEIGHFLAG,EVFLAG>, const int& ii, EV_FLOAT& ev) const {
|
||||
|
||||
// The f array is duplicated for OpenMP, atomic for CUDA, and neither for Serial
|
||||
|
||||
auto v_f = ScatterViewHelper<typename NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_f),decltype(ndup_f)>::get(dup_f,ndup_f);
|
||||
auto a_f = v_f.template access<typename AtomicDup<NEIGHFLAG,DeviceType>::value>();
|
||||
|
||||
int ii = team.league_rank();
|
||||
const int i = d_ilist[ii + chunk_offset];
|
||||
|
||||
SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
|
||||
|
||||
const int ninside = d_ninside(ii);
|
||||
|
||||
Kokkos::parallel_for (Kokkos::TeamThreadRange(team,ninside),
|
||||
[&] (const int jj) {
|
||||
for (int jj = 0; jj < ninside; jj++) {
|
||||
int j = my_sna.inside(ii,jj);
|
||||
|
||||
F_FLOAT fij[3];
|
||||
@ -1158,28 +1163,23 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
|
||||
fij[1] = my_sna.dedr(ii,jj,1);
|
||||
fij[2] = my_sna.dedr(ii,jj,2);
|
||||
|
||||
Kokkos::single(Kokkos::PerThread(team), [&] () {
|
||||
a_f(i,0) += fij[0];
|
||||
a_f(i,1) += fij[1];
|
||||
a_f(i,2) += fij[2];
|
||||
a_f(j,0) -= fij[0];
|
||||
a_f(j,1) -= fij[1];
|
||||
a_f(j,2) -= fij[2];
|
||||
|
||||
// tally global and per-atom virial contribution
|
||||
|
||||
if (EVFLAG) {
|
||||
if (vflag_either) {
|
||||
v_tally_xyz<NEIGHFLAG>(ev,i,j,
|
||||
fij[0],fij[1],fij[2],
|
||||
-my_sna.rij(ii,jj,0),-my_sna.rij(ii,jj,1),
|
||||
-my_sna.rij(ii,jj,2));
|
||||
}
|
||||
a_f(i,0) += fij[0];
|
||||
a_f(i,1) += fij[1];
|
||||
a_f(i,2) += fij[2];
|
||||
a_f(j,0) -= fij[0];
|
||||
a_f(j,1) -= fij[1];
|
||||
a_f(j,2) -= fij[2];
|
||||
// tally global and per-atom virial contribution
|
||||
if (EVFLAG) {
|
||||
if (vflag_either) {
|
||||
v_tally_xyz<NEIGHFLAG>(ev,i,j,
|
||||
fij[0],fij[1],fij[2],
|
||||
-my_sna.rij(ii,jj,0),-my_sna.rij(ii,jj,1),
|
||||
-my_sna.rij(ii,jj,2));
|
||||
}
|
||||
}
|
||||
|
||||
});
|
||||
});
|
||||
|
||||
}
|
||||
// tally energy contribution
|
||||
|
||||
if (EVFLAG) {
|
||||
@ -1189,48 +1189,41 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
|
||||
const int ielem = d_map[itype];
|
||||
auto d_coeffi = Kokkos::subview(d_coeffelem, ielem, Kokkos::ALL);
|
||||
|
||||
Kokkos::single(Kokkos::PerTeam(team), [&] () {
|
||||
// evdwl = energy of atom I, sum over coeffs_k * Bi_k
|
||||
|
||||
// evdwl = energy of atom I, sum over coeffs_k * Bi_k
|
||||
auto evdwl = d_coeffi[0];
|
||||
|
||||
auto evdwl = d_coeffi[0];
|
||||
// E = beta.B + 0.5*B^t.alpha.B
|
||||
|
||||
// E = beta.B + 0.5*B^t.alpha.B
|
||||
const auto idxb_max = snaKK.idxb_max;
|
||||
|
||||
const auto idxb_max = snaKK.idxb_max;
|
||||
// linear contributions
|
||||
|
||||
// linear contributions
|
||||
for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
|
||||
const auto idxb = icoeff % idxb_max;
|
||||
const auto idx_chem = icoeff / idxb_max;
|
||||
evdwl += d_coeffi[icoeff+1]*my_sna.blist(idxb,idx_chem,ii);
|
||||
}
|
||||
|
||||
// quadratic contributions
|
||||
if (quadraticflag) {
|
||||
int k = ncoeff+1;
|
||||
for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
|
||||
const auto idxb = icoeff % idxb_max;
|
||||
const auto idx_chem = icoeff / idxb_max;
|
||||
evdwl += d_coeffi[icoeff+1]*my_sna.blist(idxb,idx_chem,ii);
|
||||
}
|
||||
|
||||
// quadratic contributions
|
||||
|
||||
if (quadraticflag) {
|
||||
int k = ncoeff+1;
|
||||
for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
|
||||
const auto idxb = icoeff % idxb_max;
|
||||
const auto idx_chem = icoeff / idxb_max;
|
||||
auto bveci = my_sna.blist(idxb,idx_chem,ii);
|
||||
|
||||
evdwl += 0.5*d_coeffi[k++]*bveci*bveci;
|
||||
for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
|
||||
auto jdxb = jcoeff % idxb_max;
|
||||
auto jdx_chem = jcoeff / idxb_max;
|
||||
auto bvecj = my_sna.blist(jdxb,jdx_chem,ii);
|
||||
|
||||
evdwl += d_coeffi[k++]*bveci*bvecj;
|
||||
}
|
||||
auto bveci = my_sna.blist(idxb,idx_chem,ii);
|
||||
evdwl += 0.5*d_coeffi[k++]*bveci*bveci;
|
||||
for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
|
||||
auto jdxb = jcoeff % idxb_max;
|
||||
auto jdx_chem = jcoeff / idxb_max;
|
||||
auto bvecj = my_sna.blist(jdxb,jdx_chem,ii);
|
||||
evdwl += d_coeffi[k++]*bveci*bvecj;
|
||||
}
|
||||
}
|
||||
|
||||
//ev_tally_full(i,2.0*evdwl,0.0,0.0,0.0,0.0,0.0);
|
||||
if (eflag_global) ev.evdwl += evdwl;
|
||||
if (eflag_atom) d_eatom[i] += evdwl;
|
||||
});
|
||||
}
|
||||
//ev_tally_full(i,2.0*evdwl,0.0,0.0,0.0,0.0,0.0);
|
||||
if (eflag_global) ev.evdwl += evdwl;
|
||||
if (eflag_atom) d_eatom[i] += evdwl;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1238,9 +1231,9 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
template<int NEIGHFLAG, int EVFLAG>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeForce<NEIGHFLAG,EVFLAG>,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeForce<NEIGHFLAG,EVFLAG> >::member_type& team) const {
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeForce<NEIGHFLAG,EVFLAG>,const int& ii) const {
|
||||
EV_FLOAT ev;
|
||||
this->template operator()<NEIGHFLAG,EVFLAG>(TagPairSNAPComputeForce<NEIGHFLAG,EVFLAG>(), team, ev);
|
||||
this->template operator()<NEIGHFLAG,EVFLAG>(TagPairSNAPComputeForce<NEIGHFLAG,EVFLAG>(), ii, ev);
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
@ -125,6 +125,9 @@ inline
|
||||
void compute_yi(int,int,int,
|
||||
const Kokkos::View<real_type***, Kokkos::LayoutLeft, DeviceType> &beta_pack); // ForceSNAP
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_yi_with_zlist(int,int,int,
|
||||
const Kokkos::View<real_type***, Kokkos::LayoutLeft, DeviceType> &beta_pack); // ForceSNAP
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_bi(const int&, const int&, const int&); // ForceSNAP
|
||||
|
||||
// functions for bispectrum coefficients, CPU only
|
||||
|
||||
@ -880,6 +880,59 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_yi(int iatom_mod,
|
||||
} // end loop over elem1
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
compute Yi from Ui without storing Zi, looping over zlist indices.
|
||||
AoSoA data layout to take advantage of coalescing, avoiding warp
|
||||
divergence. GPU version.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void SNAKokkos<DeviceType, real_type, vector_length>::compute_yi_with_zlist(int iatom_mod, int jjz, int iatom_div,
|
||||
const Kokkos::View<real_type***, Kokkos::LayoutLeft, DeviceType> &beta_pack)
|
||||
{
|
||||
real_type betaj;
|
||||
const int j1 = idxz(jjz, 0);
|
||||
const int j2 = idxz(jjz, 1);
|
||||
const int j = idxz(jjz, 2);
|
||||
const int jju_half = idxz(jjz, 9);
|
||||
int idouble = 0;
|
||||
for (int elem1 = 0; elem1 < nelements; elem1++) {
|
||||
for (int elem2 = 0; elem2 < nelements; elem2++) {
|
||||
auto ztmp = zlist_pack(iatom_mod,jjz,idouble,iatom_div);
|
||||
// apply to z(j1,j2,j,ma,mb) to unique element of y(j)
|
||||
// find right y_list[jju] and beta(iatom,jjb) entries
|
||||
// multiply and divide by j+1 factors
|
||||
// account for multiplicity of 1, 2, or 3
|
||||
// pick out right beta value
|
||||
for (int elem3 = 0; elem3 < nelements; elem3++) {
|
||||
if (j >= j1) {
|
||||
const int jjb = idxb_block(j1, j2, j);
|
||||
const auto itriple = ((elem1 * nelements + elem2) * nelements + elem3) * idxb_max + jjb;
|
||||
if (j1 == j) {
|
||||
if (j2 == j) betaj = 3 * beta_pack(iatom_mod, itriple, iatom_div);
|
||||
else betaj = 2 * beta_pack(iatom_mod, itriple, iatom_div);
|
||||
} else betaj = beta_pack(iatom_mod, itriple, iatom_div);
|
||||
} else if (j >= j2) {
|
||||
const int jjb = idxb_block(j, j2, j1);
|
||||
const auto itriple = ((elem3 * nelements + elem2) * nelements + elem1) * idxb_max + jjb;
|
||||
if (j2 == j) betaj = 2 * beta_pack(iatom_mod, itriple, iatom_div);
|
||||
else betaj = beta_pack(iatom_mod, itriple, iatom_div);
|
||||
} else {
|
||||
const int jjb = idxb_block(j2, j, j1);
|
||||
const auto itriple = ((elem2 * nelements + elem3) * nelements + elem1) * idxb_max + jjb;
|
||||
betaj = beta_pack(iatom_mod, itriple, iatom_div);
|
||||
}
|
||||
if (!bnorm_flag && j1 > j)
|
||||
betaj *= (j1 + 1) / (j + 1.0);
|
||||
Kokkos::atomic_add(&(ylist_pack_re(iatom_mod, jju_half, elem3, iatom_div)), betaj*ztmp.re);
|
||||
Kokkos::atomic_add(&(ylist_pack_im(iatom_mod, jju_half, elem3, iatom_div)), betaj*ztmp.im);
|
||||
} // end loop over elem3
|
||||
idouble++;
|
||||
} // end loop over elem2
|
||||
} // end loop over elem1
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Fused calculation of the derivative of Ui w.r.t. atom j
|
||||
and accumulation into dEidRj. GPU only.
|
||||
|
||||
Reference in New Issue
Block a user