Optimize quadratic Kokkos SNAP, pointed out by @weinbe2

This commit is contained in:
Stan Moore
2021-03-19 17:22:40 -04:00
parent 772d270974
commit 4269eeeef7
4 changed files with 83 additions and 8 deletions

View File

@ -50,6 +50,7 @@ struct TagPairSNAPBeta{};
struct TagPairSNAPComputeBi{};
struct TagPairSNAPTransformBi{}; // re-order blist from AoSoA to AoS
struct TagPairSNAPComputeYi{};
struct TagPairSNAPComputeYiWithZlist{};
template<int dir>
struct TagPairSNAPComputeFusedDeidrj{};
@ -161,6 +162,9 @@ public:
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeYi,const int iatom_mod, const int idxz, const int iatom_div) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeYiWithZlist,const int iatom_mod, const int idxz, const int iatom_div) const;
template<int dir>
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeFusedDeidrj<dir>,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeFusedDeidrj<dir> >::member_type& team) const;

View File

@ -373,16 +373,18 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::compute(int eflag_in,
//Compute beta = dE_i/dB_i for all i in list
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPBeta> policy_beta(0,chunk_size);
Kokkos::parallel_for("ComputeBeta",policy_beta,*this);
//ComputeYi
// team_size_compute_yi is defined in `pair_snap_kokkos.h`
const int idxz_max = snaKK.idxz_max;
Snap3DRangePolicy<DeviceType, tile_size_compute_yi, TagPairSNAPComputeYi>
policy_compute_yi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_yi,1});
Kokkos::parallel_for("ComputeYi",policy_compute_yi,*this);
if (quadraticflag || eflag) {
Snap3DRangePolicy<DeviceType, tile_size_compute_yi, TagPairSNAPComputeYiWithZlist>
policy_compute_yi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_yi,1});
Kokkos::parallel_for("ComputeYiWithZlist",policy_compute_yi,*this);
} else {
Snap3DRangePolicy<DeviceType, tile_size_compute_yi, TagPairSNAPComputeYi>
policy_compute_yi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_yi,1});
Kokkos::parallel_for("ComputeYi",policy_compute_yi,*this);
}
}
// Fused ComputeDuidrj, ComputeDeidrj
{
// team_size_compute_fused_deidrj is defined in `pair_snap_kokkos.h`
@ -796,6 +798,19 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
my_sna.compute_yi(iatom_mod,jjz,iatom_div,d_beta_pack);
}
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeYiWithZlist,const int iatom_mod, const int jjz, const int iatom_div) const {
SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
const int iatom = iatom_mod + iatom_div * vector_length;
if (iatom >= chunk_size) return;
if (jjz >= my_sna.idxz_max) return;
my_sna.compute_yi_with_zlist(iatom_mod,jjz,iatom_div,d_beta_pack);
}
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeZi,const int iatom_mod, const int jjz, const int iatom_div) const {

View File

@ -125,6 +125,9 @@ inline
void compute_yi(int,int,int,
const Kokkos::View<real_type***, Kokkos::LayoutLeft, DeviceType> &beta_pack); // ForceSNAP
KOKKOS_INLINE_FUNCTION
void compute_yi_with_zlist(int,int,int,
const Kokkos::View<real_type***, Kokkos::LayoutLeft, DeviceType> &beta_pack); // ForceSNAP
KOKKOS_INLINE_FUNCTION
void compute_bi(const int&, const int&, const int&); // ForceSNAP
// functions for bispectrum coefficients, CPU only

View File

@ -880,6 +880,59 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_yi(int iatom_mod,
} // end loop over elem1
}
/* ----------------------------------------------------------------------
compute Yi from Ui without storing Zi, looping over zlist indices.
AoSoA data layout to take advantage of coalescing, avoiding warp
divergence. GPU version.
------------------------------------------------------------------------- */
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType, real_type, vector_length>::compute_yi_with_zlist(int iatom_mod, int jjz, int iatom_div,
const Kokkos::View<real_type***, Kokkos::LayoutLeft, DeviceType> &beta_pack)
{
real_type betaj;
const int j1 = idxz(jjz, 0);
const int j2 = idxz(jjz, 1);
const int j = idxz(jjz, 2);
const int jju_half = idxz(jjz, 9);
int idouble = 0;
for (int elem1 = 0; elem1 < nelements; elem1++) {
for (int elem2 = 0; elem2 < nelements; elem2++) {
auto ztmp = zlist_pack(iatom_mod,jjz,idouble,iatom_div);
// apply to z(j1,j2,j,ma,mb) to unique element of y(j)
// find right y_list[jju] and beta(iatom,jjb) entries
// multiply and divide by j+1 factors
// account for multiplicity of 1, 2, or 3
// pick out right beta value
for (int elem3 = 0; elem3 < nelements; elem3++) {
if (j >= j1) {
const int jjb = idxb_block(j1, j2, j);
const auto itriple = ((elem1 * nelements + elem2) * nelements + elem3) * idxb_max + jjb;
if (j1 == j) {
if (j2 == j) betaj = 3 * beta_pack(iatom_mod, itriple, iatom_div);
else betaj = 2 * beta_pack(iatom_mod, itriple, iatom_div);
} else betaj = beta_pack(iatom_mod, itriple, iatom_div);
} else if (j >= j2) {
const int jjb = idxb_block(j, j2, j1);
const auto itriple = ((elem3 * nelements + elem2) * nelements + elem1) * idxb_max + jjb;
if (j2 == j) betaj = 2 * beta_pack(iatom_mod, itriple, iatom_div);
else betaj = beta_pack(iatom_mod, itriple, iatom_div);
} else {
const int jjb = idxb_block(j2, j, j1);
const auto itriple = ((elem2 * nelements + elem3) * nelements + elem1) * idxb_max + jjb;
betaj = beta_pack(iatom_mod, itriple, iatom_div);
}
if (!bnorm_flag && j1 > j)
betaj *= (j1 + 1) / (j + 1.0);
Kokkos::atomic_add(&(ylist_pack_re(iatom_mod, jju_half, elem3, iatom_div)), betaj*ztmp.re);
Kokkos::atomic_add(&(ylist_pack_im(iatom_mod, jju_half, elem3, iatom_div)), betaj*ztmp.im);
} // end loop over elem3
idouble++;
} // end loop over elem2
} // end loop over elem1
}
/* ----------------------------------------------------------------------
Fused calculation of the derivative of Ui w.r.t. atom j
and accumulation into dEidRj. GPU only.