Optimize quadratic Kokkos SNAP, pointed out by @weinbe2
This commit is contained in:
@ -50,6 +50,7 @@ struct TagPairSNAPBeta{};
|
||||
struct TagPairSNAPComputeBi{};
|
||||
struct TagPairSNAPTransformBi{}; // re-order blist from AoSoA to AoS
|
||||
struct TagPairSNAPComputeYi{};
|
||||
struct TagPairSNAPComputeYiWithZlist{};
|
||||
template<int dir>
|
||||
struct TagPairSNAPComputeFusedDeidrj{};
|
||||
|
||||
@ -161,6 +162,9 @@ public:
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPComputeYi,const int iatom_mod, const int idxz, const int iatom_div) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPComputeYiWithZlist,const int iatom_mod, const int idxz, const int iatom_div) const;
|
||||
|
||||
template<int dir>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPComputeFusedDeidrj<dir>,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeFusedDeidrj<dir> >::member_type& team) const;
|
||||
|
||||
@ -373,16 +373,18 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::compute(int eflag_in,
|
||||
//Compute beta = dE_i/dB_i for all i in list
|
||||
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPBeta> policy_beta(0,chunk_size);
|
||||
Kokkos::parallel_for("ComputeBeta",policy_beta,*this);
|
||||
|
||||
//ComputeYi
|
||||
// team_size_compute_yi is defined in `pair_snap_kokkos.h`
|
||||
const int idxz_max = snaKK.idxz_max;
|
||||
Snap3DRangePolicy<DeviceType, tile_size_compute_yi, TagPairSNAPComputeYi>
|
||||
policy_compute_yi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_yi,1});
|
||||
Kokkos::parallel_for("ComputeYi",policy_compute_yi,*this);
|
||||
|
||||
if (quadraticflag || eflag) {
|
||||
Snap3DRangePolicy<DeviceType, tile_size_compute_yi, TagPairSNAPComputeYiWithZlist>
|
||||
policy_compute_yi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_yi,1});
|
||||
Kokkos::parallel_for("ComputeYiWithZlist",policy_compute_yi,*this);
|
||||
} else {
|
||||
Snap3DRangePolicy<DeviceType, tile_size_compute_yi, TagPairSNAPComputeYi>
|
||||
policy_compute_yi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_yi,1});
|
||||
Kokkos::parallel_for("ComputeYi",policy_compute_yi,*this);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Fused ComputeDuidrj, ComputeDeidrj
|
||||
{
|
||||
// team_size_compute_fused_deidrj is defined in `pair_snap_kokkos.h`
|
||||
@ -796,6 +798,19 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
|
||||
my_sna.compute_yi(iatom_mod,jjz,iatom_div,d_beta_pack);
|
||||
}
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeYiWithZlist,const int iatom_mod, const int jjz, const int iatom_div) const {
|
||||
SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
|
||||
|
||||
const int iatom = iatom_mod + iatom_div * vector_length;
|
||||
if (iatom >= chunk_size) return;
|
||||
|
||||
if (jjz >= my_sna.idxz_max) return;
|
||||
|
||||
my_sna.compute_yi_with_zlist(iatom_mod,jjz,iatom_div,d_beta_pack);
|
||||
}
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeZi,const int iatom_mod, const int jjz, const int iatom_div) const {
|
||||
|
||||
@ -125,6 +125,9 @@ inline
|
||||
void compute_yi(int,int,int,
|
||||
const Kokkos::View<real_type***, Kokkos::LayoutLeft, DeviceType> &beta_pack); // ForceSNAP
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_yi_with_zlist(int,int,int,
|
||||
const Kokkos::View<real_type***, Kokkos::LayoutLeft, DeviceType> &beta_pack); // ForceSNAP
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_bi(const int&, const int&, const int&); // ForceSNAP
|
||||
|
||||
// functions for bispectrum coefficients, CPU only
|
||||
|
||||
@ -880,6 +880,59 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_yi(int iatom_mod,
|
||||
} // end loop over elem1
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
compute Yi from Ui without storing Zi, looping over zlist indices.
|
||||
AoSoA data layout to take advantage of coalescing, avoiding warp
|
||||
divergence. GPU version.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void SNAKokkos<DeviceType, real_type, vector_length>::compute_yi_with_zlist(int iatom_mod, int jjz, int iatom_div,
|
||||
const Kokkos::View<real_type***, Kokkos::LayoutLeft, DeviceType> &beta_pack)
|
||||
{
|
||||
real_type betaj;
|
||||
const int j1 = idxz(jjz, 0);
|
||||
const int j2 = idxz(jjz, 1);
|
||||
const int j = idxz(jjz, 2);
|
||||
const int jju_half = idxz(jjz, 9);
|
||||
int idouble = 0;
|
||||
for (int elem1 = 0; elem1 < nelements; elem1++) {
|
||||
for (int elem2 = 0; elem2 < nelements; elem2++) {
|
||||
auto ztmp = zlist_pack(iatom_mod,jjz,idouble,iatom_div);
|
||||
// apply to z(j1,j2,j,ma,mb) to unique element of y(j)
|
||||
// find right y_list[jju] and beta(iatom,jjb) entries
|
||||
// multiply and divide by j+1 factors
|
||||
// account for multiplicity of 1, 2, or 3
|
||||
// pick out right beta value
|
||||
for (int elem3 = 0; elem3 < nelements; elem3++) {
|
||||
if (j >= j1) {
|
||||
const int jjb = idxb_block(j1, j2, j);
|
||||
const auto itriple = ((elem1 * nelements + elem2) * nelements + elem3) * idxb_max + jjb;
|
||||
if (j1 == j) {
|
||||
if (j2 == j) betaj = 3 * beta_pack(iatom_mod, itriple, iatom_div);
|
||||
else betaj = 2 * beta_pack(iatom_mod, itriple, iatom_div);
|
||||
} else betaj = beta_pack(iatom_mod, itriple, iatom_div);
|
||||
} else if (j >= j2) {
|
||||
const int jjb = idxb_block(j, j2, j1);
|
||||
const auto itriple = ((elem3 * nelements + elem2) * nelements + elem1) * idxb_max + jjb;
|
||||
if (j2 == j) betaj = 2 * beta_pack(iatom_mod, itriple, iatom_div);
|
||||
else betaj = beta_pack(iatom_mod, itriple, iatom_div);
|
||||
} else {
|
||||
const int jjb = idxb_block(j2, j, j1);
|
||||
const auto itriple = ((elem2 * nelements + elem3) * nelements + elem1) * idxb_max + jjb;
|
||||
betaj = beta_pack(iatom_mod, itriple, iatom_div);
|
||||
}
|
||||
if (!bnorm_flag && j1 > j)
|
||||
betaj *= (j1 + 1) / (j + 1.0);
|
||||
Kokkos::atomic_add(&(ylist_pack_re(iatom_mod, jju_half, elem3, iatom_div)), betaj*ztmp.re);
|
||||
Kokkos::atomic_add(&(ylist_pack_im(iatom_mod, jju_half, elem3, iatom_div)), betaj*ztmp.im);
|
||||
} // end loop over elem3
|
||||
idouble++;
|
||||
} // end loop over elem2
|
||||
} // end loop over elem1
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Fused calculation of the derivative of Ui w.r.t. atom j
|
||||
and accumulation into dEidRj. GPU only.
|
||||
|
||||
Reference in New Issue
Block a user