Unify the CPU and GPU ComputeYi and ComputeZi routines; extend ComputeYiWithZlist to the CPU

This commit is contained in:
Evan Weinberg
2024-11-20 10:13:11 -08:00
parent 8a65f44237
commit 98b67b8ea0
4 changed files with 42 additions and 129 deletions

View File

@ -36,7 +36,12 @@ PairStyle(snap/kk/host,PairSNAPKokkosDevice<LMPHostType>);
namespace LAMMPS_NS {
// Routines for both the CPU and GPU backend
struct TagPairSNAPTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist
struct TagPairSNAPComputeZi{};
struct TagPairSNAPComputeYi{};
struct TagPairSNAPComputeYiWithZlist{};
struct TagPairSNAPBeta{};
template<int NEIGHFLAG, int EVFLAG>
struct TagPairSNAPComputeForce{};
@ -46,11 +51,7 @@ struct TagPairSNAPComputeCayleyKlein{};
struct TagPairSNAPPreUi{};
struct TagPairSNAPComputeUiSmall{}; // more parallelism, more divergence
struct TagPairSNAPComputeUiLarge{}; // less parallelism, no divergence
struct TagPairSNAPTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist
struct TagPairSNAPComputeZi{};
struct TagPairSNAPComputeBi{};
struct TagPairSNAPComputeYi{};
struct TagPairSNAPComputeYiWithZlist{};
template<int dir>
struct TagPairSNAPComputeFusedDeidrjSmall{}; // more parallelism, more divergence
template<int dir>
@ -60,9 +61,7 @@ struct TagPairSNAPComputeFusedDeidrjLarge{}; // less parallelism, no divergence
struct TagPairSNAPComputeNeighCPU{};
struct TagPairSNAPPreUiCPU{};
struct TagPairSNAPComputeUiCPU{};
struct TagPairSNAPComputeZiCPU{};
struct TagPairSNAPComputeBiCPU{};
struct TagPairSNAPZeroYiCPU{};
struct TagPairSNAPComputeYiCPU{};
struct TagPairSNAPComputeDuidrjCPU{};
struct TagPairSNAPComputeDeidrjCPU{};
@ -212,13 +211,16 @@ class PairSNAPKokkos : public PairSNAP {
void operator() (TagPairSNAPTransformUi, const int& ii) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeZiCPU,const int& ii) const;
void operator() (TagPairSNAPComputeZi, const int& ii) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeBiCPU, const int& ii) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeYiCPU,const int& ii) const;
void operator() (TagPairSNAPComputeYi, const int& ii) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeYiWithZlist, const int& ii) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeDuidrjCPU, const int& ii) const;

View File

@ -251,32 +251,36 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::compute(int eflag_in,
// Zero out ylist
int idxu_max = snaKK.idxu_max;
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPTransformUi> policy_transform_ui_cpu(0, chunk_size * idxu_max);
Kokkos::parallel_for("TransformUi",policy_transform_ui_cpu,*this);
Kokkos::parallel_for("TransformUiCPU",policy_transform_ui_cpu,*this);
}
//Compute bispectrum
if (quadraticflag || eflag) {
//ComputeZi
int idxz_max = snaKK.idxz_max;
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeZiCPU> policy_zi_cpu(0,chunk_size*idxz_max);
Kokkos::parallel_for("ComputeZiCPU",policy_zi_cpu,*this);
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeZi> policy_zi_cpu(0, chunk_size * idxz_max);
Kokkos::parallel_for("ComputeZiCPU", policy_zi_cpu, *this);
//ComputeBi
int idxb_max = snaKK.idxb_max;
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeBiCPU> policy_bi_cpu(0, chunk_size * idxb_max);
Kokkos::parallel_for("ComputeBiCPU",policy_bi_cpu,*this);
//Compute beta = dE_i/dB_i for all i in list
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPBeta> policy_beta(0,chunk_size);
Kokkos::parallel_for("ComputeBeta",policy_beta,*this);
}
//ComputeYi
{
//Compute beta = dE_i/dB_i for all i in list
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPBeta> policy_beta(0,chunk_size);
Kokkos::parallel_for("ComputeBeta",policy_beta,*this);
//ComputeYi
int idxz_max = snaKK.idxz_max;
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeYiCPU> policy_yi_cpu(0,chunk_size*idxz_max);
Kokkos::parallel_for("ComputeYiCPU",policy_yi_cpu,*this);
if (quadraticflag || eflag) {
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeYiWithZlist> policy_yi_cpu(0, chunk_size * idxz_max);
Kokkos::parallel_for("ComputeYiWithZlistCPU", policy_yi_cpu,*this);
} else {
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeYi> policy_yi_cpu(0, chunk_size * idxz_max);
Kokkos::parallel_for("ComputeYiCPU", policy_yi_cpu,*this);
}
} // host flag
//ComputeDuidrj and Deidrj
@ -912,19 +916,19 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeZi,const int iatom_mod, const int jjz, const int iatom_div) const {
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeZi, const int iatom_mod, const int jjz, const int iatom_div) const {
const int iatom = iatom_mod + iatom_div * vector_length;
if (iatom >= chunk_size) return;
if (jjz >= snaKK.idxz_max) return;
snaKK.compute_zi(iatom, jjz);
}
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeZiCPU, const int& ii) const {
snaKK.compute_zi_cpu(ii);
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeZi, const int& ii) const {
const int iatom = ii / snaKK.idxz_max;
const int jjz = ii % snaKK.idxz_max;
snaKK.compute_zi(iatom, jjz);
}
/* ----------------------------------------------------------------------
@ -1004,19 +1008,19 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeYi,const int iatom_mod, const int jjz, const int iatom_div) const {
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeYi, const int iatom_mod, const int jjz, const int iatom_div) const {
const int iatom = iatom_mod + iatom_div * vector_length;
if (iatom >= chunk_size) return;
if (jjz >= snaKK.idxz_max) return;
snaKK.compute_yi(iatom, jjz);
}
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeYiCPU, const int& ii) const {
snaKK.compute_yi_cpu(ii);
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeYi, const int& ii) const {
const int iatom = ii / snaKK.idxz_max;
const int jjz = ii % snaKK.idxz_max;
snaKK.compute_yi(iatom, jjz);
}
/* ----------------------------------------------------------------------
@ -1035,6 +1039,14 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
snaKK.compute_yi_with_zlist(iatom, jjz);
}
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeYiWithZlist, const int& ii) const {
const int iatom = ii / snaKK.idxz_max;
const int jjz = ii % snaKK.idxz_max;
snaKK.compute_yi_with_zlist(iatom, jjz);
}
/* ----------------------------------------------------------------------
Assemble the force contributions for each atom, neighbor pair by
contracting the adjoint matrices Y with derivatives of the Wigner

View File

@ -244,10 +244,6 @@ class SNAKokkos {
void pre_ui_cpu(const int&, const int&) const; // ForceSNAP
KOKKOS_INLINE_FUNCTION
void compute_ui_cpu(const int&, const int&) const; // ForceSNAP
KOKKOS_INLINE_FUNCTION
void compute_zi_cpu(const int&) const; // ForceSNAP
KOKKOS_INLINE_FUNCTION
void compute_yi_cpu(int) const; // ForceSNAP
// functions for derivatives, CPU only
KOKKOS_INLINE_FUNCTION

View File

@ -828,30 +828,6 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_zi(const int& iato
int j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, idxcg;
idxz(jjz).get_zi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, idxcg);
const real_type* cgblock = cglist.data() + idxcg;
int idouble = 0;
for (int elem1 = 0; elem1 < nelements; elem1++) {
for (int elem2 = 0; elem2 < nelements; elem2++) {
zlist(iatom, idouble, jjz) = evaluate_zi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, iatom, elem1, elem2, cgblock);
idouble++;
}
}
}
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType, real_type, vector_length>::compute_zi_cpu(const int& iter) const
{
const int iatom = iter / idxz_max;
const int jjz = iter % idxz_max;
int j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, idxcg;
idxz(jjz).get_zi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, idxcg);
const real_type *cgblock = cglist.data() + idxcg;
int idouble = 0;
@ -1048,79 +1024,6 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_yi(const int& iato
} // end loop over elem1
}
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType, real_type, vector_length>::compute_yi_cpu(int iter) const
{
real_type betaj;
const int iatom = iter / idxz_max;
const int jjz = iter % idxz_max;
int j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, jju_half, idxcg;
idxz(jjz).get_yi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, jju_half, idxcg);
const real_type *cgblock = cglist.data() + idxcg;
//int mb = (2 * (mb1min+mb2max) - j1 - j2 + j) / 2;
//int ma = (2 * (ma1min+ma2max) - j1 - j2 + j) / 2;
for (int elem1 = 0; elem1 < nelements; elem1++) {
for (int elem2 = 0; elem2 < nelements; elem2++) {
real_type ztmp_r = 0.0;
real_type ztmp_i = 0.0;
int jju1 = idxu_block[j1] + (j1 + 1) * mb1min;
int jju2 = idxu_block[j2] + (j2 + 1) * mb2max;
int icgb = mb1min * (j2 +1) + mb2max;
for (int ib = 0; ib < nb; ib++) {
real_type suma1_r = 0.0;
real_type suma1_i = 0.0;
int ma1 = ma1min;
int ma2 = ma2max;
int icga = ma1min*(j2+1) + ma2max;
for (int ia = 0; ia < na; ia++) {
suma1_r += cgblock[icga] * (ulisttot(iatom, elem1, jju1+ma1).re * ulisttot(iatom, elem2, jju2+ma2).re -
ulisttot(iatom, elem1, jju1+ma1).im * ulisttot(iatom, elem2, jju2+ma2).im);
suma1_i += cgblock[icga] * (ulisttot(iatom, elem1, jju1+ma1).re * ulisttot(iatom, elem2, jju2+ma2).im +
ulisttot(iatom, elem1, jju1+ma1).im * ulisttot(iatom, elem2, jju2+ma2).re);
ma1++;
ma2--;
icga += j2;
} // end loop over ia
ztmp_r += cgblock[icgb] * suma1_r;
ztmp_i += cgblock[icgb] * suma1_i;
jju1 += j1 + 1;
jju2 -= j2 + 1;
icgb += j2;
} // end loop over ib
if (bnorm_flag) {
const real_type scale = static_cast<real_type>(1) / static_cast<real_type>(j + 1);
ztmp_i *= scale;
ztmp_r *= scale;
}
// apply to z(j1,j2,j,ma,mb) to unique element of y(j)
// find right y_list[jju] and beta(iatom,jjb) entries
// multiply and divide by j+1 factors
// account for multiplicity of 1, 2, or 3
// pick out right beta value
for (int elem3 = 0; elem3 < nelements; elem3++) {
const real_type betaj = evaluate_beta_scaled(j1, j2, j, iatom, elem1, elem2, elem3);
Kokkos::atomic_add(&(ylist_re(iatom, elem3, jju_half)), betaj*ztmp_r);
Kokkos::atomic_add(&(ylist_im(iatom, elem3, jju_half)), betaj*ztmp_i);
} // end loop over elem3
} // end loop over elem2
} // end loop over elem1
}
/* ----------------------------------------------------------------------
compute Yi from Ui with the precomputed Zi.
------------------------------------------------------------------------- */