Unify the CPU and GPU ComputeYi and ComputeZi routines; extend ComputeYiWithZlist to the CPU
This commit is contained in:
@ -36,7 +36,12 @@ PairStyle(snap/kk/host,PairSNAPKokkosDevice<LMPHostType>);
|
||||
namespace LAMMPS_NS {
|
||||
|
||||
// Routines for both the CPU and GPU backend
|
||||
struct TagPairSNAPTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist
|
||||
struct TagPairSNAPComputeZi{};
|
||||
struct TagPairSNAPComputeYi{};
|
||||
struct TagPairSNAPComputeYiWithZlist{};
|
||||
struct TagPairSNAPBeta{};
|
||||
|
||||
template<int NEIGHFLAG, int EVFLAG>
|
||||
struct TagPairSNAPComputeForce{};
|
||||
|
||||
@ -46,11 +51,7 @@ struct TagPairSNAPComputeCayleyKlein{};
|
||||
struct TagPairSNAPPreUi{};
|
||||
struct TagPairSNAPComputeUiSmall{}; // more parallelism, more divergence
|
||||
struct TagPairSNAPComputeUiLarge{}; // less parallelism, no divergence
|
||||
struct TagPairSNAPTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist
|
||||
struct TagPairSNAPComputeZi{};
|
||||
struct TagPairSNAPComputeBi{};
|
||||
struct TagPairSNAPComputeYi{};
|
||||
struct TagPairSNAPComputeYiWithZlist{};
|
||||
template<int dir>
|
||||
struct TagPairSNAPComputeFusedDeidrjSmall{}; // more parallelism, more divergence
|
||||
template<int dir>
|
||||
@ -60,9 +61,7 @@ struct TagPairSNAPComputeFusedDeidrjLarge{}; // less parallelism, no divergence
|
||||
struct TagPairSNAPComputeNeighCPU{};
|
||||
struct TagPairSNAPPreUiCPU{};
|
||||
struct TagPairSNAPComputeUiCPU{};
|
||||
struct TagPairSNAPComputeZiCPU{};
|
||||
struct TagPairSNAPComputeBiCPU{};
|
||||
struct TagPairSNAPZeroYiCPU{};
|
||||
struct TagPairSNAPComputeYiCPU{};
|
||||
struct TagPairSNAPComputeDuidrjCPU{};
|
||||
struct TagPairSNAPComputeDeidrjCPU{};
|
||||
@ -212,13 +211,16 @@ class PairSNAPKokkos : public PairSNAP {
|
||||
void operator() (TagPairSNAPTransformUi, const int& ii) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPComputeZiCPU,const int& ii) const;
|
||||
void operator() (TagPairSNAPComputeZi, const int& ii) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPComputeBiCPU, const int& ii) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPComputeYiCPU,const int& ii) const;
|
||||
void operator() (TagPairSNAPComputeYi, const int& ii) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPComputeYiWithZlist, const int& ii) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPComputeDuidrjCPU, const int& ii) const;
|
||||
|
||||
@ -251,32 +251,36 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::compute(int eflag_in,
|
||||
// Zero out ylist
|
||||
int idxu_max = snaKK.idxu_max;
|
||||
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPTransformUi> policy_transform_ui_cpu(0, chunk_size * idxu_max);
|
||||
Kokkos::parallel_for("TransformUi",policy_transform_ui_cpu,*this);
|
||||
Kokkos::parallel_for("TransformUiCPU",policy_transform_ui_cpu,*this);
|
||||
}
|
||||
|
||||
//Compute bispectrum
|
||||
if (quadraticflag || eflag) {
|
||||
//ComputeZi
|
||||
int idxz_max = snaKK.idxz_max;
|
||||
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeZiCPU> policy_zi_cpu(0,chunk_size*idxz_max);
|
||||
Kokkos::parallel_for("ComputeZiCPU",policy_zi_cpu,*this);
|
||||
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeZi> policy_zi_cpu(0, chunk_size * idxz_max);
|
||||
Kokkos::parallel_for("ComputeZiCPU", policy_zi_cpu, *this);
|
||||
|
||||
//ComputeBi
|
||||
int idxb_max = snaKK.idxb_max;
|
||||
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeBiCPU> policy_bi_cpu(0, chunk_size * idxb_max);
|
||||
Kokkos::parallel_for("ComputeBiCPU",policy_bi_cpu,*this);
|
||||
|
||||
//Compute beta = dE_i/dB_i for all i in list
|
||||
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPBeta> policy_beta(0,chunk_size);
|
||||
Kokkos::parallel_for("ComputeBeta",policy_beta,*this);
|
||||
}
|
||||
|
||||
//ComputeYi
|
||||
{
|
||||
//Compute beta = dE_i/dB_i for all i in list
|
||||
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPBeta> policy_beta(0,chunk_size);
|
||||
Kokkos::parallel_for("ComputeBeta",policy_beta,*this);
|
||||
|
||||
//ComputeYi
|
||||
int idxz_max = snaKK.idxz_max;
|
||||
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeYiCPU> policy_yi_cpu(0,chunk_size*idxz_max);
|
||||
Kokkos::parallel_for("ComputeYiCPU",policy_yi_cpu,*this);
|
||||
if (quadraticflag || eflag) {
|
||||
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeYiWithZlist> policy_yi_cpu(0, chunk_size * idxz_max);
|
||||
Kokkos::parallel_for("ComputeYiWithZlistCPU", policy_yi_cpu,*this);
|
||||
} else {
|
||||
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeYi> policy_yi_cpu(0, chunk_size * idxz_max);
|
||||
Kokkos::parallel_for("ComputeYiCPU", policy_yi_cpu,*this);
|
||||
}
|
||||
} // host flag
|
||||
|
||||
//ComputeDuidrj and Deidrj
|
||||
@ -912,19 +916,19 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeZi,const int iatom_mod, const int jjz, const int iatom_div) const {
|
||||
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeZi, const int iatom_mod, const int jjz, const int iatom_div) const {
|
||||
const int iatom = iatom_mod + iatom_div * vector_length;
|
||||
if (iatom >= chunk_size) return;
|
||||
if (jjz >= snaKK.idxz_max) return;
|
||||
|
||||
snaKK.compute_zi(iatom, jjz);
|
||||
}
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeZiCPU, const int& ii) const {
|
||||
snaKK.compute_zi_cpu(ii);
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeZi, const int& ii) const {
|
||||
const int iatom = ii / snaKK.idxz_max;
|
||||
const int jjz = ii % snaKK.idxz_max;
|
||||
snaKK.compute_zi(iatom, jjz);
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
@ -1004,19 +1008,19 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeYi,const int iatom_mod, const int jjz, const int iatom_div) const {
|
||||
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeYi, const int iatom_mod, const int jjz, const int iatom_div) const {
|
||||
const int iatom = iatom_mod + iatom_div * vector_length;
|
||||
if (iatom >= chunk_size) return;
|
||||
if (jjz >= snaKK.idxz_max) return;
|
||||
|
||||
snaKK.compute_yi(iatom, jjz);
|
||||
}
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeYiCPU, const int& ii) const {
|
||||
snaKK.compute_yi_cpu(ii);
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeYi, const int& ii) const {
|
||||
const int iatom = ii / snaKK.idxz_max;
|
||||
const int jjz = ii % snaKK.idxz_max;
|
||||
snaKK.compute_yi(iatom, jjz);
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
@ -1035,6 +1039,14 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
|
||||
snaKK.compute_yi_with_zlist(iatom, jjz);
|
||||
}
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeYiWithZlist, const int& ii) const {
|
||||
const int iatom = ii / snaKK.idxz_max;
|
||||
const int jjz = ii % snaKK.idxz_max;
|
||||
snaKK.compute_yi_with_zlist(iatom, jjz);
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Assemble the force contributions for each atom, neighbor pair by
|
||||
contracting the adjoint matrices Y with derivatives of the Wigner
|
||||
|
||||
@ -244,10 +244,6 @@ class SNAKokkos {
|
||||
void pre_ui_cpu(const int&, const int&) const; // ForceSNAP
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_ui_cpu(const int&, const int&) const; // ForceSNAP
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_zi_cpu(const int&) const; // ForceSNAP
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_yi_cpu(int) const; // ForceSNAP
|
||||
|
||||
// functions for derivatives, CPU only
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
|
||||
@ -828,30 +828,6 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_zi(const int& iato
|
||||
int j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, idxcg;
|
||||
idxz(jjz).get_zi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, idxcg);
|
||||
|
||||
const real_type* cgblock = cglist.data() + idxcg;
|
||||
|
||||
int idouble = 0;
|
||||
|
||||
for (int elem1 = 0; elem1 < nelements; elem1++) {
|
||||
for (int elem2 = 0; elem2 < nelements; elem2++) {
|
||||
|
||||
zlist(iatom, idouble, jjz) = evaluate_zi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, iatom, elem1, elem2, cgblock);
|
||||
|
||||
idouble++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void SNAKokkos<DeviceType, real_type, vector_length>::compute_zi_cpu(const int& iter) const
|
||||
{
|
||||
const int iatom = iter / idxz_max;
|
||||
const int jjz = iter % idxz_max;
|
||||
|
||||
int j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, idxcg;
|
||||
idxz(jjz).get_zi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, idxcg);
|
||||
|
||||
const real_type *cgblock = cglist.data() + idxcg;
|
||||
|
||||
int idouble = 0;
|
||||
@ -1048,79 +1024,6 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_yi(const int& iato
|
||||
} // end loop over elem1
|
||||
}
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void SNAKokkos<DeviceType, real_type, vector_length>::compute_yi_cpu(int iter) const
|
||||
{
|
||||
real_type betaj;
|
||||
const int iatom = iter / idxz_max;
|
||||
const int jjz = iter % idxz_max;
|
||||
|
||||
int j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, jju_half, idxcg;
|
||||
idxz(jjz).get_yi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, jju_half, idxcg);
|
||||
|
||||
const real_type *cgblock = cglist.data() + idxcg;
|
||||
//int mb = (2 * (mb1min+mb2max) - j1 - j2 + j) / 2;
|
||||
//int ma = (2 * (ma1min+ma2max) - j1 - j2 + j) / 2;
|
||||
|
||||
for (int elem1 = 0; elem1 < nelements; elem1++) {
|
||||
for (int elem2 = 0; elem2 < nelements; elem2++) {
|
||||
|
||||
real_type ztmp_r = 0.0;
|
||||
real_type ztmp_i = 0.0;
|
||||
|
||||
int jju1 = idxu_block[j1] + (j1 + 1) * mb1min;
|
||||
int jju2 = idxu_block[j2] + (j2 + 1) * mb2max;
|
||||
int icgb = mb1min * (j2 +1) + mb2max;
|
||||
|
||||
for (int ib = 0; ib < nb; ib++) {
|
||||
|
||||
real_type suma1_r = 0.0;
|
||||
real_type suma1_i = 0.0;
|
||||
|
||||
int ma1 = ma1min;
|
||||
int ma2 = ma2max;
|
||||
int icga = ma1min*(j2+1) + ma2max;
|
||||
|
||||
for (int ia = 0; ia < na; ia++) {
|
||||
suma1_r += cgblock[icga] * (ulisttot(iatom, elem1, jju1+ma1).re * ulisttot(iatom, elem2, jju2+ma2).re -
|
||||
ulisttot(iatom, elem1, jju1+ma1).im * ulisttot(iatom, elem2, jju2+ma2).im);
|
||||
suma1_i += cgblock[icga] * (ulisttot(iatom, elem1, jju1+ma1).re * ulisttot(iatom, elem2, jju2+ma2).im +
|
||||
ulisttot(iatom, elem1, jju1+ma1).im * ulisttot(iatom, elem2, jju2+ma2).re);
|
||||
ma1++;
|
||||
ma2--;
|
||||
icga += j2;
|
||||
} // end loop over ia
|
||||
|
||||
ztmp_r += cgblock[icgb] * suma1_r;
|
||||
ztmp_i += cgblock[icgb] * suma1_i;
|
||||
jju1 += j1 + 1;
|
||||
jju2 -= j2 + 1;
|
||||
icgb += j2;
|
||||
} // end loop over ib
|
||||
|
||||
if (bnorm_flag) {
|
||||
const real_type scale = static_cast<real_type>(1) / static_cast<real_type>(j + 1);
|
||||
ztmp_i *= scale;
|
||||
ztmp_r *= scale;
|
||||
}
|
||||
|
||||
// apply to z(j1,j2,j,ma,mb) to unique element of y(j)
|
||||
// find right y_list[jju] and beta(iatom,jjb) entries
|
||||
// multiply and divide by j+1 factors
|
||||
// account for multiplicity of 1, 2, or 3
|
||||
|
||||
// pick out right beta value
|
||||
for (int elem3 = 0; elem3 < nelements; elem3++) {
|
||||
const real_type betaj = evaluate_beta_scaled(j1, j2, j, iatom, elem1, elem2, elem3);
|
||||
|
||||
Kokkos::atomic_add(&(ylist_re(iatom, elem3, jju_half)), betaj*ztmp_r);
|
||||
Kokkos::atomic_add(&(ylist_im(iatom, elem3, jju_half)), betaj*ztmp_i);
|
||||
} // end loop over elem3
|
||||
} // end loop over elem2
|
||||
} // end loop over elem1
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
compute Yi from Ui with the precomputed Zi.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
Reference in New Issue
Block a user