Unify the CPU and GPU PreUi routines
This commit is contained in:
@ -36,22 +36,21 @@ PairStyle(snap/kk/host,PairSNAPKokkosDevice<LMPHostType>);
|
||||
namespace LAMMPS_NS {
|
||||
|
||||
// Routines for both the CPU and GPU backend
|
||||
struct TagPairSNAPPreUi{};
|
||||
struct TagPairSNAPTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist
|
||||
struct TagPairSNAPComputeZi{};
|
||||
struct TagPairSNAPComputeBi{};
|
||||
struct TagPairSNAPBeta{};
|
||||
struct TagPairSNAPComputeYi{};
|
||||
struct TagPairSNAPComputeYiWithZlist{};
|
||||
struct TagPairSNAPBeta{};
|
||||
|
||||
template<int NEIGHFLAG, int EVFLAG>
|
||||
struct TagPairSNAPComputeForce{};
|
||||
|
||||
// GPU backend only
|
||||
struct TagPairSNAPComputeNeigh{};
|
||||
struct TagPairSNAPComputeCayleyKlein{};
|
||||
struct TagPairSNAPPreUi{};
|
||||
struct TagPairSNAPComputeUiSmall{}; // more parallelism, more divergence
|
||||
struct TagPairSNAPComputeUiLarge{}; // less parallelism, no divergence
|
||||
struct TagPairSNAPComputeBi{};
|
||||
template<int dir>
|
||||
struct TagPairSNAPComputeFusedDeidrjSmall{}; // more parallelism, more divergence
|
||||
template<int dir>
|
||||
@ -59,10 +58,7 @@ struct TagPairSNAPComputeFusedDeidrjLarge{}; // less parallelism, no divergence
|
||||
|
||||
// CPU backend only
|
||||
struct TagPairSNAPComputeNeighCPU{};
|
||||
struct TagPairSNAPPreUiCPU{};
|
||||
struct TagPairSNAPComputeUiCPU{};
|
||||
struct TagPairSNAPComputeBiCPU{};
|
||||
struct TagPairSNAPComputeYiCPU{};
|
||||
struct TagPairSNAPComputeDuidrjCPU{};
|
||||
struct TagPairSNAPComputeDeidrjCPU{};
|
||||
|
||||
@ -202,7 +198,7 @@ class PairSNAPKokkos : public PairSNAP {
|
||||
void operator() (TagPairSNAPComputeNeighCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeNeighCPU>::member_type& team) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPPreUiCPU, const int& iatom) const;
|
||||
void operator() (TagPairSNAPPreUi, const int& ii) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPComputeUiCPU, const int& ii) const;
|
||||
@ -214,7 +210,7 @@ class PairSNAPKokkos : public PairSNAP {
|
||||
void operator() (TagPairSNAPComputeZi, const int& ii) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPComputeBiCPU, const int& ii) const;
|
||||
void operator() (TagPairSNAPComputeBi, const int& ii) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPComputeYi, const int& ii) const;
|
||||
|
||||
@ -235,7 +235,7 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::compute(int eflag_in,
|
||||
|
||||
//PreUi
|
||||
{
|
||||
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPPreUiCPU> policy_preui_cpu(0, chunk_size);
|
||||
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPPreUi> policy_preui_cpu(0, chunk_size * (twojmax + 1));
|
||||
Kokkos::parallel_for("PreUiCPU",policy_preui_cpu,*this);
|
||||
}
|
||||
|
||||
@ -263,7 +263,7 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::compute(int eflag_in,
|
||||
|
||||
//ComputeBi
|
||||
int idxb_max = snaKK.idxb_max;
|
||||
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeBiCPU> policy_bi_cpu(0, chunk_size * idxb_max);
|
||||
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeBi> policy_bi_cpu(0, chunk_size * idxb_max);
|
||||
Kokkos::parallel_for("ComputeBiCPU",policy_bi_cpu,*this);
|
||||
|
||||
//Compute beta = dE_i/dB_i for all i in list
|
||||
@ -807,11 +807,14 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPPreUiCPU, const int& iatom) const {
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPPreUi, const int& ii) const {
|
||||
const int iatom = ii / (twojmax+1);
|
||||
const int j = ii % (twojmax+1);
|
||||
|
||||
const int itype = type(iatom);
|
||||
const int ielem = d_map[itype];
|
||||
|
||||
snaKK.pre_ui_cpu(iatom, ielem);
|
||||
snaKK.pre_ui(iatom, j, ielem);
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
@ -944,12 +947,12 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
|
||||
if (iatom >= chunk_size) return;
|
||||
if (jjb >= snaKK.idxb_max) return;
|
||||
|
||||
snaKK.compute_bi(iatom,jjb);
|
||||
snaKK.compute_bi(iatom, jjb);
|
||||
}
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeBiCPU, const int& ii) const {
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeBi, const int& ii) const {
|
||||
const int iatom = ii / snaKK.idxb_max;
|
||||
const int jjb = ii % snaKK.idxb_max;
|
||||
snaKK.compute_bi(iatom, jjb);
|
||||
|
||||
@ -241,8 +241,6 @@ class SNAKokkos {
|
||||
|
||||
// functions for bispectrum coefficients, CPU only
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void pre_ui_cpu(const int&, const int&) const; // ForceSNAP
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_ui_cpu(const int&, const int&) const; // ForceSNAP
|
||||
|
||||
// functions for derivatives, CPU only
|
||||
|
||||
@ -466,30 +466,6 @@ void SNAKokkos<DeviceType, real_type, vector_length>::pre_ui(const int& iatom, c
|
||||
}
|
||||
}
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void SNAKokkos<DeviceType, real_type, vector_length>::pre_ui_cpu(const int& iatom, const int& ielem) const
|
||||
{
|
||||
for (int jelem = 0; jelem < nelements; jelem++) {
|
||||
for (int j = 0; j <= twojmax; j++) {
|
||||
int jju = idxu_half_block(j); // removed "const" to work around GCC 7 bug
|
||||
|
||||
// Only diagonal elements get initialized
|
||||
for (int m = 0; m < (j+1)*(j/2+1); m++) {
|
||||
const int jjup = jju + m;
|
||||
|
||||
// if m is on the "diagonal", initialize it with the self energy.
|
||||
// Otherwise zero it out
|
||||
real_type init = 0;
|
||||
if (m % (j+2) == 0 && (!chem_flag || ielem == jelem || wselfall_flag)) { init = wself; } //need to map iatom to element
|
||||
|
||||
ulisttot_re(iatom, jelem, jjup) = init;
|
||||
ulisttot_im(iatom, jelem, jjup) = 0;
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
compute Ui by computing Wigner U-functions for one neighbor and
|
||||
accumulating to the total. GPU only.
|
||||
|
||||
Reference in New Issue
Block a user