Merge branch 'master' into test-updates

This commit is contained in:
Axel Kohlmeyer
2020-06-22 21:00:33 -04:00
7 changed files with 1219 additions and 543 deletions

View File

@ -30,19 +30,31 @@ PairStyle(snap/kk/host,PairSNAPKokkos<LMPHostType>)
namespace LAMMPS_NS { namespace LAMMPS_NS {
// Routines for both the CPU and GPU backend
template<int NEIGHFLAG, int EVFLAG> template<int NEIGHFLAG, int EVFLAG>
struct TagPairSNAPComputeForce{}; struct TagPairSNAPComputeForce{};
struct TagPairSNAPBeta{};
struct TagPairSNAPComputeNeigh{}; struct TagPairSNAPComputeNeigh{};
// GPU backend only
struct TagPairSNAPPreUi{}; struct TagPairSNAPPreUi{};
struct TagPairSNAPComputeUi{}; struct TagPairSNAPComputeUi{};
struct TagPairSNAPComputeUiCPU{}; struct TagPairSNAPTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist
struct TagPairSNAPComputeZi{}; struct TagPairSNAPComputeZi{};
struct TagPairSNAPBeta{};
struct TagPairSNAPComputeBi{}; struct TagPairSNAPComputeBi{};
struct TagPairSNAPZeroYi{}; struct TagPairSNAPTransformBi{}; // re-order blist from AoSoA to AoS
struct TagPairSNAPComputeYi{}; struct TagPairSNAPComputeYi{};
struct TagPairSNAPTransformYi{}; // re-order ylist from AoSoA to AoS
struct TagPairSNAPComputeFusedDeidrj{}; struct TagPairSNAPComputeFusedDeidrj{};
// CPU backend only
struct TagPairSNAPPreUiCPU{};
struct TagPairSNAPComputeUiCPU{};
struct TagPairSNAPComputeZiCPU{};
struct TagPairSNAPBetaCPU{};
struct TagPairSNAPComputeBiCPU{};
struct TagPairSNAPZeroYiCPU{};
struct TagPairSNAPComputeYiCPU{};
struct TagPairSNAPComputeDuidrjCPU{}; struct TagPairSNAPComputeDuidrjCPU{};
struct TagPairSNAPComputeDeidrjCPU{}; struct TagPairSNAPComputeDeidrjCPU{};
@ -81,6 +93,10 @@ public:
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeNeigh>::member_type& team) const; void operator() (TagPairSNAPComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeNeigh>::member_type& team) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPBetaCPU,const int& ii) const;
// GPU backend only
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPPreUi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPPreUi>::member_type& team) const; void operator() (TagPairSNAPPreUi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPPreUi>::member_type& team) const;
@ -88,32 +104,54 @@ public:
void operator() (TagPairSNAPComputeUi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUi>::member_type& team) const; void operator() (TagPairSNAPComputeUi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUi>::member_type& team) const;
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiCPU>::member_type& team) const; void operator() (TagPairSNAPTransformUi,const int iatom_mod, const int idxu, const int iatom_div) const;
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeZi,const int& ii) const; void operator() (TagPairSNAPComputeZi,const int iatom_mod, const int idxz, const int iatom_div) const;
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeBi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeBi>::member_type& team) const; void operator() (TagPairSNAPBeta, const int& ii) const;
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPZeroYi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPZeroYi>::member_type& team) const; void operator() (TagPairSNAPComputeBi,const int iatom_mod, const int idxb, const int iatom_div) const;
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeYi,const int& ii) const; void operator() (TagPairSNAPTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeYi,const int iatom_mod, const int idxz, const int iatom_div) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPTransformYi,const int iatom_mod, const int idxu, const int iatom_div) const;
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeFusedDeidrj,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeFusedDeidrj>::member_type& team) const; void operator() (TagPairSNAPComputeFusedDeidrj,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeFusedDeidrj>::member_type& team) const;
// CPU backend only
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPPreUiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPPreUiCPU>::member_type& team) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiCPU>::member_type& team) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeZiCPU,const int& ii) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeBiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeBiCPU>::member_type& team) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPZeroYiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPZeroYiCPU>::member_type& team) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeYiCPU,const int& ii) const;
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeDuidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrjCPU>::member_type& team) const; void operator() (TagPairSNAPComputeDuidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrjCPU>::member_type& team) const;
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeDeidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrjCPU>::member_type& team) const; void operator() (TagPairSNAPComputeDeidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrjCPU>::member_type& team) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPBeta,const int& ii) const;
template<int NEIGHFLAG> template<int NEIGHFLAG>
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void v_tally_xyz(EV_FLOAT &ev, const int &i, const int &j, void v_tally_xyz(EV_FLOAT &ev, const int &i, const int &j,
@ -175,6 +213,7 @@ inline double dist2(double* x,double* y);
Kokkos::View<T_INT*, DeviceType> d_map; // mapping from atom types to elements Kokkos::View<T_INT*, DeviceType> d_map; // mapping from atom types to elements
Kokkos::View<T_INT*, DeviceType> d_ninside; // ninside for all atoms in list Kokkos::View<T_INT*, DeviceType> d_ninside; // ninside for all atoms in list
Kokkos::View<F_FLOAT**, DeviceType> d_beta; // betas for all atoms in list Kokkos::View<F_FLOAT**, DeviceType> d_beta; // betas for all atoms in list
Kokkos::View<F_FLOAT***, Kokkos::LayoutLeft, DeviceType> d_beta_pack; // betas for all atoms in list, GPU
Kokkos::View<F_FLOAT**, DeviceType> d_bispectrum; // bispectrum components for all atoms in list Kokkos::View<F_FLOAT**, DeviceType> d_bispectrum; // bispectrum components for all atoms in list
typedef Kokkos::DualView<F_FLOAT**, DeviceType> tdual_fparams; typedef Kokkos::DualView<F_FLOAT**, DeviceType> tdual_fparams;

View File

@ -194,6 +194,8 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
if (beta_max < inum) { if (beta_max < inum) {
beta_max = inum; beta_max = inum;
d_beta = Kokkos::View<F_FLOAT**, DeviceType>("PairSNAPKokkos:beta",ncoeff,inum); d_beta = Kokkos::View<F_FLOAT**, DeviceType>("PairSNAPKokkos:beta",ncoeff,inum);
if (!host_flag)
d_beta_pack = Kokkos::View<F_FLOAT***, Kokkos::LayoutLeft, DeviceType>("PairSNAPKokkos:beta_pack",32,ncoeff,(inum+32-1)/32);
d_ninside = Kokkos::View<int*, DeviceType>("PairSNAPKokkos:ninside",inum); d_ninside = Kokkos::View<int*, DeviceType>("PairSNAPKokkos:ninside",inum);
} }
@ -222,32 +224,93 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this); Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this);
} }
//PreUi if (host_flag)
{ {
int vector_length = vector_length_default; // Host codepath
int team_size = team_size_default;
if (!host_flag)
vector_length = 32;
check_team_size_for<TagPairSNAPPreUi>(chunk_size,team_size,vector_length);
typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPPreUi> policy_preui((chunk_size+team_size-1)/team_size,team_size,vector_length);
Kokkos::parallel_for("PreUi",policy_preui,*this);
}
// ComputeUI //PreUi
{ {
int vector_length = vector_length_default; int vector_length = vector_length_default;
int team_size = team_size_default; int team_size = team_size_default;
if (host_flag) { // CPU check_team_size_for<TagPairSNAPPreUiCPU>(chunk_size,team_size,vector_length);
// Run a fused calculation of ulist and accumulation into ulisttot using atomics typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPPreUiCPU> policy_preui_cpu((chunk_size+team_size-1)/team_size,team_size,vector_length);
Kokkos::parallel_for("PreUiCPU",policy_preui_cpu,*this);
}
// ComputeUi
{
int vector_length = vector_length_default;
int team_size = team_size_default;
// Fused calculation of ulist and accumulation into ulisttot using atomics
typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUiCPU> policy_ui_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUiCPU> policy_ui_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
Kokkos::parallel_for("ComputeUiCPU",policy_ui_cpu,*this); Kokkos::parallel_for("ComputeUiCPU",policy_ui_cpu,*this);
}
} else { // GPU, vector parallelism, shared memory, separate ulist and ulisttot to avoid atomics //Compute bispectrum
if (quadraticflag || eflag) {
//ComputeZi
int idxz_max = snaKK.idxz_max;
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeZiCPU> policy_zi_cpu(0,chunk_size*idxz_max);
Kokkos::parallel_for("ComputeZiCPU",policy_zi_cpu,*this);
vector_length = 32; //ComputeBi
team_size = 4; // need to cap b/c of shared memory reqs int vector_length = vector_length_default;
int team_size = team_size_default;
check_team_size_for<TagPairSNAPComputeBiCPU>(chunk_size,team_size,vector_length);
typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeBiCPU> policy_bi_cpu(chunk_size,team_size,vector_length);
Kokkos::parallel_for("ComputeBiCPU",policy_bi_cpu,*this);
}
//ZeroYi,ComputeYi
{
int vector_length = vector_length_default;
int team_size = team_size_default;
//Compute beta = dE_i/dB_i for all i in list
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPBetaCPU> policy_beta(0,chunk_size);
Kokkos::parallel_for("ComputeBetaCPU",policy_beta,*this);
//ZeroYi
check_team_size_for<TagPairSNAPZeroYiCPU>(chunk_size,team_size,vector_length);
typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPZeroYiCPU> policy_zero_yi(((idxu_max+team_size-1)/team_size)*chunk_size,team_size,vector_length);
Kokkos::parallel_for("ZeroYiCPU",policy_zero_yi,*this);
//ComputeYi
int idxz_max = snaKK.idxz_max;
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeYiCPU> policy_yi_cpu(0,chunk_size*idxz_max);
Kokkos::parallel_for("ComputeYiCPU",policy_yi_cpu,*this);
} // host flag
//ComputeDuidrj and Deidrj
{
int team_size = team_size_default;
int vector_length = vector_length_default;
typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeDuidrjCPU> policy_duidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
snaKK.set_dir(-1); // technically doesn't do anything
Kokkos::parallel_for("ComputeDuidrjCPU",policy_duidrj_cpu,*this);
typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeDeidrjCPU> policy_deidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
Kokkos::parallel_for("ComputeDeidrjCPU",policy_deidrj_cpu,*this);
}
} else { // GPU
#ifdef KOKKOS_ENABLE_CUDA
//PreUi
{
int vector_length = vector_length_default;
int team_size = team_size_default;
check_team_size_for<TagPairSNAPPreUi>(chunk_size,team_size,vector_length);
typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPPreUi> policy_preui((chunk_size+team_size-1)/team_size,team_size,vector_length);
Kokkos::parallel_for("PreUi",policy_preui,*this);
}
// ComputeUi w/vector parallelism, shared memory, direct atomicAdd into ulisttot
{
int vector_length = 32;
int team_size = 4; // need to cap b/c of shared memory reqs
check_team_size_for<TagPairSNAPComputeUi>(chunk_size,team_size,vector_length); check_team_size_for<TagPairSNAPComputeUi>(chunk_size,team_size,vector_length);
// scratch size: 2 * team_size * (twojmax+1)^2, to cover all `m1`,`m2` values // scratch size: 2 * team_size * (twojmax+1)^2, to cover all `m1`,`m2` values
@ -265,62 +328,54 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
Kokkos::parallel_for("ComputeUi",policy_ui,*this); Kokkos::parallel_for("ComputeUi",policy_ui,*this);
//Transform data layout of ulisttot to AoSoA, zero ylist
typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformUi> policy_transform_ui({0,0,0},{32,idxu_max,(chunk_size + 32 - 1) / 32},{32,4,1});
Kokkos::parallel_for("TransformUi",policy_transform_ui,*this);
} }
}
//Compute bispectrum in AoSoA data layout, transform Bi
if (quadraticflag || eflag) {
//ComputeZi
int idxz_max = snaKK.idxz_max;
typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPComputeZi> policy_compute_zi({0,0,0},{32,idxz_max,(chunk_size + 32 - 1) / 32},{32,4,1});
Kokkos::parallel_for("ComputeZi",policy_compute_zi,*this);
//Compute bispectrum //ComputeBi
if (quadraticflag || eflag) { int idxb_max = snaKK.idxb_max;
//ComputeZi typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPComputeBi> policy_compute_bi({0,0,0},{32,idxb_max,(chunk_size + 32 - 1) / 32},{32,4,1});
int idxz_max = snaKK.idxz_max; Kokkos::parallel_for("ComputeBi",policy_compute_bi,*this);
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeZi> policy_zi(0,chunk_size*idxz_max);
Kokkos::parallel_for("ComputeZi",policy_zi,*this);
//ComputeBi //Transform data layout of blist out of AoSoA
int vector_length = vector_length_default; //We need this b/c `blist` gets used in ComputeForce which doesn't
int team_size = team_size_default; //take advantage of AoSoA (which at best would only be beneficial
check_team_size_for<TagPairSNAPComputeBi>(chunk_size,team_size,vector_length); //on the margins)
typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeBi> policy_bi(chunk_size,team_size,vector_length); typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformBi> policy_transform_bi({0,0,0},{32,idxb_max,(chunk_size + 32 - 1) / 32},{32,4,1});
Kokkos::parallel_for("ComputeBi",policy_bi,*this); Kokkos::parallel_for("TransformBi",policy_transform_bi,*this);
} }
//Compute beta = dE_i/dB_i for all i in list //ComputeYi in AoSoA data layout, transform to AoS for ComputeFusedDeidrj
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPBeta> policy_beta(0,chunk_size); //Note zeroing `ylist` is fused into `TransformUi`.
Kokkos::parallel_for("ComputeBeta",policy_beta,*this); {
//Compute beta = dE_i/dB_i for all i in list
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPBeta> policy_beta(0,chunk_size);
Kokkos::parallel_for("ComputeBeta",policy_beta,*this);
//ZeroYi //ComputeYi
{ const int idxz_max = snaKK.idxz_max;
int vector_length = vector_length_default; typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPComputeYi> policy_compute_yi({0,0,0},{32,idxz_max,(chunk_size + 32 - 1) / 32},{32,4,1});
int team_size = team_size_default; Kokkos::parallel_for("ComputeYi",policy_compute_yi,*this);
if (!host_flag)
team_size = 128;
check_team_size_for<TagPairSNAPZeroYi>(chunk_size,team_size,vector_length);
typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPZeroYi> policy_zero_yi(((idxu_max+team_size-1)/team_size)*chunk_size,team_size,vector_length);
Kokkos::parallel_for("ZeroYi",policy_zero_yi,*this);
}
//ComputeYi //Transform data layout of ylist out of AoSoA
int idxz_max = snaKK.idxz_max; typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformYi> policy_transform_yi({0,0,0},{32,idxu_max,(chunk_size + 32 - 1) / 32},{32,4,1});
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeYi> policy_yi(0,chunk_size*idxz_max); Kokkos::parallel_for("TransformYi",policy_transform_yi,*this);
Kokkos::parallel_for("ComputeYi",policy_yi,*this);
//ComputeDuidrj and Deidrj }
{
int team_size = team_size_default;
int vector_length = vector_length_default;
if (host_flag) { // CPU
typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeDuidrjCPU> policy_duidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); // Fused ComputeDuidrj, ComputeDeidrj
snaKK.set_dir(-1); // technically doesn't do anything {
Kokkos::parallel_for("ComputeDuidrjCPU",policy_duidrj_cpu,*this); int vector_length = 32;
int team_size = 2; // need to cap b/c of shared memory reqs
typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeDeidrjCPU> policy_deidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
Kokkos::parallel_for("ComputeDeidrjCPU",policy_deidrj_cpu,*this);
} else { // GPU, utilize scratch memory and splitting over dimensions, fused dui and dei
vector_length = 32;
team_size = 2; // need to cap b/c of shared memory reqs
check_team_size_for<TagPairSNAPComputeFusedDeidrj>(chunk_size,team_size,vector_length); check_team_size_for<TagPairSNAPComputeFusedDeidrj>(chunk_size,team_size,vector_length);
// scratch size: 2 * 2 * team_size * (twojmax+1)*(twojmax/2+1), to cover half `m1`,`m2` values due to symmetry // scratch size: 2 * 2 * team_size * (twojmax+1)*(twojmax/2+1), to cover half `m1`,`m2` values due to symmetry
@ -341,6 +396,9 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
Kokkos::parallel_for("ComputeFusedDeidrj",policy_fused_deidrj,*this); Kokkos::parallel_for("ComputeFusedDeidrj",policy_fused_deidrj,*this);
} }
} }
#endif // KOKKOS_ENABLE_CUDA
} }
//ComputeForce //ComputeForce
@ -416,38 +474,6 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
} }
} }
/* ---------------------------------------------------------------------- */
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPBeta,const int& ii) const {
const int i = d_ilist[ii + chunk_offset];
const int itype = type[i];
const int ielem = d_map[itype];
SNAKokkos<DeviceType> my_sna = snaKK;
Kokkos::View<double*,Kokkos::LayoutRight,DeviceType,Kokkos::MemoryTraits<Kokkos::Unmanaged>>
d_coeffi(d_coeffelem,ielem,Kokkos::ALL);
for (int icoeff = 0; icoeff < ncoeff; icoeff++)
d_beta(icoeff,ii) = d_coeffi[icoeff+1];
if (quadraticflag) {
int k = ncoeff+1;
for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
double bveci = my_sna.blist(icoeff,ii);
d_beta(icoeff,ii) += d_coeffi[k]*bveci;
k++;
for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
double bvecj = my_sna.blist(jcoeff,ii);
d_beta(icoeff,ii) += d_coeffi[k]*bvecj;
d_beta(jcoeff,ii) += d_coeffi[k]*bveci;
k++;
}
}
}
}
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
allocate all arrays allocate all arrays
@ -520,6 +546,10 @@ void PairSNAPKokkos<DeviceType>::coeff(int narg, char **arg)
snaKK.init(); snaKK.init();
} }
/* ----------------------------------------------------------------------
Begin routines that are called on both CPU and GPU codepaths
------------------------------------------------------------------------- */
/* ---------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- */
template<class DeviceType> template<class DeviceType>
@ -594,6 +624,53 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeNeigh,const typen
}); });
} }
/* ----------------------------------------------------------------------
Begin routines that are unique to the GPU codepath. These take advantage
of AoSoA data layouts and scratch memory for recursive polynomials
------------------------------------------------------------------------- */
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPBeta,const int& ii) const {
if (ii >= chunk_size) return;
const int iatom_mod = ii % 32;
const int iatom_div = ii / 32;
const int i = d_ilist[ii + chunk_offset];
const int itype = type[i];
const int ielem = d_map[itype];
SNAKokkos<DeviceType> my_sna = snaKK;
Kokkos::View<double*,Kokkos::LayoutRight,DeviceType,Kokkos::MemoryTraits<Kokkos::Unmanaged>>
d_coeffi(d_coeffelem,ielem,Kokkos::ALL);
for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
d_beta_pack(iatom_mod,icoeff,iatom_div) = d_coeffi[icoeff+1];
}
if (quadraticflag) {
const auto idxb_max = my_sna.idxb_max;
int k = ncoeff+1;
for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
const auto idxb = icoeff % idxb_max;
const auto idx_chem = icoeff / idxb_max;
double bveci = my_sna.blist(idxb, idx_chem, ii);
d_beta_pack(iatom_mod,icoeff,iatom_div) += d_coeffi[k]*bveci;
k++;
for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
const auto jdxb = jcoeff % idxb_max;
const auto jdx_chem = jcoeff / idxb_max;
double bvecj = my_sna.blist(jdxb, jdx_chem, ii);
d_beta_pack(iatom_mod,icoeff,iatom_div) += d_coeffi[k]*bvecj;
d_beta_pack(iatom_mod,jcoeff,iatom_div) += d_coeffi[k]*bveci;
k++;
}
}
}
}
template<class DeviceType> template<class DeviceType>
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPPreUi,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPPreUi>::member_type& team) const { void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPPreUi,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPPreUi>::member_type& team) const {
@ -627,61 +704,107 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUi,const typename
template<class DeviceType> template<class DeviceType>
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUiCPU>::member_type& team) const { void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPTransformUi,const int iatom_mod, const int idxu, const int iatom_div) const {
SNAKokkos<DeviceType> my_sna = snaKK; SNAKokkos<DeviceType> my_sna = snaKK;
// Extract the atom number const int iatom = iatom_mod + iatom_div * 32;
int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size())); if (iatom >= chunk_size) return;
if (ii >= chunk_size) return;
// Extract the neighbor number if (idxu >= my_sna.idxu_max) return;
const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size());
const int ninside = d_ninside(ii); int elem_count = chemflag ? nelements : 1;
if (jj >= ninside) return;
for (int ielem = 0; ielem < elem_count; ielem++) {
const auto utot_re = my_sna.ulisttot_re(idxu, ielem, iatom);
const auto utot_im = my_sna.ulisttot_im(idxu, ielem, iatom);
my_sna.ulisttot_pack(iatom_mod, idxu, ielem, iatom_div) = { utot_re, utot_im };
my_sna.ylist_pack_re(iatom_mod, idxu, ielem, iatom_div) = 0.;
my_sna.ylist_pack_im(iatom_mod, idxu, ielem, iatom_div) = 0.;
}
my_sna.compute_ui_cpu(team,ii,jj);
} }
template<class DeviceType> template<class DeviceType>
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPZeroYi,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPZeroYi>::member_type& team) const { void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeYi,const int iatom_mod, const int jjz, const int iatom_div) const {
SNAKokkos<DeviceType> my_sna = snaKK; SNAKokkos<DeviceType> my_sna = snaKK;
// Extract the quantum number const int iatom = iatom_mod + iatom_div * 32;
const int idx = team.team_rank() + team.team_size() * (team.league_rank() % ((my_sna.idxu_max+team.team_size()-1)/team.team_size())); if (iatom >= chunk_size) return;
if (idx >= my_sna.idxu_max) return;
// Extract the atomic index if (jjz >= my_sna.idxz_max) return;
const int ii = team.league_rank() / ((my_sna.idxu_max+team.team_size()-1)/team.team_size());
if (ii >= chunk_size) return;
if (chemflag) my_sna.compute_yi(iatom_mod,jjz,iatom_div,d_beta_pack);
for(int ielem = 0; ielem < nelements; ielem++)
my_sna.zero_yi(idx,ii,ielem);
else
my_sna.zero_yi(idx,ii,0);
} }
template<class DeviceType> template<class DeviceType>
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeYi,const int& ii) const { void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPTransformYi,const int iatom_mod, const int idxu, const int iatom_div) const {
SNAKokkos<DeviceType> my_sna = snaKK; SNAKokkos<DeviceType> my_sna = snaKK;
my_sna.compute_yi(ii,d_beta);
const int iatom = iatom_mod + iatom_div * 32;
if (iatom >= chunk_size) return;
if (idxu >= my_sna.idxu_max) return;
int elem_count = chemflag ? nelements : 1;
for (int ielem = 0; ielem < elem_count; ielem++) {
const auto y_re = my_sna.ylist_pack_re(iatom_mod, idxu, ielem, iatom_div);
const auto y_im = my_sna.ylist_pack_im(iatom_mod, idxu, ielem, iatom_div);
my_sna.ylist(idxu, ielem, iatom) = { y_re, y_im };
}
} }
template<class DeviceType> template<class DeviceType>
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeZi,const int& ii) const { void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeZi,const int iatom_mod, const int jjz, const int iatom_div) const {
SNAKokkos<DeviceType> my_sna = snaKK; SNAKokkos<DeviceType> my_sna = snaKK;
my_sna.compute_zi(ii);
const int iatom = iatom_mod + iatom_div * 32;
if (iatom >= chunk_size) return;
if (jjz >= my_sna.idxz_max) return;
my_sna.compute_zi(iatom_mod,jjz,iatom_div);
} }
template<class DeviceType> template<class DeviceType>
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeBi,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeBi>::member_type& team) const { void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeBi,const int iatom_mod, const int jjb, const int iatom_div) const {
int ii = team.league_rank();
SNAKokkos<DeviceType> my_sna = snaKK; SNAKokkos<DeviceType> my_sna = snaKK;
my_sna.compute_bi(team,ii);
const int iatom = iatom_mod + iatom_div * 32;
if (iatom >= chunk_size) return;
if (jjb >= my_sna.idxb_max) return;
my_sna.compute_bi(iatom_mod,jjb,iatom_div);
}
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const {
SNAKokkos<DeviceType> my_sna = snaKK;
const int iatom = iatom_mod + iatom_div * 32;
if (iatom >= chunk_size) return;
if (idxb >= my_sna.idxb_max) return;
const int ntriples = my_sna.ntriples;
for (int itriple = 0; itriple < ntriples; itriple++) {
const auto blocal = my_sna.blist_pack(iatom_mod, idxb, itriple, iatom_div);
my_sna.blist(idxb, itriple, iatom) = blocal;
}
} }
template<class DeviceType> template<class DeviceType>
@ -701,6 +824,126 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeFusedDeidrj,const
my_sna.compute_fused_deidrj(team,ii,jj); my_sna.compute_fused_deidrj(team,ii,jj);
} }
/* ----------------------------------------------------------------------
Begin routines that are unique to the CPU codepath. These do not take
advantage of AoSoA data layouts, but that could be a good point of
future optimization and unification with the above kernels. It's unlikely
that scratch memory optimizations will ever be useful for the CPU due to
different arithmetic intensity requirements for the CPU vs GPU.
------------------------------------------------------------------------- */
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPBetaCPU,const int& ii) const {
const int i = d_ilist[ii + chunk_offset];
const int itype = type[i];
const int ielem = d_map[itype];
SNAKokkos<DeviceType> my_sna = snaKK;
Kokkos::View<double*,Kokkos::LayoutRight,DeviceType,Kokkos::MemoryTraits<Kokkos::Unmanaged>>
d_coeffi(d_coeffelem,ielem,Kokkos::ALL);
for (int icoeff = 0; icoeff < ncoeff; icoeff++)
d_beta(icoeff,ii) = d_coeffi[icoeff+1];
if (quadraticflag) {
const auto idxb_max = my_sna.idxb_max;
int k = ncoeff+1;
for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
const auto idxb = icoeff % idxb_max;
const auto idx_chem = icoeff / idxb_max;
double bveci = my_sna.blist(idxb,idx_chem,ii);
d_beta(icoeff,ii) += d_coeffi[k]*bveci;
k++;
for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
const auto jdxb = jcoeff % idxb_max;
const auto jdx_chem = jcoeff / idxb_max;
double bvecj = my_sna.blist(jdxb,jdx_chem,ii);
d_beta(icoeff,ii) += d_coeffi[k]*bvecj;
d_beta(jcoeff,ii) += d_coeffi[k]*bveci;
k++;
}
}
}
}
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPPreUiCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPPreUiCPU>::member_type& team) const {
SNAKokkos<DeviceType> my_sna = snaKK;
// Extract the atom number
const int ii = team.team_rank() + team.team_size() * team.league_rank();
if (ii >= chunk_size) return;
int itype = type(ii);
int ielem = d_map[itype];
my_sna.pre_ui_cpu(team,ii,ielem);
}
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUiCPU>::member_type& team) const {
SNAKokkos<DeviceType> my_sna = snaKK;
// Extract the atom number
int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size()));
if (ii >= chunk_size) return;
// Extract the neighbor number
const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size());
const int ninside = d_ninside(ii);
if (jj >= ninside) return;
my_sna.compute_ui_cpu(team,ii,jj);
}
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPZeroYiCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPZeroYiCPU>::member_type& team) const {
SNAKokkos<DeviceType> my_sna = snaKK;
// Extract the quantum number
const int idx = team.team_rank() + team.team_size() * (team.league_rank() % ((my_sna.idxu_max+team.team_size()-1)/team.team_size()));
if (idx >= my_sna.idxu_max) return;
// Extract the atomic index
const int ii = team.league_rank() / ((my_sna.idxu_max+team.team_size()-1)/team.team_size());
if (ii >= chunk_size) return;
if (chemflag)
for(int ielem = 0; ielem < nelements; ielem++)
my_sna.zero_yi_cpu(idx,ii,ielem);
else
my_sna.zero_yi_cpu(idx,ii,0);
}
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeYiCPU,const int& ii) const {
SNAKokkos<DeviceType> my_sna = snaKK;
my_sna.compute_yi_cpu(ii,d_beta);
}
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeZiCPU,const int& ii) const {
SNAKokkos<DeviceType> my_sna = snaKK;
my_sna.compute_zi_cpu(ii);
}
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeBiCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeBiCPU>::member_type& team) const {
int ii = team.league_rank();
SNAKokkos<DeviceType> my_sna = snaKK;
my_sna.compute_bi_cpu(team,ii);
}
template<class DeviceType> template<class DeviceType>
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDuidrjCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeDuidrjCPU>::member_type& team) const { void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDuidrjCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeDuidrjCPU>::member_type& team) const {
@ -735,6 +978,12 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDeidrjCPU,const t
my_sna.compute_deidrj_cpu(team,ii,jj); my_sna.compute_deidrj_cpu(team,ii,jj);
} }
/* ----------------------------------------------------------------------
Also used for both CPU and GPU codepaths. Could maybe benefit from a
separate GPU/CPU codepath, but this kernel takes so little time it's
likely not worth it.
------------------------------------------------------------------------- */
template<class DeviceType> template<class DeviceType>
template<int NEIGHFLAG, int EVFLAG> template<int NEIGHFLAG, int EVFLAG>
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
@ -799,20 +1048,31 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeForce<NEIGHFLAG,E
// E = beta.B + 0.5*B^t.alpha.B // E = beta.B + 0.5*B^t.alpha.B
const auto idxb_max = snaKK.idxb_max;
// linear contributions // linear contributions
for (int icoeff = 0; icoeff < ncoeff; icoeff++) for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
evdwl += d_coeffi[icoeff+1]*my_sna.blist(icoeff,ii); const auto idxb = icoeff % idxb_max;
const auto idx_chem = icoeff / idxb_max;
evdwl += d_coeffi[icoeff+1]*my_sna.blist(idxb,idx_chem,ii);
}
// quadratic contributions // quadratic contributions
if (quadraticflag) { if (quadraticflag) {
int k = ncoeff+1; int k = ncoeff+1;
for (int icoeff = 0; icoeff < ncoeff; icoeff++) { for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
double bveci = my_sna.blist(icoeff,ii); const auto idxb = icoeff % idxb_max;
const auto idx_chem = icoeff / idxb_max;
double bveci = my_sna.blist(idxb,idx_chem,ii);
evdwl += 0.5*d_coeffi[k++]*bveci*bveci; evdwl += 0.5*d_coeffi[k++]*bveci*bveci;
for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) { for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
double bvecj = my_sna.blist(jcoeff,ii); auto jdxb = jcoeff % idxb_max;
auto jdx_chem = jcoeff / idxb_max;
double bvecj = my_sna.blist(jdxb,jdx_chem,ii);
evdwl += d_coeffi[k++]*bveci*bvecj; evdwl += d_coeffi[k++]*bveci*bvecj;
} }
} }

View File

@ -36,7 +36,9 @@ public:
typedef Kokkos::View<double**, DeviceType> t_sna_2d; typedef Kokkos::View<double**, DeviceType> t_sna_2d;
typedef Kokkos::View<double**, Kokkos::LayoutLeft, DeviceType> t_sna_2d_ll; typedef Kokkos::View<double**, Kokkos::LayoutLeft, DeviceType> t_sna_2d_ll;
typedef Kokkos::View<double***, DeviceType> t_sna_3d; typedef Kokkos::View<double***, DeviceType> t_sna_3d;
typedef Kokkos::View<double***, Kokkos::LayoutLeft, DeviceType> t_sna_3d_ll;
typedef Kokkos::View<double***[3], DeviceType> t_sna_4d; typedef Kokkos::View<double***[3], DeviceType> t_sna_4d;
typedef Kokkos::View<double****, Kokkos::LayoutLeft, DeviceType> t_sna_4d_ll;
typedef Kokkos::View<double**[3], DeviceType> t_sna_3d3; typedef Kokkos::View<double**[3], DeviceType> t_sna_3d3;
typedef Kokkos::View<double*****, DeviceType> t_sna_5d; typedef Kokkos::View<double*****, DeviceType> t_sna_5d;
@ -48,7 +50,8 @@ public:
typedef Kokkos::View<SNAcomplex***, DeviceType> t_sna_3c; typedef Kokkos::View<SNAcomplex***, DeviceType> t_sna_3c;
typedef Kokkos::View<SNAcomplex***, Kokkos::LayoutLeft, DeviceType> t_sna_3c_ll; typedef Kokkos::View<SNAcomplex***, Kokkos::LayoutLeft, DeviceType> t_sna_3c_ll;
typedef Kokkos::View<SNAcomplex***[3], DeviceType> t_sna_4c; typedef Kokkos::View<SNAcomplex***[3], DeviceType> t_sna_4c;
typedef Kokkos::View<SNAcomplex***[3], Kokkos::LayoutLeft, DeviceType> t_sna_4c_ll; typedef Kokkos::View<SNAcomplex***[3], Kokkos::LayoutLeft, DeviceType> t_sna_4c3_ll;
typedef Kokkos::View<SNAcomplex****, Kokkos::LayoutLeft, DeviceType> t_sna_4c_ll;
typedef Kokkos::View<SNAcomplex**[3], DeviceType> t_sna_3c3; typedef Kokkos::View<SNAcomplex**[3], DeviceType> t_sna_3c3;
typedef Kokkos::View<SNAcomplex*****, DeviceType> t_sna_5c; typedef Kokkos::View<SNAcomplex*****, DeviceType> t_sna_5c;
@ -73,27 +76,39 @@ inline
int ncoeff; int ncoeff;
// functions for bispectrum coefficients // functions for bispectrum coefficients, GPU only
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void pre_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team,const int&, int); // ForceSNAP void pre_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team,const int&,const int&); // ForceSNAP
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void compute_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int); // ForceSNAP void compute_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int); // ForceSNAP
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void compute_zi(const int&, const int&, const int&); // ForceSNAP
KOKKOS_INLINE_FUNCTION
void compute_yi(int,int,int,
const Kokkos::View<F_FLOAT***, Kokkos::LayoutLeft, DeviceType> &beta_pack); // ForceSNAP
KOKKOS_INLINE_FUNCTION
void compute_bi(const int&, const int&, const int&); // ForceSNAP
// functions for bispectrum coefficients, CPU only
KOKKOS_INLINE_FUNCTION
void pre_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team,const int&,const int&); // ForceSNAP
KOKKOS_INLINE_FUNCTION
void compute_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); // ForceSNAP void compute_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); // ForceSNAP
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void compute_zi(const int&); // ForceSNAP void compute_zi_cpu(const int&); // ForceSNAP
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void zero_yi(const int&, const int&, int); // ForceSNAP void zero_yi_cpu(const int&,const int&,const int&); // ForceSNAP
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void compute_yi(int, void compute_yi_cpu(int,
const Kokkos::View<F_FLOAT**, DeviceType> &beta); // ForceSNAP const Kokkos::View<F_FLOAT**, DeviceType> &beta); // ForceSNAP
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void compute_bi(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int); // ForceSNAP void compute_bi_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int); // ForceSNAP
// functions for derivatives
// functions for derivatives, GPU only
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void compute_fused_deidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int); //ForceSNAP void compute_fused_deidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int); //ForceSNAP
// functions for derivatives, CPU only
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void compute_duidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); //ForceSNAP void compute_duidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); //ForceSNAP
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
@ -139,18 +154,32 @@ inline
int twojmax, diagonalstyle; int twojmax, diagonalstyle;
t_sna_2d_ll blist; t_sna_3d_ll blist;
t_sna_2c_ll ulisttot; t_sna_3c_ll ulisttot;
t_sna_2c_ll zlist; t_sna_3c_ll zlist;
t_sna_3c_ll ulist; t_sna_3c_ll ulist;
t_sna_2c_ll ylist; t_sna_3c_ll ylist;
// derivatives of data // derivatives of data
t_sna_4c_ll dulist; t_sna_4c3_ll dulist;
// Modified structures for GPU backend
t_sna_3d_ll ulisttot_re; // split real,
t_sna_3d_ll ulisttot_im; // imag
t_sna_4c_ll ulisttot_pack; // AoSoA layout
t_sna_4c_ll zlist_pack; // AoSoA layout
t_sna_4d_ll blist_pack;
t_sna_4d_ll ylist_pack_re; // split real,
t_sna_4d_ll ylist_pack_im; // imag AoSoA layout
int idxcg_max, idxu_max, idxz_max, idxb_max; int idxcg_max, idxu_max, idxz_max, idxb_max;
// Chem snap counts
int nelements;
int ndoubles;
int ntriples;
private: private:
double rmin0, rfac0; double rmin0, rfac0;
@ -212,9 +241,6 @@ inline
// Chem snap flags // Chem snap flags
int chem_flag; int chem_flag;
int bnorm_flag; int bnorm_flag;
int nelements;
int ndoubles;
int ntriples;
// Self-weight // Self-weight
double wself; double wself;

File diff suppressed because it is too large Load Diff

View File

@ -362,6 +362,7 @@ void MSM::setup()
nmax_direct = 8*(nxhi_direct+1)*(nyhi_direct+1)*(nzhi_direct+1); nmax_direct = 8*(nxhi_direct+1)*(nyhi_direct+1)*(nzhi_direct+1);
deallocate();
if (peratom_allocate_flag) deallocate_peratom(); if (peratom_allocate_flag) deallocate_peratom();
// compute direct sum interaction weights // compute direct sum interaction weights
@ -612,8 +613,6 @@ void MSM::compute(int eflag, int vflag)
void MSM::allocate() void MSM::allocate()
{ {
deallocate();
// interpolation coeffs // interpolation coeffs
order_allocated = order; order_allocated = order;
@ -635,9 +634,9 @@ void MSM::allocate()
// allocate memory for each grid level // allocate memory for each grid level
for (int n=0; n<levels; n++) { for (int n=0; n<levels; n++) {
memory->create3d_offset(qgrid[n],nzlo_out[n],nzhi_out[n], memory->create3d_offset(qgrid[n],nzlo_out[n],nzhi_out[n],
nylo_out[n],nyhi_out[n],nxlo_out[n],nxhi_out[n],"msm:qgrid"); nylo_out[n],nyhi_out[n],nxlo_out[n],nxhi_out[n],"msm:qgrid");
memory->create3d_offset(egrid[n],nzlo_out[n],nzhi_out[n], memory->create3d_offset(egrid[n],nzlo_out[n],nzhi_out[n],
nylo_out[n],nyhi_out[n],nxlo_out[n],nxhi_out[n],"msm:egrid"); nylo_out[n],nyhi_out[n],nxlo_out[n],nxhi_out[n],"msm:egrid");
@ -660,23 +659,29 @@ void MSM::allocate()
void MSM::deallocate() void MSM::deallocate()
{ {
delete cg_all;
cg_all = nullptr;
memory->destroy2d_offset(phi1d,-order_allocated); memory->destroy2d_offset(phi1d,-order_allocated);
memory->destroy2d_offset(dphi1d,-order_allocated); memory->destroy2d_offset(dphi1d,-order_allocated);
if (cg_all) delete cg_all;
cg_all = nullptr;
for (int n=0; n<levels; n++) { for (int n=0; n<levels; n++) {
memory->destroy3d_offset(qgrid[n],nzlo_out[n],nylo_out[n],nxlo_out[n]); if (qgrid[n])
memory->destroy3d_offset(egrid[n],nzlo_out[n],nylo_out[n],nxlo_out[n]); memory->destroy3d_offset(qgrid[n],nzlo_out[n],nylo_out[n],nxlo_out[n]);
if (world_levels[n] != MPI_COMM_NULL) if (egrid[n])
MPI_Comm_free(&world_levels[n]); memory->destroy3d_offset(egrid[n],nzlo_out[n],nylo_out[n],nxlo_out[n]);
world_levels[n] = MPI_COMM_NULL;
active_flag[n] = 0;
delete cg[n]; if (world_levels)
cg[n] = nullptr; if (world_levels[n] != MPI_COMM_NULL)
MPI_Comm_free(&world_levels[n]);
if (cg) {
if (cg[n]) {
delete cg[n];
cg[n] = nullptr;
}
}
} }
} }
@ -765,7 +770,6 @@ void MSM::deallocate_peratom()
void MSM::allocate_levels() void MSM::allocate_levels()
{ {
deallocate_levels();
ngrid = new int[levels]; ngrid = new int[levels];
cg = new GridComm*[levels]; cg = new GridComm*[levels];
@ -815,21 +819,21 @@ void MSM::allocate_levels()
v5grid = new double***[levels]; v5grid = new double***[levels];
for (int n=0; n<levels; n++) { for (int n=0; n<levels; n++) {
cg[n] = nullptr; cg[n] = NULL;
world_levels[n] = MPI_COMM_NULL; world_levels[n] = MPI_COMM_NULL;
active_flag[n] = 0; cg_peratom[n] = NULL;
cg_peratom[n] = nullptr;
qgrid[n] = nullptr; qgrid[n] = NULL;
egrid[n] = nullptr; egrid[n] = NULL;
v0grid[n] = nullptr; v0grid[n] = NULL;
v1grid[n] = nullptr; v1grid[n] = NULL;
v2grid[n] = nullptr; v2grid[n] = NULL;
v3grid[n] = nullptr; v3grid[n] = NULL;
v4grid[n] = nullptr; v4grid[n] = NULL;
v5grid[n] = nullptr; v5grid[n] = NULL;
} }
} }
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
@ -1105,6 +1109,7 @@ void MSM::set_grid_global()
if (!domain->nonperiodic) levels -= 1; if (!domain->nonperiodic) levels -= 1;
deallocate_levels();
allocate_levels(); allocate_levels();
// find number of grid levels in each direction // find number of grid levels in each direction

View File

@ -33,6 +33,8 @@ using namespace MathConst;
#define DELTA 16384 #define DELTA 16384
#define DELTA_BONUS 8192 #define DELTA_BONUS 8192
int AtomVec::num_atom_vecs = 0;
/* ---------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- */
AtomVec::AtomVec(LAMMPS *lmp) : Pointers(lmp) AtomVec::AtomVec(LAMMPS *lmp) : Pointers(lmp)
@ -54,6 +56,8 @@ AtomVec::AtomVec(LAMMPS *lmp) : Pointers(lmp)
threads = NULL; threads = NULL;
++num_atom_vecs;
// peratom variables auto-included in corresponding child style fields string // peratom variables auto-included in corresponding child style fields string
// these fields cannot be specified in the fields string // these fields cannot be specified in the fields string
@ -93,44 +97,48 @@ AtomVec::~AtomVec()
int datatype,cols; int datatype,cols;
void *pdata; void *pdata;
--num_atom_vecs;
for (int i = 0; i < nargcopy; i++) delete [] argcopy[i]; for (int i = 0; i < nargcopy; i++) delete [] argcopy[i];
delete [] argcopy; delete [] argcopy;
memory->destroy(atom->tag); if (num_atom_vecs == 0) {
memory->destroy(atom->type); memory->destroy(atom->tag);
memory->destroy(atom->mask); memory->destroy(atom->type);
memory->destroy(atom->image); memory->destroy(atom->mask);
memory->destroy(atom->x); memory->destroy(atom->image);
memory->destroy(atom->v); memory->destroy(atom->x);
memory->destroy(atom->f); memory->destroy(atom->v);
memory->destroy(atom->f);
for (int i = 0; i < ngrow; i++) { for (int i = 0; i < ngrow; i++) {
pdata = mgrow.pdata[i]; pdata = mgrow.pdata[i];
datatype = mgrow.datatype[i]; datatype = mgrow.datatype[i];
cols = mgrow.cols[i]; cols = mgrow.cols[i];
if (datatype == Atom::DOUBLE) { if (datatype == Atom::DOUBLE) {
if (cols == 0) if (cols == 0)
memory->destroy(*((double **) pdata)); memory->destroy(*((double **) pdata));
else if (cols > 0) else if (cols > 0)
memory->destroy(*((double ***) pdata)); memory->destroy(*((double ***) pdata));
else { else {
memory->destroy(*((double ***) pdata)); memory->destroy(*((double ***) pdata));
} }
} else if (datatype == Atom::INT) { } else if (datatype == Atom::INT) {
if (cols == 0) if (cols == 0)
memory->destroy(*((int **) pdata)); memory->destroy(*((int **) pdata));
else if (cols > 0) else if (cols > 0)
memory->destroy(*((int ***) pdata)); memory->destroy(*((int ***) pdata));
else { else {
memory->destroy(*((int ***) pdata)); memory->destroy(*((int ***) pdata));
} }
} else if (datatype == Atom::BIGINT) { } else if (datatype == Atom::BIGINT) {
if (cols == 0) if (cols == 0)
memory->destroy(*((bigint **) pdata)); memory->destroy(*((bigint **) pdata));
else if (cols > 0) else if (cols > 0)
memory->destroy(*((bigint ***) pdata)); memory->destroy(*((bigint ***) pdata));
else { else {
memory->destroy(*((bigint ***) pdata)); memory->destroy(*((bigint ***) pdata));
}
} }
} }
} }

View File

@ -209,6 +209,10 @@ class AtomVec : protected Pointers {
bool *threads; bool *threads;
// counter for atom vec instances
static int num_atom_vecs;
// local methods // local methods
void grow_nmax(); void grow_nmax();