Merge branch 'master' into test-updates
This commit is contained in:
@ -30,19 +30,31 @@ PairStyle(snap/kk/host,PairSNAPKokkos<LMPHostType>)
|
||||
|
||||
namespace LAMMPS_NS {
|
||||
|
||||
// Routines for both the CPU and GPU backend
|
||||
template<int NEIGHFLAG, int EVFLAG>
|
||||
struct TagPairSNAPComputeForce{};
|
||||
|
||||
struct TagPairSNAPBeta{};
|
||||
struct TagPairSNAPComputeNeigh{};
|
||||
|
||||
// GPU backend only
|
||||
struct TagPairSNAPPreUi{};
|
||||
struct TagPairSNAPComputeUi{};
|
||||
struct TagPairSNAPComputeUiCPU{};
|
||||
struct TagPairSNAPTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist
|
||||
struct TagPairSNAPComputeZi{};
|
||||
struct TagPairSNAPBeta{};
|
||||
struct TagPairSNAPComputeBi{};
|
||||
struct TagPairSNAPZeroYi{};
|
||||
struct TagPairSNAPTransformBi{}; // re-order blist from AoSoA to AoS
|
||||
struct TagPairSNAPComputeYi{};
|
||||
struct TagPairSNAPTransformYi{}; // re-order ylist from AoSoA to AoS
|
||||
struct TagPairSNAPComputeFusedDeidrj{};
|
||||
|
||||
// CPU backend only
|
||||
struct TagPairSNAPPreUiCPU{};
|
||||
struct TagPairSNAPComputeUiCPU{};
|
||||
struct TagPairSNAPComputeZiCPU{};
|
||||
struct TagPairSNAPBetaCPU{};
|
||||
struct TagPairSNAPComputeBiCPU{};
|
||||
struct TagPairSNAPZeroYiCPU{};
|
||||
struct TagPairSNAPComputeYiCPU{};
|
||||
struct TagPairSNAPComputeDuidrjCPU{};
|
||||
struct TagPairSNAPComputeDeidrjCPU{};
|
||||
|
||||
@ -81,6 +93,10 @@ public:
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeNeigh>::member_type& team) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPBetaCPU,const int& ii) const;
|
||||
|
||||
// GPU backend only
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPPreUi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPPreUi>::member_type& team) const;
|
||||
|
||||
@ -88,32 +104,54 @@ public:
|
||||
void operator() (TagPairSNAPComputeUi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUi>::member_type& team) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiCPU>::member_type& team) const;
|
||||
void operator() (TagPairSNAPTransformUi,const int iatom_mod, const int idxu, const int iatom_div) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPComputeZi,const int& ii) const;
|
||||
void operator() (TagPairSNAPComputeZi,const int iatom_mod, const int idxz, const int iatom_div) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPComputeBi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeBi>::member_type& team) const;
|
||||
void operator() (TagPairSNAPBeta, const int& ii) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPZeroYi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPZeroYi>::member_type& team) const;
|
||||
void operator() (TagPairSNAPComputeBi,const int iatom_mod, const int idxb, const int iatom_div) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPComputeYi,const int& ii) const;
|
||||
void operator() (TagPairSNAPTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPComputeYi,const int iatom_mod, const int idxz, const int iatom_div) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPTransformYi,const int iatom_mod, const int idxu, const int iatom_div) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPComputeFusedDeidrj,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeFusedDeidrj>::member_type& team) const;
|
||||
|
||||
// CPU backend only
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPPreUiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPPreUiCPU>::member_type& team) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiCPU>::member_type& team) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPComputeZiCPU,const int& ii) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPComputeBiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeBiCPU>::member_type& team) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPZeroYiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPZeroYiCPU>::member_type& team) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPComputeYiCPU,const int& ii) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPComputeDuidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrjCPU>::member_type& team) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPComputeDeidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrjCPU>::member_type& team) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPBeta,const int& ii) const;
|
||||
|
||||
template<int NEIGHFLAG>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void v_tally_xyz(EV_FLOAT &ev, const int &i, const int &j,
|
||||
@ -175,6 +213,7 @@ inline double dist2(double* x,double* y);
|
||||
Kokkos::View<T_INT*, DeviceType> d_map; // mapping from atom types to elements
|
||||
Kokkos::View<T_INT*, DeviceType> d_ninside; // ninside for all atoms in list
|
||||
Kokkos::View<F_FLOAT**, DeviceType> d_beta; // betas for all atoms in list
|
||||
Kokkos::View<F_FLOAT***, Kokkos::LayoutLeft, DeviceType> d_beta_pack; // betas for all atoms in list, GPU
|
||||
Kokkos::View<F_FLOAT**, DeviceType> d_bispectrum; // bispectrum components for all atoms in list
|
||||
|
||||
typedef Kokkos::DualView<F_FLOAT**, DeviceType> tdual_fparams;
|
||||
|
||||
@ -194,6 +194,8 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
|
||||
if (beta_max < inum) {
|
||||
beta_max = inum;
|
||||
d_beta = Kokkos::View<F_FLOAT**, DeviceType>("PairSNAPKokkos:beta",ncoeff,inum);
|
||||
if (!host_flag)
|
||||
d_beta_pack = Kokkos::View<F_FLOAT***, Kokkos::LayoutLeft, DeviceType>("PairSNAPKokkos:beta_pack",32,ncoeff,(inum+32-1)/32);
|
||||
d_ninside = Kokkos::View<int*, DeviceType>("PairSNAPKokkos:ninside",inum);
|
||||
}
|
||||
|
||||
@ -222,32 +224,93 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
|
||||
Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this);
|
||||
}
|
||||
|
||||
//PreUi
|
||||
if (host_flag)
|
||||
{
|
||||
int vector_length = vector_length_default;
|
||||
int team_size = team_size_default;
|
||||
if (!host_flag)
|
||||
vector_length = 32;
|
||||
check_team_size_for<TagPairSNAPPreUi>(chunk_size,team_size,vector_length);
|
||||
typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPPreUi> policy_preui((chunk_size+team_size-1)/team_size,team_size,vector_length);
|
||||
Kokkos::parallel_for("PreUi",policy_preui,*this);
|
||||
}
|
||||
// Host codepath
|
||||
|
||||
// ComputeUI
|
||||
{
|
||||
int vector_length = vector_length_default;
|
||||
int team_size = team_size_default;
|
||||
if (host_flag) { // CPU
|
||||
// Run a fused calculation of ulist and accumulation into ulisttot using atomics
|
||||
//PreUi
|
||||
{
|
||||
int vector_length = vector_length_default;
|
||||
int team_size = team_size_default;
|
||||
check_team_size_for<TagPairSNAPPreUiCPU>(chunk_size,team_size,vector_length);
|
||||
typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPPreUiCPU> policy_preui_cpu((chunk_size+team_size-1)/team_size,team_size,vector_length);
|
||||
Kokkos::parallel_for("PreUiCPU",policy_preui_cpu,*this);
|
||||
}
|
||||
|
||||
// ComputeUi
|
||||
{
|
||||
int vector_length = vector_length_default;
|
||||
int team_size = team_size_default;
|
||||
// Fused calculation of ulist and accumulation into ulisttot using atomics
|
||||
typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUiCPU> policy_ui_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
|
||||
|
||||
Kokkos::parallel_for("ComputeUiCPU",policy_ui_cpu,*this);
|
||||
}
|
||||
|
||||
} else { // GPU, vector parallelism, shared memory, separate ulist and ulisttot to avoid atomics
|
||||
//Compute bispectrum
|
||||
if (quadraticflag || eflag) {
|
||||
//ComputeZi
|
||||
int idxz_max = snaKK.idxz_max;
|
||||
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeZiCPU> policy_zi_cpu(0,chunk_size*idxz_max);
|
||||
Kokkos::parallel_for("ComputeZiCPU",policy_zi_cpu,*this);
|
||||
|
||||
vector_length = 32;
|
||||
team_size = 4; // need to cap b/c of shared memory reqs
|
||||
//ComputeBi
|
||||
int vector_length = vector_length_default;
|
||||
int team_size = team_size_default;
|
||||
check_team_size_for<TagPairSNAPComputeBiCPU>(chunk_size,team_size,vector_length);
|
||||
typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeBiCPU> policy_bi_cpu(chunk_size,team_size,vector_length);
|
||||
Kokkos::parallel_for("ComputeBiCPU",policy_bi_cpu,*this);
|
||||
}
|
||||
|
||||
//ZeroYi,ComputeYi
|
||||
{
|
||||
int vector_length = vector_length_default;
|
||||
int team_size = team_size_default;
|
||||
|
||||
//Compute beta = dE_i/dB_i for all i in list
|
||||
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPBetaCPU> policy_beta(0,chunk_size);
|
||||
Kokkos::parallel_for("ComputeBetaCPU",policy_beta,*this);
|
||||
|
||||
//ZeroYi
|
||||
check_team_size_for<TagPairSNAPZeroYiCPU>(chunk_size,team_size,vector_length);
|
||||
typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPZeroYiCPU> policy_zero_yi(((idxu_max+team_size-1)/team_size)*chunk_size,team_size,vector_length);
|
||||
Kokkos::parallel_for("ZeroYiCPU",policy_zero_yi,*this);
|
||||
|
||||
//ComputeYi
|
||||
int idxz_max = snaKK.idxz_max;
|
||||
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeYiCPU> policy_yi_cpu(0,chunk_size*idxz_max);
|
||||
Kokkos::parallel_for("ComputeYiCPU",policy_yi_cpu,*this);
|
||||
} // host flag
|
||||
|
||||
//ComputeDuidrj and Deidrj
|
||||
{
|
||||
int team_size = team_size_default;
|
||||
int vector_length = vector_length_default;
|
||||
|
||||
typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeDuidrjCPU> policy_duidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
|
||||
snaKK.set_dir(-1); // technically doesn't do anything
|
||||
Kokkos::parallel_for("ComputeDuidrjCPU",policy_duidrj_cpu,*this);
|
||||
|
||||
typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeDeidrjCPU> policy_deidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
|
||||
|
||||
Kokkos::parallel_for("ComputeDeidrjCPU",policy_deidrj_cpu,*this);
|
||||
}
|
||||
} else { // GPU
|
||||
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
//PreUi
|
||||
{
|
||||
int vector_length = vector_length_default;
|
||||
int team_size = team_size_default;
|
||||
check_team_size_for<TagPairSNAPPreUi>(chunk_size,team_size,vector_length);
|
||||
typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPPreUi> policy_preui((chunk_size+team_size-1)/team_size,team_size,vector_length);
|
||||
Kokkos::parallel_for("PreUi",policy_preui,*this);
|
||||
}
|
||||
|
||||
// ComputeUi w/vector parallelism, shared memory, direct atomicAdd into ulisttot
|
||||
{
|
||||
|
||||
int vector_length = 32;
|
||||
int team_size = 4; // need to cap b/c of shared memory reqs
|
||||
check_team_size_for<TagPairSNAPComputeUi>(chunk_size,team_size,vector_length);
|
||||
|
||||
// scratch size: 2 * team_size * (twojmax+1)^2, to cover all `m1`,`m2` values
|
||||
@ -265,62 +328,54 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
|
||||
|
||||
Kokkos::parallel_for("ComputeUi",policy_ui,*this);
|
||||
|
||||
//Transform data layout of ulisttot to AoSoA, zero ylist
|
||||
typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformUi> policy_transform_ui({0,0,0},{32,idxu_max,(chunk_size + 32 - 1) / 32},{32,4,1});
|
||||
Kokkos::parallel_for("TransformUi",policy_transform_ui,*this);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
//Compute bispectrum in AoSoA data layout, transform Bi
|
||||
if (quadraticflag || eflag) {
|
||||
//ComputeZi
|
||||
int idxz_max = snaKK.idxz_max;
|
||||
typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPComputeZi> policy_compute_zi({0,0,0},{32,idxz_max,(chunk_size + 32 - 1) / 32},{32,4,1});
|
||||
Kokkos::parallel_for("ComputeZi",policy_compute_zi,*this);
|
||||
|
||||
//Compute bispectrum
|
||||
if (quadraticflag || eflag) {
|
||||
//ComputeZi
|
||||
int idxz_max = snaKK.idxz_max;
|
||||
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeZi> policy_zi(0,chunk_size*idxz_max);
|
||||
Kokkos::parallel_for("ComputeZi",policy_zi,*this);
|
||||
//ComputeBi
|
||||
int idxb_max = snaKK.idxb_max;
|
||||
typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPComputeBi> policy_compute_bi({0,0,0},{32,idxb_max,(chunk_size + 32 - 1) / 32},{32,4,1});
|
||||
Kokkos::parallel_for("ComputeBi",policy_compute_bi,*this);
|
||||
|
||||
//ComputeBi
|
||||
int vector_length = vector_length_default;
|
||||
int team_size = team_size_default;
|
||||
check_team_size_for<TagPairSNAPComputeBi>(chunk_size,team_size,vector_length);
|
||||
typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeBi> policy_bi(chunk_size,team_size,vector_length);
|
||||
Kokkos::parallel_for("ComputeBi",policy_bi,*this);
|
||||
}
|
||||
//Transform data layout of blist out of AoSoA
|
||||
//We need this b/c `blist` gets used in ComputeForce which doesn't
|
||||
//take advantage of AoSoA (which at best would only be beneficial
|
||||
//on the margins)
|
||||
typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformBi> policy_transform_bi({0,0,0},{32,idxb_max,(chunk_size + 32 - 1) / 32},{32,4,1});
|
||||
Kokkos::parallel_for("TransformBi",policy_transform_bi,*this);
|
||||
}
|
||||
|
||||
//Compute beta = dE_i/dB_i for all i in list
|
||||
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPBeta> policy_beta(0,chunk_size);
|
||||
Kokkos::parallel_for("ComputeBeta",policy_beta,*this);
|
||||
//ComputeYi in AoSoA data layout, transform to AoS for ComputeFusedDeidrj
|
||||
//Note zeroing `ylist` is fused into `TransformUi`.
|
||||
{
|
||||
//Compute beta = dE_i/dB_i for all i in list
|
||||
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPBeta> policy_beta(0,chunk_size);
|
||||
Kokkos::parallel_for("ComputeBeta",policy_beta,*this);
|
||||
|
||||
//ZeroYi
|
||||
{
|
||||
int vector_length = vector_length_default;
|
||||
int team_size = team_size_default;
|
||||
if (!host_flag)
|
||||
team_size = 128;
|
||||
check_team_size_for<TagPairSNAPZeroYi>(chunk_size,team_size,vector_length);
|
||||
typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPZeroYi> policy_zero_yi(((idxu_max+team_size-1)/team_size)*chunk_size,team_size,vector_length);
|
||||
Kokkos::parallel_for("ZeroYi",policy_zero_yi,*this);
|
||||
}
|
||||
//ComputeYi
|
||||
const int idxz_max = snaKK.idxz_max;
|
||||
typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPComputeYi> policy_compute_yi({0,0,0},{32,idxz_max,(chunk_size + 32 - 1) / 32},{32,4,1});
|
||||
Kokkos::parallel_for("ComputeYi",policy_compute_yi,*this);
|
||||
|
||||
//ComputeYi
|
||||
int idxz_max = snaKK.idxz_max;
|
||||
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeYi> policy_yi(0,chunk_size*idxz_max);
|
||||
Kokkos::parallel_for("ComputeYi",policy_yi,*this);
|
||||
//Transform data layout of ylist out of AoSoA
|
||||
typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformYi> policy_transform_yi({0,0,0},{32,idxu_max,(chunk_size + 32 - 1) / 32},{32,4,1});
|
||||
Kokkos::parallel_for("TransformYi",policy_transform_yi,*this);
|
||||
|
||||
//ComputeDuidrj and Deidrj
|
||||
{
|
||||
int team_size = team_size_default;
|
||||
int vector_length = vector_length_default;
|
||||
if (host_flag) { // CPU
|
||||
}
|
||||
|
||||
typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeDuidrjCPU> policy_duidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
|
||||
snaKK.set_dir(-1); // technically doesn't do anything
|
||||
Kokkos::parallel_for("ComputeDuidrjCPU",policy_duidrj_cpu,*this);
|
||||
|
||||
typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeDeidrjCPU> policy_deidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
|
||||
|
||||
Kokkos::parallel_for("ComputeDeidrjCPU",policy_deidrj_cpu,*this);
|
||||
} else { // GPU, utilize scratch memory and splitting over dimensions, fused dui and dei
|
||||
|
||||
vector_length = 32;
|
||||
team_size = 2; // need to cap b/c of shared memory reqs
|
||||
// Fused ComputeDuidrj, ComputeDeidrj
|
||||
{
|
||||
int vector_length = 32;
|
||||
int team_size = 2; // need to cap b/c of shared memory reqs
|
||||
check_team_size_for<TagPairSNAPComputeFusedDeidrj>(chunk_size,team_size,vector_length);
|
||||
|
||||
// scratch size: 2 * 2 * team_size * (twojmax+1)*(twojmax/2+1), to cover half `m1`,`m2` values due to symmetry
|
||||
@ -341,6 +396,9 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
|
||||
Kokkos::parallel_for("ComputeFusedDeidrj",policy_fused_deidrj,*this);
|
||||
}
|
||||
}
|
||||
|
||||
#endif // KOKKOS_ENABLE_CUDA
|
||||
|
||||
}
|
||||
|
||||
//ComputeForce
|
||||
@ -416,38 +474,6 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPBeta,const int& ii) const {
|
||||
|
||||
const int i = d_ilist[ii + chunk_offset];
|
||||
const int itype = type[i];
|
||||
const int ielem = d_map[itype];
|
||||
SNAKokkos<DeviceType> my_sna = snaKK;
|
||||
|
||||
Kokkos::View<double*,Kokkos::LayoutRight,DeviceType,Kokkos::MemoryTraits<Kokkos::Unmanaged>>
|
||||
d_coeffi(d_coeffelem,ielem,Kokkos::ALL);
|
||||
|
||||
for (int icoeff = 0; icoeff < ncoeff; icoeff++)
|
||||
d_beta(icoeff,ii) = d_coeffi[icoeff+1];
|
||||
|
||||
if (quadraticflag) {
|
||||
int k = ncoeff+1;
|
||||
for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
|
||||
double bveci = my_sna.blist(icoeff,ii);
|
||||
d_beta(icoeff,ii) += d_coeffi[k]*bveci;
|
||||
k++;
|
||||
for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
|
||||
double bvecj = my_sna.blist(jcoeff,ii);
|
||||
d_beta(icoeff,ii) += d_coeffi[k]*bvecj;
|
||||
d_beta(jcoeff,ii) += d_coeffi[k]*bveci;
|
||||
k++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
allocate all arrays
|
||||
@ -520,6 +546,10 @@ void PairSNAPKokkos<DeviceType>::coeff(int narg, char **arg)
|
||||
snaKK.init();
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Begin routines that are called on both CPU and GPU codepaths
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
@ -594,6 +624,53 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeNeigh,const typen
|
||||
});
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Begin routines that are unique to the GPU codepath. These take advantage
|
||||
of AoSoA data layouts and scratch memory for recursive polynomials
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPBeta,const int& ii) const {
|
||||
|
||||
if (ii >= chunk_size) return;
|
||||
|
||||
const int iatom_mod = ii % 32;
|
||||
const int iatom_div = ii / 32;
|
||||
|
||||
const int i = d_ilist[ii + chunk_offset];
|
||||
const int itype = type[i];
|
||||
const int ielem = d_map[itype];
|
||||
SNAKokkos<DeviceType> my_sna = snaKK;
|
||||
|
||||
Kokkos::View<double*,Kokkos::LayoutRight,DeviceType,Kokkos::MemoryTraits<Kokkos::Unmanaged>>
|
||||
d_coeffi(d_coeffelem,ielem,Kokkos::ALL);
|
||||
|
||||
for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
|
||||
d_beta_pack(iatom_mod,icoeff,iatom_div) = d_coeffi[icoeff+1];
|
||||
}
|
||||
|
||||
if (quadraticflag) {
|
||||
const auto idxb_max = my_sna.idxb_max;
|
||||
int k = ncoeff+1;
|
||||
for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
|
||||
const auto idxb = icoeff % idxb_max;
|
||||
const auto idx_chem = icoeff / idxb_max;
|
||||
double bveci = my_sna.blist(idxb, idx_chem, ii);
|
||||
d_beta_pack(iatom_mod,icoeff,iatom_div) += d_coeffi[k]*bveci;
|
||||
k++;
|
||||
for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
|
||||
const auto jdxb = jcoeff % idxb_max;
|
||||
const auto jdx_chem = jcoeff / idxb_max;
|
||||
double bvecj = my_sna.blist(jdxb, jdx_chem, ii);
|
||||
d_beta_pack(iatom_mod,icoeff,iatom_div) += d_coeffi[k]*bvecj;
|
||||
d_beta_pack(iatom_mod,jcoeff,iatom_div) += d_coeffi[k]*bveci;
|
||||
k++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<class DeviceType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPPreUi,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPPreUi>::member_type& team) const {
|
||||
@ -627,61 +704,107 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUi,const typename
|
||||
|
||||
template<class DeviceType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUiCPU>::member_type& team) const {
|
||||
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPTransformUi,const int iatom_mod, const int idxu, const int iatom_div) const {
|
||||
SNAKokkos<DeviceType> my_sna = snaKK;
|
||||
|
||||
// Extract the atom number
|
||||
int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size()));
|
||||
if (ii >= chunk_size) return;
|
||||
const int iatom = iatom_mod + iatom_div * 32;
|
||||
if (iatom >= chunk_size) return;
|
||||
|
||||
// Extract the neighbor number
|
||||
const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size());
|
||||
const int ninside = d_ninside(ii);
|
||||
if (jj >= ninside) return;
|
||||
if (idxu >= my_sna.idxu_max) return;
|
||||
|
||||
int elem_count = chemflag ? nelements : 1;
|
||||
|
||||
for (int ielem = 0; ielem < elem_count; ielem++) {
|
||||
|
||||
const auto utot_re = my_sna.ulisttot_re(idxu, ielem, iatom);
|
||||
const auto utot_im = my_sna.ulisttot_im(idxu, ielem, iatom);
|
||||
|
||||
my_sna.ulisttot_pack(iatom_mod, idxu, ielem, iatom_div) = { utot_re, utot_im };
|
||||
|
||||
my_sna.ylist_pack_re(iatom_mod, idxu, ielem, iatom_div) = 0.;
|
||||
my_sna.ylist_pack_im(iatom_mod, idxu, ielem, iatom_div) = 0.;
|
||||
}
|
||||
|
||||
my_sna.compute_ui_cpu(team,ii,jj);
|
||||
}
|
||||
|
||||
template<class DeviceType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPZeroYi,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPZeroYi>::member_type& team) const {
|
||||
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeYi,const int iatom_mod, const int jjz, const int iatom_div) const {
|
||||
SNAKokkos<DeviceType> my_sna = snaKK;
|
||||
|
||||
// Extract the quantum number
|
||||
const int idx = team.team_rank() + team.team_size() * (team.league_rank() % ((my_sna.idxu_max+team.team_size()-1)/team.team_size()));
|
||||
if (idx >= my_sna.idxu_max) return;
|
||||
const int iatom = iatom_mod + iatom_div * 32;
|
||||
if (iatom >= chunk_size) return;
|
||||
|
||||
// Extract the atomic index
|
||||
const int ii = team.league_rank() / ((my_sna.idxu_max+team.team_size()-1)/team.team_size());
|
||||
if (ii >= chunk_size) return;
|
||||
if (jjz >= my_sna.idxz_max) return;
|
||||
|
||||
if (chemflag)
|
||||
for(int ielem = 0; ielem < nelements; ielem++)
|
||||
my_sna.zero_yi(idx,ii,ielem);
|
||||
else
|
||||
my_sna.zero_yi(idx,ii,0);
|
||||
my_sna.compute_yi(iatom_mod,jjz,iatom_div,d_beta_pack);
|
||||
}
|
||||
|
||||
template<class DeviceType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeYi,const int& ii) const {
|
||||
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPTransformYi,const int iatom_mod, const int idxu, const int iatom_div) const {
|
||||
SNAKokkos<DeviceType> my_sna = snaKK;
|
||||
my_sna.compute_yi(ii,d_beta);
|
||||
|
||||
const int iatom = iatom_mod + iatom_div * 32;
|
||||
if (iatom >= chunk_size) return;
|
||||
|
||||
if (idxu >= my_sna.idxu_max) return;
|
||||
|
||||
int elem_count = chemflag ? nelements : 1;
|
||||
for (int ielem = 0; ielem < elem_count; ielem++) {
|
||||
const auto y_re = my_sna.ylist_pack_re(iatom_mod, idxu, ielem, iatom_div);
|
||||
const auto y_im = my_sna.ylist_pack_im(iatom_mod, idxu, ielem, iatom_div);
|
||||
|
||||
my_sna.ylist(idxu, ielem, iatom) = { y_re, y_im };
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
template<class DeviceType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeZi,const int& ii) const {
|
||||
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeZi,const int iatom_mod, const int jjz, const int iatom_div) const {
|
||||
SNAKokkos<DeviceType> my_sna = snaKK;
|
||||
my_sna.compute_zi(ii);
|
||||
|
||||
const int iatom = iatom_mod + iatom_div * 32;
|
||||
if (iatom >= chunk_size) return;
|
||||
|
||||
if (jjz >= my_sna.idxz_max) return;
|
||||
|
||||
my_sna.compute_zi(iatom_mod,jjz,iatom_div);
|
||||
}
|
||||
|
||||
template<class DeviceType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeBi,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeBi>::member_type& team) const {
|
||||
int ii = team.league_rank();
|
||||
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeBi,const int iatom_mod, const int jjb, const int iatom_div) const {
|
||||
SNAKokkos<DeviceType> my_sna = snaKK;
|
||||
my_sna.compute_bi(team,ii);
|
||||
|
||||
const int iatom = iatom_mod + iatom_div * 32;
|
||||
if (iatom >= chunk_size) return;
|
||||
|
||||
if (jjb >= my_sna.idxb_max) return;
|
||||
|
||||
my_sna.compute_bi(iatom_mod,jjb,iatom_div);
|
||||
}
|
||||
|
||||
template<class DeviceType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const {
|
||||
SNAKokkos<DeviceType> my_sna = snaKK;
|
||||
|
||||
const int iatom = iatom_mod + iatom_div * 32;
|
||||
if (iatom >= chunk_size) return;
|
||||
|
||||
if (idxb >= my_sna.idxb_max) return;
|
||||
|
||||
const int ntriples = my_sna.ntriples;
|
||||
|
||||
for (int itriple = 0; itriple < ntriples; itriple++) {
|
||||
|
||||
const auto blocal = my_sna.blist_pack(iatom_mod, idxb, itriple, iatom_div);
|
||||
|
||||
my_sna.blist(idxb, itriple, iatom) = blocal;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
template<class DeviceType>
|
||||
@ -701,6 +824,126 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeFusedDeidrj,const
|
||||
my_sna.compute_fused_deidrj(team,ii,jj);
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Begin routines that are unique to the CPU codepath. These do not take
|
||||
advantage of AoSoA data layouts, but that could be a good point of
|
||||
future optimization and unification with the above kernels. It's unlikely
|
||||
that scratch memory optimizations will ever be useful for the CPU due to
|
||||
different arithmetic intensity requirements for the CPU vs GPU.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPBetaCPU,const int& ii) const {
|
||||
|
||||
const int i = d_ilist[ii + chunk_offset];
|
||||
const int itype = type[i];
|
||||
const int ielem = d_map[itype];
|
||||
SNAKokkos<DeviceType> my_sna = snaKK;
|
||||
|
||||
Kokkos::View<double*,Kokkos::LayoutRight,DeviceType,Kokkos::MemoryTraits<Kokkos::Unmanaged>>
|
||||
d_coeffi(d_coeffelem,ielem,Kokkos::ALL);
|
||||
|
||||
for (int icoeff = 0; icoeff < ncoeff; icoeff++)
|
||||
d_beta(icoeff,ii) = d_coeffi[icoeff+1];
|
||||
|
||||
if (quadraticflag) {
|
||||
const auto idxb_max = my_sna.idxb_max;
|
||||
int k = ncoeff+1;
|
||||
for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
|
||||
const auto idxb = icoeff % idxb_max;
|
||||
const auto idx_chem = icoeff / idxb_max;
|
||||
double bveci = my_sna.blist(idxb,idx_chem,ii);
|
||||
d_beta(icoeff,ii) += d_coeffi[k]*bveci;
|
||||
k++;
|
||||
for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
|
||||
const auto jdxb = jcoeff % idxb_max;
|
||||
const auto jdx_chem = jcoeff / idxb_max;
|
||||
double bvecj = my_sna.blist(jdxb,jdx_chem,ii);
|
||||
d_beta(icoeff,ii) += d_coeffi[k]*bvecj;
|
||||
d_beta(jcoeff,ii) += d_coeffi[k]*bveci;
|
||||
k++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<class DeviceType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPPreUiCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPPreUiCPU>::member_type& team) const {
|
||||
SNAKokkos<DeviceType> my_sna = snaKK;
|
||||
|
||||
// Extract the atom number
|
||||
const int ii = team.team_rank() + team.team_size() * team.league_rank();
|
||||
if (ii >= chunk_size) return;
|
||||
int itype = type(ii);
|
||||
int ielem = d_map[itype];
|
||||
|
||||
my_sna.pre_ui_cpu(team,ii,ielem);
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<class DeviceType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUiCPU>::member_type& team) const {
|
||||
SNAKokkos<DeviceType> my_sna = snaKK;
|
||||
|
||||
// Extract the atom number
|
||||
int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size()));
|
||||
if (ii >= chunk_size) return;
|
||||
|
||||
// Extract the neighbor number
|
||||
const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size());
|
||||
const int ninside = d_ninside(ii);
|
||||
if (jj >= ninside) return;
|
||||
|
||||
my_sna.compute_ui_cpu(team,ii,jj);
|
||||
}
|
||||
|
||||
template<class DeviceType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPZeroYiCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPZeroYiCPU>::member_type& team) const {
|
||||
SNAKokkos<DeviceType> my_sna = snaKK;
|
||||
|
||||
// Extract the quantum number
|
||||
const int idx = team.team_rank() + team.team_size() * (team.league_rank() % ((my_sna.idxu_max+team.team_size()-1)/team.team_size()));
|
||||
if (idx >= my_sna.idxu_max) return;
|
||||
|
||||
// Extract the atomic index
|
||||
const int ii = team.league_rank() / ((my_sna.idxu_max+team.team_size()-1)/team.team_size());
|
||||
if (ii >= chunk_size) return;
|
||||
|
||||
if (chemflag)
|
||||
for(int ielem = 0; ielem < nelements; ielem++)
|
||||
my_sna.zero_yi_cpu(idx,ii,ielem);
|
||||
else
|
||||
my_sna.zero_yi_cpu(idx,ii,0);
|
||||
}
|
||||
|
||||
template<class DeviceType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeYiCPU,const int& ii) const {
|
||||
SNAKokkos<DeviceType> my_sna = snaKK;
|
||||
my_sna.compute_yi_cpu(ii,d_beta);
|
||||
}
|
||||
|
||||
template<class DeviceType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeZiCPU,const int& ii) const {
|
||||
SNAKokkos<DeviceType> my_sna = snaKK;
|
||||
my_sna.compute_zi_cpu(ii);
|
||||
}
|
||||
|
||||
template<class DeviceType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeBiCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeBiCPU>::member_type& team) const {
|
||||
int ii = team.league_rank();
|
||||
SNAKokkos<DeviceType> my_sna = snaKK;
|
||||
my_sna.compute_bi_cpu(team,ii);
|
||||
}
|
||||
|
||||
template<class DeviceType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDuidrjCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeDuidrjCPU>::member_type& team) const {
|
||||
@ -735,6 +978,12 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDeidrjCPU,const t
|
||||
my_sna.compute_deidrj_cpu(team,ii,jj);
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Also used for both CPU and GPU codepaths. Could maybe benefit from a
|
||||
separate GPU/CPU codepath, but this kernel takes so little time it's
|
||||
likely not worth it.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
template<int NEIGHFLAG, int EVFLAG>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
@ -799,20 +1048,31 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeForce<NEIGHFLAG,E
|
||||
|
||||
// E = beta.B + 0.5*B^t.alpha.B
|
||||
|
||||
const auto idxb_max = snaKK.idxb_max;
|
||||
|
||||
// linear contributions
|
||||
|
||||
for (int icoeff = 0; icoeff < ncoeff; icoeff++)
|
||||
evdwl += d_coeffi[icoeff+1]*my_sna.blist(icoeff,ii);
|
||||
for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
|
||||
const auto idxb = icoeff % idxb_max;
|
||||
const auto idx_chem = icoeff / idxb_max;
|
||||
evdwl += d_coeffi[icoeff+1]*my_sna.blist(idxb,idx_chem,ii);
|
||||
}
|
||||
|
||||
// quadratic contributions
|
||||
|
||||
if (quadraticflag) {
|
||||
int k = ncoeff+1;
|
||||
for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
|
||||
double bveci = my_sna.blist(icoeff,ii);
|
||||
const auto idxb = icoeff % idxb_max;
|
||||
const auto idx_chem = icoeff / idxb_max;
|
||||
double bveci = my_sna.blist(idxb,idx_chem,ii);
|
||||
|
||||
evdwl += 0.5*d_coeffi[k++]*bveci*bveci;
|
||||
for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
|
||||
double bvecj = my_sna.blist(jcoeff,ii);
|
||||
auto jdxb = jcoeff % idxb_max;
|
||||
auto jdx_chem = jcoeff / idxb_max;
|
||||
double bvecj = my_sna.blist(jdxb,jdx_chem,ii);
|
||||
|
||||
evdwl += d_coeffi[k++]*bveci*bvecj;
|
||||
}
|
||||
}
|
||||
|
||||
@ -36,7 +36,9 @@ public:
|
||||
typedef Kokkos::View<double**, DeviceType> t_sna_2d;
|
||||
typedef Kokkos::View<double**, Kokkos::LayoutLeft, DeviceType> t_sna_2d_ll;
|
||||
typedef Kokkos::View<double***, DeviceType> t_sna_3d;
|
||||
typedef Kokkos::View<double***, Kokkos::LayoutLeft, DeviceType> t_sna_3d_ll;
|
||||
typedef Kokkos::View<double***[3], DeviceType> t_sna_4d;
|
||||
typedef Kokkos::View<double****, Kokkos::LayoutLeft, DeviceType> t_sna_4d_ll;
|
||||
typedef Kokkos::View<double**[3], DeviceType> t_sna_3d3;
|
||||
typedef Kokkos::View<double*****, DeviceType> t_sna_5d;
|
||||
|
||||
@ -48,7 +50,8 @@ public:
|
||||
typedef Kokkos::View<SNAcomplex***, DeviceType> t_sna_3c;
|
||||
typedef Kokkos::View<SNAcomplex***, Kokkos::LayoutLeft, DeviceType> t_sna_3c_ll;
|
||||
typedef Kokkos::View<SNAcomplex***[3], DeviceType> t_sna_4c;
|
||||
typedef Kokkos::View<SNAcomplex***[3], Kokkos::LayoutLeft, DeviceType> t_sna_4c_ll;
|
||||
typedef Kokkos::View<SNAcomplex***[3], Kokkos::LayoutLeft, DeviceType> t_sna_4c3_ll;
|
||||
typedef Kokkos::View<SNAcomplex****, Kokkos::LayoutLeft, DeviceType> t_sna_4c_ll;
|
||||
typedef Kokkos::View<SNAcomplex**[3], DeviceType> t_sna_3c3;
|
||||
typedef Kokkos::View<SNAcomplex*****, DeviceType> t_sna_5c;
|
||||
|
||||
@ -73,27 +76,39 @@ inline
|
||||
|
||||
int ncoeff;
|
||||
|
||||
// functions for bispectrum coefficients
|
||||
// functions for bispectrum coefficients, GPU only
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void pre_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team,const int&, int); // ForceSNAP
|
||||
void pre_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team,const int&,const int&); // ForceSNAP
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int); // ForceSNAP
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_zi(const int&, const int&, const int&); // ForceSNAP
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_yi(int,int,int,
|
||||
const Kokkos::View<F_FLOAT***, Kokkos::LayoutLeft, DeviceType> &beta_pack); // ForceSNAP
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_bi(const int&, const int&, const int&); // ForceSNAP
|
||||
|
||||
// functions for bispectrum coefficients, CPU only
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void pre_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team,const int&,const int&); // ForceSNAP
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); // ForceSNAP
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_zi(const int&); // ForceSNAP
|
||||
void compute_zi_cpu(const int&); // ForceSNAP
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void zero_yi(const int&, const int&, int); // ForceSNAP
|
||||
void zero_yi_cpu(const int&,const int&,const int&); // ForceSNAP
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_yi(int,
|
||||
void compute_yi_cpu(int,
|
||||
const Kokkos::View<F_FLOAT**, DeviceType> &beta); // ForceSNAP
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_bi(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int); // ForceSNAP
|
||||
|
||||
// functions for derivatives
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_bi_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int); // ForceSNAP
|
||||
|
||||
// functions for derivatives, GPU only
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_fused_deidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int); //ForceSNAP
|
||||
|
||||
// functions for derivatives, CPU only
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_duidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); //ForceSNAP
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
@ -139,18 +154,32 @@ inline
|
||||
|
||||
int twojmax, diagonalstyle;
|
||||
|
||||
t_sna_2d_ll blist;
|
||||
t_sna_2c_ll ulisttot;
|
||||
t_sna_2c_ll zlist;
|
||||
t_sna_3d_ll blist;
|
||||
t_sna_3c_ll ulisttot;
|
||||
t_sna_3c_ll zlist;
|
||||
|
||||
t_sna_3c_ll ulist;
|
||||
t_sna_2c_ll ylist;
|
||||
|
||||
t_sna_3c_ll ylist;
|
||||
|
||||
// derivatives of data
|
||||
t_sna_4c_ll dulist;
|
||||
t_sna_4c3_ll dulist;
|
||||
|
||||
// Modified structures for GPU backend
|
||||
t_sna_3d_ll ulisttot_re; // split real,
|
||||
t_sna_3d_ll ulisttot_im; // imag
|
||||
t_sna_4c_ll ulisttot_pack; // AoSoA layout
|
||||
t_sna_4c_ll zlist_pack; // AoSoA layout
|
||||
t_sna_4d_ll blist_pack;
|
||||
t_sna_4d_ll ylist_pack_re; // split real,
|
||||
t_sna_4d_ll ylist_pack_im; // imag AoSoA layout
|
||||
|
||||
int idxcg_max, idxu_max, idxz_max, idxb_max;
|
||||
|
||||
// Chem snap counts
|
||||
int nelements;
|
||||
int ndoubles;
|
||||
int ntriples;
|
||||
|
||||
private:
|
||||
double rmin0, rfac0;
|
||||
|
||||
@ -212,9 +241,6 @@ inline
|
||||
// Chem snap flags
|
||||
int chem_flag;
|
||||
int bnorm_flag;
|
||||
int nelements;
|
||||
int ndoubles;
|
||||
int ntriples;
|
||||
|
||||
// Self-weight
|
||||
double wself;
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -362,6 +362,7 @@ void MSM::setup()
|
||||
|
||||
nmax_direct = 8*(nxhi_direct+1)*(nyhi_direct+1)*(nzhi_direct+1);
|
||||
|
||||
deallocate();
|
||||
if (peratom_allocate_flag) deallocate_peratom();
|
||||
|
||||
// compute direct sum interaction weights
|
||||
@ -612,8 +613,6 @@ void MSM::compute(int eflag, int vflag)
|
||||
|
||||
void MSM::allocate()
|
||||
{
|
||||
deallocate();
|
||||
|
||||
// interpolation coeffs
|
||||
|
||||
order_allocated = order;
|
||||
@ -635,9 +634,9 @@ void MSM::allocate()
|
||||
// allocate memory for each grid level
|
||||
|
||||
for (int n=0; n<levels; n++) {
|
||||
|
||||
memory->create3d_offset(qgrid[n],nzlo_out[n],nzhi_out[n],
|
||||
nylo_out[n],nyhi_out[n],nxlo_out[n],nxhi_out[n],"msm:qgrid");
|
||||
|
||||
memory->create3d_offset(egrid[n],nzlo_out[n],nzhi_out[n],
|
||||
nylo_out[n],nyhi_out[n],nxlo_out[n],nxhi_out[n],"msm:egrid");
|
||||
|
||||
@ -660,23 +659,29 @@ void MSM::allocate()
|
||||
|
||||
void MSM::deallocate()
|
||||
{
|
||||
delete cg_all;
|
||||
cg_all = nullptr;
|
||||
|
||||
memory->destroy2d_offset(phi1d,-order_allocated);
|
||||
memory->destroy2d_offset(dphi1d,-order_allocated);
|
||||
|
||||
if (cg_all) delete cg_all;
|
||||
cg_all = nullptr;
|
||||
|
||||
for (int n=0; n<levels; n++) {
|
||||
memory->destroy3d_offset(qgrid[n],nzlo_out[n],nylo_out[n],nxlo_out[n]);
|
||||
memory->destroy3d_offset(egrid[n],nzlo_out[n],nylo_out[n],nxlo_out[n]);
|
||||
if (qgrid[n])
|
||||
memory->destroy3d_offset(qgrid[n],nzlo_out[n],nylo_out[n],nxlo_out[n]);
|
||||
|
||||
if (world_levels[n] != MPI_COMM_NULL)
|
||||
MPI_Comm_free(&world_levels[n]);
|
||||
world_levels[n] = MPI_COMM_NULL;
|
||||
active_flag[n] = 0;
|
||||
if (egrid[n])
|
||||
memory->destroy3d_offset(egrid[n],nzlo_out[n],nylo_out[n],nxlo_out[n]);
|
||||
|
||||
delete cg[n];
|
||||
cg[n] = nullptr;
|
||||
if (world_levels)
|
||||
if (world_levels[n] != MPI_COMM_NULL)
|
||||
MPI_Comm_free(&world_levels[n]);
|
||||
|
||||
if (cg) {
|
||||
if (cg[n]) {
|
||||
delete cg[n];
|
||||
cg[n] = nullptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -765,7 +770,6 @@ void MSM::deallocate_peratom()
|
||||
|
||||
void MSM::allocate_levels()
|
||||
{
|
||||
deallocate_levels();
|
||||
ngrid = new int[levels];
|
||||
|
||||
cg = new GridComm*[levels];
|
||||
@ -815,21 +819,21 @@ void MSM::allocate_levels()
|
||||
v5grid = new double***[levels];
|
||||
|
||||
for (int n=0; n<levels; n++) {
|
||||
cg[n] = nullptr;
|
||||
cg[n] = NULL;
|
||||
world_levels[n] = MPI_COMM_NULL;
|
||||
active_flag[n] = 0;
|
||||
cg_peratom[n] = nullptr;
|
||||
cg_peratom[n] = NULL;
|
||||
|
||||
qgrid[n] = nullptr;
|
||||
egrid[n] = nullptr;
|
||||
qgrid[n] = NULL;
|
||||
egrid[n] = NULL;
|
||||
|
||||
v0grid[n] = nullptr;
|
||||
v1grid[n] = nullptr;
|
||||
v2grid[n] = nullptr;
|
||||
v3grid[n] = nullptr;
|
||||
v4grid[n] = nullptr;
|
||||
v5grid[n] = nullptr;
|
||||
v0grid[n] = NULL;
|
||||
v1grid[n] = NULL;
|
||||
v2grid[n] = NULL;
|
||||
v3grid[n] = NULL;
|
||||
v4grid[n] = NULL;
|
||||
v5grid[n] = NULL;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
@ -1105,6 +1109,7 @@ void MSM::set_grid_global()
|
||||
|
||||
if (!domain->nonperiodic) levels -= 1;
|
||||
|
||||
deallocate_levels();
|
||||
allocate_levels();
|
||||
|
||||
// find number of grid levels in each direction
|
||||
|
||||
@ -33,6 +33,8 @@ using namespace MathConst;
|
||||
#define DELTA 16384
|
||||
#define DELTA_BONUS 8192
|
||||
|
||||
int AtomVec::num_atom_vecs = 0;
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
AtomVec::AtomVec(LAMMPS *lmp) : Pointers(lmp)
|
||||
@ -54,6 +56,8 @@ AtomVec::AtomVec(LAMMPS *lmp) : Pointers(lmp)
|
||||
|
||||
threads = NULL;
|
||||
|
||||
++num_atom_vecs;
|
||||
|
||||
// peratom variables auto-included in corresponding child style fields string
|
||||
// these fields cannot be specified in the fields string
|
||||
|
||||
@ -93,44 +97,48 @@ AtomVec::~AtomVec()
|
||||
int datatype,cols;
|
||||
void *pdata;
|
||||
|
||||
--num_atom_vecs;
|
||||
|
||||
for (int i = 0; i < nargcopy; i++) delete [] argcopy[i];
|
||||
delete [] argcopy;
|
||||
|
||||
memory->destroy(atom->tag);
|
||||
memory->destroy(atom->type);
|
||||
memory->destroy(atom->mask);
|
||||
memory->destroy(atom->image);
|
||||
memory->destroy(atom->x);
|
||||
memory->destroy(atom->v);
|
||||
memory->destroy(atom->f);
|
||||
if (num_atom_vecs == 0) {
|
||||
memory->destroy(atom->tag);
|
||||
memory->destroy(atom->type);
|
||||
memory->destroy(atom->mask);
|
||||
memory->destroy(atom->image);
|
||||
memory->destroy(atom->x);
|
||||
memory->destroy(atom->v);
|
||||
memory->destroy(atom->f);
|
||||
|
||||
for (int i = 0; i < ngrow; i++) {
|
||||
pdata = mgrow.pdata[i];
|
||||
datatype = mgrow.datatype[i];
|
||||
cols = mgrow.cols[i];
|
||||
if (datatype == Atom::DOUBLE) {
|
||||
if (cols == 0)
|
||||
memory->destroy(*((double **) pdata));
|
||||
else if (cols > 0)
|
||||
memory->destroy(*((double ***) pdata));
|
||||
else {
|
||||
memory->destroy(*((double ***) pdata));
|
||||
}
|
||||
} else if (datatype == Atom::INT) {
|
||||
if (cols == 0)
|
||||
memory->destroy(*((int **) pdata));
|
||||
else if (cols > 0)
|
||||
memory->destroy(*((int ***) pdata));
|
||||
else {
|
||||
memory->destroy(*((int ***) pdata));
|
||||
}
|
||||
} else if (datatype == Atom::BIGINT) {
|
||||
if (cols == 0)
|
||||
memory->destroy(*((bigint **) pdata));
|
||||
else if (cols > 0)
|
||||
memory->destroy(*((bigint ***) pdata));
|
||||
else {
|
||||
memory->destroy(*((bigint ***) pdata));
|
||||
for (int i = 0; i < ngrow; i++) {
|
||||
pdata = mgrow.pdata[i];
|
||||
datatype = mgrow.datatype[i];
|
||||
cols = mgrow.cols[i];
|
||||
if (datatype == Atom::DOUBLE) {
|
||||
if (cols == 0)
|
||||
memory->destroy(*((double **) pdata));
|
||||
else if (cols > 0)
|
||||
memory->destroy(*((double ***) pdata));
|
||||
else {
|
||||
memory->destroy(*((double ***) pdata));
|
||||
}
|
||||
} else if (datatype == Atom::INT) {
|
||||
if (cols == 0)
|
||||
memory->destroy(*((int **) pdata));
|
||||
else if (cols > 0)
|
||||
memory->destroy(*((int ***) pdata));
|
||||
else {
|
||||
memory->destroy(*((int ***) pdata));
|
||||
}
|
||||
} else if (datatype == Atom::BIGINT) {
|
||||
if (cols == 0)
|
||||
memory->destroy(*((bigint **) pdata));
|
||||
else if (cols > 0)
|
||||
memory->destroy(*((bigint ***) pdata));
|
||||
else {
|
||||
memory->destroy(*((bigint ***) pdata));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -209,6 +209,10 @@ class AtomVec : protected Pointers {
|
||||
|
||||
bool *threads;
|
||||
|
||||
// counter for atom vec instances
|
||||
|
||||
static int num_atom_vecs;
|
||||
|
||||
// local methods
|
||||
|
||||
void grow_nmax();
|
||||
|
||||
Reference in New Issue
Block a user