Removed unnecessary team parallelism from CPU routines; rearranged pair_snap_kokkos_impl to make the subsequent CPU/GPU unifications easier to follow.

This commit is contained in:
Evan Weinberg
2024-11-19 12:44:08 -08:00
parent 976167e2e5
commit 277fba1907
4 changed files with 530 additions and 507 deletions

View File

@ -204,10 +204,10 @@ class PairSNAPKokkos : public PairSNAP {
void operator() (TagPairSNAPComputeNeighCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeNeighCPU>::member_type& team) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPPreUiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPPreUiCPU>::member_type& team) const;
void operator() (TagPairSNAPPreUiCPU, const int& iatom) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiCPU>::member_type& team) const;
void operator() (TagPairSNAPComputeUiCPU, const int& ii) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPTransformUiCPU, const int j, const int iatom) const;
@ -222,10 +222,10 @@ class PairSNAPKokkos : public PairSNAP {
void operator() (TagPairSNAPComputeYiCPU,const int& ii) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeDuidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrjCPU>::member_type& team) const;
void operator() (TagPairSNAPComputeDuidrjCPU, const int& ii) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeDeidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrjCPU>::member_type& team) const;
void operator() (TagPairSNAPComputeDeidrjCPU, const int& ii) const;
template<int NEIGHFLAG>
KOKKOS_INLINE_FUNCTION

View File

@ -204,7 +204,7 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::compute(int eflag_in,
const int inum_pad = inum_div * vector_length;
MemKK::realloc_kokkos(d_beta,"PairSNAPKokkos:beta", inum_pad, ncoeff);
snaKK.d_beta = d_beta;
MemKK::realloc_kokkos(d_ninside,"PairSNAPKokkos:ninside", inum);
MemKK::realloc_kokkos(d_ninside,"PairSNAPKokkos:ninside", inum_pad);
}
chunk_size = MIN(chunksize, inum); // "chunksize" variable is set by user
@ -235,17 +235,14 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::compute(int eflag_in,
//PreUi
{
int team_size = team_size_default;
check_team_size_for<TagPairSNAPPreUiCPU>(chunk_size,team_size);
typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPPreUiCPU> policy_preui_cpu((chunk_size+team_size-1)/team_size,team_size,vector_length);
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPPreUiCPU> policy_preui_cpu(0, chunk_size);
Kokkos::parallel_for("PreUiCPU",policy_preui_cpu,*this);
}
// ComputeUi
{
int team_size = team_size_default;
// Fused calculation of ulist and accumulation into ulisttot using atomics
typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUiCPU> policy_ui_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeUiCPU> policy_ui_cpu(0, chunk_size*max_neighs);
Kokkos::parallel_for("ComputeUiCPU",policy_ui_cpu,*this);
}
@ -283,13 +280,10 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::compute(int eflag_in,
//ComputeDuidrj and Deidrj
{
int team_size = team_size_default;
typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeDuidrjCPU> policy_duidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeDuidrjCPU> policy_duidrj_cpu(0,chunk_size*max_neighs);
Kokkos::parallel_for("ComputeDuidrjCPU",policy_duidrj_cpu,*this);
typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeDeidrjCPU> policy_deidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeDeidrjCPU> policy_deidrj_cpu(0,chunk_size*max_neighs);
Kokkos::parallel_for("ComputeDeidrjCPU",policy_deidrj_cpu,*this);
}
@ -601,47 +595,6 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::coeff(int narg, char
snaKK.init();
}
/* ----------------------------------------------------------------------
Begin routines that are common to both the CPU and GPU codepath.
------------------------------------------------------------------------- */
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPBeta, const int& iatom) const {
if (iatom >= chunk_size) return;
const int i = d_ilist[iatom + chunk_offset];
const int itype = type[i];
const int ielem = d_map[itype];
auto d_coeffi = Kokkos::subview(d_coeffelem, ielem, Kokkos::ALL);
for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
d_beta(iatom, icoeff) = d_coeffi[icoeff+1];
}
if (quadraticflag) {
const auto idxb_max = snaKK.idxb_max;
int k = ncoeff+1;
for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
const auto idxb = icoeff % idxb_max;
const auto idx_chem = icoeff / idxb_max;
real_type bveci = snaKK.blist(iatom, idx_chem, idxb);
d_beta(iatom, icoeff) += d_coeffi[k] * bveci;
k++;
for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
const auto jdxb = jcoeff % idxb_max;
const auto jdx_chem = jcoeff / idxb_max;
real_type bvecj = snaKK.blist(iatom, jdx_chem, jdxb);
d_beta(iatom, icoeff) += d_coeffi[k] * bvecj;
d_beta(iatom, jcoeff) += d_coeffi[k] * bveci;
k++;
}
}
}
}
/* ----------------------------------------------------------------------
Begin routines that are unique to the GPU codepath. These take advantage
of AoSoA data layouts and scratch memory for recursive polynomials
@ -737,222 +690,6 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
});
}
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeCayleyKlein,const int iatom_mod, const int jnbor, const int iatom_div) const {
const int iatom = iatom_mod + iatom_div * vector_length;
if (iatom >= chunk_size) return;
const int ninside = d_ninside(iatom);
if (jnbor >= ninside) return;
snaKK.compute_cayley_klein(iatom,jnbor);
}
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPPreUi, const int iatom_mod, const int j, const int iatom_div) const {
const int iatom = iatom_mod + iatom_div * vector_length;
if (iatom >= chunk_size) return;
int itype = type(iatom);
int ielem = d_map[itype];
snaKK.pre_ui(iatom, j, ielem);
}
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUiSmall>::member_type& team) const {
// extract flattened atom_div / neighbor number / bend location
int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
// extract neighbor index, iatom_div
int iatom_div = flattened_idx / (max_neighs * (twojmax + 1)); // removed "const" to work around GCC 7 bug
const int jj_jbend = flattened_idx - iatom_div * (max_neighs * (twojmax + 1));
const int jbend = jj_jbend / max_neighs;
int jj = jj_jbend - jbend * max_neighs; // removed "const" to work around GCC 7 bug
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
[&] (const int iatom_mod) {
const int ii = iatom_mod + vector_length * iatom_div;
if (ii >= chunk_size) return;
const int ninside = d_ninside(ii);
if (jj >= ninside) return;
snaKK.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div);
});
}
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUiLarge>::member_type& team) const {
// extract flattened atom_div / neighbor number / bend location
int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
// extract neighbor index, iatom_div
int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug
int jj = flattened_idx - iatom_div * max_neighs;
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
[&] (const int iatom_mod) {
const int ii = iatom_mod + vector_length * iatom_div;
if (ii >= chunk_size) return;
const int ninside = d_ninside(ii);
if (jj >= ninside) return;
snaKK.compute_ui_large(team,iatom_mod, jj, iatom_div);
});
}
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPTransformUi, const int iatom_mod, const int idxu, const int iatom_div) const {
const int iatom = iatom_mod + iatom_div * vector_length;
if (iatom >= chunk_size) return;
if (idxu > snaKK.idxu_max) return;
int elem_count = chemflag ? nelements : 1;
for (int ielem = 0; ielem < elem_count; ielem++) {
const FullHalfMapper mapper = snaKK.idxu_full_half[idxu];
auto utot_re = snaKK.ulisttot_re(iatom, ielem, mapper.idxu_half);
auto utot_im = snaKK.ulisttot_im(iatom, ielem, mapper.idxu_half);
if (mapper.flip_sign == 1) {
utot_im = -utot_im;
} else if (mapper.flip_sign == -1) {
utot_re = -utot_re;
}
snaKK.ulisttot(iatom, ielem, idxu) = { utot_re, utot_im };
if (mapper.flip_sign == 0) {
snaKK.ylist_re(iatom, ielem, mapper.idxu_half) = 0.;
snaKK.ylist_im(iatom, ielem, mapper.idxu_half) = 0.;
}
}
}
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeYi,const int iatom_mod, const int jjz, const int iatom_div) const {
const int iatom = iatom_mod + iatom_div * vector_length;
if (iatom >= chunk_size) return;
if (jjz >= snaKK.idxz_max) return;
snaKK.compute_yi(iatom, jjz);
}
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeYiWithZlist,const int iatom_mod, const int jjz, const int iatom_div) const {
const int iatom = iatom_mod + iatom_div * vector_length;
if (iatom >= chunk_size) return;
if (jjz >= snaKK.idxz_max) return;
snaKK.compute_yi_with_zlist(iatom, jjz);
}
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeZi,const int iatom_mod, const int jjz, const int iatom_div) const {
const int iatom = iatom_mod + iatom_div * vector_length;
if (iatom >= chunk_size) return;
if (jjz >= snaKK.idxz_max) return;
snaKK.compute_zi(iatom, jjz);
}
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeBi,const int iatom_mod, const int jjb, const int iatom_div) const {
const int iatom = iatom_mod + iatom_div * vector_length;
if (iatom >= chunk_size) return;
if (jjb >= snaKK.idxb_max) return;
snaKK.compute_bi(iatom,jjb);
}
template<class DeviceType, typename real_type, int vector_length>
template<int dir>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeFusedDeidrjSmall<dir>,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeFusedDeidrjSmall<dir> >::member_type& team) const {
// extract flattened atom_div / neighbor number / bend location
int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_fused_deidrj;
// extract neighbor index, iatom_div
int iatom_div = flattened_idx / (max_neighs * (twojmax + 1)); // removed "const" to work around GCC 7 bug
const int jj_jbend = flattened_idx - iatom_div * (max_neighs * (twojmax + 1));
const int jbend = jj_jbend / max_neighs;
int jj = jj_jbend - jbend * max_neighs; // removed "const" to work around GCC 7 bug
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
[&] (const int iatom_mod) {
const int ii = iatom_mod + vector_length * iatom_div;
if (ii >= chunk_size) return;
const int ninside = d_ninside(ii);
if (jj >= ninside) return;
snaKK.template compute_fused_deidrj_small<dir>(team, iatom_mod, jbend, jj, iatom_div);
});
}
template<class DeviceType, typename real_type, int vector_length>
template<int dir>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeFusedDeidrjLarge<dir>,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeFusedDeidrjLarge<dir> >::member_type& team) const {
// extract flattened atom_div / neighbor number / bend location
int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_fused_deidrj;
// extract neighbor index, iatom_div
int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug
int jj = flattened_idx - max_neighs * iatom_div;
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
[&] (const int iatom_mod) {
const int ii = iatom_mod + vector_length * iatom_div;
if (ii >= chunk_size) return;
const int ninside = d_ninside(ii);
if (jj >= ninside) return;
snaKK.template compute_fused_deidrj_large<dir>(team, iatom_mod, jj, iatom_div);
});
}
/* ----------------------------------------------------------------------
Begin routines that are unique to the CPU codepath. These do not take
advantage of AoSoA data layouts, but that could be a good point of
future optimization and unification with the above kernels. It's unlikely
that scratch memory optimizations will ever be useful for the CPU due to
different arithmetic intensity requirements for the CPU vs GPU.
------------------------------------------------------------------------- */
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeNeighCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeNeighCPU>::member_type& team) const {
@ -1028,36 +765,156 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
});
}
/* ----------------------------------------------------------------------
Pre-compute the Cayley-Klein parameters for reuse in later routines.
GPU only.
------------------------------------------------------------------------- */
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPPreUiCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPPreUiCPU>::member_type& team) const {
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeCayleyKlein,const int iatom_mod, const int jnbor, const int iatom_div) const {
// Extract the atom number
const int ii = team.team_rank() + team.team_size() * team.league_rank();
if (ii >= chunk_size) return;
int itype = type(ii);
int ielem = d_map[itype];
const int iatom = iatom_mod + iatom_div * vector_length;
if (iatom >= chunk_size) return;
snaKK.pre_ui_cpu(team,ii,ielem);
const int ninside = d_ninside(iatom);
if (jnbor >= ninside) return;
snaKK.compute_cayley_klein(iatom, jnbor);
}
/* ----------------------------------------------------------------------
Initialize the "ulisttot" structure with non-zero on-diagonal terms
and zero terms elsewhere; both CPU and GPU.
------------------------------------------------------------------------- */
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUiCPU>::member_type& team) const {
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPPreUi, const int iatom_mod, const int j, const int iatom_div) const {
const int iatom = iatom_mod + iatom_div * vector_length;
if (iatom >= chunk_size) return;
// Extract the atom number
int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size()));
if (ii >= chunk_size) return;
int itype = type(iatom);
int ielem = d_map[itype];
// Extract the neighbor number
const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size());
const int ninside = d_ninside(ii);
if (jj >= ninside) return;
snaKK.pre_ui(iatom, j, ielem);
}
snaKK.compute_ui_cpu(team,ii,jj);
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPPreUiCPU, const int& iatom) const {
const int itype = type(iatom);
const int ielem = d_map[itype];
snaKK.pre_ui_cpu(iatom, ielem);
}
/* ----------------------------------------------------------------------
Accumulate the spectral contributions from atom, neighbor pairs into
ulisttot_re and _im. These routines are GPU only and use scratch memory
staging.
------------------------------------------------------------------------- */
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUiSmall>::member_type& team) const {
// extract flattened atom_div / neighbor number / bend location
int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
// extract neighbor index, iatom_div
int iatom_div = flattened_idx / (max_neighs * (twojmax + 1)); // removed "const" to work around GCC 7 bug
const int jj_jbend = flattened_idx - iatom_div * (max_neighs * (twojmax + 1));
const int jbend = jj_jbend / max_neighs;
int jj = jj_jbend - jbend * max_neighs; // removed "const" to work around GCC 7 bug
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
[&] (const int iatom_mod) {
const int ii = iatom_mod + vector_length * iatom_div;
if (ii >= chunk_size) return;
const int ninside = d_ninside(ii);
if (jj >= ninside) return;
snaKK.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div);
});
}
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUiLarge>::member_type& team) const {
// extract flattened atom_div / neighbor number / bend location
int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
// extract neighbor index, iatom_div
int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug
int jj = flattened_idx - iatom_div * max_neighs;
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
[&] (const int iatom_mod) {
const int ii = iatom_mod + vector_length * iatom_div;
if (ii >= chunk_size) return;
const int ninside = d_ninside(ii);
if (jj >= ninside) return;
snaKK.compute_ui_large(team,iatom_mod, jj, iatom_div);
});
}
/* ----------------------------------------------------------------------
Accumulate the spectral contributions from atom, neighbor pairs into
ulisttot_re and _im. This routine is CPU only and does not use staging.
------------------------------------------------------------------------- */
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeUiCPU, const int& ii) const {
const int iatom = ii / max_neighs;
const int jnbor = ii % max_neighs;
const int ninside = d_ninside(iatom);
if (jnbor >= ninside) return;
snaKK.compute_ui_cpu(iatom, jnbor);
}
/* ----------------------------------------------------------------------
De-symmetrize ulisttot_re and _im and pack it into a unified ulisttot
structure. Zero-initialize ylist. CPU and GPU.
------------------------------------------------------------------------- */
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPTransformUi, const int iatom_mod, const int idxu, const int iatom_div) const {
const int iatom = iatom_mod + iatom_div * vector_length;
if (iatom >= chunk_size) return;
if (idxu > snaKK.idxu_max) return;
int elem_count = chemflag ? nelements : 1;
for (int ielem = 0; ielem < elem_count; ielem++) {
const FullHalfMapper mapper = snaKK.idxu_full_half[idxu];
auto utot_re = snaKK.ulisttot_re(iatom, ielem, mapper.idxu_half);
auto utot_im = snaKK.ulisttot_im(iatom, ielem, mapper.idxu_half);
if (mapper.flip_sign == 1) {
utot_im = -utot_im;
} else if (mapper.flip_sign == -1) {
utot_re = -utot_re;
}
snaKK.ulisttot(iatom, ielem, idxu) = { utot_re, utot_im };
if (mapper.flip_sign == 0) {
snaKK.ylist_re(iatom, ielem, mapper.idxu_half) = 0.;
snaKK.ylist_im(iatom, ielem, mapper.idxu_half) = 0.;
}
}
}
template<class DeviceType, typename real_type, int vector_length>
@ -1110,10 +967,21 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
}
}
/* ----------------------------------------------------------------------
Compute all elements of the Z tensor and store them into the `zlist`
View. This is only used for energy timesteps or quadratic SNAP.
CPU and GPU.
------------------------------------------------------------------------- */
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeYiCPU,const int& ii) const {
snaKK.compute_yi_cpu(ii);
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeZi,const int iatom_mod, const int jjz, const int iatom_div) const {
const int iatom = iatom_mod + iatom_div * vector_length;
if (iatom >= chunk_size) return;
if (jjz >= snaKK.idxz_max) return;
snaKK.compute_zi(iatom, jjz);
}
template<class DeviceType, typename real_type, int vector_length>
@ -1122,6 +990,22 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
snaKK.compute_zi_cpu(ii);
}
/* ----------------------------------------------------------------------
Compute the energy triple products and store in the "blist" View.
CPU and GPU.
------------------------------------------------------------------------- */
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeBi,const int iatom_mod, const int jjb, const int iatom_div) const {
const int iatom = iatom_mod + iatom_div * vector_length;
if (iatom >= chunk_size) return;
if (jjb >= snaKK.idxb_max) return;
snaKK.compute_bi(iatom,jjb);
}
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeBiCPU, const int& ii) const {
@ -1130,42 +1014,187 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
snaKK.compute_bi(iatom, jjb);
}
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeDuidrjCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeDuidrjCPU>::member_type& team) const {
// Extract the atom number
int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size()));
if (ii >= chunk_size) return;
// Extract the neighbor number
const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size());
const int ninside = d_ninside(ii);
if (jj >= ninside) return;
snaKK.compute_duidrj_cpu(team,ii,jj);
}
/* ----------------------------------------------------------------------
Assemble the "beta" coefficients that enter the computation of the
adjoint matrices Y. For quadratic SNAP, this includes accumulating
energy triple products into an "effective" beta that encodes the
quadratic terms with otherwise linear compute work.
CPU and GPU.
------------------------------------------------------------------------- */
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeDeidrjCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeDeidrjCPU>::member_type& team) const {
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPBeta, const int& iatom) const {
// Extract the atom number
int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size()));
if (ii >= chunk_size) return;
if (iatom >= chunk_size) return;
// Extract the neighbor number
const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size());
const int ninside = d_ninside(ii);
if (jj >= ninside) return;
const int i = d_ilist[iatom + chunk_offset];
const int itype = type[i];
const int ielem = d_map[itype];
snaKK.compute_deidrj_cpu(team,ii,jj);
auto d_coeffi = Kokkos::subview(d_coeffelem, ielem, Kokkos::ALL);
for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
d_beta(iatom, icoeff) = d_coeffi[icoeff+1];
}
if (quadraticflag) {
const auto idxb_max = snaKK.idxb_max;
int k = ncoeff+1;
for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
const auto idxb = icoeff % idxb_max;
const auto idx_chem = icoeff / idxb_max;
real_type bveci = snaKK.blist(iatom, idx_chem, idxb);
d_beta(iatom, icoeff) += d_coeffi[k] * bveci;
k++;
for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
const auto jdxb = jcoeff % idxb_max;
const auto jdx_chem = jcoeff / idxb_max;
real_type bvecj = snaKK.blist(iatom, jdx_chem, jdxb);
d_beta(iatom, icoeff) += d_coeffi[k] * bvecj;
d_beta(iatom, jcoeff) += d_coeffi[k] * bveci;
k++;
}
}
}
}
/* ----------------------------------------------------------------------
Also used for both CPU and GPU codepaths. Could maybe benefit from a
separate GPU/CPU codepath, but this kernel takes so little time it's
likely not worth it.
Compute all elements of the Z tensor and accumultate them into the
adjoint matrices Y (ylist_re, _im) on non-energy timesteps. CPU and GPU.
------------------------------------------------------------------------- */
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeYi,const int iatom_mod, const int jjz, const int iatom_div) const {
const int iatom = iatom_mod + iatom_div * vector_length;
if (iatom >= chunk_size) return;
if (jjz >= snaKK.idxz_max) return;
snaKK.compute_yi(iatom, jjz);
}
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeYiCPU, const int& ii) const {
snaKK.compute_yi_cpu(ii);
}
/* ----------------------------------------------------------------------
Accumulate the pre-computed elements of the Z tensor into the adjoint
matrices Y (ylist_re, _im) on non-energy timesteps. CPU and GPU.
------------------------------------------------------------------------- */
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeYiWithZlist,const int iatom_mod, const int jjz, const int iatom_div) const {
const int iatom = iatom_mod + iatom_div * vector_length;
if (iatom >= chunk_size) return;
if (jjz >= snaKK.idxz_max) return;
snaKK.compute_yi_with_zlist(iatom, jjz);
}
/* ----------------------------------------------------------------------
Assemble the force contributions for each atom, neighbor pair by
contracting the adjoint matrices Y with derivatives of the Wigner
matrices U. These routines are GPU only and use scratch memory
staging.
------------------------------------------------------------------------- */
template<class DeviceType, typename real_type, int vector_length>
template<int dir>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeFusedDeidrjSmall<dir>,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeFusedDeidrjSmall<dir> >::member_type& team) const {
// extract flattened atom_div / neighbor number / bend location
int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_fused_deidrj;
// extract neighbor index, iatom_div
int iatom_div = flattened_idx / (max_neighs * (twojmax + 1)); // removed "const" to work around GCC 7 bug
const int jj_jbend = flattened_idx - iatom_div * (max_neighs * (twojmax + 1));
const int jbend = jj_jbend / max_neighs;
int jj = jj_jbend - jbend * max_neighs; // removed "const" to work around GCC 7 bug
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
[&] (const int iatom_mod) {
const int ii = iatom_mod + vector_length * iatom_div;
if (ii >= chunk_size) return;
const int ninside = d_ninside(ii);
if (jj >= ninside) return;
snaKK.template compute_fused_deidrj_small<dir>(team, iatom_mod, jbend, jj, iatom_div);
});
}
template<class DeviceType, typename real_type, int vector_length>
template<int dir>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeFusedDeidrjLarge<dir>,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeFusedDeidrjLarge<dir> >::member_type& team) const {
// extract flattened atom_div / neighbor number / bend location
int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_fused_deidrj;
// extract neighbor index, iatom_div
int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug
int jj = flattened_idx - max_neighs * iatom_div;
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
[&] (const int iatom_mod) {
const int ii = iatom_mod + vector_length * iatom_div;
if (ii >= chunk_size) return;
const int ninside = d_ninside(ii);
if (jj >= ninside) return;
snaKK.template compute_fused_deidrj_large<dir>(team, iatom_mod, jj, iatom_div);
});
}
/* ----------------------------------------------------------------------
Assemble the derivatives of the Winger matrices U into the View
"dulist". CPU only.
------------------------------------------------------------------------- */
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeDuidrjCPU, const int& ii) const {
const int iatom = ii / max_neighs;
const int jnbor = ii % max_neighs;
const int ninside = d_ninside(iatom);
if (jnbor >= ninside) return;
snaKK.compute_duidrj_cpu(iatom, jnbor);
}
/* ----------------------------------------------------------------------
Assemble the force contributions for each atom, neighbor pair by
contracting the adjoint matrices Y with the pre-computed derivatives
of the Wigner matrices U. CPU only.
------------------------------------------------------------------------- */
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeDeidrjCPU, const int& ii) const {
const int iatom = ii / max_neighs;
const int jnbor = ii % max_neighs;
const int ninside = d_ninside(iatom);
if (jnbor >= ninside) return;
snaKK.compute_deidrj_cpu(iatom, jnbor);
}
/* ----------------------------------------------------------------------
This routine formally accumulates the "chunked" force contributions
into the broader LAMMPS "f" force View. As appropriate it
also accumulates the total energy and the virial. CPU and GPU.
------------------------------------------------------------------------- */
template<class DeviceType, typename real_type, int vector_length>

View File

@ -237,9 +237,9 @@ class SNAKokkos {
// functions for bispectrum coefficients, CPU only
KOKKOS_INLINE_FUNCTION
void pre_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team,const int&,const int&) const; // ForceSNAP
void pre_ui_cpu(const int&, const int&) const; // ForceSNAP
KOKKOS_INLINE_FUNCTION
void compute_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int) const; // ForceSNAP
void compute_ui_cpu(const int&, const int&) const; // ForceSNAP
KOKKOS_INLINE_FUNCTION
void compute_zi_cpu(const int&) const; // ForceSNAP
KOKKOS_INLINE_FUNCTION
@ -247,9 +247,9 @@ class SNAKokkos {
// functions for derivatives, CPU only
KOKKOS_INLINE_FUNCTION
void compute_duidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int) const; //ForceSNAP
void compute_duidrj_cpu(const int&, const int&) const; //ForceSNAP
KOKKOS_INLINE_FUNCTION
void compute_deidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int) const; // ForceSNAP
void compute_deidrj_cpu(const int&, const int&) const; // ForceSNAP
KOKKOS_INLINE_FUNCTION
real_type compute_sfac(real_type, real_type, real_type, real_type) const; // add_uarraytot, compute_duarray
@ -358,11 +358,10 @@ class SNAKokkos {
void init_rootpqarray(); // init()
KOKKOS_INLINE_FUNCTION
void add_uarraytot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int, const real_type&, const real_type&, const real_type&, const real_type&, const real_type&, int) const; // compute_ui
void add_uarraytot(int, int, const real_type&, const real_type&, const real_type&, const real_type&, const real_type&, int) const; // compute_ui
KOKKOS_INLINE_FUNCTION
void compute_uarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int,
const real_type&, const real_type&, const real_type&,
void compute_uarray_cpu(int, int, const real_type&, const real_type&, const real_type&,
const real_type&, const real_type&) const; // compute_ui_cpu
@ -372,8 +371,7 @@ class SNAKokkos {
inline
int compute_ncoeff(); // SNAKokkos()
KOKKOS_INLINE_FUNCTION
void compute_duarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int,
const real_type&, const real_type&, const real_type&, // compute_duidrj_cpu
void compute_duarray_cpu(int, int, const real_type&, const real_type&, const real_type&, // compute_duidrj_cpu
const real_type&, const real_type&, const real_type&, const real_type&, const real_type&,
const real_type&, const real_type&) const;

View File

@ -1156,17 +1156,14 @@ typename SNAKokkos<DeviceType, real_type, vector_length>::real_type SNAKokkos<De
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType, real_type, vector_length>::pre_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int& iatom, const int& ielem) const
void SNAKokkos<DeviceType, real_type, vector_length>::pre_ui_cpu(const int& iatom, const int& ielem) const
{
for (int jelem = 0; jelem < nelements; jelem++) {
for (int j = 0; j <= twojmax; j++) {
int jju = idxu_half_block(j); // removed "const" to work around GCC 7 bug
// Only diagonal elements get initialized
// for (int m = 0; m < (j+1)*(j/2+1); m++)
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, (j+1)*(j/2+1)),
[&] (const int m) {
for (int m = 0; m < (j+1)*(j/2+1); m++) {
const int jjup = jju + m;
// if m is on the "diagonal", initialize it with the self energy.
@ -1176,7 +1173,7 @@ void SNAKokkos<DeviceType, real_type, vector_length>::pre_ui_cpu(const typename
ulisttot_re(iatom, jelem, jjup) = init;
ulisttot_im(iatom, jelem, jjup) = 0;
});
};
}
}
@ -1191,7 +1188,7 @@ void SNAKokkos<DeviceType, real_type, vector_length>::pre_ui_cpu(const typename
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType, real_type, vector_length>::compute_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor) const
void SNAKokkos<DeviceType, real_type, vector_length>::compute_ui_cpu(const int& iatom, const int& jnbor) const
{
real_type rsq, r, x, y, z, z0, theta0;
@ -1211,8 +1208,8 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_ui_cpu(const typen
// theta0 = (r - rmin0) * rscale0;
z0 = r / tan(theta0);
compute_uarray_cpu(team, iatom, jnbor, x, y, z, z0, r);
add_uarraytot(team, iatom, jnbor, r, wj(iatom,jnbor), rcutij(iatom,jnbor), sinnerij(iatom,jnbor), dinnerij(iatom,jnbor), element(iatom, jnbor));
compute_uarray_cpu(iatom, jnbor, x, y, z, z0, r);
add_uarraytot(iatom, jnbor, r, wj(iatom,jnbor), rcutij(iatom,jnbor), sinnerij(iatom,jnbor), dinnerij(iatom,jnbor), element(iatom, jnbor));
}
/* ----------------------------------------------------------------------
@ -1328,7 +1325,7 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_yi_cpu(int iter) c
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType, real_type, vector_length>::compute_duidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor) const
void SNAKokkos<DeviceType, real_type, vector_length>::compute_duidrj_cpu(const int& iatom, const int& jnbor) const
{
real_type rsq, r, x, y, z, z0, theta0, cs, sn;
real_type dz0dr;
@ -1345,7 +1342,7 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_duidrj_cpu(const t
z0 = r * cs / sn;
dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq;
compute_duarray_cpu(team, iatom, jnbor, x, y, z, z0, r, dz0dr, wj(iatom,jnbor), rcutij(iatom,jnbor), sinnerij(iatom,jnbor), dinnerij(iatom,jnbor));
compute_duarray_cpu(iatom, jnbor, x, y, z, z0, r, dz0dr, wj(iatom,jnbor), rcutij(iatom,jnbor), sinnerij(iatom,jnbor), dinnerij(iatom,jnbor));
}
@ -1360,54 +1357,42 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_duidrj_cpu(const t
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType, real_type, vector_length>::compute_deidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor) const
void SNAKokkos<DeviceType, real_type, vector_length>::compute_deidrj_cpu(const int& iatom, const int& jnbor) const
{
t_scalar3<real_type> final_sum;
real_type force_sum[3] = { 0, 0, 0 };
const int jelem = element(iatom, jnbor);
Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,twojmax+1),
[&] (const int& j, t_scalar3<real_type>& sum_tmp) {
for (int j = 0; j <= twojmax; j++) {
int jju_half = idxu_half_block[j];
int jju_cache = idxu_cache_block[j];
for (int mb = 0; 2*mb < j; mb++)
for (int mb = 0; 2 * mb < j; mb++) {
for (int ma = 0; ma <= j; ma++) {
const complex y_val = { ylist_re(iatom, jelem, jju_half), ylist_im(iatom, jelem, jju_half) };
sum_tmp.x += dulist_cpu(iatom, jnbor, jju_cache, 0).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 0).im * y_val.im;
sum_tmp.y += dulist_cpu(iatom, jnbor, jju_cache, 1).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 1).im * y_val.im;
sum_tmp.z += dulist_cpu(iatom, jnbor, jju_cache, 2).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 2).im * y_val.im;
complex y = { ylist_re(iatom, jelem, jju_half), ylist_im(iatom, jelem, jju_half) };
for (int k = 0; k < 3; k++)
force_sum[k] += dulist_cpu(iatom, jnbor, jju_cache, k).re * y.re +
dulist_cpu(iatom, jnbor, jju_cache, k).im * y.im;
jju_half++; jju_cache++;
} //end loop over ma mb
}
} //end loop over ma mb
// For j even, handle middle column
if (j%2 == 0) {
int mb = j/2;
for (int ma = 0; ma < mb; ma++) {
const complex y_val = { ylist_re(iatom, jelem, jju_half), ylist_im(iatom, jelem, jju_half) };
sum_tmp.x += dulist_cpu(iatom, jnbor, jju_cache, 0).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 0).im * y_val.im;
sum_tmp.y += dulist_cpu(iatom, jnbor, jju_cache, 1).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 1).im * y_val.im;
sum_tmp.z += dulist_cpu(iatom, jnbor, jju_cache, 2).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 2).im * y_val.im;
if (j % 2 == 0) {
//int mb = j / 2;
for (int ma = 0; ma <= j; ma++) {
complex y = { ylist_re(iatom, jelem, jju_half), ylist_im(iatom, jelem, jju_half) };
for (int k = 0; k < 3; k++)
force_sum[k] += static_cast<real_type>(0.5) * (dulist_cpu(iatom, jnbor, jju_cache, k).re * y.re +
dulist_cpu(iatom, jnbor, jju_cache, k).im * y.im);
jju_half++; jju_cache++;
}
//int ma = mb;
// 0.5 is meant to avoid double-counting
const complex y_val = { 0.5 * ylist_re(iatom, jelem, jju_half), 0.5 * ylist_im(iatom, jelem, jju_half) };
sum_tmp.x += dulist_cpu(iatom, jnbor, jju_cache, 0).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 0).im * y_val.im;
sum_tmp.y += dulist_cpu(iatom, jnbor, jju_cache, 1).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 1).im * y_val.im;
sum_tmp.z += dulist_cpu(iatom, jnbor, jju_cache, 2).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 2).im * y_val.im;
} // end if jeven
},final_sum); // end loop over j
Kokkos::single(Kokkos::PerThread(team), [&] () {
dedr(iatom,jnbor,0) = final_sum.x*2.0;
dedr(iatom,jnbor,1) = final_sum.y*2.0;
dedr(iatom,jnbor,2) = final_sum.z*2.0;
});
}
for (int k = 0; k < 3; k++)
dedr(iatom, jnbor, k) = 2 * force_sum[k];
}
@ -1421,15 +1406,13 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_deidrj_cpu(const t
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType, real_type, vector_length>::add_uarraytot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor,
void SNAKokkos<DeviceType, real_type, vector_length>::add_uarraytot(int iatom, int jnbor,
const real_type& r, const real_type& wj, const real_type& rcut,
const real_type& sinner, const real_type& dinner, int jelem) const
{
const real_type sfac = compute_sfac(r, rcut, sinner, dinner) * wj;
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,twojmax+1),
[&] (const int& j) {
for (int j = 0; j <= twojmax; j++) {
int jju_half = idxu_half_block[j]; // index into ulisttot
int jju_cache = idxu_cache_block[j]; // index into ulist
@ -1441,7 +1424,7 @@ void SNAKokkos<DeviceType, real_type, vector_length>::add_uarraytot(const typena
count++;
}
}
});
}
}
/* ----------------------------------------------------------------------
@ -1452,23 +1435,15 @@ void SNAKokkos<DeviceType, real_type, vector_length>::add_uarraytot(const typena
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType, real_type, vector_length>::compute_uarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor,
void SNAKokkos<DeviceType, real_type, vector_length>::compute_uarray_cpu(int iatom, int jnbor,
const real_type& x, const real_type& y, const real_type& z, const real_type& z0, const real_type& r) const
{
real_type r0inv;
real_type a_r, b_r, a_i, b_i;
real_type rootpq;
// compute Cayley-Klein parameters for unit quaternion
r0inv = static_cast<real_type>(1.0) / sqrt(r * r + z0 * z0);
a_r = r0inv * z0;
a_i = -r0inv * z;
b_r = r0inv * y;
b_i = -r0inv * x;
real_type r0inv = static_cast<real_type>(1.0) / sqrt(r * r + z0 * z0);
complex a = { r0inv * z0, -r0inv * z };
complex b = { r0inv * y, -r0inv * x };
// VMK Section 4.8.2
ulist_cpu(iatom, jnbor, 0) = complex::one();
for (int j = 1; j <= twojmax; j++) {
@ -1476,61 +1451,65 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_uarray_cpu(const t
int jjup = idxu_cache_block[j-1]; // removed "const" to work around GCC 7 bug
// fill in left side of matrix layer from previous layer
for (int mb = 0; 2*mb <= j; mb++) {
int jju_index = jju + (j + 1) * mb;
int jjup_index = jjup + j * mb;
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,(j+2)/2),
[&] (const int& mb) {
//for (int mb = 0; 2*mb <= j; mb++) {
const int jju_index = jju+mb+mb*j;
ulist_cpu(iatom, jnbor, jju_index) = { static_cast<real_type>(0), static_cast<real_type>(0) };
complex ui = complex::zero();
for (int ma = 0; ma < j; ma++) {
const int jju_index = jju+mb+mb*j+ma;
const int jjup_index = jjup+mb*j+ma;
rootpq = rootpqarray(j - ma,j - mb);
ulist_cpu(iatom, jnbor, jju_index).re +=
rootpq *
(a_r * ulist_cpu(iatom, jnbor, jjup_index).re +
a_i * ulist_cpu(iatom, jnbor, jjup_index).im);
ulist_cpu(iatom, jnbor, jju_index).im +=
rootpq *
(a_r * ulist_cpu(iatom, jnbor, jjup_index).im -
a_i * ulist_cpu(iatom, jnbor, jjup_index).re);
complex ui_prev = ulist_cpu(iatom, jnbor, jjup_index);
rootpq = rootpqarray(ma + 1,j - mb);
ulist_cpu(iatom, jnbor, jju_index+1).re =
-rootpq *
(b_r * ulist_cpu(iatom, jnbor, jjup_index).re +
b_i * ulist_cpu(iatom, jnbor, jjup_index).im);
ulist_cpu(iatom, jnbor, jju_index+1).im =
-rootpq *
(b_r * ulist_cpu(iatom, jnbor, jjup_index).im -
b_i * ulist_cpu(iatom, jnbor, jjup_index).re);
real_type rootpq = rootpqarray(j - ma, j - mb);
ui.re += rootpq * (a.re * ui_prev.re + a.im * ui_prev.im);
ui.im += rootpq * (a.re * ui_prev.im - a.im * ui_prev.re);
ulist_cpu(iatom, jnbor, jju_index) = ui;
rootpq = rootpqarray(ma + 1, j - mb);
ui.re = -rootpq * (b.re * ui_prev.re + b.im * ui_prev.im);
ui.im = -rootpq * (b.re * ui_prev.im - b.im * ui_prev.re);
jju_index++;
jjup_index++;
}
// copy left side to right side with inversion symmetry VMK 4.4(2)
// u[ma-j,mb-j] = (-1)^(ma-mb)*Conj([u[ma,mb))
ulist_cpu(iatom, jnbor, jju_index) = ui;
}
// Only need to add one symmetrized row for convenience
// Symmetry gets "unfolded" in accumulating ulisttot
if (j%2==1 && mb==(j/2)) {
const int mbpar = (mb)%2==0?1:-1;
int mapar = mbpar;
for (int ma = 0; ma <= j; ma++) {
const int jju_index = jju + mb*(j+1) + ma;
const int jjup_index = jju + (j+1-mb)*(j+1)-(ma+1);
if (mapar == 1) {
ulist_cpu(iatom, jnbor, jjup_index).re = ulist_cpu(iatom, jnbor, jju_index).re;
ulist_cpu(iatom, jnbor, jjup_index).im = -ulist_cpu(iatom, jnbor, jju_index).im;
} else {
ulist_cpu(iatom, jnbor, jjup_index).re = -ulist_cpu(iatom, jnbor, jju_index).re;
ulist_cpu(iatom, jnbor, jjup_index).im = ulist_cpu(iatom, jnbor, jju_index).im;
}
mapar = -mapar;
}
// If j is odd (half-integer in the mathematical convention), we need
// to add one more row for convenience (for now). This can either be done
// via symmetry (see the commented code below), or by the equations to fill
// from the left instead of the right
if (j % 2 == 1) {
int mb = j / 2;
// begin filling in the extra row
int jju_index = jju + (mb + 1) * (j + 1);
int jjup_index = jjup + mb * j;
complex ui = complex::zero();
for (int ma = 0; ma < j; ma++) {
complex ui_prev = ulist_cpu(iatom, jnbor, jjup_index);
real_type rootpq = rootpqarray(j - ma, mb + 1);
ui.re += rootpq * (b.re * ui_prev.re - b.im * ui_prev.im);
ui.im += rootpq * (b.re * ui_prev.im + b.im * ui_prev.re);
ulist_cpu(iatom, jnbor, jju_index) = ui;
rootpq = rootpqarray(ma + 1, mb + 1);
ui.re = rootpq * (a.re * ui_prev.re - a.im * ui_prev.im);
ui.im = rootpq * (a.re * ui_prev.im + a.im * ui_prev.re);
jju_index++;
jjup_index++;
}
});
ulist_cpu(iatom, jnbor, jju_index) = ui;
}
}
}
/* ----------------------------------------------------------------------
@ -1541,30 +1520,25 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_uarray_cpu(const t
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType, real_type, vector_length>::compute_duarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor,
void SNAKokkos<DeviceType, real_type, vector_length>::compute_duarray_cpu(int iatom, int jnbor,
const real_type& x, const real_type& y, const real_type& z,
const real_type& z0, const real_type& r, const real_type& dz0dr,
const real_type& wj, const real_type& rcut,
const real_type& sinner, const real_type& dinner) const
{
real_type r0inv;
real_type a_r, a_i, b_r, b_i;
real_type u[3], da_r[3], da_i[3], db_r[3], db_i[3];
real_type dz0[3], dr0inv[3], dr0invdr;
real_type rootpq;
complex da[3], db[3];
real_type u[3], dz0[3], dr0inv[3];
real_type rinv = 1.0 / r;
u[0] = x * rinv;
u[1] = y * rinv;
u[2] = z * rinv;
r0inv = 1.0 / sqrt(r * r + z0 * z0);
a_r = z0 * r0inv;
a_i = -z * r0inv;
b_r = y * r0inv;
b_i = -x * r0inv;
real_type r0inv = 1.0 / sqrt(r * r + z0 * z0);
complex a = { z0 * r0inv, -z * r0inv };
complex b = { y * r0inv, -x * r0inv };
dr0invdr = -r0inv * r0inv * r0inv * (r + z0 * dz0dr);
real_type dr0invdr = -r0inv * r0inv * r0inv * (r + z0 * dz0dr);
dr0inv[0] = dr0invdr * u[0];
dr0inv[1] = dr0invdr * u[1];
@ -1575,19 +1549,19 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_duarray_cpu(const
dz0[2] = dz0dr * u[2];
for (int k = 0; k < 3; k++) {
da_r[k] = dz0[k] * r0inv + z0 * dr0inv[k];
da_i[k] = -z * dr0inv[k];
da[k].re = dz0[k] * r0inv + z0 * dr0inv[k];
da[k].im = -z * dr0inv[k];
}
da_i[2] += -r0inv;
da[2].im += -r0inv;
for (int k = 0; k < 3; k++) {
db_r[k] = y * dr0inv[k];
db_i[k] = -x * dr0inv[k];
db[k].re = y * dr0inv[k];
db[k].im = -x * dr0inv[k];
}
db_i[0] += -r0inv;
db_r[1] += r0inv;
db[0].im += -r0inv;
db[1].re += r0inv;
for (int k = 0; k < 3; k++)
dulist_cpu(iatom, jnbor, 0, k) = complex::zero();
@ -1595,72 +1569,94 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_duarray_cpu(const
for (int j = 1; j <= twojmax; j++) {
int jju = idxu_cache_block[j];
int jjup = idxu_cache_block[j-1];
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,(j+2)/2),
[&] (const int& mb) {
//for (int mb = 0; 2*mb <= j; mb++) {
const int jju_index = jju+mb+mb*j;
for (int k = 0; k < 3; k++)
dulist_cpu(iatom, jnbor, jju_index, k) = complex::zero();
for (int mb = 0; 2*mb <= j; mb++) {
int jju_index = jju + mb * (j + 1);
int jjup_index = jjup + mb * j;
complex duidrj[3] = { complex::zero(), complex::zero(), complex::zero() };
for (int ma = 0; ma < j; ma++) {
const int jju_index = jju+mb+mb*j+ma;
const int jjup_index = jjup+mb*j+ma;
rootpq = rootpqarray(j - ma,j - mb);
complex ui_prev = ulist_cpu(iatom, jnbor, jjup_index);
complex duidrj_prev[3] = { dulist_cpu(iatom, jnbor, jjup_index, 0),
dulist_cpu(iatom, jnbor, jjup_index, 1),
dulist_cpu(iatom, jnbor, jjup_index, 2) };
real_type rootpq = rootpqarray(j - ma,j - mb);
for (int k = 0; k < 3; k++) {
dulist_cpu(iatom, jnbor, jju_index, k).re +=
rootpq * (da_r[k] * ulist_cpu(iatom, jnbor, jjup_index).re +
da_i[k] * ulist_cpu(iatom, jnbor, jjup_index).im +
a_r * dulist_cpu(iatom, jnbor, jjup_index, k).re +
a_i * dulist_cpu(iatom, jnbor, jjup_index, k).im);
dulist_cpu(iatom, jnbor, jju_index, k).im +=
rootpq * (da_r[k] * ulist_cpu(iatom, jnbor, jjup_index).im -
da_i[k] * ulist_cpu(iatom, jnbor, jjup_index).re +
a_r * dulist_cpu(iatom, jnbor, jjup_index, k).im -
a_i * dulist_cpu(iatom, jnbor, jjup_index, k).re);
duidrj[k].re += rootpq * (da[k].re * ui_prev.re + da[k].im * ui_prev.im +
a.re * duidrj_prev[k].re + a.im * duidrj_prev[k].im);
duidrj[k].im += rootpq * (da[k].re * ui_prev.im - da[k].im * ui_prev.re +
a.re * duidrj_prev[k].im - a.im * duidrj_prev[k].re);
dulist_cpu(iatom, jnbor, jju_index, k) = duidrj[k];
}
rootpq = rootpqarray(ma + 1,j - mb);
for (int k = 0; k < 3; k++) {
dulist_cpu(iatom, jnbor, jju_index+1, k).re =
-rootpq * (db_r[k] * ulist_cpu(iatom, jnbor, jjup_index).re +
db_i[k] * ulist_cpu(iatom, jnbor, jjup_index).im +
b_r * dulist_cpu(iatom, jnbor, jjup_index, k).re +
b_i * dulist_cpu(iatom, jnbor, jjup_index, k).im);
dulist_cpu(iatom, jnbor, jju_index+1, k).im =
-rootpq * (db_r[k] * ulist_cpu(iatom, jnbor, jjup_index).im -
db_i[k] * ulist_cpu(iatom, jnbor, jjup_index).re +
b_r * dulist_cpu(iatom, jnbor, jjup_index, k).im -
b_i * dulist_cpu(iatom, jnbor, jjup_index, k).re);
duidrj[k].re = -rootpq * (db[k].re * ui_prev.re + db[k].im * ui_prev.im +
b.re * duidrj_prev[k].re + b.im * duidrj_prev[k].im);
duidrj[k].im = -rootpq * (db[k].re * ui_prev.im - db[k].im * ui_prev.re +
b.re * duidrj_prev[k].im - b.im * duidrj_prev[k].re);
}
jju_index++;
jjup_index++;
}
// Only need to add one symmetrized row for convenience
// Symmetry gets "unfolded" during the dedr accumulation
// copy left side to right side with inversion symmetry VMK 4.4(2)
// u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb])
if (j%2==1 && mb==(j/2)) {
const int mbpar = (mb)%2==0?1:-1;
int mapar = mbpar;
for (int ma = 0; ma <= j; ma++) {
const int jju_index = jju+mb*(j+1)+ma;
const int jjup_index = jju+(mb+2)*(j+1)-(ma+1);
if (mapar == 1) {
for (int k = 0; k < 3; k++) {
dulist_cpu(iatom, jnbor, jjup_index, k).re = dulist_cpu(iatom, jnbor, jju_index, k).re;
dulist_cpu(iatom, jnbor, jjup_index, k).im = -dulist_cpu(iatom, jnbor, jju_index, k).im;
}
} else {
for (int k = 0; k < 3; k++) {
dulist_cpu(iatom, jnbor, jjup_index, k).re = -dulist_cpu(iatom, jnbor, jju_index, k).re;
dulist_cpu(iatom, jnbor, jjup_index, k).im = dulist_cpu(iatom, jnbor, jju_index, k).im;
}
}
mapar = -mapar;
}
for (int k = 0; k < 3; k++) {
dulist_cpu(iatom, jnbor, jju_index, k) = duidrj[k];
}
});
}
// Only need to add one symmetrized row for convenience
// Symmetry gets "unfolded" during the dedr accumulation
// copy left side to right side with inversion symmetry VMK 4.4(2)
// u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb])
if (j % 2 == 1) {
int mb = j / 2;
// begin filling in the extra row
int jju_index = jju + (mb + 1) * (j + 1);
int jjup_index = jjup + mb * j;
complex duidrj[3] = { complex::zero(), complex::zero(), complex::zero() };
for (int ma = 0; ma < j; ma++) {
complex ui_prev = ulist_cpu(iatom, jnbor, jjup_index);
complex duidrj_prev[3] = { dulist_cpu(iatom, jnbor, jjup_index, 0),
dulist_cpu(iatom, jnbor, jjup_index, 1),
dulist_cpu(iatom, jnbor, jjup_index, 2) };
real_type rootpq = rootpqarray(j - ma, mb + 1);
for (int k = 0; k < 3; k++) {
duidrj[k].re += rootpq * (db[k].re * ui_prev.re - db[k].im * ui_prev.im +
b.re * duidrj_prev[k].re - b.im * duidrj_prev[k].im);
duidrj[k].im += rootpq * (db[k].re * ui_prev.im + db[k].im * ui_prev.re +
b.re * duidrj_prev[k].im + b.im * duidrj_prev[k].re);
dulist_cpu(iatom, jnbor, jju_index, k) = duidrj[k];
}
rootpq = rootpqarray(ma + 1, mb + 1);
for (int k = 0; k < 3; k++) {
duidrj[k].re = rootpq * (da[k].re * ui_prev.re - da[k].im * ui_prev.im +
a.re * duidrj_prev[k].re - a.im * duidrj_prev[k].im);
duidrj[k].im = rootpq * (da[k].re * ui_prev.im + da[k].im * ui_prev.re +
a.re * duidrj_prev[k].im + a.im * duidrj_prev[k].re);
}
jju_index++;
jjup_index++;
}
for (int k = 0; k < 3; k++) {
dulist_cpu(iatom, jnbor, jju_index, k) = duidrj[k];
}
}
}
real_type sfac = compute_sfac(r, rcut, sinner, dinner);
@ -1678,9 +1674,9 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_duarray_cpu(const
for (int ma = 0; ma <= j; ma++) {
for (int k = 0; k < 3; k++) {
dulist_cpu(iatom, jnbor, jju, k).re = dsfac * ulist_cpu(iatom, jnbor, jju).re * u[k] +
sfac * dulist_cpu(iatom, jnbor, jju, k).re;
sfac * dulist_cpu(iatom, jnbor, jju, k).re;
dulist_cpu(iatom, jnbor, jju, k).im = dsfac * ulist_cpu(iatom, jnbor, jju).im * u[k] +
sfac * dulist_cpu(iatom, jnbor, jju, k).im;
sfac * dulist_cpu(iatom, jnbor, jju, k).im;
}
jju++;
}