Removed unnecessary team parallelism from CPU routines; rearranged pair_snap_kokkos_impl to make the subsequent CPU/GPU unifications easier to follow.
This commit is contained in:
@ -204,10 +204,10 @@ class PairSNAPKokkos : public PairSNAP {
|
||||
void operator() (TagPairSNAPComputeNeighCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeNeighCPU>::member_type& team) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPPreUiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPPreUiCPU>::member_type& team) const;
|
||||
void operator() (TagPairSNAPPreUiCPU, const int& iatom) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiCPU>::member_type& team) const;
|
||||
void operator() (TagPairSNAPComputeUiCPU, const int& ii) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPTransformUiCPU, const int j, const int iatom) const;
|
||||
@ -222,10 +222,10 @@ class PairSNAPKokkos : public PairSNAP {
|
||||
void operator() (TagPairSNAPComputeYiCPU,const int& ii) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPComputeDuidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrjCPU>::member_type& team) const;
|
||||
void operator() (TagPairSNAPComputeDuidrjCPU, const int& ii) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPComputeDeidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrjCPU>::member_type& team) const;
|
||||
void operator() (TagPairSNAPComputeDeidrjCPU, const int& ii) const;
|
||||
|
||||
template<int NEIGHFLAG>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
|
||||
@ -204,7 +204,7 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::compute(int eflag_in,
|
||||
const int inum_pad = inum_div * vector_length;
|
||||
MemKK::realloc_kokkos(d_beta,"PairSNAPKokkos:beta", inum_pad, ncoeff);
|
||||
snaKK.d_beta = d_beta;
|
||||
MemKK::realloc_kokkos(d_ninside,"PairSNAPKokkos:ninside", inum);
|
||||
MemKK::realloc_kokkos(d_ninside,"PairSNAPKokkos:ninside", inum_pad);
|
||||
}
|
||||
|
||||
chunk_size = MIN(chunksize, inum); // "chunksize" variable is set by user
|
||||
@ -235,17 +235,14 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::compute(int eflag_in,
|
||||
|
||||
//PreUi
|
||||
{
|
||||
int team_size = team_size_default;
|
||||
check_team_size_for<TagPairSNAPPreUiCPU>(chunk_size,team_size);
|
||||
typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPPreUiCPU> policy_preui_cpu((chunk_size+team_size-1)/team_size,team_size,vector_length);
|
||||
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPPreUiCPU> policy_preui_cpu(0, chunk_size);
|
||||
Kokkos::parallel_for("PreUiCPU",policy_preui_cpu,*this);
|
||||
}
|
||||
|
||||
// ComputeUi
|
||||
{
|
||||
int team_size = team_size_default;
|
||||
// Fused calculation of ulist and accumulation into ulisttot using atomics
|
||||
typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUiCPU> policy_ui_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
|
||||
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeUiCPU> policy_ui_cpu(0, chunk_size*max_neighs);
|
||||
Kokkos::parallel_for("ComputeUiCPU",policy_ui_cpu,*this);
|
||||
}
|
||||
|
||||
@ -283,13 +280,10 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::compute(int eflag_in,
|
||||
|
||||
//ComputeDuidrj and Deidrj
|
||||
{
|
||||
int team_size = team_size_default;
|
||||
|
||||
typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeDuidrjCPU> policy_duidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
|
||||
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeDuidrjCPU> policy_duidrj_cpu(0,chunk_size*max_neighs);
|
||||
Kokkos::parallel_for("ComputeDuidrjCPU",policy_duidrj_cpu,*this);
|
||||
|
||||
typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeDeidrjCPU> policy_deidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
|
||||
|
||||
typename Kokkos::RangePolicy<DeviceType,TagPairSNAPComputeDeidrjCPU> policy_deidrj_cpu(0,chunk_size*max_neighs);
|
||||
Kokkos::parallel_for("ComputeDeidrjCPU",policy_deidrj_cpu,*this);
|
||||
}
|
||||
|
||||
@ -601,47 +595,6 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::coeff(int narg, char
|
||||
snaKK.init();
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Begin routines that are common to both the CPU and GPU codepath.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPBeta, const int& iatom) const {
|
||||
|
||||
if (iatom >= chunk_size) return;
|
||||
|
||||
const int i = d_ilist[iatom + chunk_offset];
|
||||
const int itype = type[i];
|
||||
const int ielem = d_map[itype];
|
||||
|
||||
auto d_coeffi = Kokkos::subview(d_coeffelem, ielem, Kokkos::ALL);
|
||||
|
||||
for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
|
||||
d_beta(iatom, icoeff) = d_coeffi[icoeff+1];
|
||||
}
|
||||
|
||||
if (quadraticflag) {
|
||||
const auto idxb_max = snaKK.idxb_max;
|
||||
int k = ncoeff+1;
|
||||
for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
|
||||
const auto idxb = icoeff % idxb_max;
|
||||
const auto idx_chem = icoeff / idxb_max;
|
||||
real_type bveci = snaKK.blist(iatom, idx_chem, idxb);
|
||||
d_beta(iatom, icoeff) += d_coeffi[k] * bveci;
|
||||
k++;
|
||||
for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
|
||||
const auto jdxb = jcoeff % idxb_max;
|
||||
const auto jdx_chem = jcoeff / idxb_max;
|
||||
real_type bvecj = snaKK.blist(iatom, jdx_chem, jdxb);
|
||||
d_beta(iatom, icoeff) += d_coeffi[k] * bvecj;
|
||||
d_beta(iatom, jcoeff) += d_coeffi[k] * bveci;
|
||||
k++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Begin routines that are unique to the GPU codepath. These take advantage
|
||||
of AoSoA data layouts and scratch memory for recursive polynomials
|
||||
@ -737,222 +690,6 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
|
||||
});
|
||||
}
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeCayleyKlein,const int iatom_mod, const int jnbor, const int iatom_div) const {
|
||||
|
||||
const int iatom = iatom_mod + iatom_div * vector_length;
|
||||
if (iatom >= chunk_size) return;
|
||||
|
||||
const int ninside = d_ninside(iatom);
|
||||
if (jnbor >= ninside) return;
|
||||
|
||||
snaKK.compute_cayley_klein(iatom,jnbor);
|
||||
}
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPPreUi, const int iatom_mod, const int j, const int iatom_div) const {
|
||||
const int iatom = iatom_mod + iatom_div * vector_length;
|
||||
if (iatom >= chunk_size) return;
|
||||
|
||||
int itype = type(iatom);
|
||||
int ielem = d_map[itype];
|
||||
|
||||
snaKK.pre_ui(iatom, j, ielem);
|
||||
}
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUiSmall>::member_type& team) const {
|
||||
|
||||
// extract flattened atom_div / neighbor number / bend location
|
||||
int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
|
||||
|
||||
// extract neighbor index, iatom_div
|
||||
int iatom_div = flattened_idx / (max_neighs * (twojmax + 1)); // removed "const" to work around GCC 7 bug
|
||||
const int jj_jbend = flattened_idx - iatom_div * (max_neighs * (twojmax + 1));
|
||||
const int jbend = jj_jbend / max_neighs;
|
||||
int jj = jj_jbend - jbend * max_neighs; // removed "const" to work around GCC 7 bug
|
||||
|
||||
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
|
||||
[&] (const int iatom_mod) {
|
||||
const int ii = iatom_mod + vector_length * iatom_div;
|
||||
if (ii >= chunk_size) return;
|
||||
|
||||
const int ninside = d_ninside(ii);
|
||||
if (jj >= ninside) return;
|
||||
|
||||
snaKK.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div);
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUiLarge>::member_type& team) const {
|
||||
|
||||
// extract flattened atom_div / neighbor number / bend location
|
||||
int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
|
||||
|
||||
// extract neighbor index, iatom_div
|
||||
int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug
|
||||
int jj = flattened_idx - iatom_div * max_neighs;
|
||||
|
||||
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
|
||||
[&] (const int iatom_mod) {
|
||||
const int ii = iatom_mod + vector_length * iatom_div;
|
||||
if (ii >= chunk_size) return;
|
||||
|
||||
const int ninside = d_ninside(ii);
|
||||
if (jj >= ninside) return;
|
||||
|
||||
snaKK.compute_ui_large(team,iatom_mod, jj, iatom_div);
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPTransformUi, const int iatom_mod, const int idxu, const int iatom_div) const {
|
||||
|
||||
const int iatom = iatom_mod + iatom_div * vector_length;
|
||||
if (iatom >= chunk_size) return;
|
||||
if (idxu > snaKK.idxu_max) return;
|
||||
|
||||
int elem_count = chemflag ? nelements : 1;
|
||||
|
||||
for (int ielem = 0; ielem < elem_count; ielem++) {
|
||||
|
||||
const FullHalfMapper mapper = snaKK.idxu_full_half[idxu];
|
||||
|
||||
auto utot_re = snaKK.ulisttot_re(iatom, ielem, mapper.idxu_half);
|
||||
auto utot_im = snaKK.ulisttot_im(iatom, ielem, mapper.idxu_half);
|
||||
|
||||
if (mapper.flip_sign == 1) {
|
||||
utot_im = -utot_im;
|
||||
} else if (mapper.flip_sign == -1) {
|
||||
utot_re = -utot_re;
|
||||
}
|
||||
|
||||
snaKK.ulisttot(iatom, ielem, idxu) = { utot_re, utot_im };
|
||||
|
||||
if (mapper.flip_sign == 0) {
|
||||
snaKK.ylist_re(iatom, ielem, mapper.idxu_half) = 0.;
|
||||
snaKK.ylist_im(iatom, ielem, mapper.idxu_half) = 0.;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeYi,const int iatom_mod, const int jjz, const int iatom_div) const {
|
||||
|
||||
const int iatom = iatom_mod + iatom_div * vector_length;
|
||||
if (iatom >= chunk_size) return;
|
||||
|
||||
if (jjz >= snaKK.idxz_max) return;
|
||||
|
||||
snaKK.compute_yi(iatom, jjz);
|
||||
}
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeYiWithZlist,const int iatom_mod, const int jjz, const int iatom_div) const {
|
||||
|
||||
const int iatom = iatom_mod + iatom_div * vector_length;
|
||||
if (iatom >= chunk_size) return;
|
||||
|
||||
if (jjz >= snaKK.idxz_max) return;
|
||||
|
||||
snaKK.compute_yi_with_zlist(iatom, jjz);
|
||||
}
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeZi,const int iatom_mod, const int jjz, const int iatom_div) const {
|
||||
|
||||
const int iatom = iatom_mod + iatom_div * vector_length;
|
||||
if (iatom >= chunk_size) return;
|
||||
if (jjz >= snaKK.idxz_max) return;
|
||||
|
||||
snaKK.compute_zi(iatom, jjz);
|
||||
}
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeBi,const int iatom_mod, const int jjb, const int iatom_div) const {
|
||||
|
||||
const int iatom = iatom_mod + iatom_div * vector_length;
|
||||
if (iatom >= chunk_size) return;
|
||||
|
||||
if (jjb >= snaKK.idxb_max) return;
|
||||
|
||||
snaKK.compute_bi(iatom,jjb);
|
||||
}
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
template<int dir>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeFusedDeidrjSmall<dir>,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeFusedDeidrjSmall<dir> >::member_type& team) const {
|
||||
|
||||
// extract flattened atom_div / neighbor number / bend location
|
||||
int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_fused_deidrj;
|
||||
|
||||
// extract neighbor index, iatom_div
|
||||
int iatom_div = flattened_idx / (max_neighs * (twojmax + 1)); // removed "const" to work around GCC 7 bug
|
||||
const int jj_jbend = flattened_idx - iatom_div * (max_neighs * (twojmax + 1));
|
||||
const int jbend = jj_jbend / max_neighs;
|
||||
int jj = jj_jbend - jbend * max_neighs; // removed "const" to work around GCC 7 bug
|
||||
|
||||
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
|
||||
[&] (const int iatom_mod) {
|
||||
const int ii = iatom_mod + vector_length * iatom_div;
|
||||
if (ii >= chunk_size) return;
|
||||
|
||||
const int ninside = d_ninside(ii);
|
||||
if (jj >= ninside) return;
|
||||
|
||||
snaKK.template compute_fused_deidrj_small<dir>(team, iatom_mod, jbend, jj, iatom_div);
|
||||
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
template<int dir>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeFusedDeidrjLarge<dir>,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeFusedDeidrjLarge<dir> >::member_type& team) const {
|
||||
|
||||
// extract flattened atom_div / neighbor number / bend location
|
||||
int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_fused_deidrj;
|
||||
|
||||
// extract neighbor index, iatom_div
|
||||
int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug
|
||||
int jj = flattened_idx - max_neighs * iatom_div;
|
||||
|
||||
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
|
||||
[&] (const int iatom_mod) {
|
||||
const int ii = iatom_mod + vector_length * iatom_div;
|
||||
if (ii >= chunk_size) return;
|
||||
|
||||
const int ninside = d_ninside(ii);
|
||||
if (jj >= ninside) return;
|
||||
|
||||
snaKK.template compute_fused_deidrj_large<dir>(team, iatom_mod, jj, iatom_div);
|
||||
|
||||
});
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Begin routines that are unique to the CPU codepath. These do not take
|
||||
advantage of AoSoA data layouts, but that could be a good point of
|
||||
future optimization and unification with the above kernels. It's unlikely
|
||||
that scratch memory optimizations will ever be useful for the CPU due to
|
||||
different arithmetic intensity requirements for the CPU vs GPU.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeNeighCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeNeighCPU>::member_type& team) const {
|
||||
@ -1028,36 +765,156 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
|
||||
});
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Pre-compute the Cayley-Klein parameters for reuse in later routines.
|
||||
GPU only.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPPreUiCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPPreUiCPU>::member_type& team) const {
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeCayleyKlein,const int iatom_mod, const int jnbor, const int iatom_div) const {
|
||||
|
||||
// Extract the atom number
|
||||
const int ii = team.team_rank() + team.team_size() * team.league_rank();
|
||||
if (ii >= chunk_size) return;
|
||||
int itype = type(ii);
|
||||
int ielem = d_map[itype];
|
||||
const int iatom = iatom_mod + iatom_div * vector_length;
|
||||
if (iatom >= chunk_size) return;
|
||||
|
||||
snaKK.pre_ui_cpu(team,ii,ielem);
|
||||
const int ninside = d_ninside(iatom);
|
||||
if (jnbor >= ninside) return;
|
||||
|
||||
snaKK.compute_cayley_klein(iatom, jnbor);
|
||||
}
|
||||
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Initialize the "ulisttot" structure with non-zero on-diagonal terms
|
||||
and zero terms elsewhere; both CPU and GPU.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUiCPU>::member_type& team) const {
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPPreUi, const int iatom_mod, const int j, const int iatom_div) const {
|
||||
const int iatom = iatom_mod + iatom_div * vector_length;
|
||||
if (iatom >= chunk_size) return;
|
||||
|
||||
// Extract the atom number
|
||||
int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size()));
|
||||
if (ii >= chunk_size) return;
|
||||
int itype = type(iatom);
|
||||
int ielem = d_map[itype];
|
||||
|
||||
// Extract the neighbor number
|
||||
const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size());
|
||||
const int ninside = d_ninside(ii);
|
||||
if (jj >= ninside) return;
|
||||
snaKK.pre_ui(iatom, j, ielem);
|
||||
}
|
||||
|
||||
snaKK.compute_ui_cpu(team,ii,jj);
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPPreUiCPU, const int& iatom) const {
|
||||
const int itype = type(iatom);
|
||||
const int ielem = d_map[itype];
|
||||
|
||||
snaKK.pre_ui_cpu(iatom, ielem);
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Accumulate the spectral contributions from atom, neighbor pairs into
|
||||
ulisttot_re and _im. These routines are GPU only and use scratch memory
|
||||
staging.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUiSmall>::member_type& team) const {
|
||||
|
||||
// extract flattened atom_div / neighbor number / bend location
|
||||
int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
|
||||
|
||||
// extract neighbor index, iatom_div
|
||||
int iatom_div = flattened_idx / (max_neighs * (twojmax + 1)); // removed "const" to work around GCC 7 bug
|
||||
const int jj_jbend = flattened_idx - iatom_div * (max_neighs * (twojmax + 1));
|
||||
const int jbend = jj_jbend / max_neighs;
|
||||
int jj = jj_jbend - jbend * max_neighs; // removed "const" to work around GCC 7 bug
|
||||
|
||||
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
|
||||
[&] (const int iatom_mod) {
|
||||
const int ii = iatom_mod + vector_length * iatom_div;
|
||||
if (ii >= chunk_size) return;
|
||||
|
||||
const int ninside = d_ninside(ii);
|
||||
if (jj >= ninside) return;
|
||||
|
||||
snaKK.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div);
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUiLarge>::member_type& team) const {
|
||||
|
||||
// extract flattened atom_div / neighbor number / bend location
|
||||
int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
|
||||
|
||||
// extract neighbor index, iatom_div
|
||||
int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug
|
||||
int jj = flattened_idx - iatom_div * max_neighs;
|
||||
|
||||
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
|
||||
[&] (const int iatom_mod) {
|
||||
const int ii = iatom_mod + vector_length * iatom_div;
|
||||
if (ii >= chunk_size) return;
|
||||
|
||||
const int ninside = d_ninside(ii);
|
||||
if (jj >= ninside) return;
|
||||
|
||||
snaKK.compute_ui_large(team,iatom_mod, jj, iatom_div);
|
||||
});
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Accumulate the spectral contributions from atom, neighbor pairs into
|
||||
ulisttot_re and _im. This routine is CPU only and does not use staging.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeUiCPU, const int& ii) const {
|
||||
const int iatom = ii / max_neighs;
|
||||
const int jnbor = ii % max_neighs;
|
||||
const int ninside = d_ninside(iatom);
|
||||
if (jnbor >= ninside) return;
|
||||
|
||||
snaKK.compute_ui_cpu(iatom, jnbor);
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
De-symmetrize ulisttot_re and _im and pack it into a unified ulisttot
|
||||
structure. Zero-initialize ylist. CPU and GPU.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPTransformUi, const int iatom_mod, const int idxu, const int iatom_div) const {
|
||||
|
||||
const int iatom = iatom_mod + iatom_div * vector_length;
|
||||
if (iatom >= chunk_size) return;
|
||||
if (idxu > snaKK.idxu_max) return;
|
||||
|
||||
int elem_count = chemflag ? nelements : 1;
|
||||
|
||||
for (int ielem = 0; ielem < elem_count; ielem++) {
|
||||
|
||||
const FullHalfMapper mapper = snaKK.idxu_full_half[idxu];
|
||||
|
||||
auto utot_re = snaKK.ulisttot_re(iatom, ielem, mapper.idxu_half);
|
||||
auto utot_im = snaKK.ulisttot_im(iatom, ielem, mapper.idxu_half);
|
||||
|
||||
if (mapper.flip_sign == 1) {
|
||||
utot_im = -utot_im;
|
||||
} else if (mapper.flip_sign == -1) {
|
||||
utot_re = -utot_re;
|
||||
}
|
||||
|
||||
snaKK.ulisttot(iatom, ielem, idxu) = { utot_re, utot_im };
|
||||
|
||||
if (mapper.flip_sign == 0) {
|
||||
snaKK.ylist_re(iatom, ielem, mapper.idxu_half) = 0.;
|
||||
snaKK.ylist_im(iatom, ielem, mapper.idxu_half) = 0.;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
@ -1110,10 +967,21 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
|
||||
}
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Compute all elements of the Z tensor and store them into the `zlist`
|
||||
View. This is only used for energy timesteps or quadratic SNAP.
|
||||
CPU and GPU.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeYiCPU,const int& ii) const {
|
||||
snaKK.compute_yi_cpu(ii);
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeZi,const int iatom_mod, const int jjz, const int iatom_div) const {
|
||||
|
||||
const int iatom = iatom_mod + iatom_div * vector_length;
|
||||
if (iatom >= chunk_size) return;
|
||||
if (jjz >= snaKK.idxz_max) return;
|
||||
|
||||
snaKK.compute_zi(iatom, jjz);
|
||||
}
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
@ -1122,6 +990,22 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
|
||||
snaKK.compute_zi_cpu(ii);
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Compute the energy triple products and store in the "blist" View.
|
||||
CPU and GPU.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeBi,const int iatom_mod, const int jjb, const int iatom_div) const {
|
||||
|
||||
const int iatom = iatom_mod + iatom_div * vector_length;
|
||||
if (iatom >= chunk_size) return;
|
||||
if (jjb >= snaKK.idxb_max) return;
|
||||
|
||||
snaKK.compute_bi(iatom,jjb);
|
||||
}
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeBiCPU, const int& ii) const {
|
||||
@ -1130,42 +1014,187 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
|
||||
snaKK.compute_bi(iatom, jjb);
|
||||
}
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeDuidrjCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeDuidrjCPU>::member_type& team) const {
|
||||
|
||||
// Extract the atom number
|
||||
int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size()));
|
||||
if (ii >= chunk_size) return;
|
||||
|
||||
// Extract the neighbor number
|
||||
const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size());
|
||||
const int ninside = d_ninside(ii);
|
||||
if (jj >= ninside) return;
|
||||
|
||||
snaKK.compute_duidrj_cpu(team,ii,jj);
|
||||
}
|
||||
/* ----------------------------------------------------------------------
|
||||
Assemble the "beta" coefficients that enter the computation of the
|
||||
adjoint matrices Y. For quadratic SNAP, this includes accumulating
|
||||
energy triple products into an "effective" beta that encodes the
|
||||
quadratic terms with otherwise linear compute work.
|
||||
CPU and GPU.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeDeidrjCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeDeidrjCPU>::member_type& team) const {
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPBeta, const int& iatom) const {
|
||||
|
||||
// Extract the atom number
|
||||
int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size()));
|
||||
if (ii >= chunk_size) return;
|
||||
if (iatom >= chunk_size) return;
|
||||
|
||||
// Extract the neighbor number
|
||||
const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size());
|
||||
const int ninside = d_ninside(ii);
|
||||
if (jj >= ninside) return;
|
||||
const int i = d_ilist[iatom + chunk_offset];
|
||||
const int itype = type[i];
|
||||
const int ielem = d_map[itype];
|
||||
|
||||
snaKK.compute_deidrj_cpu(team,ii,jj);
|
||||
auto d_coeffi = Kokkos::subview(d_coeffelem, ielem, Kokkos::ALL);
|
||||
|
||||
for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
|
||||
d_beta(iatom, icoeff) = d_coeffi[icoeff+1];
|
||||
}
|
||||
|
||||
if (quadraticflag) {
|
||||
const auto idxb_max = snaKK.idxb_max;
|
||||
int k = ncoeff+1;
|
||||
for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
|
||||
const auto idxb = icoeff % idxb_max;
|
||||
const auto idx_chem = icoeff / idxb_max;
|
||||
real_type bveci = snaKK.blist(iatom, idx_chem, idxb);
|
||||
d_beta(iatom, icoeff) += d_coeffi[k] * bveci;
|
||||
k++;
|
||||
for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
|
||||
const auto jdxb = jcoeff % idxb_max;
|
||||
const auto jdx_chem = jcoeff / idxb_max;
|
||||
real_type bvecj = snaKK.blist(iatom, jdx_chem, jdxb);
|
||||
d_beta(iatom, icoeff) += d_coeffi[k] * bvecj;
|
||||
d_beta(iatom, jcoeff) += d_coeffi[k] * bveci;
|
||||
k++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Also used for both CPU and GPU codepaths. Could maybe benefit from a
|
||||
separate GPU/CPU codepath, but this kernel takes so little time it's
|
||||
likely not worth it.
|
||||
Compute all elements of the Z tensor and accumultate them into the
|
||||
adjoint matrices Y (ylist_re, _im) on non-energy timesteps. CPU and GPU.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeYi,const int iatom_mod, const int jjz, const int iatom_div) const {
|
||||
|
||||
const int iatom = iatom_mod + iatom_div * vector_length;
|
||||
if (iatom >= chunk_size) return;
|
||||
if (jjz >= snaKK.idxz_max) return;
|
||||
|
||||
snaKK.compute_yi(iatom, jjz);
|
||||
}
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeYiCPU, const int& ii) const {
|
||||
snaKK.compute_yi_cpu(ii);
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Accumulate the pre-computed elements of the Z tensor into the adjoint
|
||||
matrices Y (ylist_re, _im) on non-energy timesteps. CPU and GPU.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeYiWithZlist,const int iatom_mod, const int jjz, const int iatom_div) const {
|
||||
|
||||
const int iatom = iatom_mod + iatom_div * vector_length;
|
||||
if (iatom >= chunk_size) return;
|
||||
if (jjz >= snaKK.idxz_max) return;
|
||||
|
||||
snaKK.compute_yi_with_zlist(iatom, jjz);
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Assemble the force contributions for each atom, neighbor pair by
|
||||
contracting the adjoint matrices Y with derivatives of the Wigner
|
||||
matrices U. These routines are GPU only and use scratch memory
|
||||
staging.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
template<int dir>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeFusedDeidrjSmall<dir>,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeFusedDeidrjSmall<dir> >::member_type& team) const {
|
||||
|
||||
// extract flattened atom_div / neighbor number / bend location
|
||||
int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_fused_deidrj;
|
||||
|
||||
// extract neighbor index, iatom_div
|
||||
int iatom_div = flattened_idx / (max_neighs * (twojmax + 1)); // removed "const" to work around GCC 7 bug
|
||||
const int jj_jbend = flattened_idx - iatom_div * (max_neighs * (twojmax + 1));
|
||||
const int jbend = jj_jbend / max_neighs;
|
||||
int jj = jj_jbend - jbend * max_neighs; // removed "const" to work around GCC 7 bug
|
||||
|
||||
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
|
||||
[&] (const int iatom_mod) {
|
||||
const int ii = iatom_mod + vector_length * iatom_div;
|
||||
if (ii >= chunk_size) return;
|
||||
|
||||
const int ninside = d_ninside(ii);
|
||||
if (jj >= ninside) return;
|
||||
|
||||
snaKK.template compute_fused_deidrj_small<dir>(team, iatom_mod, jbend, jj, iatom_div);
|
||||
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
template<int dir>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeFusedDeidrjLarge<dir>,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeFusedDeidrjLarge<dir> >::member_type& team) const {
|
||||
|
||||
// extract flattened atom_div / neighbor number / bend location
|
||||
int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_fused_deidrj;
|
||||
|
||||
// extract neighbor index, iatom_div
|
||||
int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug
|
||||
int jj = flattened_idx - max_neighs * iatom_div;
|
||||
|
||||
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
|
||||
[&] (const int iatom_mod) {
|
||||
const int ii = iatom_mod + vector_length * iatom_div;
|
||||
if (ii >= chunk_size) return;
|
||||
|
||||
const int ninside = d_ninside(ii);
|
||||
if (jj >= ninside) return;
|
||||
|
||||
snaKK.template compute_fused_deidrj_large<dir>(team, iatom_mod, jj, iatom_div);
|
||||
|
||||
});
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Assemble the derivatives of the Winger matrices U into the View
|
||||
"dulist". CPU only.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeDuidrjCPU, const int& ii) const {
|
||||
const int iatom = ii / max_neighs;
|
||||
const int jnbor = ii % max_neighs;
|
||||
const int ninside = d_ninside(iatom);
|
||||
if (jnbor >= ninside) return;
|
||||
|
||||
snaKK.compute_duidrj_cpu(iatom, jnbor);
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Assemble the force contributions for each atom, neighbor pair by
|
||||
contracting the adjoint matrices Y with the pre-computed derivatives
|
||||
of the Wigner matrices U. CPU only.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeDeidrjCPU, const int& ii) const {
|
||||
const int iatom = ii / max_neighs;
|
||||
const int jnbor = ii % max_neighs;
|
||||
const int ninside = d_ninside(iatom);
|
||||
if (jnbor >= ninside) return;
|
||||
|
||||
snaKK.compute_deidrj_cpu(iatom, jnbor);
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
This routine formally accumulates the "chunked" force contributions
|
||||
into the broader LAMMPS "f" force View. As appropriate it
|
||||
also accumulates the total energy and the virial. CPU and GPU.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
|
||||
@ -237,9 +237,9 @@ class SNAKokkos {
|
||||
|
||||
// functions for bispectrum coefficients, CPU only
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void pre_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team,const int&,const int&) const; // ForceSNAP
|
||||
void pre_ui_cpu(const int&, const int&) const; // ForceSNAP
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int) const; // ForceSNAP
|
||||
void compute_ui_cpu(const int&, const int&) const; // ForceSNAP
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_zi_cpu(const int&) const; // ForceSNAP
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
@ -247,9 +247,9 @@ class SNAKokkos {
|
||||
|
||||
// functions for derivatives, CPU only
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_duidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int) const; //ForceSNAP
|
||||
void compute_duidrj_cpu(const int&, const int&) const; //ForceSNAP
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_deidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int) const; // ForceSNAP
|
||||
void compute_deidrj_cpu(const int&, const int&) const; // ForceSNAP
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
real_type compute_sfac(real_type, real_type, real_type, real_type) const; // add_uarraytot, compute_duarray
|
||||
@ -358,11 +358,10 @@ class SNAKokkos {
|
||||
void init_rootpqarray(); // init()
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void add_uarraytot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int, const real_type&, const real_type&, const real_type&, const real_type&, const real_type&, int) const; // compute_ui
|
||||
void add_uarraytot(int, int, const real_type&, const real_type&, const real_type&, const real_type&, const real_type&, int) const; // compute_ui
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_uarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int,
|
||||
const real_type&, const real_type&, const real_type&,
|
||||
void compute_uarray_cpu(int, int, const real_type&, const real_type&, const real_type&,
|
||||
const real_type&, const real_type&) const; // compute_ui_cpu
|
||||
|
||||
|
||||
@ -372,8 +371,7 @@ class SNAKokkos {
|
||||
inline
|
||||
int compute_ncoeff(); // SNAKokkos()
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_duarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int,
|
||||
const real_type&, const real_type&, const real_type&, // compute_duidrj_cpu
|
||||
void compute_duarray_cpu(int, int, const real_type&, const real_type&, const real_type&, // compute_duidrj_cpu
|
||||
const real_type&, const real_type&, const real_type&, const real_type&, const real_type&,
|
||||
const real_type&, const real_type&) const;
|
||||
|
||||
|
||||
@ -1156,17 +1156,14 @@ typename SNAKokkos<DeviceType, real_type, vector_length>::real_type SNAKokkos<De
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void SNAKokkos<DeviceType, real_type, vector_length>::pre_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int& iatom, const int& ielem) const
|
||||
void SNAKokkos<DeviceType, real_type, vector_length>::pre_ui_cpu(const int& iatom, const int& ielem) const
|
||||
{
|
||||
for (int jelem = 0; jelem < nelements; jelem++) {
|
||||
for (int j = 0; j <= twojmax; j++) {
|
||||
int jju = idxu_half_block(j); // removed "const" to work around GCC 7 bug
|
||||
|
||||
// Only diagonal elements get initialized
|
||||
// for (int m = 0; m < (j+1)*(j/2+1); m++)
|
||||
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, (j+1)*(j/2+1)),
|
||||
[&] (const int m) {
|
||||
|
||||
for (int m = 0; m < (j+1)*(j/2+1); m++) {
|
||||
const int jjup = jju + m;
|
||||
|
||||
// if m is on the "diagonal", initialize it with the self energy.
|
||||
@ -1176,7 +1173,7 @@ void SNAKokkos<DeviceType, real_type, vector_length>::pre_ui_cpu(const typename
|
||||
|
||||
ulisttot_re(iatom, jelem, jjup) = init;
|
||||
ulisttot_im(iatom, jelem, jjup) = 0;
|
||||
});
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
@ -1191,7 +1188,7 @@ void SNAKokkos<DeviceType, real_type, vector_length>::pre_ui_cpu(const typename
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void SNAKokkos<DeviceType, real_type, vector_length>::compute_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor) const
|
||||
void SNAKokkos<DeviceType, real_type, vector_length>::compute_ui_cpu(const int& iatom, const int& jnbor) const
|
||||
{
|
||||
real_type rsq, r, x, y, z, z0, theta0;
|
||||
|
||||
@ -1211,8 +1208,8 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_ui_cpu(const typen
|
||||
// theta0 = (r - rmin0) * rscale0;
|
||||
z0 = r / tan(theta0);
|
||||
|
||||
compute_uarray_cpu(team, iatom, jnbor, x, y, z, z0, r);
|
||||
add_uarraytot(team, iatom, jnbor, r, wj(iatom,jnbor), rcutij(iatom,jnbor), sinnerij(iatom,jnbor), dinnerij(iatom,jnbor), element(iatom, jnbor));
|
||||
compute_uarray_cpu(iatom, jnbor, x, y, z, z0, r);
|
||||
add_uarraytot(iatom, jnbor, r, wj(iatom,jnbor), rcutij(iatom,jnbor), sinnerij(iatom,jnbor), dinnerij(iatom,jnbor), element(iatom, jnbor));
|
||||
|
||||
}
|
||||
/* ----------------------------------------------------------------------
|
||||
@ -1328,7 +1325,7 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_yi_cpu(int iter) c
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void SNAKokkos<DeviceType, real_type, vector_length>::compute_duidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor) const
|
||||
void SNAKokkos<DeviceType, real_type, vector_length>::compute_duidrj_cpu(const int& iatom, const int& jnbor) const
|
||||
{
|
||||
real_type rsq, r, x, y, z, z0, theta0, cs, sn;
|
||||
real_type dz0dr;
|
||||
@ -1345,7 +1342,7 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_duidrj_cpu(const t
|
||||
z0 = r * cs / sn;
|
||||
dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq;
|
||||
|
||||
compute_duarray_cpu(team, iatom, jnbor, x, y, z, z0, r, dz0dr, wj(iatom,jnbor), rcutij(iatom,jnbor), sinnerij(iatom,jnbor), dinnerij(iatom,jnbor));
|
||||
compute_duarray_cpu(iatom, jnbor, x, y, z, z0, r, dz0dr, wj(iatom,jnbor), rcutij(iatom,jnbor), sinnerij(iatom,jnbor), dinnerij(iatom,jnbor));
|
||||
}
|
||||
|
||||
|
||||
@ -1360,54 +1357,42 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_duidrj_cpu(const t
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void SNAKokkos<DeviceType, real_type, vector_length>::compute_deidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor) const
|
||||
void SNAKokkos<DeviceType, real_type, vector_length>::compute_deidrj_cpu(const int& iatom, const int& jnbor) const
|
||||
{
|
||||
t_scalar3<real_type> final_sum;
|
||||
real_type force_sum[3] = { 0, 0, 0 };
|
||||
const int jelem = element(iatom, jnbor);
|
||||
|
||||
Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,twojmax+1),
|
||||
[&] (const int& j, t_scalar3<real_type>& sum_tmp) {
|
||||
for (int j = 0; j <= twojmax; j++) {
|
||||
int jju_half = idxu_half_block[j];
|
||||
int jju_cache = idxu_cache_block[j];
|
||||
|
||||
for (int mb = 0; 2*mb < j; mb++)
|
||||
for (int mb = 0; 2 * mb < j; mb++) {
|
||||
for (int ma = 0; ma <= j; ma++) {
|
||||
const complex y_val = { ylist_re(iatom, jelem, jju_half), ylist_im(iatom, jelem, jju_half) };
|
||||
sum_tmp.x += dulist_cpu(iatom, jnbor, jju_cache, 0).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 0).im * y_val.im;
|
||||
sum_tmp.y += dulist_cpu(iatom, jnbor, jju_cache, 1).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 1).im * y_val.im;
|
||||
sum_tmp.z += dulist_cpu(iatom, jnbor, jju_cache, 2).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 2).im * y_val.im;
|
||||
complex y = { ylist_re(iatom, jelem, jju_half), ylist_im(iatom, jelem, jju_half) };
|
||||
for (int k = 0; k < 3; k++)
|
||||
force_sum[k] += dulist_cpu(iatom, jnbor, jju_cache, k).re * y.re +
|
||||
dulist_cpu(iatom, jnbor, jju_cache, k).im * y.im;
|
||||
jju_half++; jju_cache++;
|
||||
} //end loop over ma mb
|
||||
}
|
||||
} //end loop over ma mb
|
||||
|
||||
// For j even, handle middle column
|
||||
|
||||
if (j%2 == 0) {
|
||||
|
||||
int mb = j/2;
|
||||
for (int ma = 0; ma < mb; ma++) {
|
||||
const complex y_val = { ylist_re(iatom, jelem, jju_half), ylist_im(iatom, jelem, jju_half) };
|
||||
sum_tmp.x += dulist_cpu(iatom, jnbor, jju_cache, 0).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 0).im * y_val.im;
|
||||
sum_tmp.y += dulist_cpu(iatom, jnbor, jju_cache, 1).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 1).im * y_val.im;
|
||||
sum_tmp.z += dulist_cpu(iatom, jnbor, jju_cache, 2).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 2).im * y_val.im;
|
||||
if (j % 2 == 0) {
|
||||
//int mb = j / 2;
|
||||
for (int ma = 0; ma <= j; ma++) {
|
||||
complex y = { ylist_re(iatom, jelem, jju_half), ylist_im(iatom, jelem, jju_half) };
|
||||
for (int k = 0; k < 3; k++)
|
||||
force_sum[k] += static_cast<real_type>(0.5) * (dulist_cpu(iatom, jnbor, jju_cache, k).re * y.re +
|
||||
dulist_cpu(iatom, jnbor, jju_cache, k).im * y.im);
|
||||
jju_half++; jju_cache++;
|
||||
}
|
||||
|
||||
//int ma = mb;
|
||||
// 0.5 is meant to avoid double-counting
|
||||
const complex y_val = { 0.5 * ylist_re(iatom, jelem, jju_half), 0.5 * ylist_im(iatom, jelem, jju_half) };
|
||||
sum_tmp.x += dulist_cpu(iatom, jnbor, jju_cache, 0).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 0).im * y_val.im;
|
||||
sum_tmp.y += dulist_cpu(iatom, jnbor, jju_cache, 1).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 1).im * y_val.im;
|
||||
sum_tmp.z += dulist_cpu(iatom, jnbor, jju_cache, 2).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 2).im * y_val.im;
|
||||
} // end if jeven
|
||||
|
||||
},final_sum); // end loop over j
|
||||
|
||||
Kokkos::single(Kokkos::PerThread(team), [&] () {
|
||||
dedr(iatom,jnbor,0) = final_sum.x*2.0;
|
||||
dedr(iatom,jnbor,1) = final_sum.y*2.0;
|
||||
dedr(iatom,jnbor,2) = final_sum.z*2.0;
|
||||
});
|
||||
}
|
||||
|
||||
for (int k = 0; k < 3; k++)
|
||||
dedr(iatom, jnbor, k) = 2 * force_sum[k];
|
||||
}
|
||||
|
||||
|
||||
@ -1421,15 +1406,13 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_deidrj_cpu(const t
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void SNAKokkos<DeviceType, real_type, vector_length>::add_uarraytot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor,
|
||||
void SNAKokkos<DeviceType, real_type, vector_length>::add_uarraytot(int iatom, int jnbor,
|
||||
const real_type& r, const real_type& wj, const real_type& rcut,
|
||||
const real_type& sinner, const real_type& dinner, int jelem) const
|
||||
{
|
||||
const real_type sfac = compute_sfac(r, rcut, sinner, dinner) * wj;
|
||||
|
||||
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,twojmax+1),
|
||||
[&] (const int& j) {
|
||||
|
||||
for (int j = 0; j <= twojmax; j++) {
|
||||
int jju_half = idxu_half_block[j]; // index into ulisttot
|
||||
int jju_cache = idxu_cache_block[j]; // index into ulist
|
||||
|
||||
@ -1441,7 +1424,7 @@ void SNAKokkos<DeviceType, real_type, vector_length>::add_uarraytot(const typena
|
||||
count++;
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
@ -1452,23 +1435,15 @@ void SNAKokkos<DeviceType, real_type, vector_length>::add_uarraytot(const typena
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void SNAKokkos<DeviceType, real_type, vector_length>::compute_uarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor,
|
||||
void SNAKokkos<DeviceType, real_type, vector_length>::compute_uarray_cpu(int iatom, int jnbor,
|
||||
const real_type& x, const real_type& y, const real_type& z, const real_type& z0, const real_type& r) const
|
||||
{
|
||||
real_type r0inv;
|
||||
real_type a_r, b_r, a_i, b_i;
|
||||
real_type rootpq;
|
||||
|
||||
// compute Cayley-Klein parameters for unit quaternion
|
||||
|
||||
r0inv = static_cast<real_type>(1.0) / sqrt(r * r + z0 * z0);
|
||||
a_r = r0inv * z0;
|
||||
a_i = -r0inv * z;
|
||||
b_r = r0inv * y;
|
||||
b_i = -r0inv * x;
|
||||
real_type r0inv = static_cast<real_type>(1.0) / sqrt(r * r + z0 * z0);
|
||||
complex a = { r0inv * z0, -r0inv * z };
|
||||
complex b = { r0inv * y, -r0inv * x };
|
||||
|
||||
// VMK Section 4.8.2
|
||||
|
||||
ulist_cpu(iatom, jnbor, 0) = complex::one();
|
||||
|
||||
for (int j = 1; j <= twojmax; j++) {
|
||||
@ -1476,61 +1451,65 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_uarray_cpu(const t
|
||||
int jjup = idxu_cache_block[j-1]; // removed "const" to work around GCC 7 bug
|
||||
|
||||
// fill in left side of matrix layer from previous layer
|
||||
for (int mb = 0; 2*mb <= j; mb++) {
|
||||
int jju_index = jju + (j + 1) * mb;
|
||||
int jjup_index = jjup + j * mb;
|
||||
|
||||
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,(j+2)/2),
|
||||
[&] (const int& mb) {
|
||||
//for (int mb = 0; 2*mb <= j; mb++) {
|
||||
const int jju_index = jju+mb+mb*j;
|
||||
ulist_cpu(iatom, jnbor, jju_index) = { static_cast<real_type>(0), static_cast<real_type>(0) };
|
||||
complex ui = complex::zero();
|
||||
|
||||
for (int ma = 0; ma < j; ma++) {
|
||||
const int jju_index = jju+mb+mb*j+ma;
|
||||
const int jjup_index = jjup+mb*j+ma;
|
||||
rootpq = rootpqarray(j - ma,j - mb);
|
||||
ulist_cpu(iatom, jnbor, jju_index).re +=
|
||||
rootpq *
|
||||
(a_r * ulist_cpu(iatom, jnbor, jjup_index).re +
|
||||
a_i * ulist_cpu(iatom, jnbor, jjup_index).im);
|
||||
ulist_cpu(iatom, jnbor, jju_index).im +=
|
||||
rootpq *
|
||||
(a_r * ulist_cpu(iatom, jnbor, jjup_index).im -
|
||||
a_i * ulist_cpu(iatom, jnbor, jjup_index).re);
|
||||
complex ui_prev = ulist_cpu(iatom, jnbor, jjup_index);
|
||||
|
||||
rootpq = rootpqarray(ma + 1,j - mb);
|
||||
ulist_cpu(iatom, jnbor, jju_index+1).re =
|
||||
-rootpq *
|
||||
(b_r * ulist_cpu(iatom, jnbor, jjup_index).re +
|
||||
b_i * ulist_cpu(iatom, jnbor, jjup_index).im);
|
||||
ulist_cpu(iatom, jnbor, jju_index+1).im =
|
||||
-rootpq *
|
||||
(b_r * ulist_cpu(iatom, jnbor, jjup_index).im -
|
||||
b_i * ulist_cpu(iatom, jnbor, jjup_index).re);
|
||||
real_type rootpq = rootpqarray(j - ma, j - mb);
|
||||
ui.re += rootpq * (a.re * ui_prev.re + a.im * ui_prev.im);
|
||||
ui.im += rootpq * (a.re * ui_prev.im - a.im * ui_prev.re);
|
||||
|
||||
ulist_cpu(iatom, jnbor, jju_index) = ui;
|
||||
|
||||
rootpq = rootpqarray(ma + 1, j - mb);
|
||||
ui.re = -rootpq * (b.re * ui_prev.re + b.im * ui_prev.im);
|
||||
ui.im = -rootpq * (b.re * ui_prev.im - b.im * ui_prev.re);
|
||||
|
||||
jju_index++;
|
||||
jjup_index++;
|
||||
}
|
||||
|
||||
// copy left side to right side with inversion symmetry VMK 4.4(2)
|
||||
// u[ma-j,mb-j] = (-1)^(ma-mb)*Conj([u[ma,mb))
|
||||
ulist_cpu(iatom, jnbor, jju_index) = ui;
|
||||
}
|
||||
|
||||
// Only need to add one symmetrized row for convenience
|
||||
// Symmetry gets "unfolded" in accumulating ulisttot
|
||||
if (j%2==1 && mb==(j/2)) {
|
||||
const int mbpar = (mb)%2==0?1:-1;
|
||||
int mapar = mbpar;
|
||||
for (int ma = 0; ma <= j; ma++) {
|
||||
const int jju_index = jju + mb*(j+1) + ma;
|
||||
const int jjup_index = jju + (j+1-mb)*(j+1)-(ma+1);
|
||||
if (mapar == 1) {
|
||||
ulist_cpu(iatom, jnbor, jjup_index).re = ulist_cpu(iatom, jnbor, jju_index).re;
|
||||
ulist_cpu(iatom, jnbor, jjup_index).im = -ulist_cpu(iatom, jnbor, jju_index).im;
|
||||
} else {
|
||||
ulist_cpu(iatom, jnbor, jjup_index).re = -ulist_cpu(iatom, jnbor, jju_index).re;
|
||||
ulist_cpu(iatom, jnbor, jjup_index).im = ulist_cpu(iatom, jnbor, jju_index).im;
|
||||
}
|
||||
mapar = -mapar;
|
||||
}
|
||||
// If j is odd (half-integer in the mathematical convention), we need
|
||||
// to add one more row for convenience (for now). This can either be done
|
||||
// via symmetry (see the commented code below), or by the equations to fill
|
||||
// from the left instead of the right
|
||||
if (j % 2 == 1) {
|
||||
int mb = j / 2;
|
||||
// begin filling in the extra row
|
||||
int jju_index = jju + (mb + 1) * (j + 1);
|
||||
int jjup_index = jjup + mb * j;
|
||||
|
||||
complex ui = complex::zero();
|
||||
|
||||
for (int ma = 0; ma < j; ma++) {
|
||||
complex ui_prev = ulist_cpu(iatom, jnbor, jjup_index);
|
||||
|
||||
real_type rootpq = rootpqarray(j - ma, mb + 1);
|
||||
ui.re += rootpq * (b.re * ui_prev.re - b.im * ui_prev.im);
|
||||
ui.im += rootpq * (b.re * ui_prev.im + b.im * ui_prev.re);
|
||||
|
||||
ulist_cpu(iatom, jnbor, jju_index) = ui;
|
||||
|
||||
rootpq = rootpqarray(ma + 1, mb + 1);
|
||||
ui.re = rootpq * (a.re * ui_prev.re - a.im * ui_prev.im);
|
||||
ui.im = rootpq * (a.re * ui_prev.im + a.im * ui_prev.re);
|
||||
|
||||
jju_index++;
|
||||
jjup_index++;
|
||||
}
|
||||
});
|
||||
|
||||
ulist_cpu(iatom, jnbor, jju_index) = ui;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
@ -1541,30 +1520,25 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_uarray_cpu(const t
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void SNAKokkos<DeviceType, real_type, vector_length>::compute_duarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor,
|
||||
void SNAKokkos<DeviceType, real_type, vector_length>::compute_duarray_cpu(int iatom, int jnbor,
|
||||
const real_type& x, const real_type& y, const real_type& z,
|
||||
const real_type& z0, const real_type& r, const real_type& dz0dr,
|
||||
const real_type& wj, const real_type& rcut,
|
||||
const real_type& sinner, const real_type& dinner) const
|
||||
{
|
||||
real_type r0inv;
|
||||
real_type a_r, a_i, b_r, b_i;
|
||||
real_type u[3], da_r[3], da_i[3], db_r[3], db_i[3];
|
||||
real_type dz0[3], dr0inv[3], dr0invdr;
|
||||
real_type rootpq;
|
||||
complex da[3], db[3];
|
||||
real_type u[3], dz0[3], dr0inv[3];
|
||||
|
||||
real_type rinv = 1.0 / r;
|
||||
u[0] = x * rinv;
|
||||
u[1] = y * rinv;
|
||||
u[2] = z * rinv;
|
||||
|
||||
r0inv = 1.0 / sqrt(r * r + z0 * z0);
|
||||
a_r = z0 * r0inv;
|
||||
a_i = -z * r0inv;
|
||||
b_r = y * r0inv;
|
||||
b_i = -x * r0inv;
|
||||
real_type r0inv = 1.0 / sqrt(r * r + z0 * z0);
|
||||
complex a = { z0 * r0inv, -z * r0inv };
|
||||
complex b = { y * r0inv, -x * r0inv };
|
||||
|
||||
dr0invdr = -r0inv * r0inv * r0inv * (r + z0 * dz0dr);
|
||||
real_type dr0invdr = -r0inv * r0inv * r0inv * (r + z0 * dz0dr);
|
||||
|
||||
dr0inv[0] = dr0invdr * u[0];
|
||||
dr0inv[1] = dr0invdr * u[1];
|
||||
@ -1575,19 +1549,19 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_duarray_cpu(const
|
||||
dz0[2] = dz0dr * u[2];
|
||||
|
||||
for (int k = 0; k < 3; k++) {
|
||||
da_r[k] = dz0[k] * r0inv + z0 * dr0inv[k];
|
||||
da_i[k] = -z * dr0inv[k];
|
||||
da[k].re = dz0[k] * r0inv + z0 * dr0inv[k];
|
||||
da[k].im = -z * dr0inv[k];
|
||||
}
|
||||
|
||||
da_i[2] += -r0inv;
|
||||
da[2].im += -r0inv;
|
||||
|
||||
for (int k = 0; k < 3; k++) {
|
||||
db_r[k] = y * dr0inv[k];
|
||||
db_i[k] = -x * dr0inv[k];
|
||||
db[k].re = y * dr0inv[k];
|
||||
db[k].im = -x * dr0inv[k];
|
||||
}
|
||||
|
||||
db_i[0] += -r0inv;
|
||||
db_r[1] += r0inv;
|
||||
db[0].im += -r0inv;
|
||||
db[1].re += r0inv;
|
||||
|
||||
for (int k = 0; k < 3; k++)
|
||||
dulist_cpu(iatom, jnbor, 0, k) = complex::zero();
|
||||
@ -1595,72 +1569,94 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_duarray_cpu(const
|
||||
for (int j = 1; j <= twojmax; j++) {
|
||||
int jju = idxu_cache_block[j];
|
||||
int jjup = idxu_cache_block[j-1];
|
||||
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,(j+2)/2),
|
||||
[&] (const int& mb) {
|
||||
//for (int mb = 0; 2*mb <= j; mb++) {
|
||||
const int jju_index = jju+mb+mb*j;
|
||||
for (int k = 0; k < 3; k++)
|
||||
dulist_cpu(iatom, jnbor, jju_index, k) = complex::zero();
|
||||
|
||||
for (int mb = 0; 2*mb <= j; mb++) {
|
||||
int jju_index = jju + mb * (j + 1);
|
||||
int jjup_index = jjup + mb * j;
|
||||
|
||||
complex duidrj[3] = { complex::zero(), complex::zero(), complex::zero() };
|
||||
|
||||
for (int ma = 0; ma < j; ma++) {
|
||||
const int jju_index = jju+mb+mb*j+ma;
|
||||
const int jjup_index = jjup+mb*j+ma;
|
||||
rootpq = rootpqarray(j - ma,j - mb);
|
||||
complex ui_prev = ulist_cpu(iatom, jnbor, jjup_index);
|
||||
complex duidrj_prev[3] = { dulist_cpu(iatom, jnbor, jjup_index, 0),
|
||||
dulist_cpu(iatom, jnbor, jjup_index, 1),
|
||||
dulist_cpu(iatom, jnbor, jjup_index, 2) };
|
||||
|
||||
real_type rootpq = rootpqarray(j - ma,j - mb);
|
||||
for (int k = 0; k < 3; k++) {
|
||||
dulist_cpu(iatom, jnbor, jju_index, k).re +=
|
||||
rootpq * (da_r[k] * ulist_cpu(iatom, jnbor, jjup_index).re +
|
||||
da_i[k] * ulist_cpu(iatom, jnbor, jjup_index).im +
|
||||
a_r * dulist_cpu(iatom, jnbor, jjup_index, k).re +
|
||||
a_i * dulist_cpu(iatom, jnbor, jjup_index, k).im);
|
||||
dulist_cpu(iatom, jnbor, jju_index, k).im +=
|
||||
rootpq * (da_r[k] * ulist_cpu(iatom, jnbor, jjup_index).im -
|
||||
da_i[k] * ulist_cpu(iatom, jnbor, jjup_index).re +
|
||||
a_r * dulist_cpu(iatom, jnbor, jjup_index, k).im -
|
||||
a_i * dulist_cpu(iatom, jnbor, jjup_index, k).re);
|
||||
duidrj[k].re += rootpq * (da[k].re * ui_prev.re + da[k].im * ui_prev.im +
|
||||
a.re * duidrj_prev[k].re + a.im * duidrj_prev[k].im);
|
||||
|
||||
duidrj[k].im += rootpq * (da[k].re * ui_prev.im - da[k].im * ui_prev.re +
|
||||
a.re * duidrj_prev[k].im - a.im * duidrj_prev[k].re);
|
||||
|
||||
dulist_cpu(iatom, jnbor, jju_index, k) = duidrj[k];
|
||||
}
|
||||
|
||||
rootpq = rootpqarray(ma + 1,j - mb);
|
||||
for (int k = 0; k < 3; k++) {
|
||||
dulist_cpu(iatom, jnbor, jju_index+1, k).re =
|
||||
-rootpq * (db_r[k] * ulist_cpu(iatom, jnbor, jjup_index).re +
|
||||
db_i[k] * ulist_cpu(iatom, jnbor, jjup_index).im +
|
||||
b_r * dulist_cpu(iatom, jnbor, jjup_index, k).re +
|
||||
b_i * dulist_cpu(iatom, jnbor, jjup_index, k).im);
|
||||
dulist_cpu(iatom, jnbor, jju_index+1, k).im =
|
||||
-rootpq * (db_r[k] * ulist_cpu(iatom, jnbor, jjup_index).im -
|
||||
db_i[k] * ulist_cpu(iatom, jnbor, jjup_index).re +
|
||||
b_r * dulist_cpu(iatom, jnbor, jjup_index, k).im -
|
||||
b_i * dulist_cpu(iatom, jnbor, jjup_index, k).re);
|
||||
duidrj[k].re = -rootpq * (db[k].re * ui_prev.re + db[k].im * ui_prev.im +
|
||||
b.re * duidrj_prev[k].re + b.im * duidrj_prev[k].im);
|
||||
|
||||
duidrj[k].im = -rootpq * (db[k].re * ui_prev.im - db[k].im * ui_prev.re +
|
||||
b.re * duidrj_prev[k].im - b.im * duidrj_prev[k].re);
|
||||
}
|
||||
|
||||
jju_index++;
|
||||
jjup_index++;
|
||||
}
|
||||
|
||||
// Only need to add one symmetrized row for convenience
|
||||
// Symmetry gets "unfolded" during the dedr accumulation
|
||||
|
||||
// copy left side to right side with inversion symmetry VMK 4.4(2)
|
||||
// u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb])
|
||||
|
||||
if (j%2==1 && mb==(j/2)) {
|
||||
const int mbpar = (mb)%2==0?1:-1;
|
||||
int mapar = mbpar;
|
||||
for (int ma = 0; ma <= j; ma++) {
|
||||
const int jju_index = jju+mb*(j+1)+ma;
|
||||
const int jjup_index = jju+(mb+2)*(j+1)-(ma+1);
|
||||
if (mapar == 1) {
|
||||
for (int k = 0; k < 3; k++) {
|
||||
dulist_cpu(iatom, jnbor, jjup_index, k).re = dulist_cpu(iatom, jnbor, jju_index, k).re;
|
||||
dulist_cpu(iatom, jnbor, jjup_index, k).im = -dulist_cpu(iatom, jnbor, jju_index, k).im;
|
||||
}
|
||||
} else {
|
||||
for (int k = 0; k < 3; k++) {
|
||||
dulist_cpu(iatom, jnbor, jjup_index, k).re = -dulist_cpu(iatom, jnbor, jju_index, k).re;
|
||||
dulist_cpu(iatom, jnbor, jjup_index, k).im = dulist_cpu(iatom, jnbor, jju_index, k).im;
|
||||
}
|
||||
}
|
||||
mapar = -mapar;
|
||||
}
|
||||
for (int k = 0; k < 3; k++) {
|
||||
dulist_cpu(iatom, jnbor, jju_index, k) = duidrj[k];
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Only need to add one symmetrized row for convenience
|
||||
// Symmetry gets "unfolded" during the dedr accumulation
|
||||
|
||||
// copy left side to right side with inversion symmetry VMK 4.4(2)
|
||||
// u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb])
|
||||
|
||||
if (j % 2 == 1) {
|
||||
int mb = j / 2;
|
||||
// begin filling in the extra row
|
||||
int jju_index = jju + (mb + 1) * (j + 1);
|
||||
int jjup_index = jjup + mb * j;
|
||||
|
||||
complex duidrj[3] = { complex::zero(), complex::zero(), complex::zero() };
|
||||
|
||||
for (int ma = 0; ma < j; ma++) {
|
||||
complex ui_prev = ulist_cpu(iatom, jnbor, jjup_index);
|
||||
complex duidrj_prev[3] = { dulist_cpu(iatom, jnbor, jjup_index, 0),
|
||||
dulist_cpu(iatom, jnbor, jjup_index, 1),
|
||||
dulist_cpu(iatom, jnbor, jjup_index, 2) };
|
||||
|
||||
real_type rootpq = rootpqarray(j - ma, mb + 1);
|
||||
for (int k = 0; k < 3; k++) {
|
||||
duidrj[k].re += rootpq * (db[k].re * ui_prev.re - db[k].im * ui_prev.im +
|
||||
b.re * duidrj_prev[k].re - b.im * duidrj_prev[k].im);
|
||||
duidrj[k].im += rootpq * (db[k].re * ui_prev.im + db[k].im * ui_prev.re +
|
||||
b.re * duidrj_prev[k].im + b.im * duidrj_prev[k].re);
|
||||
|
||||
dulist_cpu(iatom, jnbor, jju_index, k) = duidrj[k];
|
||||
}
|
||||
|
||||
rootpq = rootpqarray(ma + 1, mb + 1);
|
||||
for (int k = 0; k < 3; k++) {
|
||||
duidrj[k].re = rootpq * (da[k].re * ui_prev.re - da[k].im * ui_prev.im +
|
||||
a.re * duidrj_prev[k].re - a.im * duidrj_prev[k].im);
|
||||
duidrj[k].im = rootpq * (da[k].re * ui_prev.im + da[k].im * ui_prev.re +
|
||||
a.re * duidrj_prev[k].im + a.im * duidrj_prev[k].re);
|
||||
}
|
||||
|
||||
jju_index++;
|
||||
jjup_index++;
|
||||
}
|
||||
|
||||
for (int k = 0; k < 3; k++) {
|
||||
dulist_cpu(iatom, jnbor, jju_index, k) = duidrj[k];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
real_type sfac = compute_sfac(r, rcut, sinner, dinner);
|
||||
@ -1678,9 +1674,9 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_duarray_cpu(const
|
||||
for (int ma = 0; ma <= j; ma++) {
|
||||
for (int k = 0; k < 3; k++) {
|
||||
dulist_cpu(iatom, jnbor, jju, k).re = dsfac * ulist_cpu(iatom, jnbor, jju).re * u[k] +
|
||||
sfac * dulist_cpu(iatom, jnbor, jju, k).re;
|
||||
sfac * dulist_cpu(iatom, jnbor, jju, k).re;
|
||||
dulist_cpu(iatom, jnbor, jju, k).im = dsfac * ulist_cpu(iatom, jnbor, jju).im * u[k] +
|
||||
sfac * dulist_cpu(iatom, jnbor, jju, k).im;
|
||||
sfac * dulist_cpu(iatom, jnbor, jju, k).im;
|
||||
}
|
||||
jju++;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user