diff --git a/src/KOKKOS/pair_snap_kokkos.h b/src/KOKKOS/pair_snap_kokkos.h index 7d58f387d6..e7797097c2 100644 --- a/src/KOKKOS/pair_snap_kokkos.h +++ b/src/KOKKOS/pair_snap_kokkos.h @@ -204,10 +204,10 @@ class PairSNAPKokkos : public PairSNAP { void operator() (TagPairSNAPComputeNeighCPU,const typename Kokkos::TeamPolicy::member_type& team) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPPreUiCPU,const typename Kokkos::TeamPolicy::member_type& team) const; + void operator() (TagPairSNAPPreUiCPU, const int& iatom) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy::member_type& team) const; + void operator() (TagPairSNAPComputeUiCPU, const int& ii) const; KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPTransformUiCPU, const int j, const int iatom) const; @@ -222,10 +222,10 @@ class PairSNAPKokkos : public PairSNAP { void operator() (TagPairSNAPComputeYiCPU,const int& ii) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeDuidrjCPU,const typename Kokkos::TeamPolicy::member_type& team) const; + void operator() (TagPairSNAPComputeDuidrjCPU, const int& ii) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeDeidrjCPU,const typename Kokkos::TeamPolicy::member_type& team) const; + void operator() (TagPairSNAPComputeDeidrjCPU, const int& ii) const; template KOKKOS_INLINE_FUNCTION diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h index b2252f85c7..d0647337a8 100644 --- a/src/KOKKOS/pair_snap_kokkos_impl.h +++ b/src/KOKKOS/pair_snap_kokkos_impl.h @@ -204,7 +204,7 @@ void PairSNAPKokkos::compute(int eflag_in, const int inum_pad = inum_div * vector_length; MemKK::realloc_kokkos(d_beta,"PairSNAPKokkos:beta", inum_pad, ncoeff); snaKK.d_beta = d_beta; - MemKK::realloc_kokkos(d_ninside,"PairSNAPKokkos:ninside", inum); + MemKK::realloc_kokkos(d_ninside,"PairSNAPKokkos:ninside", inum_pad); } chunk_size = MIN(chunksize, inum); // "chunksize" variable is set by user @@ -235,17 +235,14 @@ void PairSNAPKokkos::compute(int eflag_in, //PreUi { - int team_size = team_size_default; - check_team_size_for(chunk_size,team_size); - typename Kokkos::TeamPolicy policy_preui_cpu((chunk_size+team_size-1)/team_size,team_size,vector_length); + typename Kokkos::RangePolicy policy_preui_cpu(0, chunk_size); Kokkos::parallel_for("PreUiCPU",policy_preui_cpu,*this); } // ComputeUi { - int team_size = team_size_default; // Fused calculation of ulist and accumulation into ulisttot using atomics - typename Kokkos::TeamPolicy policy_ui_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); + typename Kokkos::RangePolicy policy_ui_cpu(0, chunk_size*max_neighs); Kokkos::parallel_for("ComputeUiCPU",policy_ui_cpu,*this); } @@ -283,13 +280,10 @@ void PairSNAPKokkos::compute(int eflag_in, //ComputeDuidrj and Deidrj { - int team_size = team_size_default; - - typename Kokkos::TeamPolicy policy_duidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); + typename Kokkos::RangePolicy policy_duidrj_cpu(0,chunk_size*max_neighs); Kokkos::parallel_for("ComputeDuidrjCPU",policy_duidrj_cpu,*this); - typename Kokkos::TeamPolicy policy_deidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); - + typename Kokkos::RangePolicy policy_deidrj_cpu(0,chunk_size*max_neighs); Kokkos::parallel_for("ComputeDeidrjCPU",policy_deidrj_cpu,*this); } @@ -601,47 +595,6 @@ void PairSNAPKokkos::coeff(int narg, char snaKK.init(); } -/* ---------------------------------------------------------------------- - Begin routines that are common to both the CPU and GPU codepath. -------------------------------------------------------------------------- */ - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPBeta, const int& iatom) const { - - if (iatom >= chunk_size) return; - - const int i = d_ilist[iatom + chunk_offset]; - const int itype = type[i]; - const int ielem = d_map[itype]; - - auto d_coeffi = Kokkos::subview(d_coeffelem, ielem, Kokkos::ALL); - - for (int icoeff = 0; icoeff < ncoeff; icoeff++) { - d_beta(iatom, icoeff) = d_coeffi[icoeff+1]; - } - - if (quadraticflag) { - const auto idxb_max = snaKK.idxb_max; - int k = ncoeff+1; - for (int icoeff = 0; icoeff < ncoeff; icoeff++) { - const auto idxb = icoeff % idxb_max; - const auto idx_chem = icoeff / idxb_max; - real_type bveci = snaKK.blist(iatom, idx_chem, idxb); - d_beta(iatom, icoeff) += d_coeffi[k] * bveci; - k++; - for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) { - const auto jdxb = jcoeff % idxb_max; - const auto jdx_chem = jcoeff / idxb_max; - real_type bvecj = snaKK.blist(iatom, jdx_chem, jdxb); - d_beta(iatom, icoeff) += d_coeffi[k] * bvecj; - d_beta(iatom, jcoeff) += d_coeffi[k] * bveci; - k++; - } - } - } -} - /* ---------------------------------------------------------------------- Begin routines that are unique to the GPU codepath. These take advantage of AoSoA data layouts and scratch memory for recursive polynomials @@ -737,222 +690,6 @@ void PairSNAPKokkos::operator() (TagPairSN }); } -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeCayleyKlein,const int iatom_mod, const int jnbor, const int iatom_div) const { - - const int iatom = iatom_mod + iatom_div * vector_length; - if (iatom >= chunk_size) return; - - const int ninside = d_ninside(iatom); - if (jnbor >= ninside) return; - - snaKK.compute_cayley_klein(iatom,jnbor); -} - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPPreUi, const int iatom_mod, const int j, const int iatom_div) const { - const int iatom = iatom_mod + iatom_div * vector_length; - if (iatom >= chunk_size) return; - - int itype = type(iatom); - int ielem = d_map[itype]; - - snaKK.pre_ui(iatom, j, ielem); -} - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeUiSmall,const typename Kokkos::TeamPolicy::member_type& team) const { - - // extract flattened atom_div / neighbor number / bend location - int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui; - - // extract neighbor index, iatom_div - int iatom_div = flattened_idx / (max_neighs * (twojmax + 1)); // removed "const" to work around GCC 7 bug - const int jj_jbend = flattened_idx - iatom_div * (max_neighs * (twojmax + 1)); - const int jbend = jj_jbend / max_neighs; - int jj = jj_jbend - jbend * max_neighs; // removed "const" to work around GCC 7 bug - - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length), - [&] (const int iatom_mod) { - const int ii = iatom_mod + vector_length * iatom_div; - if (ii >= chunk_size) return; - - const int ninside = d_ninside(ii); - if (jj >= ninside) return; - - snaKK.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div); - }); - -} - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeUiLarge,const typename Kokkos::TeamPolicy::member_type& team) const { - - // extract flattened atom_div / neighbor number / bend location - int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui; - - // extract neighbor index, iatom_div - int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug - int jj = flattened_idx - iatom_div * max_neighs; - - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length), - [&] (const int iatom_mod) { - const int ii = iatom_mod + vector_length * iatom_div; - if (ii >= chunk_size) return; - - const int ninside = d_ninside(ii); - if (jj >= ninside) return; - - snaKK.compute_ui_large(team,iatom_mod, jj, iatom_div); - }); - -} - - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPTransformUi, const int iatom_mod, const int idxu, const int iatom_div) const { - - const int iatom = iatom_mod + iatom_div * vector_length; - if (iatom >= chunk_size) return; - if (idxu > snaKK.idxu_max) return; - - int elem_count = chemflag ? nelements : 1; - - for (int ielem = 0; ielem < elem_count; ielem++) { - - const FullHalfMapper mapper = snaKK.idxu_full_half[idxu]; - - auto utot_re = snaKK.ulisttot_re(iatom, ielem, mapper.idxu_half); - auto utot_im = snaKK.ulisttot_im(iatom, ielem, mapper.idxu_half); - - if (mapper.flip_sign == 1) { - utot_im = -utot_im; - } else if (mapper.flip_sign == -1) { - utot_re = -utot_re; - } - - snaKK.ulisttot(iatom, ielem, idxu) = { utot_re, utot_im }; - - if (mapper.flip_sign == 0) { - snaKK.ylist_re(iatom, ielem, mapper.idxu_half) = 0.; - snaKK.ylist_im(iatom, ielem, mapper.idxu_half) = 0.; - } - } -} - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeYi,const int iatom_mod, const int jjz, const int iatom_div) const { - - const int iatom = iatom_mod + iatom_div * vector_length; - if (iatom >= chunk_size) return; - - if (jjz >= snaKK.idxz_max) return; - - snaKK.compute_yi(iatom, jjz); -} - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeYiWithZlist,const int iatom_mod, const int jjz, const int iatom_div) const { - - const int iatom = iatom_mod + iatom_div * vector_length; - if (iatom >= chunk_size) return; - - if (jjz >= snaKK.idxz_max) return; - - snaKK.compute_yi_with_zlist(iatom, jjz); -} - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeZi,const int iatom_mod, const int jjz, const int iatom_div) const { - - const int iatom = iatom_mod + iatom_div * vector_length; - if (iatom >= chunk_size) return; - if (jjz >= snaKK.idxz_max) return; - - snaKK.compute_zi(iatom, jjz); -} - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeBi,const int iatom_mod, const int jjb, const int iatom_div) const { - - const int iatom = iatom_mod + iatom_div * vector_length; - if (iatom >= chunk_size) return; - - if (jjb >= snaKK.idxb_max) return; - - snaKK.compute_bi(iatom,jjb); -} - -template -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeFusedDeidrjSmall,const typename Kokkos::TeamPolicy >::member_type& team) const { - - // extract flattened atom_div / neighbor number / bend location - int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_fused_deidrj; - - // extract neighbor index, iatom_div - int iatom_div = flattened_idx / (max_neighs * (twojmax + 1)); // removed "const" to work around GCC 7 bug - const int jj_jbend = flattened_idx - iatom_div * (max_neighs * (twojmax + 1)); - const int jbend = jj_jbend / max_neighs; - int jj = jj_jbend - jbend * max_neighs; // removed "const" to work around GCC 7 bug - - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length), - [&] (const int iatom_mod) { - const int ii = iatom_mod + vector_length * iatom_div; - if (ii >= chunk_size) return; - - const int ninside = d_ninside(ii); - if (jj >= ninside) return; - - snaKK.template compute_fused_deidrj_small(team, iatom_mod, jbend, jj, iatom_div); - - }); - -} - -template -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeFusedDeidrjLarge,const typename Kokkos::TeamPolicy >::member_type& team) const { - - // extract flattened atom_div / neighbor number / bend location - int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_fused_deidrj; - - // extract neighbor index, iatom_div - int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug - int jj = flattened_idx - max_neighs * iatom_div; - - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length), - [&] (const int iatom_mod) { - const int ii = iatom_mod + vector_length * iatom_div; - if (ii >= chunk_size) return; - - const int ninside = d_ninside(ii); - if (jj >= ninside) return; - - snaKK.template compute_fused_deidrj_large(team, iatom_mod, jj, iatom_div); - - }); -} - -/* ---------------------------------------------------------------------- - Begin routines that are unique to the CPU codepath. These do not take - advantage of AoSoA data layouts, but that could be a good point of - future optimization and unification with the above kernels. It's unlikely - that scratch memory optimizations will ever be useful for the CPU due to - different arithmetic intensity requirements for the CPU vs GPU. -------------------------------------------------------------------------- */ - template KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPComputeNeighCPU,const typename Kokkos::TeamPolicy::member_type& team) const { @@ -1028,36 +765,156 @@ void PairSNAPKokkos::operator() (TagPairSN }); } +/* ---------------------------------------------------------------------- + Pre-compute the Cayley-Klein parameters for reuse in later routines. + GPU only. +------------------------------------------------------------------------- */ template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPPreUiCPU,const typename Kokkos::TeamPolicy::member_type& team) const { +void PairSNAPKokkos::operator() (TagPairSNAPComputeCayleyKlein,const int iatom_mod, const int jnbor, const int iatom_div) const { - // Extract the atom number - const int ii = team.team_rank() + team.team_size() * team.league_rank(); - if (ii >= chunk_size) return; - int itype = type(ii); - int ielem = d_map[itype]; + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; - snaKK.pre_ui_cpu(team,ii,ielem); + const int ninside = d_ninside(iatom); + if (jnbor >= ninside) return; + + snaKK.compute_cayley_klein(iatom, jnbor); } - +/* ---------------------------------------------------------------------- + Initialize the "ulisttot" structure with non-zero on-diagonal terms + and zero terms elsewhere; both CPU and GPU. +------------------------------------------------------------------------- */ template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy::member_type& team) const { +void PairSNAPKokkos::operator() (TagPairSNAPPreUi, const int iatom_mod, const int j, const int iatom_div) const { + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; - // Extract the atom number - int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size())); - if (ii >= chunk_size) return; + int itype = type(iatom); + int ielem = d_map[itype]; - // Extract the neighbor number - const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size()); - const int ninside = d_ninside(ii); - if (jj >= ninside) return; + snaKK.pre_ui(iatom, j, ielem); +} - snaKK.compute_ui_cpu(team,ii,jj); +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPPreUiCPU, const int& iatom) const { + const int itype = type(iatom); + const int ielem = d_map[itype]; + + snaKK.pre_ui_cpu(iatom, ielem); +} + +/* ---------------------------------------------------------------------- + Accumulate the spectral contributions from atom, neighbor pairs into + ulisttot_re and _im. These routines are GPU only and use scratch memory + staging. +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeUiSmall,const typename Kokkos::TeamPolicy::member_type& team) const { + + // extract flattened atom_div / neighbor number / bend location + int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui; + + // extract neighbor index, iatom_div + int iatom_div = flattened_idx / (max_neighs * (twojmax + 1)); // removed "const" to work around GCC 7 bug + const int jj_jbend = flattened_idx - iatom_div * (max_neighs * (twojmax + 1)); + const int jbend = jj_jbend / max_neighs; + int jj = jj_jbend - jbend * max_neighs; // removed "const" to work around GCC 7 bug + + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length), + [&] (const int iatom_mod) { + const int ii = iatom_mod + vector_length * iatom_div; + if (ii >= chunk_size) return; + + const int ninside = d_ninside(ii); + if (jj >= ninside) return; + + snaKK.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div); + }); + +} + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeUiLarge,const typename Kokkos::TeamPolicy::member_type& team) const { + + // extract flattened atom_div / neighbor number / bend location + int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui; + + // extract neighbor index, iatom_div + int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug + int jj = flattened_idx - iatom_div * max_neighs; + + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length), + [&] (const int iatom_mod) { + const int ii = iatom_mod + vector_length * iatom_div; + if (ii >= chunk_size) return; + + const int ninside = d_ninside(ii); + if (jj >= ninside) return; + + snaKK.compute_ui_large(team,iatom_mod, jj, iatom_div); + }); +} + +/* ---------------------------------------------------------------------- + Accumulate the spectral contributions from atom, neighbor pairs into + ulisttot_re and _im. This routine is CPU only and does not use staging. +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeUiCPU, const int& ii) const { + const int iatom = ii / max_neighs; + const int jnbor = ii % max_neighs; + const int ninside = d_ninside(iatom); + if (jnbor >= ninside) return; + + snaKK.compute_ui_cpu(iatom, jnbor); +} + +/* ---------------------------------------------------------------------- + De-symmetrize ulisttot_re and _im and pack it into a unified ulisttot + structure. Zero-initialize ylist. CPU and GPU. +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPTransformUi, const int iatom_mod, const int idxu, const int iatom_div) const { + + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; + if (idxu > snaKK.idxu_max) return; + + int elem_count = chemflag ? nelements : 1; + + for (int ielem = 0; ielem < elem_count; ielem++) { + + const FullHalfMapper mapper = snaKK.idxu_full_half[idxu]; + + auto utot_re = snaKK.ulisttot_re(iatom, ielem, mapper.idxu_half); + auto utot_im = snaKK.ulisttot_im(iatom, ielem, mapper.idxu_half); + + if (mapper.flip_sign == 1) { + utot_im = -utot_im; + } else if (mapper.flip_sign == -1) { + utot_re = -utot_re; + } + + snaKK.ulisttot(iatom, ielem, idxu) = { utot_re, utot_im }; + + if (mapper.flip_sign == 0) { + snaKK.ylist_re(iatom, ielem, mapper.idxu_half) = 0.; + snaKK.ylist_im(iatom, ielem, mapper.idxu_half) = 0.; + } + } } template @@ -1110,10 +967,21 @@ void PairSNAPKokkos::operator() (TagPairSN } } +/* ---------------------------------------------------------------------- + Compute all elements of the Z tensor and store them into the `zlist` + View. This is only used for energy timesteps or quadratic SNAP. + CPU and GPU. +------------------------------------------------------------------------- */ + template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeYiCPU,const int& ii) const { - snaKK.compute_yi_cpu(ii); +void PairSNAPKokkos::operator() (TagPairSNAPComputeZi,const int iatom_mod, const int jjz, const int iatom_div) const { + + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; + if (jjz >= snaKK.idxz_max) return; + + snaKK.compute_zi(iatom, jjz); } template @@ -1122,6 +990,22 @@ void PairSNAPKokkos::operator() (TagPairSN snaKK.compute_zi_cpu(ii); } +/* ---------------------------------------------------------------------- + Compute the energy triple products and store in the "blist" View. + CPU and GPU. +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeBi,const int iatom_mod, const int jjb, const int iatom_div) const { + + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; + if (jjb >= snaKK.idxb_max) return; + + snaKK.compute_bi(iatom,jjb); +} + template KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPComputeBiCPU, const int& ii) const { @@ -1130,42 +1014,187 @@ void PairSNAPKokkos::operator() (TagPairSN snaKK.compute_bi(iatom, jjb); } -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeDuidrjCPU,const typename Kokkos::TeamPolicy::member_type& team) const { - // Extract the atom number - int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size())); - if (ii >= chunk_size) return; - - // Extract the neighbor number - const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size()); - const int ninside = d_ninside(ii); - if (jj >= ninside) return; - - snaKK.compute_duidrj_cpu(team,ii,jj); -} +/* ---------------------------------------------------------------------- + Assemble the "beta" coefficients that enter the computation of the + adjoint matrices Y. For quadratic SNAP, this includes accumulating + energy triple products into an "effective" beta that encodes the + quadratic terms with otherwise linear compute work. + CPU and GPU. +------------------------------------------------------------------------- */ template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeDeidrjCPU,const typename Kokkos::TeamPolicy::member_type& team) const { +void PairSNAPKokkos::operator() (TagPairSNAPBeta, const int& iatom) const { - // Extract the atom number - int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size())); - if (ii >= chunk_size) return; + if (iatom >= chunk_size) return; - // Extract the neighbor number - const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size()); - const int ninside = d_ninside(ii); - if (jj >= ninside) return; + const int i = d_ilist[iatom + chunk_offset]; + const int itype = type[i]; + const int ielem = d_map[itype]; - snaKK.compute_deidrj_cpu(team,ii,jj); + auto d_coeffi = Kokkos::subview(d_coeffelem, ielem, Kokkos::ALL); + + for (int icoeff = 0; icoeff < ncoeff; icoeff++) { + d_beta(iatom, icoeff) = d_coeffi[icoeff+1]; + } + + if (quadraticflag) { + const auto idxb_max = snaKK.idxb_max; + int k = ncoeff+1; + for (int icoeff = 0; icoeff < ncoeff; icoeff++) { + const auto idxb = icoeff % idxb_max; + const auto idx_chem = icoeff / idxb_max; + real_type bveci = snaKK.blist(iatom, idx_chem, idxb); + d_beta(iatom, icoeff) += d_coeffi[k] * bveci; + k++; + for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) { + const auto jdxb = jcoeff % idxb_max; + const auto jdx_chem = jcoeff / idxb_max; + real_type bvecj = snaKK.blist(iatom, jdx_chem, jdxb); + d_beta(iatom, icoeff) += d_coeffi[k] * bvecj; + d_beta(iatom, jcoeff) += d_coeffi[k] * bveci; + k++; + } + } + } } /* ---------------------------------------------------------------------- - Also used for both CPU and GPU codepaths. Could maybe benefit from a - separate GPU/CPU codepath, but this kernel takes so little time it's - likely not worth it. + Compute all elements of the Z tensor and accumultate them into the + adjoint matrices Y (ylist_re, _im) on non-energy timesteps. CPU and GPU. +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeYi,const int iatom_mod, const int jjz, const int iatom_div) const { + + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; + if (jjz >= snaKK.idxz_max) return; + + snaKK.compute_yi(iatom, jjz); +} + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeYiCPU, const int& ii) const { + snaKK.compute_yi_cpu(ii); +} + +/* ---------------------------------------------------------------------- + Accumulate the pre-computed elements of the Z tensor into the adjoint + matrices Y (ylist_re, _im) on non-energy timesteps. CPU and GPU. +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeYiWithZlist,const int iatom_mod, const int jjz, const int iatom_div) const { + + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; + if (jjz >= snaKK.idxz_max) return; + + snaKK.compute_yi_with_zlist(iatom, jjz); +} + +/* ---------------------------------------------------------------------- + Assemble the force contributions for each atom, neighbor pair by + contracting the adjoint matrices Y with derivatives of the Wigner + matrices U. These routines are GPU only and use scratch memory + staging. +------------------------------------------------------------------------- */ + +template +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeFusedDeidrjSmall,const typename Kokkos::TeamPolicy >::member_type& team) const { + + // extract flattened atom_div / neighbor number / bend location + int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_fused_deidrj; + + // extract neighbor index, iatom_div + int iatom_div = flattened_idx / (max_neighs * (twojmax + 1)); // removed "const" to work around GCC 7 bug + const int jj_jbend = flattened_idx - iatom_div * (max_neighs * (twojmax + 1)); + const int jbend = jj_jbend / max_neighs; + int jj = jj_jbend - jbend * max_neighs; // removed "const" to work around GCC 7 bug + + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length), + [&] (const int iatom_mod) { + const int ii = iatom_mod + vector_length * iatom_div; + if (ii >= chunk_size) return; + + const int ninside = d_ninside(ii); + if (jj >= ninside) return; + + snaKK.template compute_fused_deidrj_small(team, iatom_mod, jbend, jj, iatom_div); + + }); + +} + +template +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeFusedDeidrjLarge,const typename Kokkos::TeamPolicy >::member_type& team) const { + + // extract flattened atom_div / neighbor number / bend location + int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_fused_deidrj; + + // extract neighbor index, iatom_div + int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug + int jj = flattened_idx - max_neighs * iatom_div; + + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length), + [&] (const int iatom_mod) { + const int ii = iatom_mod + vector_length * iatom_div; + if (ii >= chunk_size) return; + + const int ninside = d_ninside(ii); + if (jj >= ninside) return; + + snaKK.template compute_fused_deidrj_large(team, iatom_mod, jj, iatom_div); + + }); +} + +/* ---------------------------------------------------------------------- + Assemble the derivatives of the Winger matrices U into the View + "dulist". CPU only. +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeDuidrjCPU, const int& ii) const { + const int iatom = ii / max_neighs; + const int jnbor = ii % max_neighs; + const int ninside = d_ninside(iatom); + if (jnbor >= ninside) return; + + snaKK.compute_duidrj_cpu(iatom, jnbor); +} + +/* ---------------------------------------------------------------------- + Assemble the force contributions for each atom, neighbor pair by + contracting the adjoint matrices Y with the pre-computed derivatives + of the Wigner matrices U. CPU only. +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeDeidrjCPU, const int& ii) const { + const int iatom = ii / max_neighs; + const int jnbor = ii % max_neighs; + const int ninside = d_ninside(iatom); + if (jnbor >= ninside) return; + + snaKK.compute_deidrj_cpu(iatom, jnbor); +} + +/* ---------------------------------------------------------------------- + This routine formally accumulates the "chunked" force contributions + into the broader LAMMPS "f" force View. As appropriate it + also accumulates the total energy and the virial. CPU and GPU. ------------------------------------------------------------------------- */ template diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h index fa187c26ea..4c9cace26a 100644 --- a/src/KOKKOS/sna_kokkos.h +++ b/src/KOKKOS/sna_kokkos.h @@ -237,9 +237,9 @@ class SNAKokkos { // functions for bispectrum coefficients, CPU only KOKKOS_INLINE_FUNCTION - void pre_ui_cpu(const typename Kokkos::TeamPolicy::member_type& team,const int&,const int&) const; // ForceSNAP + void pre_ui_cpu(const int&, const int&) const; // ForceSNAP KOKKOS_INLINE_FUNCTION - void compute_ui_cpu(const typename Kokkos::TeamPolicy::member_type& team, int, int) const; // ForceSNAP + void compute_ui_cpu(const int&, const int&) const; // ForceSNAP KOKKOS_INLINE_FUNCTION void compute_zi_cpu(const int&) const; // ForceSNAP KOKKOS_INLINE_FUNCTION @@ -247,9 +247,9 @@ class SNAKokkos { // functions for derivatives, CPU only KOKKOS_INLINE_FUNCTION - void compute_duidrj_cpu(const typename Kokkos::TeamPolicy::member_type& team, int, int) const; //ForceSNAP + void compute_duidrj_cpu(const int&, const int&) const; //ForceSNAP KOKKOS_INLINE_FUNCTION - void compute_deidrj_cpu(const typename Kokkos::TeamPolicy::member_type& team, int, int) const; // ForceSNAP + void compute_deidrj_cpu(const int&, const int&) const; // ForceSNAP KOKKOS_INLINE_FUNCTION real_type compute_sfac(real_type, real_type, real_type, real_type) const; // add_uarraytot, compute_duarray @@ -358,11 +358,10 @@ class SNAKokkos { void init_rootpqarray(); // init() KOKKOS_INLINE_FUNCTION - void add_uarraytot(const typename Kokkos::TeamPolicy::member_type& team, int, int, const real_type&, const real_type&, const real_type&, const real_type&, const real_type&, int) const; // compute_ui + void add_uarraytot(int, int, const real_type&, const real_type&, const real_type&, const real_type&, const real_type&, int) const; // compute_ui KOKKOS_INLINE_FUNCTION - void compute_uarray_cpu(const typename Kokkos::TeamPolicy::member_type& team, int, int, - const real_type&, const real_type&, const real_type&, + void compute_uarray_cpu(int, int, const real_type&, const real_type&, const real_type&, const real_type&, const real_type&) const; // compute_ui_cpu @@ -372,8 +371,7 @@ class SNAKokkos { inline int compute_ncoeff(); // SNAKokkos() KOKKOS_INLINE_FUNCTION - void compute_duarray_cpu(const typename Kokkos::TeamPolicy::member_type& team, int, int, - const real_type&, const real_type&, const real_type&, // compute_duidrj_cpu + void compute_duarray_cpu(int, int, const real_type&, const real_type&, const real_type&, // compute_duidrj_cpu const real_type&, const real_type&, const real_type&, const real_type&, const real_type&, const real_type&, const real_type&) const; diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index 7988026dd2..6cb83b3910 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -1156,17 +1156,14 @@ typename SNAKokkos::real_type SNAKokkos KOKKOS_INLINE_FUNCTION -void SNAKokkos::pre_ui_cpu(const typename Kokkos::TeamPolicy::member_type& team, const int& iatom, const int& ielem) const +void SNAKokkos::pre_ui_cpu(const int& iatom, const int& ielem) const { for (int jelem = 0; jelem < nelements; jelem++) { for (int j = 0; j <= twojmax; j++) { int jju = idxu_half_block(j); // removed "const" to work around GCC 7 bug // Only diagonal elements get initialized - // for (int m = 0; m < (j+1)*(j/2+1); m++) - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, (j+1)*(j/2+1)), - [&] (const int m) { - + for (int m = 0; m < (j+1)*(j/2+1); m++) { const int jjup = jju + m; // if m is on the "diagonal", initialize it with the self energy. @@ -1176,7 +1173,7 @@ void SNAKokkos::pre_ui_cpu(const typename ulisttot_re(iatom, jelem, jjup) = init; ulisttot_im(iatom, jelem, jjup) = 0; - }); + }; } } @@ -1191,7 +1188,7 @@ void SNAKokkos::pre_ui_cpu(const typename template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_ui_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor) const +void SNAKokkos::compute_ui_cpu(const int& iatom, const int& jnbor) const { real_type rsq, r, x, y, z, z0, theta0; @@ -1211,8 +1208,8 @@ void SNAKokkos::compute_ui_cpu(const typen // theta0 = (r - rmin0) * rscale0; z0 = r / tan(theta0); - compute_uarray_cpu(team, iatom, jnbor, x, y, z, z0, r); - add_uarraytot(team, iatom, jnbor, r, wj(iatom,jnbor), rcutij(iatom,jnbor), sinnerij(iatom,jnbor), dinnerij(iatom,jnbor), element(iatom, jnbor)); + compute_uarray_cpu(iatom, jnbor, x, y, z, z0, r); + add_uarraytot(iatom, jnbor, r, wj(iatom,jnbor), rcutij(iatom,jnbor), sinnerij(iatom,jnbor), dinnerij(iatom,jnbor), element(iatom, jnbor)); } /* ---------------------------------------------------------------------- @@ -1328,7 +1325,7 @@ void SNAKokkos::compute_yi_cpu(int iter) c template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_duidrj_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor) const +void SNAKokkos::compute_duidrj_cpu(const int& iatom, const int& jnbor) const { real_type rsq, r, x, y, z, z0, theta0, cs, sn; real_type dz0dr; @@ -1345,7 +1342,7 @@ void SNAKokkos::compute_duidrj_cpu(const t z0 = r * cs / sn; dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq; - compute_duarray_cpu(team, iatom, jnbor, x, y, z, z0, r, dz0dr, wj(iatom,jnbor), rcutij(iatom,jnbor), sinnerij(iatom,jnbor), dinnerij(iatom,jnbor)); + compute_duarray_cpu(iatom, jnbor, x, y, z, z0, r, dz0dr, wj(iatom,jnbor), rcutij(iatom,jnbor), sinnerij(iatom,jnbor), dinnerij(iatom,jnbor)); } @@ -1360,54 +1357,42 @@ void SNAKokkos::compute_duidrj_cpu(const t template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_deidrj_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor) const +void SNAKokkos::compute_deidrj_cpu(const int& iatom, const int& jnbor) const { - t_scalar3 final_sum; + real_type force_sum[3] = { 0, 0, 0 }; const int jelem = element(iatom, jnbor); - Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,twojmax+1), - [&] (const int& j, t_scalar3& sum_tmp) { + for (int j = 0; j <= twojmax; j++) { int jju_half = idxu_half_block[j]; int jju_cache = idxu_cache_block[j]; - for (int mb = 0; 2*mb < j; mb++) + for (int mb = 0; 2 * mb < j; mb++) { for (int ma = 0; ma <= j; ma++) { - const complex y_val = { ylist_re(iatom, jelem, jju_half), ylist_im(iatom, jelem, jju_half) }; - sum_tmp.x += dulist_cpu(iatom, jnbor, jju_cache, 0).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 0).im * y_val.im; - sum_tmp.y += dulist_cpu(iatom, jnbor, jju_cache, 1).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 1).im * y_val.im; - sum_tmp.z += dulist_cpu(iatom, jnbor, jju_cache, 2).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 2).im * y_val.im; + complex y = { ylist_re(iatom, jelem, jju_half), ylist_im(iatom, jelem, jju_half) }; + for (int k = 0; k < 3; k++) + force_sum[k] += dulist_cpu(iatom, jnbor, jju_cache, k).re * y.re + + dulist_cpu(iatom, jnbor, jju_cache, k).im * y.im; jju_half++; jju_cache++; - } //end loop over ma mb + } + } //end loop over ma mb // For j even, handle middle column - if (j%2 == 0) { - - int mb = j/2; - for (int ma = 0; ma < mb; ma++) { - const complex y_val = { ylist_re(iatom, jelem, jju_half), ylist_im(iatom, jelem, jju_half) }; - sum_tmp.x += dulist_cpu(iatom, jnbor, jju_cache, 0).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 0).im * y_val.im; - sum_tmp.y += dulist_cpu(iatom, jnbor, jju_cache, 1).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 1).im * y_val.im; - sum_tmp.z += dulist_cpu(iatom, jnbor, jju_cache, 2).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 2).im * y_val.im; + if (j % 2 == 0) { + //int mb = j / 2; + for (int ma = 0; ma <= j; ma++) { + complex y = { ylist_re(iatom, jelem, jju_half), ylist_im(iatom, jelem, jju_half) }; + for (int k = 0; k < 3; k++) + force_sum[k] += static_cast(0.5) * (dulist_cpu(iatom, jnbor, jju_cache, k).re * y.re + + dulist_cpu(iatom, jnbor, jju_cache, k).im * y.im); jju_half++; jju_cache++; } - - //int ma = mb; - // 0.5 is meant to avoid double-counting - const complex y_val = { 0.5 * ylist_re(iatom, jelem, jju_half), 0.5 * ylist_im(iatom, jelem, jju_half) }; - sum_tmp.x += dulist_cpu(iatom, jnbor, jju_cache, 0).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 0).im * y_val.im; - sum_tmp.y += dulist_cpu(iatom, jnbor, jju_cache, 1).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 1).im * y_val.im; - sum_tmp.z += dulist_cpu(iatom, jnbor, jju_cache, 2).re * y_val.re + dulist_cpu(iatom, jnbor, jju_cache, 2).im * y_val.im; } // end if jeven - },final_sum); // end loop over j - - Kokkos::single(Kokkos::PerThread(team), [&] () { - dedr(iatom,jnbor,0) = final_sum.x*2.0; - dedr(iatom,jnbor,1) = final_sum.y*2.0; - dedr(iatom,jnbor,2) = final_sum.z*2.0; - }); + } + for (int k = 0; k < 3; k++) + dedr(iatom, jnbor, k) = 2 * force_sum[k]; } @@ -1421,15 +1406,13 @@ void SNAKokkos::compute_deidrj_cpu(const t template KOKKOS_INLINE_FUNCTION -void SNAKokkos::add_uarraytot(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor, +void SNAKokkos::add_uarraytot(int iatom, int jnbor, const real_type& r, const real_type& wj, const real_type& rcut, const real_type& sinner, const real_type& dinner, int jelem) const { const real_type sfac = compute_sfac(r, rcut, sinner, dinner) * wj; - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,twojmax+1), - [&] (const int& j) { - + for (int j = 0; j <= twojmax; j++) { int jju_half = idxu_half_block[j]; // index into ulisttot int jju_cache = idxu_cache_block[j]; // index into ulist @@ -1441,7 +1424,7 @@ void SNAKokkos::add_uarraytot(const typena count++; } } - }); + } } /* ---------------------------------------------------------------------- @@ -1452,23 +1435,15 @@ void SNAKokkos::add_uarraytot(const typena template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_uarray_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor, +void SNAKokkos::compute_uarray_cpu(int iatom, int jnbor, const real_type& x, const real_type& y, const real_type& z, const real_type& z0, const real_type& r) const { - real_type r0inv; - real_type a_r, b_r, a_i, b_i; - real_type rootpq; - // compute Cayley-Klein parameters for unit quaternion - - r0inv = static_cast(1.0) / sqrt(r * r + z0 * z0); - a_r = r0inv * z0; - a_i = -r0inv * z; - b_r = r0inv * y; - b_i = -r0inv * x; + real_type r0inv = static_cast(1.0) / sqrt(r * r + z0 * z0); + complex a = { r0inv * z0, -r0inv * z }; + complex b = { r0inv * y, -r0inv * x }; // VMK Section 4.8.2 - ulist_cpu(iatom, jnbor, 0) = complex::one(); for (int j = 1; j <= twojmax; j++) { @@ -1476,61 +1451,65 @@ void SNAKokkos::compute_uarray_cpu(const t int jjup = idxu_cache_block[j-1]; // removed "const" to work around GCC 7 bug // fill in left side of matrix layer from previous layer + for (int mb = 0; 2*mb <= j; mb++) { + int jju_index = jju + (j + 1) * mb; + int jjup_index = jjup + j * mb; - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,(j+2)/2), - [&] (const int& mb) { - //for (int mb = 0; 2*mb <= j; mb++) { - const int jju_index = jju+mb+mb*j; - ulist_cpu(iatom, jnbor, jju_index) = { static_cast(0), static_cast(0) }; + complex ui = complex::zero(); for (int ma = 0; ma < j; ma++) { - const int jju_index = jju+mb+mb*j+ma; - const int jjup_index = jjup+mb*j+ma; - rootpq = rootpqarray(j - ma,j - mb); - ulist_cpu(iatom, jnbor, jju_index).re += - rootpq * - (a_r * ulist_cpu(iatom, jnbor, jjup_index).re + - a_i * ulist_cpu(iatom, jnbor, jjup_index).im); - ulist_cpu(iatom, jnbor, jju_index).im += - rootpq * - (a_r * ulist_cpu(iatom, jnbor, jjup_index).im - - a_i * ulist_cpu(iatom, jnbor, jjup_index).re); + complex ui_prev = ulist_cpu(iatom, jnbor, jjup_index); - rootpq = rootpqarray(ma + 1,j - mb); - ulist_cpu(iatom, jnbor, jju_index+1).re = - -rootpq * - (b_r * ulist_cpu(iatom, jnbor, jjup_index).re + - b_i * ulist_cpu(iatom, jnbor, jjup_index).im); - ulist_cpu(iatom, jnbor, jju_index+1).im = - -rootpq * - (b_r * ulist_cpu(iatom, jnbor, jjup_index).im - - b_i * ulist_cpu(iatom, jnbor, jjup_index).re); + real_type rootpq = rootpqarray(j - ma, j - mb); + ui.re += rootpq * (a.re * ui_prev.re + a.im * ui_prev.im); + ui.im += rootpq * (a.re * ui_prev.im - a.im * ui_prev.re); + + ulist_cpu(iatom, jnbor, jju_index) = ui; + + rootpq = rootpqarray(ma + 1, j - mb); + ui.re = -rootpq * (b.re * ui_prev.re + b.im * ui_prev.im); + ui.im = -rootpq * (b.re * ui_prev.im - b.im * ui_prev.re); + + jju_index++; + jjup_index++; } - // copy left side to right side with inversion symmetry VMK 4.4(2) - // u[ma-j,mb-j] = (-1)^(ma-mb)*Conj([u[ma,mb)) + ulist_cpu(iatom, jnbor, jju_index) = ui; + } - // Only need to add one symmetrized row for convenience - // Symmetry gets "unfolded" in accumulating ulisttot - if (j%2==1 && mb==(j/2)) { - const int mbpar = (mb)%2==0?1:-1; - int mapar = mbpar; - for (int ma = 0; ma <= j; ma++) { - const int jju_index = jju + mb*(j+1) + ma; - const int jjup_index = jju + (j+1-mb)*(j+1)-(ma+1); - if (mapar == 1) { - ulist_cpu(iatom, jnbor, jjup_index).re = ulist_cpu(iatom, jnbor, jju_index).re; - ulist_cpu(iatom, jnbor, jjup_index).im = -ulist_cpu(iatom, jnbor, jju_index).im; - } else { - ulist_cpu(iatom, jnbor, jjup_index).re = -ulist_cpu(iatom, jnbor, jju_index).re; - ulist_cpu(iatom, jnbor, jjup_index).im = ulist_cpu(iatom, jnbor, jju_index).im; - } - mapar = -mapar; - } + // If j is odd (half-integer in the mathematical convention), we need + // to add one more row for convenience (for now). This can either be done + // via symmetry (see the commented code below), or by the equations to fill + // from the left instead of the right + if (j % 2 == 1) { + int mb = j / 2; + // begin filling in the extra row + int jju_index = jju + (mb + 1) * (j + 1); + int jjup_index = jjup + mb * j; + + complex ui = complex::zero(); + + for (int ma = 0; ma < j; ma++) { + complex ui_prev = ulist_cpu(iatom, jnbor, jjup_index); + + real_type rootpq = rootpqarray(j - ma, mb + 1); + ui.re += rootpq * (b.re * ui_prev.re - b.im * ui_prev.im); + ui.im += rootpq * (b.re * ui_prev.im + b.im * ui_prev.re); + + ulist_cpu(iatom, jnbor, jju_index) = ui; + + rootpq = rootpqarray(ma + 1, mb + 1); + ui.re = rootpq * (a.re * ui_prev.re - a.im * ui_prev.im); + ui.im = rootpq * (a.re * ui_prev.im + a.im * ui_prev.re); + + jju_index++; + jjup_index++; } - }); + ulist_cpu(iatom, jnbor, jju_index) = ui; + } } + } /* ---------------------------------------------------------------------- @@ -1541,30 +1520,25 @@ void SNAKokkos::compute_uarray_cpu(const t template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_duarray_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor, +void SNAKokkos::compute_duarray_cpu(int iatom, int jnbor, const real_type& x, const real_type& y, const real_type& z, const real_type& z0, const real_type& r, const real_type& dz0dr, const real_type& wj, const real_type& rcut, const real_type& sinner, const real_type& dinner) const { - real_type r0inv; - real_type a_r, a_i, b_r, b_i; - real_type u[3], da_r[3], da_i[3], db_r[3], db_i[3]; - real_type dz0[3], dr0inv[3], dr0invdr; - real_type rootpq; + complex da[3], db[3]; + real_type u[3], dz0[3], dr0inv[3]; real_type rinv = 1.0 / r; u[0] = x * rinv; u[1] = y * rinv; u[2] = z * rinv; - r0inv = 1.0 / sqrt(r * r + z0 * z0); - a_r = z0 * r0inv; - a_i = -z * r0inv; - b_r = y * r0inv; - b_i = -x * r0inv; + real_type r0inv = 1.0 / sqrt(r * r + z0 * z0); + complex a = { z0 * r0inv, -z * r0inv }; + complex b = { y * r0inv, -x * r0inv }; - dr0invdr = -r0inv * r0inv * r0inv * (r + z0 * dz0dr); + real_type dr0invdr = -r0inv * r0inv * r0inv * (r + z0 * dz0dr); dr0inv[0] = dr0invdr * u[0]; dr0inv[1] = dr0invdr * u[1]; @@ -1575,19 +1549,19 @@ void SNAKokkos::compute_duarray_cpu(const dz0[2] = dz0dr * u[2]; for (int k = 0; k < 3; k++) { - da_r[k] = dz0[k] * r0inv + z0 * dr0inv[k]; - da_i[k] = -z * dr0inv[k]; + da[k].re = dz0[k] * r0inv + z0 * dr0inv[k]; + da[k].im = -z * dr0inv[k]; } - da_i[2] += -r0inv; + da[2].im += -r0inv; for (int k = 0; k < 3; k++) { - db_r[k] = y * dr0inv[k]; - db_i[k] = -x * dr0inv[k]; + db[k].re = y * dr0inv[k]; + db[k].im = -x * dr0inv[k]; } - db_i[0] += -r0inv; - db_r[1] += r0inv; + db[0].im += -r0inv; + db[1].re += r0inv; for (int k = 0; k < 3; k++) dulist_cpu(iatom, jnbor, 0, k) = complex::zero(); @@ -1595,72 +1569,94 @@ void SNAKokkos::compute_duarray_cpu(const for (int j = 1; j <= twojmax; j++) { int jju = idxu_cache_block[j]; int jjup = idxu_cache_block[j-1]; - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,(j+2)/2), - [&] (const int& mb) { - //for (int mb = 0; 2*mb <= j; mb++) { - const int jju_index = jju+mb+mb*j; - for (int k = 0; k < 3; k++) - dulist_cpu(iatom, jnbor, jju_index, k) = complex::zero(); + + for (int mb = 0; 2*mb <= j; mb++) { + int jju_index = jju + mb * (j + 1); + int jjup_index = jjup + mb * j; + + complex duidrj[3] = { complex::zero(), complex::zero(), complex::zero() }; for (int ma = 0; ma < j; ma++) { - const int jju_index = jju+mb+mb*j+ma; - const int jjup_index = jjup+mb*j+ma; - rootpq = rootpqarray(j - ma,j - mb); + complex ui_prev = ulist_cpu(iatom, jnbor, jjup_index); + complex duidrj_prev[3] = { dulist_cpu(iatom, jnbor, jjup_index, 0), + dulist_cpu(iatom, jnbor, jjup_index, 1), + dulist_cpu(iatom, jnbor, jjup_index, 2) }; + + real_type rootpq = rootpqarray(j - ma,j - mb); for (int k = 0; k < 3; k++) { - dulist_cpu(iatom, jnbor, jju_index, k).re += - rootpq * (da_r[k] * ulist_cpu(iatom, jnbor, jjup_index).re + - da_i[k] * ulist_cpu(iatom, jnbor, jjup_index).im + - a_r * dulist_cpu(iatom, jnbor, jjup_index, k).re + - a_i * dulist_cpu(iatom, jnbor, jjup_index, k).im); - dulist_cpu(iatom, jnbor, jju_index, k).im += - rootpq * (da_r[k] * ulist_cpu(iatom, jnbor, jjup_index).im - - da_i[k] * ulist_cpu(iatom, jnbor, jjup_index).re + - a_r * dulist_cpu(iatom, jnbor, jjup_index, k).im - - a_i * dulist_cpu(iatom, jnbor, jjup_index, k).re); + duidrj[k].re += rootpq * (da[k].re * ui_prev.re + da[k].im * ui_prev.im + + a.re * duidrj_prev[k].re + a.im * duidrj_prev[k].im); + + duidrj[k].im += rootpq * (da[k].re * ui_prev.im - da[k].im * ui_prev.re + + a.re * duidrj_prev[k].im - a.im * duidrj_prev[k].re); + + dulist_cpu(iatom, jnbor, jju_index, k) = duidrj[k]; } rootpq = rootpqarray(ma + 1,j - mb); for (int k = 0; k < 3; k++) { - dulist_cpu(iatom, jnbor, jju_index+1, k).re = - -rootpq * (db_r[k] * ulist_cpu(iatom, jnbor, jjup_index).re + - db_i[k] * ulist_cpu(iatom, jnbor, jjup_index).im + - b_r * dulist_cpu(iatom, jnbor, jjup_index, k).re + - b_i * dulist_cpu(iatom, jnbor, jjup_index, k).im); - dulist_cpu(iatom, jnbor, jju_index+1, k).im = - -rootpq * (db_r[k] * ulist_cpu(iatom, jnbor, jjup_index).im - - db_i[k] * ulist_cpu(iatom, jnbor, jjup_index).re + - b_r * dulist_cpu(iatom, jnbor, jjup_index, k).im - - b_i * dulist_cpu(iatom, jnbor, jjup_index, k).re); + duidrj[k].re = -rootpq * (db[k].re * ui_prev.re + db[k].im * ui_prev.im + + b.re * duidrj_prev[k].re + b.im * duidrj_prev[k].im); + + duidrj[k].im = -rootpq * (db[k].re * ui_prev.im - db[k].im * ui_prev.re + + b.re * duidrj_prev[k].im - b.im * duidrj_prev[k].re); } + + jju_index++; + jjup_index++; } - // Only need to add one symmetrized row for convenience - // Symmetry gets "unfolded" during the dedr accumulation - - // copy left side to right side with inversion symmetry VMK 4.4(2) - // u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb]) - - if (j%2==1 && mb==(j/2)) { - const int mbpar = (mb)%2==0?1:-1; - int mapar = mbpar; - for (int ma = 0; ma <= j; ma++) { - const int jju_index = jju+mb*(j+1)+ma; - const int jjup_index = jju+(mb+2)*(j+1)-(ma+1); - if (mapar == 1) { - for (int k = 0; k < 3; k++) { - dulist_cpu(iatom, jnbor, jjup_index, k).re = dulist_cpu(iatom, jnbor, jju_index, k).re; - dulist_cpu(iatom, jnbor, jjup_index, k).im = -dulist_cpu(iatom, jnbor, jju_index, k).im; - } - } else { - for (int k = 0; k < 3; k++) { - dulist_cpu(iatom, jnbor, jjup_index, k).re = -dulist_cpu(iatom, jnbor, jju_index, k).re; - dulist_cpu(iatom, jnbor, jjup_index, k).im = dulist_cpu(iatom, jnbor, jju_index, k).im; - } - } - mapar = -mapar; - } + for (int k = 0; k < 3; k++) { + dulist_cpu(iatom, jnbor, jju_index, k) = duidrj[k]; } - }); + } + + // Only need to add one symmetrized row for convenience + // Symmetry gets "unfolded" during the dedr accumulation + + // copy left side to right side with inversion symmetry VMK 4.4(2) + // u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb]) + + if (j % 2 == 1) { + int mb = j / 2; + // begin filling in the extra row + int jju_index = jju + (mb + 1) * (j + 1); + int jjup_index = jjup + mb * j; + + complex duidrj[3] = { complex::zero(), complex::zero(), complex::zero() }; + + for (int ma = 0; ma < j; ma++) { + complex ui_prev = ulist_cpu(iatom, jnbor, jjup_index); + complex duidrj_prev[3] = { dulist_cpu(iatom, jnbor, jjup_index, 0), + dulist_cpu(iatom, jnbor, jjup_index, 1), + dulist_cpu(iatom, jnbor, jjup_index, 2) }; + + real_type rootpq = rootpqarray(j - ma, mb + 1); + for (int k = 0; k < 3; k++) { + duidrj[k].re += rootpq * (db[k].re * ui_prev.re - db[k].im * ui_prev.im + + b.re * duidrj_prev[k].re - b.im * duidrj_prev[k].im); + duidrj[k].im += rootpq * (db[k].re * ui_prev.im + db[k].im * ui_prev.re + + b.re * duidrj_prev[k].im + b.im * duidrj_prev[k].re); + + dulist_cpu(iatom, jnbor, jju_index, k) = duidrj[k]; + } + + rootpq = rootpqarray(ma + 1, mb + 1); + for (int k = 0; k < 3; k++) { + duidrj[k].re = rootpq * (da[k].re * ui_prev.re - da[k].im * ui_prev.im + + a.re * duidrj_prev[k].re - a.im * duidrj_prev[k].im); + duidrj[k].im = rootpq * (da[k].re * ui_prev.im + da[k].im * ui_prev.re + + a.re * duidrj_prev[k].im + a.im * duidrj_prev[k].re); + } + + jju_index++; + jjup_index++; + } + + for (int k = 0; k < 3; k++) { + dulist_cpu(iatom, jnbor, jju_index, k) = duidrj[k]; + } + } } real_type sfac = compute_sfac(r, rcut, sinner, dinner); @@ -1678,9 +1674,9 @@ void SNAKokkos::compute_duarray_cpu(const for (int ma = 0; ma <= j; ma++) { for (int k = 0; k < 3; k++) { dulist_cpu(iatom, jnbor, jju, k).re = dsfac * ulist_cpu(iatom, jnbor, jju).re * u[k] + - sfac * dulist_cpu(iatom, jnbor, jju, k).re; + sfac * dulist_cpu(iatom, jnbor, jju, k).re; dulist_cpu(iatom, jnbor, jju, k).im = dsfac * ulist_cpu(iatom, jnbor, jju).im * u[k] + - sfac * dulist_cpu(iatom, jnbor, jju, k).im; + sfac * dulist_cpu(iatom, jnbor, jju, k).im; } jju++; }