diff --git a/src/KOKKOS/pair_snap_kokkos.h b/src/KOKKOS/pair_snap_kokkos.h index 9f6ef83016..97ff5a00f0 100644 --- a/src/KOKKOS/pair_snap_kokkos.h +++ b/src/KOKKOS/pair_snap_kokkos.h @@ -46,8 +46,8 @@ struct TagPairSNAPPreUi{}; struct TagPairSNAPTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist struct TagPairSNAPComputeZi{}; struct TagPairSNAPComputeBi{}; -struct TagPairSNAPZeroBeta{}; -struct TagPairSNAPComputeBeta{}; +struct TagPairSNAPComputeBetaLinear{}; +struct TagPairSNAPComputeBetaQuadratic{}; struct TagPairSNAPComputeYi{}; struct TagPairSNAPComputeYiWithZlist{}; template @@ -94,7 +94,6 @@ class PairSNAPKokkos : public PairSNAP { static constexpr int tile_size_transform_ui = 2; static constexpr int tile_size_compute_zi = 2; static constexpr int tile_size_compute_bi = 2; - static constexpr int tile_size_zero_beta = 2; static constexpr int tile_size_compute_beta = 2; static constexpr int tile_size_compute_yi = 2; static constexpr int team_size_compute_fused_deidrj = 2; @@ -106,7 +105,6 @@ class PairSNAPKokkos : public PairSNAP { static constexpr int tile_size_transform_ui = 8; static constexpr int tile_size_compute_zi = 4; static constexpr int tile_size_compute_bi = 4; - static constexpr int tile_size_zero_beta = 8; static constexpr int tile_size_compute_beta = 8; static constexpr int tile_size_compute_yi = 8; static constexpr int team_size_compute_fused_deidrj = 4; @@ -118,7 +116,6 @@ class PairSNAPKokkos : public PairSNAP { static constexpr int tile_size_transform_ui = 4; static constexpr int tile_size_compute_zi = 8; static constexpr int tile_size_compute_bi = 4; - static constexpr int tile_size_zero_beta = 4; static constexpr int tile_size_compute_beta = 4; static constexpr int tile_size_compute_yi = 8; static constexpr int team_size_compute_fused_deidrj = sizeof(real_type) == 4 ? 4 : 2; @@ -236,22 +233,22 @@ class PairSNAPKokkos : public PairSNAP { void operator() (TagPairSNAPComputeBi, const int& iatom) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPZeroBeta, const int& iatom_mod, const int& idxb, const int& iatom_div) const; + void operator() (TagPairSNAPComputeBetaLinear, const int& iatom_mod, const int& idxb, const int& iatom_div) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPZeroBeta, const int& iatom, const int& idxb) const; + void operator() (TagPairSNAPComputeBetaLinear, const int& iatom, const int& idxb) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPZeroBeta, const int& iatom) const; + void operator() (TagPairSNAPComputeBetaLinear, const int& iatom) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeBeta, const int& iatom_mod, const int& idxb, const int& iatom_div) const; + void operator() (TagPairSNAPComputeBetaQuadratic, const int& iatom_mod, const int& idxb, const int& iatom_div) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeBeta, const int& iatom, const int& idxb) const; + void operator() (TagPairSNAPComputeBetaQuadratic, const int& iatom, const int& idxb) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeBeta, const int& iatom) const; + void operator() (TagPairSNAPComputeBetaQuadratic, const int& iatom) const; KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeYi, const int& iatom_mod, const int& idxz, const int& iatom_div) const; diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h index 056dfbf231..3ba61dab91 100644 --- a/src/KOKKOS/pair_snap_kokkos_impl.h +++ b/src/KOKKOS/pair_snap_kokkos_impl.h @@ -292,7 +292,7 @@ void PairSNAPKokkos::compute(int eflag_in, } { - // Expand ulisttot_re,_im -> ulisttot + // Expand ulisttot -> ulisttot // Zero out ylist auto policy_transform_ui = snap_get_policy(chunk_size_div, snaKK.idxu_max); Kokkos::parallel_for("TransformUi", policy_transform_ui, *this); @@ -312,13 +312,15 @@ void PairSNAPKokkos::compute(int eflag_in, } { - // Zero beta out - auto policy_zero_beta = snap_get_policy(chunk_size_div, snaKK.idxb_max); - Kokkos::parallel_for("ZeroBeta",policy_zero_beta,*this); + //Compute beta = dE_i/dB_i for all i in list; linear portion only + auto policy_compute_beta_linear = snap_get_policy(chunk_size_div, snaKK.idxb_max); + Kokkos::parallel_for("ComputeBetaLinear", policy_compute_beta_linear, *this); - //Compute beta = dE_i/dB_i for all i in list - auto policy_compute_beta = snap_get_policy(chunk_size_div, snaKK.idxb_max); - Kokkos::parallel_for("ComputeBeta", policy_compute_beta, *this); + if (quadraticflag) { + // Compute the quadratic correction + auto policy_compute_beta_quadratic = snap_get_policy(chunk_size_div, snaKK.idxb_max); + Kokkos::parallel_for("ComputeBetaQuadratic", policy_compute_beta_quadratic, *this); + } //Note zeroing `ylist` is fused into `TransformUi`. if (quadraticflag || eflag) { @@ -450,8 +452,8 @@ void PairSNAPKokkos::compute(int eflag_in, // free duplicated memory if (need_dup) { - dup_f = {}; - dup_vatom = {}; + dup_f = decltype(dup_f)(); + dup_vatom = decltype(dup_vatom)(); } } @@ -934,48 +936,15 @@ void PairSNAPKokkos::operator() (TagPairSN snaKK.compute_bi(iatom, jjb); } -/* ---------------------------------------------------------------------- - Zero out beta in advance of accumulating. CPU and GPU. -------------------------------------------------------------------------- */ - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPZeroBeta, const int& iatom_mod, const int& jjb, const int& iatom_div) const { - const int iatom = iatom_mod + iatom_div * vector_length; - if (iatom >= chunk_size) return; - if (jjb >= snaKK.idxb_max) return; - for (int itriple = 0; itriple < snaKK.ntriples; itriple++) - snaKK.d_beta(iatom, jjb + itriple * snaKK.idxb_max) = 0; -} - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPZeroBeta, const int& iatom, const int& jjb) const { - if (iatom >= chunk_size) return; - for (int itriple = 0; itriple < snaKK.ntriples; itriple++) - snaKK.d_beta(iatom, jjb + itriple * snaKK.idxb_max) = 0; -} - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPZeroBeta, const int& iatom) const { - if (iatom >= chunk_size) return; - for (int jjb = 0; jjb < snaKK.idxb_max; jjb++) - for (int itriple = 0; itriple < snaKK.ntriples; itriple++) - snaKK.d_beta(iatom, jjb + itriple * snaKK.idxb_max) = 0; -} - /* ---------------------------------------------------------------------- Assemble the "beta" coefficients that enter the computation of the - adjoint matrices Y. For quadratic SNAP, this includes accumulating - energy triple products into an "effective" beta that encodes the - quadratic terms with otherwise linear compute work. - CPU and GPU. + adjoint matrices Y. This is just for a linear potential. A quadratic + contribution is added in a subsequent kernel. CPU and GPU. ------------------------------------------------------------------------- */ template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeBeta, const int& iatom_mod, const int& idxb, const int& iatom_div) const { +void PairSNAPKokkos::operator() (TagPairSNAPComputeBetaLinear, const int& iatom_mod, const int& idxb, const int& iatom_div) const { const int iatom = iatom_mod + iatom_div * vector_length; if (iatom >= chunk_size) return; if (idxb >= snaKK.idxb_max) return; @@ -984,24 +953,24 @@ void PairSNAPKokkos::operator() (TagPairSN const int itype = type[i]; const int ielem = d_map[itype]; - snaKK.template compute_beta(iatom, idxb, ielem); + snaKK.compute_beta_linear(iatom, idxb, ielem); } template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeBeta, const int& iatom, const int& idxb) const { +void PairSNAPKokkos::operator() (TagPairSNAPComputeBetaLinear, const int& iatom, const int& idxb) const { if (iatom >= chunk_size) return; const int i = d_ilist[iatom + chunk_offset]; const int itype = type[i]; const int ielem = d_map[itype]; - snaKK.template compute_beta(iatom, idxb, ielem); + snaKK.compute_beta_linear(iatom, idxb, ielem); } template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeBeta, const int& iatom) const { +void PairSNAPKokkos::operator() (TagPairSNAPComputeBetaLinear, const int& iatom) const { if (iatom >= chunk_size) return; const int i = d_ilist[iatom + chunk_offset]; @@ -1009,7 +978,53 @@ void PairSNAPKokkos::operator() (TagPairSN const int ielem = d_map[itype]; for (int idxb = 0; idxb < snaKK.idxb_max; idxb++) - snaKK.template compute_beta(iatom, idxb, ielem); + snaKK.compute_beta_linear(iatom, idxb, ielem); +} + +/* ---------------------------------------------------------------------- + Accumulate the qudratic terms which includes accumulating + energy triple products into an "effective" beta that encodes the + quadratic terms with otherwise linear compute work. + CPU and GPU. +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeBetaQuadratic, const int& iatom_mod, const int& idxb, const int& iatom_div) const { + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; + if (idxb >= snaKK.idxb_max) return; + + const int i = d_ilist[iatom + chunk_offset]; + const int itype = type[i]; + const int ielem = d_map[itype]; + + snaKK.template compute_beta_quadratic(iatom, idxb, ielem); +} + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeBetaQuadratic, const int& iatom, const int& idxb) const { + if (iatom >= chunk_size) return; + + const int i = d_ilist[iatom + chunk_offset]; + const int itype = type[i]; + const int ielem = d_map[itype]; + + snaKK.template compute_beta_quadratic(iatom, idxb, ielem); +} + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeBetaQuadratic, const int& iatom) const { + if (iatom >= chunk_size) return; + + const int i = d_ilist[iatom + chunk_offset]; + const int itype = type[i]; + const int ielem = d_map[itype]; + + for (int idxb = 0; idxb < snaKK.idxb_max; idxb++) + snaKK.template compute_beta_quadratic(iatom, idxb, ielem); } /* ---------------------------------------------------------------------- diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h index 438f1b4304..4fc34c260d 100644 --- a/src/KOKKOS/sna_kokkos.h +++ b/src/KOKKOS/sna_kokkos.h @@ -212,8 +212,10 @@ class SNAKokkos { void compute_yi_with_zlist(const int&, const int&) const; // ForceSNAP KOKKOS_INLINE_FUNCTION void compute_bi(const int&, const int&) const; // ForceSNAP + KOKKOS_INLINE_FUNCTION + void compute_beta_linear(const int&, const int&, const int&) const; template KOKKOS_INLINE_FUNCTION - void compute_beta(const int&, const int&, const int&) const; + void compute_beta_quadratic(const int&, const int&, const int&) const; // functions for derivatives, GPU only // version of the code with parallelism over j_bend diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index 7d86d5ceaa..e14add1c8f 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -963,20 +963,29 @@ void SNAKokkos::compute_bi(const int& iato or accumulating the quadratic terms from blist ------------------------------------------------------------------------- */ +template +KOKKOS_INLINE_FUNCTION +void SNAKokkos::compute_beta_linear(const int& iatom, const int& idxb, const int& ielem) const +{ + auto d_coeffi = Kokkos::subview(d_coeffelem, ielem, Kokkos::ALL); + + for (int itriple = 0; itriple < ntriples; itriple++) { + int icoeff = idxb + itriple * idxb_max; + d_beta(iatom, icoeff) = d_coeffi[icoeff+1]; + } +} + template template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_beta(const int& iatom, const int& idxb, const int& ielem) const +void SNAKokkos::compute_beta_quadratic(const int& iatom, const int& idxb, const int& ielem) const { auto d_coeffi = Kokkos::subview(d_coeffelem, ielem, Kokkos::ALL); // handle quadratic && chemflag as a special case - if (quadratic_flag && chem_flag) { + if (chem_flag) { if (idxb == 0) { - for (int icoeff = 0; icoeff < ncoeff; icoeff++) { - d_beta(iatom, icoeff) = d_coeffi[icoeff+1]; - } - + // no need to use atomics, we're just serializing int k = ncoeff+1; for (int icoeff = 0; icoeff < ncoeff; icoeff++) { const auto idxb = icoeff % idxb_max; @@ -995,38 +1004,36 @@ void SNAKokkos::compute_beta(const int& ia } } } else { - for (int itriple = 0; itriple < ntriples; itriple++) { - int icoeff = idxb + itriple * idxb_max; - if constexpr (need_atomics) - Kokkos::atomic_add(&d_beta(iatom, icoeff), d_coeffi[icoeff+1]); - else - d_beta(iatom, icoeff) += d_coeffi[icoeff+1]; - } + // Compute triangular partial sum via a closed form to get the starting offset + int k = (idxb * (1 + 2 * idxb_max - idxb)) / 2 + idxb_max + 1; + real_type bveci = blist(iatom, 0, idxb); - if (quadratic_flag) { - int k = (idxb * (1 + 2 * idxb_max - idxb)) / 2 + idxb_max + 1; - real_type bveci = blist(iatom, 0, idxb); + // Locally accumulate the contribution to d_beta(iatom, idxb) + real_type beta_idxb_accum = d_coeffi[k] * bveci; + k++; + + for (int jdxb = idxb + 1; jdxb < idxb_max; jdxb++) { + real_type bvecj = blist(iatom, 0, jdxb); + real_type coeff_k = d_coeffi[k]; + beta_idxb_accum += coeff_k * bvecj; + + // Accumulate "half" contribution into d_beta(iatom, jdxb) if constexpr (need_atomics) - Kokkos::atomic_add(&d_beta(iatom, idxb), d_coeffi[k] * bveci); + Kokkos::atomic_add(&d_beta(iatom, jdxb), coeff_k * bveci); else - d_beta(iatom, idxb) += d_coeffi[k] * bveci; + d_beta(iatom, jdxb) += coeff_k * bveci; + k++; - - for (int jdxb = idxb + 1; jdxb < idxb_max; jdxb++) { - real_type bvecj = blist(iatom, 0, jdxb); - if constexpr (need_atomics) { - Kokkos::atomic_add(&d_beta(iatom, idxb), d_coeffi[k] * bvecj); - Kokkos::atomic_add(&d_beta(iatom, jdxb), d_coeffi[k] * bveci); - } else { - d_beta(iatom, idxb) += d_coeffi[k] * bvecj; - d_beta(iatom, jdxb) += d_coeffi[k] * bveci; - } - k++; - } } + + if constexpr (need_atomics) + Kokkos::atomic_add(&d_beta(iatom, idxb), beta_idxb_accum); + else + d_beta(iatom, idxb) += beta_idxb_accum; } } + /* ---------------------------------------------------------------------- Compute Yi from Ui without storing Zi, looping over zlist indices. ------------------------------------------------------------------------- */