From 782ca9e0ff366c8a2e540bfc08d558155357fac5 Mon Sep 17 00:00:00 2001 From: Stan Gerald Moore Date: Fri, 1 Dec 2023 14:46:30 -0700 Subject: [PATCH 1/8] Thread over neighbors in addition to atoms when using a half neighbor list --- src/KOKKOS/kokkos.cpp | 4 +- src/KOKKOS/pair_kokkos.h | 318 ++++++++++++++++++++++++++++----------- 2 files changed, 236 insertions(+), 86 deletions(-) diff --git a/src/KOKKOS/kokkos.cpp b/src/KOKKOS/kokkos.cpp index c963cd52d0..5572f69901 100644 --- a/src/KOKKOS/kokkos.cpp +++ b/src/KOKKOS/kokkos.cpp @@ -608,8 +608,8 @@ void KokkosLMP::accelerator(int narg, char **arg) force->newton = force->newton_pair = force->newton_bond = newtonflag; - if (neigh_thread && neighflag != FULL) - error->all(FLERR,"Must use KOKKOS package option 'neigh full' with 'neigh/thread on'"); + if (neigh_thread && newtonflag) + error->all(FLERR,"Must use KOKKOS package option 'newton off' with 'neigh/thread on'"); neighbor->binsize_user = binsize; if (binsize <= 0.0) neighbor->binsizeflag = 0; diff --git a/src/KOKKOS/pair_kokkos.h b/src/KOKKOS/pair_kokkos.h index d3c766f5ae..c0b81cc594 100644 --- a/src/KOKKOS/pair_kokkos.h +++ b/src/KOKKOS/pair_kokkos.h @@ -116,6 +116,7 @@ struct PairComputeFunctor { // Loop over neighbors of one atom without coulomb interaction // This function is called in parallel + template KOKKOS_FUNCTION EV_FLOAT compute_item(const int& ii, @@ -161,7 +162,7 @@ struct PairComputeFunctor { fytmp += dely*fpair; fztmp += delz*fpair; - if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < c.nlocal)) { + if ((NEIGHFLAG == HALF || NEIGHFLAG == HALFTHREAD) && (NEWTON_PAIR || j < c.nlocal)) { a_f(j,0) -= delx*fpair; a_f(j,1) -= dely*fpair; a_f(j,2) -= delz*fpair; @@ -169,9 +170,9 @@ struct PairComputeFunctor { if (EVFLAG) { F_FLOAT evdwl = 0.0; - if (c.eflag) { + if (c.eflag_either) { evdwl = factor_lj * c.template compute_evdwl(rsq,i,j,itype,jtype); - ev.evdwl += (((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD)&&(NEWTON_PAIR||(j KOKKOS_FUNCTION EV_FLOAT compute_item(const int& ii, @@ -241,7 +243,7 @@ struct PairComputeFunctor { fytmp += dely*fpair; fztmp += delz*fpair; - if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < c.nlocal)) { + if ((NEIGHFLAG == HALF || NEIGHFLAG == HALFTHREAD) && (NEWTON_PAIR || j < c.nlocal)) { a_f(j,0) -= delx*fpair; a_f(j,1) -= dely*fpair; a_f(j,2) -= delz*fpair; @@ -250,14 +252,14 @@ struct PairComputeFunctor { if (EVFLAG) { F_FLOAT evdwl = 0.0; F_FLOAT ecoul = 0.0; - if (c.eflag) { + if (c.eflag_either) { if (rsq < (STACKPARAMS?c.m_cut_ljsq[itype][jtype]:c.d_cut_ljsq(itype,jtype))) { evdwl = factor_lj * c.template compute_evdwl(rsq,i,j,itype,jtype); - ev.evdwl += (((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD)&&(NEWTON_PAIR||(j(rsq,i,j,itype,jtype,factor_coul,qtmp); - ev.ecoul += (((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD)&&(NEWTON_PAIR||(j::member_type team, const NeighListKokkos &list, const NoCoulTag&) const { + auto a_f = dup_f.template access::value>(); + const int inum = team.league_size(); const int atoms_per_team = team.team_size(); const int firstatom = team.league_rank()*atoms_per_team; @@ -292,7 +297,7 @@ struct PairComputeFunctor { const X_FLOAT ztmp = c.x(i,2); const int itype = c.type(i); - if (ZEROFLAG) { + if (NEIGHFLAG == FULL && ZEROFLAG) { Kokkos::single(Kokkos::PerThread(team), [&] (){ f(i,0) = 0.0; f(i,1) = 0.0; @@ -321,35 +326,49 @@ struct PairComputeFunctor { const F_FLOAT fpair = factor_lj*c.template compute_fpair(rsq,i,j,itype,jtype); - ftmp.x += delx*fpair; - ftmp.y += dely*fpair; - ftmp.z += delz*fpair; + const F_FLOAT fx = delx*fpair; + const F_FLOAT fy = dely*fpair; + const F_FLOAT fz = delz*fpair; + + ftmp.x += fx; + ftmp.y += fy; + ftmp.z += fz; + + if ((NEIGHFLAG == HALF || NEIGHFLAG == HALFTHREAD) && j < c.nlocal) { + a_f(j,0) -= fx; + a_f(j,1) -= fy; + a_f(j,2) -= fz; + } } },fsum); Kokkos::single(Kokkos::PerThread(team), [&] () { - f(i,0) += fsum.x; - f(i,1) += fsum.y; - f(i,2) += fsum.z; + a_f(i,0) += fsum.x; + a_f(i,1) += fsum.y; + a_f(i,2) += fsum.z; }); }); } - // Use TeamPolicy, assume Newton off, Full Neighborlist, and no energy/virial + // TeamPolicy, newton off, and no energy/virial // Loop over neighbors of one atom with coulomb interaction // This function is called in parallel + KOKKOS_FUNCTION void compute_item_team(typename Kokkos::TeamPolicy::member_type team, const NeighListKokkos &list, const CoulTag& ) const { + auto a_f = dup_f.template access::value>(); + const int inum = team.league_size(); const int atoms_per_team = team.team_size(); int firstatom = team.league_rank()*atoms_per_team; int lastatom = firstatom + atoms_per_team < inum ? firstatom + atoms_per_team : inum; Kokkos::parallel_for(Kokkos::TeamThreadRange(team, firstatom, lastatom), [&] (const int &ii) { + const int i = list.d_ilist[ii]; const X_FLOAT xtmp = c.x(i,0); const X_FLOAT ytmp = c.x(i,1); @@ -357,8 +376,9 @@ struct PairComputeFunctor { const int itype = c.type(i); const F_FLOAT qtmp = c.q(i); - if (ZEROFLAG) { - Kokkos::single(Kokkos::PerThread(team), [&] (){ + if (NEIGHFLAG == FULL && ZEROFLAG) { + Kokkos::single(Kokkos::PerThread(team), [&] () + { f(i,0) = 0.0; f(i,1) = 0.0; f(i,2) = 0.0; @@ -391,34 +411,50 @@ struct PairComputeFunctor { if (rsq < (STACKPARAMS?c.m_cut_coulsq[itype][jtype]:c.d_cut_coulsq(itype,jtype))) fpair+=c.template compute_fcoul(rsq,i,j,itype,jtype,factor_coul,qtmp); - ftmp.x += delx*fpair; - ftmp.y += dely*fpair; - ftmp.z += delz*fpair; + const F_FLOAT fx = delx*fpair; + const F_FLOAT fy = dely*fpair; + const F_FLOAT fz = delz*fpair; + + ftmp.x += fx; + ftmp.y += fy; + ftmp.z += fz; + + if ((NEIGHFLAG == HALF || NEIGHFLAG == HALFTHREAD) && j < c.nlocal) { + a_f(j,0) -= fx; + a_f(j,1) -= fy; + a_f(j,2) -= fz; + } } + },fsum); Kokkos::single(Kokkos::PerThread(team), [&] () { - f(i,0) += fsum.x; - f(i,1) += fsum.y; - f(i,2) += fsum.z; + a_f(i,0) += fsum.x; + a_f(i,1) += fsum.y; + a_f(i,2) += fsum.z; }); }); } - - // Use TeamPolicy, assume Newton off, Full Neighborlist, and energy/virial + // TeamPolicy, newton off, and energy/virial // Loop over neighbors of one atom without coulomb interaction // This function is called in parallel + KOKKOS_FUNCTION EV_FLOAT compute_item_team_ev(typename Kokkos::TeamPolicy::member_type team, const NeighListKokkos &list, const NoCoulTag&) const { + auto a_f = dup_f.template access::value>(); + auto a_eatom = dup_eatom.template access::value>(); + auto a_vatom = dup_vatom.template access::value>(); + EV_FLOAT ev; const int inum = team.league_size(); const int atoms_per_team = team.team_size(); const int firstatom = team.league_rank()*atoms_per_team; const int lastatom = firstatom + atoms_per_team < inum ? firstatom + atoms_per_team : inum; + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, firstatom, lastatom), [&] (const int &ii) { const int i = list.d_ilist[ii]; @@ -427,8 +463,9 @@ struct PairComputeFunctor { const X_FLOAT ztmp = c.x(i,2); const int itype = c.type(i); - if (ZEROFLAG) { - Kokkos::single(Kokkos::PerThread(team), [&] (){ + if (NEIGHFLAG == FULL && ZEROFLAG) { + Kokkos::single(Kokkos::PerThread(team), [&] () + { f(i,0) = 0.0; f(i,1) = 0.0; f(i,2) = 0.0; @@ -456,37 +493,85 @@ struct PairComputeFunctor { const F_FLOAT fpair = factor_lj*c.template compute_fpair(rsq,i,j,itype,jtype); - fev_tmp.f[0] += delx*fpair; - fev_tmp.f[1] += dely*fpair; - fev_tmp.f[2] += delz*fpair; + const F_FLOAT fx = delx*fpair; + const F_FLOAT fy = dely*fpair; + const F_FLOAT fz = delz*fpair; + + fev_tmp.f[0] += fx; + fev_tmp.f[1] += fy; + fev_tmp.f[2] += fz; + + const int I_CONTRIB = (NEIGHFLAG == HALF || NEIGHFLAG == HALFTHREAD); + const int J_CONTRIB = ((NEIGHFLAG == HALF || NEIGHFLAG == HALFTHREAD) && j < c.nlocal); + const E_FLOAT factor = J_CONTRIB?1.0:0.5; + + if (J_CONTRIB) { + a_f(j,0) -= fx; + a_f(j,1) -= fy; + a_f(j,2) -= fz; + } F_FLOAT evdwl = 0.0; - if (c.eflag) { + if (c.eflag_either) { evdwl = factor_lj * c.template compute_evdwl(rsq,i,j,itype,jtype); - fev_tmp.evdwl += 0.5*evdwl; + fev_tmp.evdwl += factor * evdwl; + + if (c.eflag_atom) { + const E_FLOAT epairhalf = 0.5 * evdwl; + + if (I_CONTRIB) + a_eatom[i] += epairhalf; + + if (J_CONTRIB) + a_eatom[j] += epairhalf; + } } + if (c.vflag_either) { - fev_tmp.v[0] += 0.5*delx*delx*fpair; - fev_tmp.v[1] += 0.5*dely*dely*fpair; - fev_tmp.v[2] += 0.5*delz*delz*fpair; - fev_tmp.v[3] += 0.5*delx*dely*fpair; - fev_tmp.v[4] += 0.5*delx*delz*fpair; - fev_tmp.v[5] += 0.5*dely*delz*fpair; + const E_FLOAT v0 = delx*delx*fpair; + const E_FLOAT v1 = dely*dely*fpair; + const E_FLOAT v2 = delz*delz*fpair; + const E_FLOAT v3 = delx*dely*fpair; + const E_FLOAT v4 = delx*delz*fpair; + const E_FLOAT v5 = dely*delz*fpair; + + fev_tmp.v[0] += factor*v0; + fev_tmp.v[1] += factor*v1; + fev_tmp.v[2] += factor*v2; + fev_tmp.v[3] += factor*v3; + fev_tmp.v[4] += factor*v4; + fev_tmp.v[5] += factor*v5; + + if (c.vflag_atom) { + if (I_CONTRIB) { + a_vatom(i,0) += 0.5*v0; + a_vatom(i,1) += 0.5*v1; + a_vatom(i,2) += 0.5*v2; + a_vatom(i,3) += 0.5*v3; + a_vatom(i,4) += 0.5*v4; + a_vatom(i,5) += 0.5*v5; + } + if (J_CONTRIB) { + a_vatom(j,0) += 0.5*v0; + a_vatom(j,1) += 0.5*v1; + a_vatom(j,2) += 0.5*v2; + a_vatom(j,3) += 0.5*v3; + a_vatom(j,4) += 0.5*v4; + a_vatom(j,5) += 0.5*v5; + } + } } } },fev); Kokkos::single(Kokkos::PerThread(team), [&] () { - f(i,0) += fev.f[0]; - f(i,1) += fev.f[1]; - f(i,2) += fev.f[2]; + a_f(i,0) += fev.f[0]; + a_f(i,1) += fev.f[1]; + a_f(i,2) += fev.f[2]; if (c.eflag_global) ev.evdwl += fev.evdwl; - if (c.eflag_atom) - d_eatom(i) += fev.evdwl; - if (c.vflag_global) { ev.v[0] += fev.v[0]; ev.v[1] += fev.v[1]; @@ -496,26 +581,37 @@ struct PairComputeFunctor { ev.v[5] += fev.v[5]; } - if (c.vflag_atom) { - d_vatom(i,0) += fev.v[0]; - d_vatom(i,1) += fev.v[1]; - d_vatom(i,2) += fev.v[2]; - d_vatom(i,3) += fev.v[3]; - d_vatom(i,4) += fev.v[4]; - d_vatom(i,5) += fev.v[5]; + if (NEIGHFLAG == FULL) { + + if (c.eflag_atom) + a_eatom(i) += fev.evdwl; + + if (c.vflag_atom) { + a_vatom(i,0) += fev.v[0]; + a_vatom(i,1) += fev.v[1]; + a_vatom(i,2) += fev.v[2]; + a_vatom(i,3) += fev.v[3]; + a_vatom(i,4) += fev.v[4]; + a_vatom(i,5) += fev.v[5]; + } } }); }); return ev; } - // Use TeamPolicy, assume Newton off, Full Neighborlist, and energy/virial + // TeamPolicy, newton off, and energy/virial // Loop over neighbors of one atom with coulomb interaction // This function is called in parallel + KOKKOS_FUNCTION EV_FLOAT compute_item_team_ev(typename Kokkos::TeamPolicy::member_type team, const NeighListKokkos &list, const CoulTag& ) const { + auto a_f = dup_f.template access::value>(); + auto a_eatom = dup_eatom.template access::value>(); + auto a_vatom = dup_vatom.template access::value>(); + EV_FLOAT ev; const int inum = team.league_size(); @@ -566,45 +662,92 @@ struct PairComputeFunctor { if (rsq < (STACKPARAMS?c.m_cut_coulsq[itype][jtype]:c.d_cut_coulsq(itype,jtype))) fpair+=c.template compute_fcoul(rsq,i,j,itype,jtype,factor_coul,qtmp); - fev_tmp.f[0] += delx*fpair; - fev_tmp.f[1] += dely*fpair; - fev_tmp.f[2] += delz*fpair; + const F_FLOAT fx = delx*fpair; + const F_FLOAT fy = dely*fpair; + const F_FLOAT fz = delz*fpair; + + fev_tmp.f[0] += fx; + fev_tmp.f[1] += fy; + fev_tmp.f[2] += fz; + + const int I_CONTRIB = (NEIGHFLAG == HALF || NEIGHFLAG == HALFTHREAD); + const int J_CONTRIB = ((NEIGHFLAG == HALF || NEIGHFLAG == HALFTHREAD) && j < c.nlocal); + const E_FLOAT factor = J_CONTRIB?1.0:0.5; + + if ((NEIGHFLAG == HALF || NEIGHFLAG == HALFTHREAD) && j < c.nlocal) { + a_f(j,0) -= fx; + a_f(j,1) -= fy; + a_f(j,2) -= fz; + } F_FLOAT evdwl = 0.0; F_FLOAT ecoul = 0.0; - if (c.eflag) { + if (c.eflag_either) { if (rsq < (STACKPARAMS?c.m_cut_ljsq[itype][jtype]:c.d_cut_ljsq(itype,jtype))) { evdwl = factor_lj * c.template compute_evdwl(rsq,i,j,itype,jtype); - fev_tmp.evdwl += 0.5*evdwl; + fev_tmp.evdwl += factor * evdwl; } if (rsq < (STACKPARAMS?c.m_cut_coulsq[itype][jtype]:c.d_cut_coulsq(itype,jtype))) { ecoul = c.template compute_ecoul(rsq,i,j,itype,jtype,factor_coul,qtmp); - fev_tmp.ecoul += 0.5*ecoul; + fev_tmp.ecoul += factor * ecoul; + } + + + if (c.eflag_atom) { + const E_FLOAT epairhalf = 0.5 * (evdwl + ecoul); + + if (I_CONTRIB) + a_eatom[i] += epairhalf; + + if (J_CONTRIB) + a_eatom[j] += epairhalf; } } + if (c.vflag_either) { - fev_tmp.v[0] += 0.5*delx*delx*fpair; - fev_tmp.v[1] += 0.5*dely*dely*fpair; - fev_tmp.v[2] += 0.5*delz*delz*fpair; - fev_tmp.v[3] += 0.5*delx*dely*fpair; - fev_tmp.v[4] += 0.5*delx*delz*fpair; - fev_tmp.v[5] += 0.5*dely*delz*fpair; + const E_FLOAT v0 = delx*delx*fpair; + const E_FLOAT v1 = dely*dely*fpair; + const E_FLOAT v2 = delz*delz*fpair; + const E_FLOAT v3 = delx*dely*fpair; + const E_FLOAT v4 = delx*delz*fpair; + const E_FLOAT v5 = dely*delz*fpair; + + fev_tmp.v[0] += factor*v0; + fev_tmp.v[1] += factor*v1; + fev_tmp.v[2] += factor*v2; + fev_tmp.v[3] += factor*v3; + fev_tmp.v[4] += factor*v4; + fev_tmp.v[5] += factor*v5; + + if (c.vflag_atom) { + if (I_CONTRIB) { + a_vatom(i,0) += 0.5*v0; + a_vatom(i,1) += 0.5*v1; + a_vatom(i,2) += 0.5*v2; + a_vatom(i,3) += 0.5*v3; + a_vatom(i,4) += 0.5*v4; + a_vatom(i,5) += 0.5*v5; + } + if (J_CONTRIB) { + a_vatom(j,0) += 0.5*v0; + a_vatom(j,1) += 0.5*v1; + a_vatom(j,2) += 0.5*v2; + a_vatom(j,3) += 0.5*v3; + a_vatom(j,4) += 0.5*v4; + a_vatom(j,5) += 0.5*v5; + } + } } } },fev); Kokkos::single(Kokkos::PerThread(team), [&] () { - f(i,0) += fev.f[0]; - f(i,1) += fev.f[1]; - f(i,2) += fev.f[2]; + a_f(i,0) += fev.f[0]; + a_f(i,1) += fev.f[1]; + a_f(i,2) += fev.f[2]; - if (c.eflag_global) { + if (c.eflag_global) ev.evdwl += fev.evdwl; - ev.ecoul += fev.ecoul; - } - - if (c.eflag_atom) - d_eatom(i) += fev.evdwl + fev.ecoul; if (c.vflag_global) { ev.v[0] += fev.v[0]; @@ -615,13 +758,19 @@ struct PairComputeFunctor { ev.v[5] += fev.v[5]; } - if (c.vflag_atom) { - d_vatom(i,0) += fev.v[0]; - d_vatom(i,1) += fev.v[1]; - d_vatom(i,2) += fev.v[2]; - d_vatom(i,3) += fev.v[3]; - d_vatom(i,4) += fev.v[4]; - d_vatom(i,5) += fev.v[5]; + if (NEIGHFLAG == FULL) { + + if (c.eflag_atom) + a_eatom(i) += fev.evdwl; + + if (c.vflag_atom) { + a_vatom(i,0) += fev.v[0]; + a_vatom(i,1) += fev.v[1]; + a_vatom(i,2) += fev.v[2]; + a_vatom(i,3) += fev.v[3]; + a_vatom(i,4) += fev.v[4]; + a_vatom(i,5) += fev.v[5]; + } } }); }); @@ -636,7 +785,7 @@ struct PairComputeFunctor { auto a_eatom = dup_eatom.template access::value>(); auto a_vatom = dup_vatom.template access::value>(); - const int EFLAG = c.eflag; + const int EFLAG = c.eflag_either; const int NEWTON_PAIR = c.newton_pair; const int VFLAG = c.vflag_either; @@ -657,7 +806,7 @@ struct PairComputeFunctor { const E_FLOAT v5 = dely*delz*fpair; if (c.vflag_global) { - if (NEIGHFLAG!=FULL) { + if (NEIGHFLAG != FULL) { if (NEWTON_PAIR) { ev.v[0] += v0; ev.v[1] += v1; @@ -747,7 +896,8 @@ struct PairComputeFunctor { // This uses the fact that failure to match template parameters is not an error. // By having the enable_if with a ! and without it, exactly one of the functions // pair_compute_neighlist will match - either the dummy version -// or the real one further below. +// or the real one further below + template EV_FLOAT pair_compute_neighlist (PairStyle* fpair, std::enable_if_t*> list) { EV_FLOAT ev; From c0c64e812fabd092400fc08039cc56b7147aa936 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Mon, 18 Dec 2023 11:29:42 -0700 Subject: [PATCH 2/8] Enable default threading over neighbors for half list --- src/KOKKOS/pair_kokkos.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/KOKKOS/pair_kokkos.h b/src/KOKKOS/pair_kokkos.h index c0b81cc594..a160f84d95 100644 --- a/src/KOKKOS/pair_kokkos.h +++ b/src/KOKKOS/pair_kokkos.h @@ -933,8 +933,9 @@ EV_FLOAT pair_compute_neighlist (PairStyle* fpair, std::enable_if_t<(NEIGHFLAG&P EV_FLOAT ev; if (!fpair->lmp->kokkos->neigh_thread_set) - if (list->inum <= 16384 && NEIGHFLAG == FULL) - fpair->lmp->kokkos->neigh_thread = 1; + if (list->inum <= 16384) + if (NEIGHFLAG == FULL || !fpair->newton_pair) + fpair->lmp->kokkos->neigh_thread = 1; if (fpair->lmp->kokkos->neigh_thread) { From 8e4871c5e1d5527a0e10ac2a30e5048a14db5e55 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Mon, 18 Dec 2023 16:37:31 -0700 Subject: [PATCH 3/8] Tune team params --- src/KOKKOS/pair_kokkos.h | 61 +++++++++++++++++++++++++--------------- 1 file changed, 38 insertions(+), 23 deletions(-) diff --git a/src/KOKKOS/pair_kokkos.h b/src/KOKKOS/pair_kokkos.h index a160f84d95..bd543bbbba 100644 --- a/src/KOKKOS/pair_kokkos.h +++ b/src/KOKKOS/pair_kokkos.h @@ -19,10 +19,12 @@ #ifndef LMP_PAIR_KOKKOS_H #define LMP_PAIR_KOKKOS_H -#include "Kokkos_Macros.hpp" #include "pair.h" // IWYU pragma: export #include "neighbor_kokkos.h" #include "neigh_list_kokkos.h" +#include "math_special.h" +#include "update.h" +#include "Kokkos_Macros.hpp" #include "Kokkos_ScatterView.hpp" namespace LAMMPS_NS { @@ -907,24 +909,21 @@ EV_FLOAT pair_compute_neighlist (PairStyle* fpair, std::enable_if_t -int GetTeamSize(FunctorStyle& KOKKOS_GPU_ARG(functor), int KOKKOS_GPU_ARG(inum), - int KOKKOS_GPU_ARG(reduce_flag), int team_size, int KOKKOS_GPU_ARG(vector_length)) { +template +int GetMaxNeighs(NeighStyle* list) +{ + auto d_ilist = list->d_ilist; + auto d_numneigh = list->d_numneigh; + int inum = list->inum; -#ifdef LMP_KOKKOS_GPU - int team_size_max; + int maxneigh = 0; + Kokkos::parallel_reduce(inum, LAMMPS_LAMBDA(const int ii, int &maxneigh) { + const int i = d_ilist[ii]; + const int num_neighs = d_numneigh[i]; + maxneigh = MAX(maxneigh,num_neighs); + }, Kokkos::Max(maxneigh)); - if (reduce_flag) - team_size_max = Kokkos::TeamPolicy(inum,Kokkos::AUTO).team_size_max(functor,Kokkos::ParallelReduceTag()); - else - team_size_max = Kokkos::TeamPolicy(inum,Kokkos::AUTO).team_size_max(functor,Kokkos::ParallelForTag()); - - if (team_size*vector_length > team_size_max) - team_size = team_size_max/vector_length; -#else - team_size = 1; -#endif - return team_size; + return maxneigh; } // Submit ParallelFor for NEIGHFLAG=HALF,HALFTHREAD,FULL @@ -939,19 +938,35 @@ EV_FLOAT pair_compute_neighlist (PairStyle* fpair, std::enable_if_t<(NEIGHFLAG&P if (fpair->lmp->kokkos->neigh_thread) { - int vector_length = 8; - int atoms_per_team = 32; + static int vectorsize = 0; + static int lastcall = -1; + +#if defined(LMP_KOKKOS_GPU) + + #if defined(KOKKOS_ENABLE_HIP) + int max_vectorsize = 64; + #else + int max_vectorsize = 32; + #endif + + if (!vectorsize || lastcall < fpair->lmp->neighbor->lastcall) { + lastcall = fpair->lmp->update->ntimestep; + vectorsize = GetMaxNeighs(list); + vectorsize = MathSpecial::powint(2,int(log2(vectorsize))); // round down to nearest power of 2 + vectorsize = MIN(vectorsize,max_vectorsize); + } +#else + vectorsize = 1; +#endif if (fpair->atom->ntypes > MAX_TYPES_STACKPARAMS) { PairComputeFunctor ff(fpair,list); - atoms_per_team = GetTeamSize(ff, list->inum, (fpair->eflag || fpair->vflag), atoms_per_team, vector_length); - Kokkos::TeamPolicy > policy(list->inum,atoms_per_team,vector_length); + Kokkos::TeamPolicy > policy(list->inum,Kokkos::AUTO(),vectorsize); if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(policy,ff,ev); else Kokkos::parallel_for(policy,ff); } else { PairComputeFunctor ff(fpair,list); - atoms_per_team = GetTeamSize(ff, list->inum, (fpair->eflag || fpair->vflag), atoms_per_team, vector_length); - Kokkos::TeamPolicy > policy(list->inum,atoms_per_team,vector_length); + Kokkos::TeamPolicy > policy(list->inum,Kokkos::AUTO(),vectorsize); if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(policy,ff,ev); else Kokkos::parallel_for(policy,ff); } From ab29200c60ad807ed3c7415738eb8c62737a3d50 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Tue, 19 Dec 2023 10:26:52 -0700 Subject: [PATCH 4/8] More tuning --- src/KOKKOS/pair_kokkos.h | 41 +++++++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/src/KOKKOS/pair_kokkos.h b/src/KOKKOS/pair_kokkos.h index bd543bbbba..55fe01f6d2 100644 --- a/src/KOKKOS/pair_kokkos.h +++ b/src/KOKKOS/pair_kokkos.h @@ -65,6 +65,7 @@ struct PairComputeFunctor { typename AT::t_f_array f; typename AT::t_efloat_1d d_eatom; typename AT::t_virial_array d_vatom; + int inum; using KKDeviceType = typename KKDevice::value; using DUP = NeedDup_v; @@ -97,6 +98,7 @@ struct PairComputeFunctor { dup_f = Kokkos::Experimental::create_scatter_view(c.f); dup_eatom = Kokkos::Experimental::create_scatter_view(c.d_eatom); dup_vatom = Kokkos::Experimental::create_scatter_view(c.d_vatom); + inum = list.inum; }; // Set copymode = 1 so parent allocations aren't destructed by copies of the style @@ -287,7 +289,6 @@ struct PairComputeFunctor { auto a_f = dup_f.template access::value>(); - const int inum = team.league_size(); const int atoms_per_team = team.team_size(); const int firstatom = team.league_rank()*atoms_per_team; const int lastatom = firstatom + atoms_per_team < inum ? firstatom + atoms_per_team : inum; @@ -364,13 +365,11 @@ struct PairComputeFunctor { auto a_f = dup_f.template access::value>(); - const int inum = team.league_size(); const int atoms_per_team = team.team_size(); int firstatom = team.league_rank()*atoms_per_team; int lastatom = firstatom + atoms_per_team < inum ? firstatom + atoms_per_team : inum; Kokkos::parallel_for(Kokkos::TeamThreadRange(team, firstatom, lastatom), [&] (const int &ii) { - const int i = list.d_ilist[ii]; const X_FLOAT xtmp = c.x(i,0); const X_FLOAT ytmp = c.x(i,1); @@ -452,11 +451,9 @@ struct PairComputeFunctor { EV_FLOAT ev; - const int inum = team.league_size(); const int atoms_per_team = team.team_size(); const int firstatom = team.league_rank()*atoms_per_team; const int lastatom = firstatom + atoms_per_team < inum ? firstatom + atoms_per_team : inum; - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, firstatom, lastatom), [&] (const int &ii) { const int i = list.d_ilist[ii]; @@ -616,7 +613,6 @@ struct PairComputeFunctor { EV_FLOAT ev; - const int inum = team.league_size(); const int atoms_per_team = team.team_size(); const int firstatom = team.league_rank()*atoms_per_team; const int lastatom = firstatom + atoms_per_team < inum ? firstatom + atoms_per_team : inum; @@ -926,6 +922,14 @@ int GetMaxNeighs(NeighStyle* list) return maxneigh; } +template +void GetMaxTeamSize(FunctorStyle& functor, int inum, + int &teamsize_max_for, int &teamsize_max_reduce) +{ + teamsize_max_for = Kokkos::TeamPolicy(inum,Kokkos::AUTO).team_size_max(functor,Kokkos::ParallelForTag()); + teamsize_max_reduce = Kokkos::TeamPolicy(inum,Kokkos::AUTO).team_size_max(functor,Kokkos::ParallelReduceTag()); +} + // Submit ParallelFor for NEIGHFLAG=HALF,HALFTHREAD,FULL template EV_FLOAT pair_compute_neighlist (PairStyle* fpair, std::enable_if_t<(NEIGHFLAG&PairStyle::EnabledNeighFlags) != 0, NeighListKokkos*> list) { @@ -939,6 +943,7 @@ EV_FLOAT pair_compute_neighlist (PairStyle* fpair, std::enable_if_t<(NEIGHFLAG&P if (fpair->lmp->kokkos->neigh_thread) { static int vectorsize = 0; + static int atoms_per_team = 0; static int lastcall = -1; #if defined(LMP_KOKKOS_GPU) @@ -952,21 +957,39 @@ EV_FLOAT pair_compute_neighlist (PairStyle* fpair, std::enable_if_t<(NEIGHFLAG&P if (!vectorsize || lastcall < fpair->lmp->neighbor->lastcall) { lastcall = fpair->lmp->update->ntimestep; vectorsize = GetMaxNeighs(list); - vectorsize = MathSpecial::powint(2,int(log2(vectorsize))); // round down to nearest power of 2 + vectorsize = MathSpecial::powint(2,(int(log2(vectorsize) + 0.5))); // round to nearest power of 2 vectorsize = MIN(vectorsize,max_vectorsize); + + int teamsize_max_for,teamsize_max_reduce; + if (fpair->atom->ntypes > MAX_TYPES_STACKPARAMS) { + PairComputeFunctor ff(fpair,list); + GetMaxTeamSize(ff, list->inum, teamsize_max_for, teamsize_max_reduce); + } else { + PairComputeFunctor ff(fpair,list); + GetMaxTeamSize(ff, list->inum, teamsize_max_for, teamsize_max_reduce); + } + + int teamsize_max = teamsize_max_for; + if (fpair->eflag || fpair->vflag) + teamsize_max = teamsize_max_reduce; + atoms_per_team = teamsize_max/vectorsize; } #else vectorsize = 1; + atoms_per_team = 1; #endif + const int inum = list->inum; + const int num_teams = inum / atoms_per_team + (inum % atoms_per_team ? 1 : 0); + if (fpair->atom->ntypes > MAX_TYPES_STACKPARAMS) { PairComputeFunctor ff(fpair,list); - Kokkos::TeamPolicy > policy(list->inum,Kokkos::AUTO(),vectorsize); + Kokkos::TeamPolicy > policy(num_teams,atoms_per_team,vectorsize); if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(policy,ff,ev); else Kokkos::parallel_for(policy,ff); } else { PairComputeFunctor ff(fpair,list); - Kokkos::TeamPolicy > policy(list->inum,Kokkos::AUTO(),vectorsize); + Kokkos::TeamPolicy > policy(num_teams,atoms_per_team,vectorsize); if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(policy,ff,ev); else Kokkos::parallel_for(policy,ff); } From 86f87e0f7b85b8d157e790249abf31b54abb3bf3 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Tue, 19 Dec 2023 10:46:41 -0700 Subject: [PATCH 5/8] Update docs --- doc/src/package.rst | 16 ++++++++-------- src/KOKKOS/pair_kokkos.h | 31 ++++++++++++++++--------------- 2 files changed, 24 insertions(+), 23 deletions(-) diff --git a/doc/src/package.rst b/doc/src/package.rst index 63a7f095ad..65392845e9 100644 --- a/doc/src/package.rst +++ b/doc/src/package.rst @@ -474,13 +474,13 @@ If the *neigh/thread* keyword is set to *off*, then the KOKKOS package threads only over atoms. However, for small systems, this may not expose enough parallelism to keep a GPU busy. When this keyword is set to *on*, the KOKKOS package threads over both atoms and neighbors of atoms. When -using *neigh/thread* *on*, a full neighbor list must also be used. Using -*neigh/thread* *on* may be slower for large systems, so this this option -is turned on by default only when there are 16K atoms or less owned by -an MPI rank and when using a full neighbor list. Not all KOKKOS-enabled -potentials support this keyword yet, and only thread over atoms. Many -simple pairwise potentials such as Lennard-Jones do support threading -over both atoms and neighbors. +using *neigh/thread* *on*, the :doc:`newton pair ` setting must +be "off". Using *neigh/thread* *on* may be slower for large systems, so +this this option is turned on by default only when running on one or +more GPUs and there are 16k atoms or less owned by an MPI rank. Not all +KOKKOS-enabled potentials support this keyword yet, and only thread over +atoms. Many simple pairwise potentials such as Lennard-Jones do support +threading over both atoms and neighbors. If the *neigh/transpose* keyword is set to *off*, then the KOKKOS package will use the same memory layout for building the neighbor list on @@ -732,7 +732,7 @@ comm = device, sort = device, neigh/transpose = off, gpu/aware = on. When LAMMPS can safely detect that GPU-aware MPI is not available, the default value of gpu/aware becomes "off". For CPUs or Xeon Phis, the option defaults are neigh = half, neigh/qeq = half, newton = on, binsize = 0.0, comm = no, and sort -= no. The option neigh/thread = on when there are 16K atoms or less on an MPI += no. For GPUs, option neigh/thread = on when there are 16k atoms or less on an MPI rank, otherwise it is "off". These settings are made automatically by the required "-k on" :doc:`command-line switch `. You can change them by using the package kokkos command in your input script or via the :doc:`-pk diff --git a/src/KOKKOS/pair_kokkos.h b/src/KOKKOS/pair_kokkos.h index 55fe01f6d2..eea2cd5316 100644 --- a/src/KOKKOS/pair_kokkos.h +++ b/src/KOKKOS/pair_kokkos.h @@ -935,8 +935,10 @@ template*> list) { EV_FLOAT ev; + const int inum = list->inum; + if (!fpair->lmp->kokkos->neigh_thread_set) - if (list->inum <= 16384) + if (fpair->lmp->kokkos->ngpus && inum <= 16000) if (NEIGHFLAG == FULL || !fpair->newton_pair) fpair->lmp->kokkos->neigh_thread = 1; @@ -947,26 +949,26 @@ EV_FLOAT pair_compute_neighlist (PairStyle* fpair, std::enable_if_t<(NEIGHFLAG&P static int lastcall = -1; #if defined(LMP_KOKKOS_GPU) - - #if defined(KOKKOS_ENABLE_HIP) - int max_vectorsize = 64; - #else - int max_vectorsize = 32; - #endif - if (!vectorsize || lastcall < fpair->lmp->neighbor->lastcall) { lastcall = fpair->lmp->update->ntimestep; vectorsize = GetMaxNeighs(list); vectorsize = MathSpecial::powint(2,(int(log2(vectorsize) + 0.5))); // round to nearest power of 2 + + #if defined(KOKKOS_ENABLE_HIP) + int max_vectorsize = 64; + #else + int max_vectorsize = 32; + #endif + vectorsize = MIN(vectorsize,max_vectorsize); int teamsize_max_for,teamsize_max_reduce; if (fpair->atom->ntypes > MAX_TYPES_STACKPARAMS) { PairComputeFunctor ff(fpair,list); - GetMaxTeamSize(ff, list->inum, teamsize_max_for, teamsize_max_reduce); + GetMaxTeamSize(ff, inum, teamsize_max_for, teamsize_max_reduce); } else { PairComputeFunctor ff(fpair,list); - GetMaxTeamSize(ff, list->inum, teamsize_max_for, teamsize_max_reduce); + GetMaxTeamSize(ff, inum, teamsize_max_for, teamsize_max_reduce); } int teamsize_max = teamsize_max_for; @@ -979,7 +981,6 @@ EV_FLOAT pair_compute_neighlist (PairStyle* fpair, std::enable_if_t<(NEIGHFLAG&P atoms_per_team = 1; #endif - const int inum = list->inum; const int num_teams = inum / atoms_per_team + (inum % atoms_per_team ? 1 : 0); if (fpair->atom->ntypes > MAX_TYPES_STACKPARAMS) { @@ -996,13 +997,13 @@ EV_FLOAT pair_compute_neighlist (PairStyle* fpair, std::enable_if_t<(NEIGHFLAG&P } else { if (fpair->atom->ntypes > MAX_TYPES_STACKPARAMS) { PairComputeFunctor ff(fpair,list); - if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(list->inum,ff,ev); - else Kokkos::parallel_for(list->inum,ff); + if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(inum,ff,ev); + else Kokkos::parallel_for(inum,ff); ff.contribute(); } else { PairComputeFunctor ff(fpair,list); - if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(list->inum,ff,ev); - else Kokkos::parallel_for(list->inum,ff); + if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(inum,ff,ev); + else Kokkos::parallel_for(inum,ff); ff.contribute(); } } From e00fc992fcbf899cefb3c409a561ed13e03b3346 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Fri, 22 Dec 2023 11:35:35 -0500 Subject: [PATCH 6/8] skip python tests using numpy that fail randomly on macOS --- unittest/python/CMakeLists.txt | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/unittest/python/CMakeLists.txt b/unittest/python/CMakeLists.txt index b4ba281d93..f3b851620c 100644 --- a/unittest/python/CMakeLists.txt +++ b/unittest/python/CMakeLists.txt @@ -84,20 +84,26 @@ if(Python_EXECUTABLE) WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) set_tests_properties(PythonCommands PROPERTIES ENVIRONMENT "${PYTHON_TEST_ENVIRONMENT}") - add_test(NAME PythonNumpy - COMMAND ${PYTHON_TEST_RUNNER} ${CMAKE_CURRENT_SOURCE_DIR}/python-numpy.py -v - WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) - set_tests_properties(PythonNumpy PROPERTIES ENVIRONMENT "${PYTHON_TEST_ENVIRONMENT}") + # randomly failing on macOS with python 3.12 + if(NOT APPLE) + add_test(NAME PythonNumpy + COMMAND ${PYTHON_TEST_RUNNER} ${CMAKE_CURRENT_SOURCE_DIR}/python-numpy.py -v + WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) + set_tests_properties(PythonNumpy PROPERTIES ENVIRONMENT "${PYTHON_TEST_ENVIRONMENT}") + endif() add_test(NAME PythonCapabilities COMMAND ${PYTHON_TEST_RUNNER} ${CMAKE_CURRENT_SOURCE_DIR}/python-capabilities.py -v WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) set_tests_properties(PythonCapabilities PROPERTIES ENVIRONMENT "${PYTHON_TEST_ENVIRONMENT}") - add_test(NAME PythonPyLammps - COMMAND ${PYTHON_TEST_RUNNER} ${CMAKE_CURRENT_SOURCE_DIR}/python-pylammps.py -v - WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) - set_tests_properties(PythonPyLammps PROPERTIES ENVIRONMENT "${PYTHON_TEST_ENVIRONMENT}") + # randomly failing on macOS with python 3.12 + if(NOT APPLE) + add_test(NAME PythonPyLammps + COMMAND ${PYTHON_TEST_RUNNER} ${CMAKE_CURRENT_SOURCE_DIR}/python-pylammps.py -v + WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) + set_tests_properties(PythonPyLammps PROPERTIES ENVIRONMENT "${PYTHON_TEST_ENVIRONMENT}") + endif() add_test(NAME PythonFormats COMMAND ${PYTHON_TEST_RUNNER} ${CMAKE_CURRENT_SOURCE_DIR}/python-formats.py -v From 55784019f76c27ce5c3703d379439e123588530b Mon Sep 17 00:00:00 2001 From: Stan Gerald Moore Date: Wed, 3 Jan 2024 16:46:06 -0700 Subject: [PATCH 7/8] Fix CPU issue --- src/KOKKOS/pair_kokkos.h | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/KOKKOS/pair_kokkos.h b/src/KOKKOS/pair_kokkos.h index eea2cd5316..32f50c3bb5 100644 --- a/src/KOKKOS/pair_kokkos.h +++ b/src/KOKKOS/pair_kokkos.h @@ -84,8 +84,6 @@ struct PairComputeFunctor { // typename KKDevice::value,Kokkos::MemoryTraits::value> > vatom; KKScatterView dup_vatom; - - NeighListKokkos list; PairComputeFunctor(PairStyle* c_ptr, @@ -109,13 +107,15 @@ struct PairComputeFunctor { } void contribute() { - Kokkos::Experimental::contribute(c.f, dup_f); + if constexpr (std::is_same_v,Kokkos::Experimental::ScatterDuplicated>) { + Kokkos::Experimental::contribute(c.f, dup_f); - if (c.eflag_atom) - Kokkos::Experimental::contribute(c.d_eatom, dup_eatom); + if (c.eflag_atom) + Kokkos::Experimental::contribute(c.d_eatom, dup_eatom); - if (c.vflag_atom) - Kokkos::Experimental::contribute(c.d_vatom, dup_vatom); + if (c.vflag_atom) + Kokkos::Experimental::contribute(c.d_vatom, dup_vatom); + } } // Loop over neighbors of one atom without coulomb interaction @@ -988,11 +988,13 @@ EV_FLOAT pair_compute_neighlist (PairStyle* fpair, std::enable_if_t<(NEIGHFLAG&P Kokkos::TeamPolicy > policy(num_teams,atoms_per_team,vectorsize); if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(policy,ff,ev); else Kokkos::parallel_for(policy,ff); + ff.contribute(); } else { PairComputeFunctor ff(fpair,list); Kokkos::TeamPolicy > policy(num_teams,atoms_per_team,vectorsize); if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(policy,ff,ev); else Kokkos::parallel_for(policy,ff); + ff.contribute(); } } else { if (fpair->atom->ntypes > MAX_TYPES_STACKPARAMS) { From 9d7582ec1bb86d0b41f9ae6f1de9e62d81e9db01 Mon Sep 17 00:00:00 2001 From: Stan Gerald Moore Date: Wed, 3 Jan 2024 16:59:11 -0700 Subject: [PATCH 8/8] Small tweak for readability --- src/KOKKOS/pair_kokkos.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/KOKKOS/pair_kokkos.h b/src/KOKKOS/pair_kokkos.h index 32f50c3bb5..9521268284 100644 --- a/src/KOKKOS/pair_kokkos.h +++ b/src/KOKKOS/pair_kokkos.h @@ -107,7 +107,9 @@ struct PairComputeFunctor { } void contribute() { - if constexpr (std::is_same_v,Kokkos::Experimental::ScatterDuplicated>) { + int need_dup = std::is_same_v; + + if (need_dup) { Kokkos::Experimental::contribute(c.f, dup_f); if (c.eflag_atom)