Merge pull request #4003 from stanmoore1/kk_half_thread

Thread over neighbors in addition to atoms when using a half neighlist
This commit is contained in:
Axel Kohlmeyer
2024-01-04 21:04:31 -05:00
committed by GitHub
3 changed files with 329 additions and 135 deletions

View File

@ -472,13 +472,13 @@ If the *neigh/thread* keyword is set to *off*, then the KOKKOS package
threads only over atoms. However, for small systems, this may not expose threads only over atoms. However, for small systems, this may not expose
enough parallelism to keep a GPU busy. When this keyword is set to *on*, enough parallelism to keep a GPU busy. When this keyword is set to *on*,
the KOKKOS package threads over both atoms and neighbors of atoms. When the KOKKOS package threads over both atoms and neighbors of atoms. When
using *neigh/thread* *on*, a full neighbor list must also be used. Using using *neigh/thread* *on*, the :doc:`newton pair <newton>` setting must
*neigh/thread* *on* may be slower for large systems, so this this option be "off". Using *neigh/thread* *on* may be slower for large systems, so
is turned on by default only when there are 16K atoms or less owned by this this option is turned on by default only when running on one or
an MPI rank and when using a full neighbor list. Not all KOKKOS-enabled more GPUs and there are 16k atoms or less owned by an MPI rank. Not all
potentials support this keyword yet, and only thread over atoms. Many KOKKOS-enabled potentials support this keyword yet, and only thread over
simple pairwise potentials such as Lennard-Jones do support threading atoms. Many simple pairwise potentials such as Lennard-Jones do support
over both atoms and neighbors. threading over both atoms and neighbors.
If the *neigh/transpose* keyword is set to *off*, then the KOKKOS If the *neigh/transpose* keyword is set to *off*, then the KOKKOS
package will use the same memory layout for building the neighbor list on package will use the same memory layout for building the neighbor list on
@ -730,7 +730,7 @@ comm = device, sort = device, neigh/transpose = off, gpu/aware = on. When
LAMMPS can safely detect that GPU-aware MPI is not available, the default value LAMMPS can safely detect that GPU-aware MPI is not available, the default value
of gpu/aware becomes "off". For CPUs or Xeon Phis, the option defaults are of gpu/aware becomes "off". For CPUs or Xeon Phis, the option defaults are
neigh = half, neigh/qeq = half, newton = on, binsize = 0.0, comm = no, and sort neigh = half, neigh/qeq = half, newton = on, binsize = 0.0, comm = no, and sort
= no. The option neigh/thread = on when there are 16K atoms or less on an MPI = no. For GPUs, option neigh/thread = on when there are 16k atoms or less on an MPI
rank, otherwise it is "off". These settings are made automatically by the rank, otherwise it is "off". These settings are made automatically by the
required "-k on" :doc:`command-line switch <Run_options>`. You can change them required "-k on" :doc:`command-line switch <Run_options>`. You can change them
by using the package kokkos command in your input script or via the :doc:`-pk by using the package kokkos command in your input script or via the :doc:`-pk

View File

@ -608,8 +608,8 @@ void KokkosLMP::accelerator(int narg, char **arg)
force->newton = force->newton_pair = force->newton_bond = newtonflag; force->newton = force->newton_pair = force->newton_bond = newtonflag;
if (neigh_thread && neighflag != FULL) if (neigh_thread && newtonflag)
error->all(FLERR,"Must use KOKKOS package option 'neigh full' with 'neigh/thread on'"); error->all(FLERR,"Must use KOKKOS package option 'newton off' with 'neigh/thread on'");
neighbor->binsize_user = binsize; neighbor->binsize_user = binsize;
if (binsize <= 0.0) neighbor->binsizeflag = 0; if (binsize <= 0.0) neighbor->binsizeflag = 0;

View File

@ -19,10 +19,12 @@
#ifndef LMP_PAIR_KOKKOS_H #ifndef LMP_PAIR_KOKKOS_H
#define LMP_PAIR_KOKKOS_H #define LMP_PAIR_KOKKOS_H
#include "Kokkos_Macros.hpp"
#include "pair.h" // IWYU pragma: export #include "pair.h" // IWYU pragma: export
#include "neighbor_kokkos.h" #include "neighbor_kokkos.h"
#include "neigh_list_kokkos.h" #include "neigh_list_kokkos.h"
#include "math_special.h"
#include "update.h"
#include "Kokkos_Macros.hpp"
#include "Kokkos_ScatterView.hpp" #include "Kokkos_ScatterView.hpp"
namespace LAMMPS_NS { namespace LAMMPS_NS {
@ -63,6 +65,7 @@ struct PairComputeFunctor {
typename AT::t_f_array f; typename AT::t_f_array f;
typename AT::t_efloat_1d d_eatom; typename AT::t_efloat_1d d_eatom;
typename AT::t_virial_array d_vatom; typename AT::t_virial_array d_vatom;
int inum;
using KKDeviceType = typename KKDevice<device_type>::value; using KKDeviceType = typename KKDevice<device_type>::value;
using DUP = NeedDup_v<NEIGHFLAG,device_type>; using DUP = NeedDup_v<NEIGHFLAG,device_type>;
@ -81,8 +84,6 @@ struct PairComputeFunctor {
// typename KKDevice<device_type>::value,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > vatom; // typename KKDevice<device_type>::value,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > vatom;
KKScatterView<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,KKDeviceType,KKScatterSum,DUP> dup_vatom; KKScatterView<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,KKDeviceType,KKScatterSum,DUP> dup_vatom;
NeighListKokkos<device_type> list; NeighListKokkos<device_type> list;
PairComputeFunctor(PairStyle* c_ptr, PairComputeFunctor(PairStyle* c_ptr,
@ -95,6 +96,7 @@ struct PairComputeFunctor {
dup_f = Kokkos::Experimental::create_scatter_view<KKScatterSum, DUP>(c.f); dup_f = Kokkos::Experimental::create_scatter_view<KKScatterSum, DUP>(c.f);
dup_eatom = Kokkos::Experimental::create_scatter_view<KKScatterSum, DUP>(c.d_eatom); dup_eatom = Kokkos::Experimental::create_scatter_view<KKScatterSum, DUP>(c.d_eatom);
dup_vatom = Kokkos::Experimental::create_scatter_view<KKScatterSum, DUP>(c.d_vatom); dup_vatom = Kokkos::Experimental::create_scatter_view<KKScatterSum, DUP>(c.d_vatom);
inum = list.inum;
}; };
// Set copymode = 1 so parent allocations aren't destructed by copies of the style // Set copymode = 1 so parent allocations aren't destructed by copies of the style
@ -105,17 +107,22 @@ struct PairComputeFunctor {
} }
void contribute() { void contribute() {
Kokkos::Experimental::contribute(c.f, dup_f); int need_dup = std::is_same_v<DUP,Kokkos::Experimental::ScatterDuplicated>;
if (c.eflag_atom) if (need_dup) {
Kokkos::Experimental::contribute(c.d_eatom, dup_eatom); Kokkos::Experimental::contribute(c.f, dup_f);
if (c.vflag_atom) if (c.eflag_atom)
Kokkos::Experimental::contribute(c.d_vatom, dup_vatom); Kokkos::Experimental::contribute(c.d_eatom, dup_eatom);
if (c.vflag_atom)
Kokkos::Experimental::contribute(c.d_vatom, dup_vatom);
}
} }
// Loop over neighbors of one atom without coulomb interaction // Loop over neighbors of one atom without coulomb interaction
// This function is called in parallel // This function is called in parallel
template<int EVFLAG, int NEWTON_PAIR> template<int EVFLAG, int NEWTON_PAIR>
KOKKOS_FUNCTION KOKKOS_FUNCTION
EV_FLOAT compute_item(const int& ii, EV_FLOAT compute_item(const int& ii,
@ -161,7 +168,7 @@ struct PairComputeFunctor {
fytmp += dely*fpair; fytmp += dely*fpair;
fztmp += delz*fpair; fztmp += delz*fpair;
if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < c.nlocal)) { if ((NEIGHFLAG == HALF || NEIGHFLAG == HALFTHREAD) && (NEWTON_PAIR || j < c.nlocal)) {
a_f(j,0) -= delx*fpair; a_f(j,0) -= delx*fpair;
a_f(j,1) -= dely*fpair; a_f(j,1) -= dely*fpair;
a_f(j,2) -= delz*fpair; a_f(j,2) -= delz*fpair;
@ -169,9 +176,9 @@ struct PairComputeFunctor {
if (EVFLAG) { if (EVFLAG) {
F_FLOAT evdwl = 0.0; F_FLOAT evdwl = 0.0;
if (c.eflag) { if (c.eflag_either) {
evdwl = factor_lj * c.template compute_evdwl<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype); evdwl = factor_lj * c.template compute_evdwl<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype);
ev.evdwl += (((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD)&&(NEWTON_PAIR||(j<c.nlocal)))?1.0:0.5)*evdwl; ev.evdwl += (((NEIGHFLAG == HALF || NEIGHFLAG == HALFTHREAD)&&(NEWTON_PAIR||(j<c.nlocal)))?1.0:0.5)*evdwl;
} }
if (c.vflag_either || c.eflag_atom) ev_tally(ev,i,j,evdwl,fpair,delx,dely,delz); if (c.vflag_either || c.eflag_atom) ev_tally(ev,i,j,evdwl,fpair,delx,dely,delz);
@ -189,6 +196,7 @@ struct PairComputeFunctor {
// Loop over neighbors of one atom with coulomb interaction // Loop over neighbors of one atom with coulomb interaction
// This function is called in parallel // This function is called in parallel
template<int EVFLAG, int NEWTON_PAIR> template<int EVFLAG, int NEWTON_PAIR>
KOKKOS_FUNCTION KOKKOS_FUNCTION
EV_FLOAT compute_item(const int& ii, EV_FLOAT compute_item(const int& ii,
@ -241,7 +249,7 @@ struct PairComputeFunctor {
fytmp += dely*fpair; fytmp += dely*fpair;
fztmp += delz*fpair; fztmp += delz*fpair;
if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < c.nlocal)) { if ((NEIGHFLAG == HALF || NEIGHFLAG == HALFTHREAD) && (NEWTON_PAIR || j < c.nlocal)) {
a_f(j,0) -= delx*fpair; a_f(j,0) -= delx*fpair;
a_f(j,1) -= dely*fpair; a_f(j,1) -= dely*fpair;
a_f(j,2) -= delz*fpair; a_f(j,2) -= delz*fpair;
@ -250,14 +258,14 @@ struct PairComputeFunctor {
if (EVFLAG) { if (EVFLAG) {
F_FLOAT evdwl = 0.0; F_FLOAT evdwl = 0.0;
F_FLOAT ecoul = 0.0; F_FLOAT ecoul = 0.0;
if (c.eflag) { if (c.eflag_either) {
if (rsq < (STACKPARAMS?c.m_cut_ljsq[itype][jtype]:c.d_cut_ljsq(itype,jtype))) { if (rsq < (STACKPARAMS?c.m_cut_ljsq[itype][jtype]:c.d_cut_ljsq(itype,jtype))) {
evdwl = factor_lj * c.template compute_evdwl<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype); evdwl = factor_lj * c.template compute_evdwl<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype);
ev.evdwl += (((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD)&&(NEWTON_PAIR||(j<c.nlocal)))?1.0:0.5)*evdwl; ev.evdwl += (((NEIGHFLAG == HALF || NEIGHFLAG == HALFTHREAD) && (NEWTON_PAIR || (j < c.nlocal)))?1.0:0.5)*evdwl;
} }
if (rsq < (STACKPARAMS?c.m_cut_coulsq[itype][jtype]:c.d_cut_coulsq(itype,jtype))) { if (rsq < (STACKPARAMS?c.m_cut_coulsq[itype][jtype]:c.d_cut_coulsq(itype,jtype))) {
ecoul = c.template compute_ecoul<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype,factor_coul,qtmp); ecoul = c.template compute_ecoul<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype,factor_coul,qtmp);
ev.ecoul += (((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD)&&(NEWTON_PAIR||(j<c.nlocal)))?1.0:0.5)*ecoul; ev.ecoul += (((NEIGHFLAG == HALF || NEIGHFLAG == HALFTHREAD) && (NEWTON_PAIR || (j < c.nlocal)))?1.0:0.5)*ecoul;
} }
} }
@ -273,14 +281,16 @@ struct PairComputeFunctor {
return ev; return ev;
} }
// Use TeamPolicy, assume Newton off, Full Neighborlist, and no energy/virial // TeamPolicy, newton off, and no energy/virial
// Loop over neighbors of one atom without coulomb interaction // Loop over neighbors of one atom without coulomb interaction
// This function is called in parallel // This function is called in parallel
KOKKOS_FUNCTION KOKKOS_FUNCTION
void compute_item_team(typename Kokkos::TeamPolicy<device_type>::member_type team, void compute_item_team(typename Kokkos::TeamPolicy<device_type>::member_type team,
const NeighListKokkos<device_type> &list, const NoCoulTag&) const { const NeighListKokkos<device_type> &list, const NoCoulTag&) const {
const int inum = team.league_size(); auto a_f = dup_f.template access<typename AtomicDup<NEIGHFLAG,device_type>::value>();
const int atoms_per_team = team.team_size(); const int atoms_per_team = team.team_size();
const int firstatom = team.league_rank()*atoms_per_team; const int firstatom = team.league_rank()*atoms_per_team;
const int lastatom = firstatom + atoms_per_team < inum ? firstatom + atoms_per_team : inum; const int lastatom = firstatom + atoms_per_team < inum ? firstatom + atoms_per_team : inum;
@ -292,7 +302,7 @@ struct PairComputeFunctor {
const X_FLOAT ztmp = c.x(i,2); const X_FLOAT ztmp = c.x(i,2);
const int itype = c.type(i); const int itype = c.type(i);
if (ZEROFLAG) { if (NEIGHFLAG == FULL && ZEROFLAG) {
Kokkos::single(Kokkos::PerThread(team), [&] (){ Kokkos::single(Kokkos::PerThread(team), [&] (){
f(i,0) = 0.0; f(i,0) = 0.0;
f(i,1) = 0.0; f(i,1) = 0.0;
@ -321,30 +331,42 @@ struct PairComputeFunctor {
const F_FLOAT fpair = factor_lj*c.template compute_fpair<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype); const F_FLOAT fpair = factor_lj*c.template compute_fpair<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype);
ftmp.x += delx*fpair; const F_FLOAT fx = delx*fpair;
ftmp.y += dely*fpair; const F_FLOAT fy = dely*fpair;
ftmp.z += delz*fpair; const F_FLOAT fz = delz*fpair;
ftmp.x += fx;
ftmp.y += fy;
ftmp.z += fz;
if ((NEIGHFLAG == HALF || NEIGHFLAG == HALFTHREAD) && j < c.nlocal) {
a_f(j,0) -= fx;
a_f(j,1) -= fy;
a_f(j,2) -= fz;
}
} }
},fsum); },fsum);
Kokkos::single(Kokkos::PerThread(team), [&] () { Kokkos::single(Kokkos::PerThread(team), [&] () {
f(i,0) += fsum.x; a_f(i,0) += fsum.x;
f(i,1) += fsum.y; a_f(i,1) += fsum.y;
f(i,2) += fsum.z; a_f(i,2) += fsum.z;
}); });
}); });
} }
// Use TeamPolicy, assume Newton off, Full Neighborlist, and no energy/virial // TeamPolicy, newton off, and no energy/virial
// Loop over neighbors of one atom with coulomb interaction // Loop over neighbors of one atom with coulomb interaction
// This function is called in parallel // This function is called in parallel
KOKKOS_FUNCTION KOKKOS_FUNCTION
void compute_item_team(typename Kokkos::TeamPolicy<device_type>::member_type team, void compute_item_team(typename Kokkos::TeamPolicy<device_type>::member_type team,
const NeighListKokkos<device_type> &list, const CoulTag& ) const { const NeighListKokkos<device_type> &list, const CoulTag& ) const {
const int inum = team.league_size(); auto a_f = dup_f.template access<typename AtomicDup<NEIGHFLAG,device_type>::value>();
const int atoms_per_team = team.team_size(); const int atoms_per_team = team.team_size();
int firstatom = team.league_rank()*atoms_per_team; int firstatom = team.league_rank()*atoms_per_team;
int lastatom = firstatom + atoms_per_team < inum ? firstatom + atoms_per_team : inum; int lastatom = firstatom + atoms_per_team < inum ? firstatom + atoms_per_team : inum;
@ -357,8 +379,9 @@ struct PairComputeFunctor {
const int itype = c.type(i); const int itype = c.type(i);
const F_FLOAT qtmp = c.q(i); const F_FLOAT qtmp = c.q(i);
if (ZEROFLAG) { if (NEIGHFLAG == FULL && ZEROFLAG) {
Kokkos::single(Kokkos::PerThread(team), [&] (){ Kokkos::single(Kokkos::PerThread(team), [&] ()
{
f(i,0) = 0.0; f(i,0) = 0.0;
f(i,1) = 0.0; f(i,1) = 0.0;
f(i,2) = 0.0; f(i,2) = 0.0;
@ -391,31 +414,45 @@ struct PairComputeFunctor {
if (rsq < (STACKPARAMS?c.m_cut_coulsq[itype][jtype]:c.d_cut_coulsq(itype,jtype))) if (rsq < (STACKPARAMS?c.m_cut_coulsq[itype][jtype]:c.d_cut_coulsq(itype,jtype)))
fpair+=c.template compute_fcoul<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype,factor_coul,qtmp); fpair+=c.template compute_fcoul<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype,factor_coul,qtmp);
ftmp.x += delx*fpair; const F_FLOAT fx = delx*fpair;
ftmp.y += dely*fpair; const F_FLOAT fy = dely*fpair;
ftmp.z += delz*fpair; const F_FLOAT fz = delz*fpair;
ftmp.x += fx;
ftmp.y += fy;
ftmp.z += fz;
if ((NEIGHFLAG == HALF || NEIGHFLAG == HALFTHREAD) && j < c.nlocal) {
a_f(j,0) -= fx;
a_f(j,1) -= fy;
a_f(j,2) -= fz;
}
} }
},fsum); },fsum);
Kokkos::single(Kokkos::PerThread(team), [&] () { Kokkos::single(Kokkos::PerThread(team), [&] () {
f(i,0) += fsum.x; a_f(i,0) += fsum.x;
f(i,1) += fsum.y; a_f(i,1) += fsum.y;
f(i,2) += fsum.z; a_f(i,2) += fsum.z;
}); });
}); });
} }
// TeamPolicy, newton off, and energy/virial
// Use TeamPolicy, assume Newton off, Full Neighborlist, and energy/virial
// Loop over neighbors of one atom without coulomb interaction // Loop over neighbors of one atom without coulomb interaction
// This function is called in parallel // This function is called in parallel
KOKKOS_FUNCTION KOKKOS_FUNCTION
EV_FLOAT compute_item_team_ev(typename Kokkos::TeamPolicy<device_type>::member_type team, EV_FLOAT compute_item_team_ev(typename Kokkos::TeamPolicy<device_type>::member_type team,
const NeighListKokkos<device_type> &list, const NoCoulTag&) const { const NeighListKokkos<device_type> &list, const NoCoulTag&) const {
auto a_f = dup_f.template access<typename AtomicDup<NEIGHFLAG,device_type>::value>();
auto a_eatom = dup_eatom.template access<typename AtomicDup<NEIGHFLAG,device_type>::value>();
auto a_vatom = dup_vatom.template access<typename AtomicDup<NEIGHFLAG,device_type>::value>();
EV_FLOAT ev; EV_FLOAT ev;
const int inum = team.league_size();
const int atoms_per_team = team.team_size(); const int atoms_per_team = team.team_size();
const int firstatom = team.league_rank()*atoms_per_team; const int firstatom = team.league_rank()*atoms_per_team;
const int lastatom = firstatom + atoms_per_team < inum ? firstatom + atoms_per_team : inum; const int lastatom = firstatom + atoms_per_team < inum ? firstatom + atoms_per_team : inum;
@ -427,8 +464,9 @@ struct PairComputeFunctor {
const X_FLOAT ztmp = c.x(i,2); const X_FLOAT ztmp = c.x(i,2);
const int itype = c.type(i); const int itype = c.type(i);
if (ZEROFLAG) { if (NEIGHFLAG == FULL && ZEROFLAG) {
Kokkos::single(Kokkos::PerThread(team), [&] (){ Kokkos::single(Kokkos::PerThread(team), [&] ()
{
f(i,0) = 0.0; f(i,0) = 0.0;
f(i,1) = 0.0; f(i,1) = 0.0;
f(i,2) = 0.0; f(i,2) = 0.0;
@ -456,37 +494,85 @@ struct PairComputeFunctor {
const F_FLOAT fpair = factor_lj*c.template compute_fpair<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype); const F_FLOAT fpair = factor_lj*c.template compute_fpair<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype);
fev_tmp.f[0] += delx*fpair; const F_FLOAT fx = delx*fpair;
fev_tmp.f[1] += dely*fpair; const F_FLOAT fy = dely*fpair;
fev_tmp.f[2] += delz*fpair; const F_FLOAT fz = delz*fpair;
fev_tmp.f[0] += fx;
fev_tmp.f[1] += fy;
fev_tmp.f[2] += fz;
const int I_CONTRIB = (NEIGHFLAG == HALF || NEIGHFLAG == HALFTHREAD);
const int J_CONTRIB = ((NEIGHFLAG == HALF || NEIGHFLAG == HALFTHREAD) && j < c.nlocal);
const E_FLOAT factor = J_CONTRIB?1.0:0.5;
if (J_CONTRIB) {
a_f(j,0) -= fx;
a_f(j,1) -= fy;
a_f(j,2) -= fz;
}
F_FLOAT evdwl = 0.0; F_FLOAT evdwl = 0.0;
if (c.eflag) { if (c.eflag_either) {
evdwl = factor_lj * c.template compute_evdwl<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype); evdwl = factor_lj * c.template compute_evdwl<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype);
fev_tmp.evdwl += 0.5*evdwl; fev_tmp.evdwl += factor * evdwl;
if (c.eflag_atom) {
const E_FLOAT epairhalf = 0.5 * evdwl;
if (I_CONTRIB)
a_eatom[i] += epairhalf;
if (J_CONTRIB)
a_eatom[j] += epairhalf;
}
} }
if (c.vflag_either) { if (c.vflag_either) {
fev_tmp.v[0] += 0.5*delx*delx*fpair; const E_FLOAT v0 = delx*delx*fpair;
fev_tmp.v[1] += 0.5*dely*dely*fpair; const E_FLOAT v1 = dely*dely*fpair;
fev_tmp.v[2] += 0.5*delz*delz*fpair; const E_FLOAT v2 = delz*delz*fpair;
fev_tmp.v[3] += 0.5*delx*dely*fpair; const E_FLOAT v3 = delx*dely*fpair;
fev_tmp.v[4] += 0.5*delx*delz*fpair; const E_FLOAT v4 = delx*delz*fpair;
fev_tmp.v[5] += 0.5*dely*delz*fpair; const E_FLOAT v5 = dely*delz*fpair;
fev_tmp.v[0] += factor*v0;
fev_tmp.v[1] += factor*v1;
fev_tmp.v[2] += factor*v2;
fev_tmp.v[3] += factor*v3;
fev_tmp.v[4] += factor*v4;
fev_tmp.v[5] += factor*v5;
if (c.vflag_atom) {
if (I_CONTRIB) {
a_vatom(i,0) += 0.5*v0;
a_vatom(i,1) += 0.5*v1;
a_vatom(i,2) += 0.5*v2;
a_vatom(i,3) += 0.5*v3;
a_vatom(i,4) += 0.5*v4;
a_vatom(i,5) += 0.5*v5;
}
if (J_CONTRIB) {
a_vatom(j,0) += 0.5*v0;
a_vatom(j,1) += 0.5*v1;
a_vatom(j,2) += 0.5*v2;
a_vatom(j,3) += 0.5*v3;
a_vatom(j,4) += 0.5*v4;
a_vatom(j,5) += 0.5*v5;
}
}
} }
} }
},fev); },fev);
Kokkos::single(Kokkos::PerThread(team), [&] () { Kokkos::single(Kokkos::PerThread(team), [&] () {
f(i,0) += fev.f[0]; a_f(i,0) += fev.f[0];
f(i,1) += fev.f[1]; a_f(i,1) += fev.f[1];
f(i,2) += fev.f[2]; a_f(i,2) += fev.f[2];
if (c.eflag_global) if (c.eflag_global)
ev.evdwl += fev.evdwl; ev.evdwl += fev.evdwl;
if (c.eflag_atom)
d_eatom(i) += fev.evdwl;
if (c.vflag_global) { if (c.vflag_global) {
ev.v[0] += fev.v[0]; ev.v[0] += fev.v[0];
ev.v[1] += fev.v[1]; ev.v[1] += fev.v[1];
@ -496,29 +582,39 @@ struct PairComputeFunctor {
ev.v[5] += fev.v[5]; ev.v[5] += fev.v[5];
} }
if (c.vflag_atom) { if (NEIGHFLAG == FULL) {
d_vatom(i,0) += fev.v[0];
d_vatom(i,1) += fev.v[1]; if (c.eflag_atom)
d_vatom(i,2) += fev.v[2]; a_eatom(i) += fev.evdwl;
d_vatom(i,3) += fev.v[3];
d_vatom(i,4) += fev.v[4]; if (c.vflag_atom) {
d_vatom(i,5) += fev.v[5]; a_vatom(i,0) += fev.v[0];
a_vatom(i,1) += fev.v[1];
a_vatom(i,2) += fev.v[2];
a_vatom(i,3) += fev.v[3];
a_vatom(i,4) += fev.v[4];
a_vatom(i,5) += fev.v[5];
}
} }
}); });
}); });
return ev; return ev;
} }
// Use TeamPolicy, assume Newton off, Full Neighborlist, and energy/virial // TeamPolicy, newton off, and energy/virial
// Loop over neighbors of one atom with coulomb interaction // Loop over neighbors of one atom with coulomb interaction
// This function is called in parallel // This function is called in parallel
KOKKOS_FUNCTION KOKKOS_FUNCTION
EV_FLOAT compute_item_team_ev(typename Kokkos::TeamPolicy<device_type>::member_type team, EV_FLOAT compute_item_team_ev(typename Kokkos::TeamPolicy<device_type>::member_type team,
const NeighListKokkos<device_type> &list, const CoulTag& ) const { const NeighListKokkos<device_type> &list, const CoulTag& ) const {
auto a_f = dup_f.template access<typename AtomicDup<NEIGHFLAG,device_type>::value>();
auto a_eatom = dup_eatom.template access<typename AtomicDup<NEIGHFLAG,device_type>::value>();
auto a_vatom = dup_vatom.template access<typename AtomicDup<NEIGHFLAG,device_type>::value>();
EV_FLOAT ev; EV_FLOAT ev;
const int inum = team.league_size();
const int atoms_per_team = team.team_size(); const int atoms_per_team = team.team_size();
const int firstatom = team.league_rank()*atoms_per_team; const int firstatom = team.league_rank()*atoms_per_team;
const int lastatom = firstatom + atoms_per_team < inum ? firstatom + atoms_per_team : inum; const int lastatom = firstatom + atoms_per_team < inum ? firstatom + atoms_per_team : inum;
@ -566,45 +662,92 @@ struct PairComputeFunctor {
if (rsq < (STACKPARAMS?c.m_cut_coulsq[itype][jtype]:c.d_cut_coulsq(itype,jtype))) if (rsq < (STACKPARAMS?c.m_cut_coulsq[itype][jtype]:c.d_cut_coulsq(itype,jtype)))
fpair+=c.template compute_fcoul<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype,factor_coul,qtmp); fpair+=c.template compute_fcoul<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype,factor_coul,qtmp);
fev_tmp.f[0] += delx*fpair; const F_FLOAT fx = delx*fpair;
fev_tmp.f[1] += dely*fpair; const F_FLOAT fy = dely*fpair;
fev_tmp.f[2] += delz*fpair; const F_FLOAT fz = delz*fpair;
fev_tmp.f[0] += fx;
fev_tmp.f[1] += fy;
fev_tmp.f[2] += fz;
const int I_CONTRIB = (NEIGHFLAG == HALF || NEIGHFLAG == HALFTHREAD);
const int J_CONTRIB = ((NEIGHFLAG == HALF || NEIGHFLAG == HALFTHREAD) && j < c.nlocal);
const E_FLOAT factor = J_CONTRIB?1.0:0.5;
if ((NEIGHFLAG == HALF || NEIGHFLAG == HALFTHREAD) && j < c.nlocal) {
a_f(j,0) -= fx;
a_f(j,1) -= fy;
a_f(j,2) -= fz;
}
F_FLOAT evdwl = 0.0; F_FLOAT evdwl = 0.0;
F_FLOAT ecoul = 0.0; F_FLOAT ecoul = 0.0;
if (c.eflag) { if (c.eflag_either) {
if (rsq < (STACKPARAMS?c.m_cut_ljsq[itype][jtype]:c.d_cut_ljsq(itype,jtype))) { if (rsq < (STACKPARAMS?c.m_cut_ljsq[itype][jtype]:c.d_cut_ljsq(itype,jtype))) {
evdwl = factor_lj * c.template compute_evdwl<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype); evdwl = factor_lj * c.template compute_evdwl<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype);
fev_tmp.evdwl += 0.5*evdwl; fev_tmp.evdwl += factor * evdwl;
} }
if (rsq < (STACKPARAMS?c.m_cut_coulsq[itype][jtype]:c.d_cut_coulsq(itype,jtype))) { if (rsq < (STACKPARAMS?c.m_cut_coulsq[itype][jtype]:c.d_cut_coulsq(itype,jtype))) {
ecoul = c.template compute_ecoul<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype,factor_coul,qtmp); ecoul = c.template compute_ecoul<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype,factor_coul,qtmp);
fev_tmp.ecoul += 0.5*ecoul; fev_tmp.ecoul += factor * ecoul;
}
if (c.eflag_atom) {
const E_FLOAT epairhalf = 0.5 * (evdwl + ecoul);
if (I_CONTRIB)
a_eatom[i] += epairhalf;
if (J_CONTRIB)
a_eatom[j] += epairhalf;
} }
} }
if (c.vflag_either) { if (c.vflag_either) {
fev_tmp.v[0] += 0.5*delx*delx*fpair; const E_FLOAT v0 = delx*delx*fpair;
fev_tmp.v[1] += 0.5*dely*dely*fpair; const E_FLOAT v1 = dely*dely*fpair;
fev_tmp.v[2] += 0.5*delz*delz*fpair; const E_FLOAT v2 = delz*delz*fpair;
fev_tmp.v[3] += 0.5*delx*dely*fpair; const E_FLOAT v3 = delx*dely*fpair;
fev_tmp.v[4] += 0.5*delx*delz*fpair; const E_FLOAT v4 = delx*delz*fpair;
fev_tmp.v[5] += 0.5*dely*delz*fpair; const E_FLOAT v5 = dely*delz*fpair;
fev_tmp.v[0] += factor*v0;
fev_tmp.v[1] += factor*v1;
fev_tmp.v[2] += factor*v2;
fev_tmp.v[3] += factor*v3;
fev_tmp.v[4] += factor*v4;
fev_tmp.v[5] += factor*v5;
if (c.vflag_atom) {
if (I_CONTRIB) {
a_vatom(i,0) += 0.5*v0;
a_vatom(i,1) += 0.5*v1;
a_vatom(i,2) += 0.5*v2;
a_vatom(i,3) += 0.5*v3;
a_vatom(i,4) += 0.5*v4;
a_vatom(i,5) += 0.5*v5;
}
if (J_CONTRIB) {
a_vatom(j,0) += 0.5*v0;
a_vatom(j,1) += 0.5*v1;
a_vatom(j,2) += 0.5*v2;
a_vatom(j,3) += 0.5*v3;
a_vatom(j,4) += 0.5*v4;
a_vatom(j,5) += 0.5*v5;
}
}
} }
} }
},fev); },fev);
Kokkos::single(Kokkos::PerThread(team), [&] () { Kokkos::single(Kokkos::PerThread(team), [&] () {
f(i,0) += fev.f[0]; a_f(i,0) += fev.f[0];
f(i,1) += fev.f[1]; a_f(i,1) += fev.f[1];
f(i,2) += fev.f[2]; a_f(i,2) += fev.f[2];
if (c.eflag_global) { if (c.eflag_global)
ev.evdwl += fev.evdwl; ev.evdwl += fev.evdwl;
ev.ecoul += fev.ecoul;
}
if (c.eflag_atom)
d_eatom(i) += fev.evdwl + fev.ecoul;
if (c.vflag_global) { if (c.vflag_global) {
ev.v[0] += fev.v[0]; ev.v[0] += fev.v[0];
@ -615,13 +758,19 @@ struct PairComputeFunctor {
ev.v[5] += fev.v[5]; ev.v[5] += fev.v[5];
} }
if (c.vflag_atom) { if (NEIGHFLAG == FULL) {
d_vatom(i,0) += fev.v[0];
d_vatom(i,1) += fev.v[1]; if (c.eflag_atom)
d_vatom(i,2) += fev.v[2]; a_eatom(i) += fev.evdwl;
d_vatom(i,3) += fev.v[3];
d_vatom(i,4) += fev.v[4]; if (c.vflag_atom) {
d_vatom(i,5) += fev.v[5]; a_vatom(i,0) += fev.v[0];
a_vatom(i,1) += fev.v[1];
a_vatom(i,2) += fev.v[2];
a_vatom(i,3) += fev.v[3];
a_vatom(i,4) += fev.v[4];
a_vatom(i,5) += fev.v[5];
}
} }
}); });
}); });
@ -636,7 +785,7 @@ struct PairComputeFunctor {
auto a_eatom = dup_eatom.template access<typename AtomicDup<NEIGHFLAG,device_type>::value>(); auto a_eatom = dup_eatom.template access<typename AtomicDup<NEIGHFLAG,device_type>::value>();
auto a_vatom = dup_vatom.template access<typename AtomicDup<NEIGHFLAG,device_type>::value>(); auto a_vatom = dup_vatom.template access<typename AtomicDup<NEIGHFLAG,device_type>::value>();
const int EFLAG = c.eflag; const int EFLAG = c.eflag_either;
const int NEWTON_PAIR = c.newton_pair; const int NEWTON_PAIR = c.newton_pair;
const int VFLAG = c.vflag_either; const int VFLAG = c.vflag_either;
@ -657,7 +806,7 @@ struct PairComputeFunctor {
const E_FLOAT v5 = dely*delz*fpair; const E_FLOAT v5 = dely*delz*fpair;
if (c.vflag_global) { if (c.vflag_global) {
if (NEIGHFLAG!=FULL) { if (NEIGHFLAG != FULL) {
if (NEWTON_PAIR) { if (NEWTON_PAIR) {
ev.v[0] += v0; ev.v[0] += v0;
ev.v[1] += v1; ev.v[1] += v1;
@ -747,7 +896,8 @@ struct PairComputeFunctor {
// This uses the fact that failure to match template parameters is not an error. // This uses the fact that failure to match template parameters is not an error.
// By having the enable_if with a ! and without it, exactly one of the functions // By having the enable_if with a ! and without it, exactly one of the functions
// pair_compute_neighlist will match - either the dummy version // pair_compute_neighlist will match - either the dummy version
// or the real one further below. // or the real one further below
template<class PairStyle, unsigned NEIGHFLAG, int ZEROFLAG = 0, class Specialisation = void> template<class PairStyle, unsigned NEIGHFLAG, int ZEROFLAG = 0, class Specialisation = void>
EV_FLOAT pair_compute_neighlist (PairStyle* fpair, std::enable_if_t<!((NEIGHFLAG&PairStyle::EnabledNeighFlags) != 0), NeighListKokkos<typename PairStyle::device_type>*> list) { EV_FLOAT pair_compute_neighlist (PairStyle* fpair, std::enable_if_t<!((NEIGHFLAG&PairStyle::EnabledNeighFlags) != 0), NeighListKokkos<typename PairStyle::device_type>*> list) {
EV_FLOAT ev; EV_FLOAT ev;
@ -757,24 +907,29 @@ EV_FLOAT pair_compute_neighlist (PairStyle* fpair, std::enable_if_t<!((NEIGHFLAG
return ev; return ev;
} }
template<class NeighStyle>
int GetMaxNeighs(NeighStyle* list)
{
auto d_ilist = list->d_ilist;
auto d_numneigh = list->d_numneigh;
int inum = list->inum;
int maxneigh = 0;
Kokkos::parallel_reduce(inum, LAMMPS_LAMBDA(const int ii, int &maxneigh) {
const int i = d_ilist[ii];
const int num_neighs = d_numneigh[i];
maxneigh = MAX(maxneigh,num_neighs);
}, Kokkos::Max<int>(maxneigh));
return maxneigh;
}
template<class DeviceType, class FunctorStyle> template<class DeviceType, class FunctorStyle>
int GetTeamSize(FunctorStyle& KOKKOS_GPU_ARG(functor), int KOKKOS_GPU_ARG(inum), void GetMaxTeamSize(FunctorStyle& functor, int inum,
int KOKKOS_GPU_ARG(reduce_flag), int team_size, int KOKKOS_GPU_ARG(vector_length)) { int &teamsize_max_for, int &teamsize_max_reduce)
{
#ifdef LMP_KOKKOS_GPU teamsize_max_for = Kokkos::TeamPolicy<DeviceType>(inum,Kokkos::AUTO).team_size_max(functor,Kokkos::ParallelForTag());
int team_size_max; teamsize_max_reduce = Kokkos::TeamPolicy<DeviceType>(inum,Kokkos::AUTO).team_size_max(functor,Kokkos::ParallelReduceTag());
if (reduce_flag)
team_size_max = Kokkos::TeamPolicy<DeviceType>(inum,Kokkos::AUTO).team_size_max(functor,Kokkos::ParallelReduceTag());
else
team_size_max = Kokkos::TeamPolicy<DeviceType>(inum,Kokkos::AUTO).team_size_max(functor,Kokkos::ParallelForTag());
if (team_size*vector_length > team_size_max)
team_size = team_size_max/vector_length;
#else
team_size = 1;
#endif
return team_size;
} }
// Submit ParallelFor for NEIGHFLAG=HALF,HALFTHREAD,FULL // Submit ParallelFor for NEIGHFLAG=HALF,HALFTHREAD,FULL
@ -782,38 +937,77 @@ template<class PairStyle, unsigned NEIGHFLAG, int ZEROFLAG = 0, class Specialisa
EV_FLOAT pair_compute_neighlist (PairStyle* fpair, std::enable_if_t<(NEIGHFLAG&PairStyle::EnabledNeighFlags) != 0, NeighListKokkos<typename PairStyle::device_type>*> list) { EV_FLOAT pair_compute_neighlist (PairStyle* fpair, std::enable_if_t<(NEIGHFLAG&PairStyle::EnabledNeighFlags) != 0, NeighListKokkos<typename PairStyle::device_type>*> list) {
EV_FLOAT ev; EV_FLOAT ev;
const int inum = list->inum;
if (!fpair->lmp->kokkos->neigh_thread_set) if (!fpair->lmp->kokkos->neigh_thread_set)
if (list->inum <= 16384 && NEIGHFLAG == FULL) if (fpair->lmp->kokkos->ngpus && inum <= 16000)
fpair->lmp->kokkos->neigh_thread = 1; if (NEIGHFLAG == FULL || !fpair->newton_pair)
fpair->lmp->kokkos->neigh_thread = 1;
if (fpair->lmp->kokkos->neigh_thread) { if (fpair->lmp->kokkos->neigh_thread) {
int vector_length = 8; static int vectorsize = 0;
int atoms_per_team = 32; static int atoms_per_team = 0;
static int lastcall = -1;
#if defined(LMP_KOKKOS_GPU)
if (!vectorsize || lastcall < fpair->lmp->neighbor->lastcall) {
lastcall = fpair->lmp->update->ntimestep;
vectorsize = GetMaxNeighs(list);
vectorsize = MathSpecial::powint(2,(int(log2(vectorsize) + 0.5))); // round to nearest power of 2
#if defined(KOKKOS_ENABLE_HIP)
int max_vectorsize = 64;
#else
int max_vectorsize = 32;
#endif
vectorsize = MIN(vectorsize,max_vectorsize);
int teamsize_max_for,teamsize_max_reduce;
if (fpair->atom->ntypes > MAX_TYPES_STACKPARAMS) {
PairComputeFunctor<PairStyle,NEIGHFLAG,false,ZEROFLAG,Specialisation > ff(fpair,list);
GetMaxTeamSize<typename PairStyle::device_type>(ff, inum, teamsize_max_for, teamsize_max_reduce);
} else {
PairComputeFunctor<PairStyle,NEIGHFLAG,true,ZEROFLAG,Specialisation > ff(fpair,list);
GetMaxTeamSize<typename PairStyle::device_type>(ff, inum, teamsize_max_for, teamsize_max_reduce);
}
int teamsize_max = teamsize_max_for;
if (fpair->eflag || fpair->vflag)
teamsize_max = teamsize_max_reduce;
atoms_per_team = teamsize_max/vectorsize;
}
#else
vectorsize = 1;
atoms_per_team = 1;
#endif
const int num_teams = inum / atoms_per_team + (inum % atoms_per_team ? 1 : 0);
if (fpair->atom->ntypes > MAX_TYPES_STACKPARAMS) { if (fpair->atom->ntypes > MAX_TYPES_STACKPARAMS) {
PairComputeFunctor<PairStyle,NEIGHFLAG,false,ZEROFLAG,Specialisation > ff(fpair,list); PairComputeFunctor<PairStyle,NEIGHFLAG,false,ZEROFLAG,Specialisation > ff(fpair,list);
atoms_per_team = GetTeamSize<typename PairStyle::device_type>(ff, list->inum, (fpair->eflag || fpair->vflag), atoms_per_team, vector_length); Kokkos::TeamPolicy<typename PairStyle::device_type,Kokkos::IndexType<int> > policy(num_teams,atoms_per_team,vectorsize);
Kokkos::TeamPolicy<typename PairStyle::device_type,Kokkos::IndexType<int> > policy(list->inum,atoms_per_team,vector_length);
if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(policy,ff,ev); if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(policy,ff,ev);
else Kokkos::parallel_for(policy,ff); else Kokkos::parallel_for(policy,ff);
ff.contribute();
} else { } else {
PairComputeFunctor<PairStyle,NEIGHFLAG,true,ZEROFLAG,Specialisation > ff(fpair,list); PairComputeFunctor<PairStyle,NEIGHFLAG,true,ZEROFLAG,Specialisation > ff(fpair,list);
atoms_per_team = GetTeamSize<typename PairStyle::device_type>(ff, list->inum, (fpair->eflag || fpair->vflag), atoms_per_team, vector_length); Kokkos::TeamPolicy<typename PairStyle::device_type,Kokkos::IndexType<int> > policy(num_teams,atoms_per_team,vectorsize);
Kokkos::TeamPolicy<typename PairStyle::device_type,Kokkos::IndexType<int> > policy(list->inum,atoms_per_team,vector_length);
if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(policy,ff,ev); if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(policy,ff,ev);
else Kokkos::parallel_for(policy,ff); else Kokkos::parallel_for(policy,ff);
ff.contribute();
} }
} else { } else {
if (fpair->atom->ntypes > MAX_TYPES_STACKPARAMS) { if (fpair->atom->ntypes > MAX_TYPES_STACKPARAMS) {
PairComputeFunctor<PairStyle,NEIGHFLAG,false,ZEROFLAG,Specialisation > ff(fpair,list); PairComputeFunctor<PairStyle,NEIGHFLAG,false,ZEROFLAG,Specialisation > ff(fpair,list);
if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(list->inum,ff,ev); if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(inum,ff,ev);
else Kokkos::parallel_for(list->inum,ff); else Kokkos::parallel_for(inum,ff);
ff.contribute(); ff.contribute();
} else { } else {
PairComputeFunctor<PairStyle,NEIGHFLAG,true,ZEROFLAG,Specialisation > ff(fpair,list); PairComputeFunctor<PairStyle,NEIGHFLAG,true,ZEROFLAG,Specialisation > ff(fpair,list);
if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(list->inum,ff,ev); if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(inum,ff,ev);
else Kokkos::parallel_for(list->inum,ff); else Kokkos::parallel_for(inum,ff);
ff.contribute(); ff.contribute();
} }
} }