Merge pull request #4003 from stanmoore1/kk_half_thread
Thread over neighbors in addition to atoms when using a half neighlist
This commit is contained in:
@ -472,13 +472,13 @@ If the *neigh/thread* keyword is set to *off*, then the KOKKOS package
|
|||||||
threads only over atoms. However, for small systems, this may not expose
|
threads only over atoms. However, for small systems, this may not expose
|
||||||
enough parallelism to keep a GPU busy. When this keyword is set to *on*,
|
enough parallelism to keep a GPU busy. When this keyword is set to *on*,
|
||||||
the KOKKOS package threads over both atoms and neighbors of atoms. When
|
the KOKKOS package threads over both atoms and neighbors of atoms. When
|
||||||
using *neigh/thread* *on*, a full neighbor list must also be used. Using
|
using *neigh/thread* *on*, the :doc:`newton pair <newton>` setting must
|
||||||
*neigh/thread* *on* may be slower for large systems, so this this option
|
be "off". Using *neigh/thread* *on* may be slower for large systems, so
|
||||||
is turned on by default only when there are 16K atoms or less owned by
|
this this option is turned on by default only when running on one or
|
||||||
an MPI rank and when using a full neighbor list. Not all KOKKOS-enabled
|
more GPUs and there are 16k atoms or less owned by an MPI rank. Not all
|
||||||
potentials support this keyword yet, and only thread over atoms. Many
|
KOKKOS-enabled potentials support this keyword yet, and only thread over
|
||||||
simple pairwise potentials such as Lennard-Jones do support threading
|
atoms. Many simple pairwise potentials such as Lennard-Jones do support
|
||||||
over both atoms and neighbors.
|
threading over both atoms and neighbors.
|
||||||
|
|
||||||
If the *neigh/transpose* keyword is set to *off*, then the KOKKOS
|
If the *neigh/transpose* keyword is set to *off*, then the KOKKOS
|
||||||
package will use the same memory layout for building the neighbor list on
|
package will use the same memory layout for building the neighbor list on
|
||||||
@ -730,7 +730,7 @@ comm = device, sort = device, neigh/transpose = off, gpu/aware = on. When
|
|||||||
LAMMPS can safely detect that GPU-aware MPI is not available, the default value
|
LAMMPS can safely detect that GPU-aware MPI is not available, the default value
|
||||||
of gpu/aware becomes "off". For CPUs or Xeon Phis, the option defaults are
|
of gpu/aware becomes "off". For CPUs or Xeon Phis, the option defaults are
|
||||||
neigh = half, neigh/qeq = half, newton = on, binsize = 0.0, comm = no, and sort
|
neigh = half, neigh/qeq = half, newton = on, binsize = 0.0, comm = no, and sort
|
||||||
= no. The option neigh/thread = on when there are 16K atoms or less on an MPI
|
= no. For GPUs, option neigh/thread = on when there are 16k atoms or less on an MPI
|
||||||
rank, otherwise it is "off". These settings are made automatically by the
|
rank, otherwise it is "off". These settings are made automatically by the
|
||||||
required "-k on" :doc:`command-line switch <Run_options>`. You can change them
|
required "-k on" :doc:`command-line switch <Run_options>`. You can change them
|
||||||
by using the package kokkos command in your input script or via the :doc:`-pk
|
by using the package kokkos command in your input script or via the :doc:`-pk
|
||||||
|
|||||||
@ -608,8 +608,8 @@ void KokkosLMP::accelerator(int narg, char **arg)
|
|||||||
|
|
||||||
force->newton = force->newton_pair = force->newton_bond = newtonflag;
|
force->newton = force->newton_pair = force->newton_bond = newtonflag;
|
||||||
|
|
||||||
if (neigh_thread && neighflag != FULL)
|
if (neigh_thread && newtonflag)
|
||||||
error->all(FLERR,"Must use KOKKOS package option 'neigh full' with 'neigh/thread on'");
|
error->all(FLERR,"Must use KOKKOS package option 'newton off' with 'neigh/thread on'");
|
||||||
|
|
||||||
neighbor->binsize_user = binsize;
|
neighbor->binsize_user = binsize;
|
||||||
if (binsize <= 0.0) neighbor->binsizeflag = 0;
|
if (binsize <= 0.0) neighbor->binsizeflag = 0;
|
||||||
|
|||||||
@ -19,10 +19,12 @@
|
|||||||
#ifndef LMP_PAIR_KOKKOS_H
|
#ifndef LMP_PAIR_KOKKOS_H
|
||||||
#define LMP_PAIR_KOKKOS_H
|
#define LMP_PAIR_KOKKOS_H
|
||||||
|
|
||||||
#include "Kokkos_Macros.hpp"
|
|
||||||
#include "pair.h" // IWYU pragma: export
|
#include "pair.h" // IWYU pragma: export
|
||||||
#include "neighbor_kokkos.h"
|
#include "neighbor_kokkos.h"
|
||||||
#include "neigh_list_kokkos.h"
|
#include "neigh_list_kokkos.h"
|
||||||
|
#include "math_special.h"
|
||||||
|
#include "update.h"
|
||||||
|
#include "Kokkos_Macros.hpp"
|
||||||
#include "Kokkos_ScatterView.hpp"
|
#include "Kokkos_ScatterView.hpp"
|
||||||
|
|
||||||
namespace LAMMPS_NS {
|
namespace LAMMPS_NS {
|
||||||
@ -63,6 +65,7 @@ struct PairComputeFunctor {
|
|||||||
typename AT::t_f_array f;
|
typename AT::t_f_array f;
|
||||||
typename AT::t_efloat_1d d_eatom;
|
typename AT::t_efloat_1d d_eatom;
|
||||||
typename AT::t_virial_array d_vatom;
|
typename AT::t_virial_array d_vatom;
|
||||||
|
int inum;
|
||||||
|
|
||||||
using KKDeviceType = typename KKDevice<device_type>::value;
|
using KKDeviceType = typename KKDevice<device_type>::value;
|
||||||
using DUP = NeedDup_v<NEIGHFLAG,device_type>;
|
using DUP = NeedDup_v<NEIGHFLAG,device_type>;
|
||||||
@ -81,8 +84,6 @@ struct PairComputeFunctor {
|
|||||||
// typename KKDevice<device_type>::value,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > vatom;
|
// typename KKDevice<device_type>::value,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > vatom;
|
||||||
KKScatterView<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,KKDeviceType,KKScatterSum,DUP> dup_vatom;
|
KKScatterView<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,KKDeviceType,KKScatterSum,DUP> dup_vatom;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
NeighListKokkos<device_type> list;
|
NeighListKokkos<device_type> list;
|
||||||
|
|
||||||
PairComputeFunctor(PairStyle* c_ptr,
|
PairComputeFunctor(PairStyle* c_ptr,
|
||||||
@ -95,6 +96,7 @@ struct PairComputeFunctor {
|
|||||||
dup_f = Kokkos::Experimental::create_scatter_view<KKScatterSum, DUP>(c.f);
|
dup_f = Kokkos::Experimental::create_scatter_view<KKScatterSum, DUP>(c.f);
|
||||||
dup_eatom = Kokkos::Experimental::create_scatter_view<KKScatterSum, DUP>(c.d_eatom);
|
dup_eatom = Kokkos::Experimental::create_scatter_view<KKScatterSum, DUP>(c.d_eatom);
|
||||||
dup_vatom = Kokkos::Experimental::create_scatter_view<KKScatterSum, DUP>(c.d_vatom);
|
dup_vatom = Kokkos::Experimental::create_scatter_view<KKScatterSum, DUP>(c.d_vatom);
|
||||||
|
inum = list.inum;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Set copymode = 1 so parent allocations aren't destructed by copies of the style
|
// Set copymode = 1 so parent allocations aren't destructed by copies of the style
|
||||||
@ -105,17 +107,22 @@ struct PairComputeFunctor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void contribute() {
|
void contribute() {
|
||||||
Kokkos::Experimental::contribute(c.f, dup_f);
|
int need_dup = std::is_same_v<DUP,Kokkos::Experimental::ScatterDuplicated>;
|
||||||
|
|
||||||
if (c.eflag_atom)
|
if (need_dup) {
|
||||||
Kokkos::Experimental::contribute(c.d_eatom, dup_eatom);
|
Kokkos::Experimental::contribute(c.f, dup_f);
|
||||||
|
|
||||||
if (c.vflag_atom)
|
if (c.eflag_atom)
|
||||||
Kokkos::Experimental::contribute(c.d_vatom, dup_vatom);
|
Kokkos::Experimental::contribute(c.d_eatom, dup_eatom);
|
||||||
|
|
||||||
|
if (c.vflag_atom)
|
||||||
|
Kokkos::Experimental::contribute(c.d_vatom, dup_vatom);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Loop over neighbors of one atom without coulomb interaction
|
// Loop over neighbors of one atom without coulomb interaction
|
||||||
// This function is called in parallel
|
// This function is called in parallel
|
||||||
|
|
||||||
template<int EVFLAG, int NEWTON_PAIR>
|
template<int EVFLAG, int NEWTON_PAIR>
|
||||||
KOKKOS_FUNCTION
|
KOKKOS_FUNCTION
|
||||||
EV_FLOAT compute_item(const int& ii,
|
EV_FLOAT compute_item(const int& ii,
|
||||||
@ -161,7 +168,7 @@ struct PairComputeFunctor {
|
|||||||
fytmp += dely*fpair;
|
fytmp += dely*fpair;
|
||||||
fztmp += delz*fpair;
|
fztmp += delz*fpair;
|
||||||
|
|
||||||
if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < c.nlocal)) {
|
if ((NEIGHFLAG == HALF || NEIGHFLAG == HALFTHREAD) && (NEWTON_PAIR || j < c.nlocal)) {
|
||||||
a_f(j,0) -= delx*fpair;
|
a_f(j,0) -= delx*fpair;
|
||||||
a_f(j,1) -= dely*fpair;
|
a_f(j,1) -= dely*fpair;
|
||||||
a_f(j,2) -= delz*fpair;
|
a_f(j,2) -= delz*fpair;
|
||||||
@ -169,9 +176,9 @@ struct PairComputeFunctor {
|
|||||||
|
|
||||||
if (EVFLAG) {
|
if (EVFLAG) {
|
||||||
F_FLOAT evdwl = 0.0;
|
F_FLOAT evdwl = 0.0;
|
||||||
if (c.eflag) {
|
if (c.eflag_either) {
|
||||||
evdwl = factor_lj * c.template compute_evdwl<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype);
|
evdwl = factor_lj * c.template compute_evdwl<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype);
|
||||||
ev.evdwl += (((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD)&&(NEWTON_PAIR||(j<c.nlocal)))?1.0:0.5)*evdwl;
|
ev.evdwl += (((NEIGHFLAG == HALF || NEIGHFLAG == HALFTHREAD)&&(NEWTON_PAIR||(j<c.nlocal)))?1.0:0.5)*evdwl;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (c.vflag_either || c.eflag_atom) ev_tally(ev,i,j,evdwl,fpair,delx,dely,delz);
|
if (c.vflag_either || c.eflag_atom) ev_tally(ev,i,j,evdwl,fpair,delx,dely,delz);
|
||||||
@ -189,6 +196,7 @@ struct PairComputeFunctor {
|
|||||||
|
|
||||||
// Loop over neighbors of one atom with coulomb interaction
|
// Loop over neighbors of one atom with coulomb interaction
|
||||||
// This function is called in parallel
|
// This function is called in parallel
|
||||||
|
|
||||||
template<int EVFLAG, int NEWTON_PAIR>
|
template<int EVFLAG, int NEWTON_PAIR>
|
||||||
KOKKOS_FUNCTION
|
KOKKOS_FUNCTION
|
||||||
EV_FLOAT compute_item(const int& ii,
|
EV_FLOAT compute_item(const int& ii,
|
||||||
@ -241,7 +249,7 @@ struct PairComputeFunctor {
|
|||||||
fytmp += dely*fpair;
|
fytmp += dely*fpair;
|
||||||
fztmp += delz*fpair;
|
fztmp += delz*fpair;
|
||||||
|
|
||||||
if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < c.nlocal)) {
|
if ((NEIGHFLAG == HALF || NEIGHFLAG == HALFTHREAD) && (NEWTON_PAIR || j < c.nlocal)) {
|
||||||
a_f(j,0) -= delx*fpair;
|
a_f(j,0) -= delx*fpair;
|
||||||
a_f(j,1) -= dely*fpair;
|
a_f(j,1) -= dely*fpair;
|
||||||
a_f(j,2) -= delz*fpair;
|
a_f(j,2) -= delz*fpair;
|
||||||
@ -250,14 +258,14 @@ struct PairComputeFunctor {
|
|||||||
if (EVFLAG) {
|
if (EVFLAG) {
|
||||||
F_FLOAT evdwl = 0.0;
|
F_FLOAT evdwl = 0.0;
|
||||||
F_FLOAT ecoul = 0.0;
|
F_FLOAT ecoul = 0.0;
|
||||||
if (c.eflag) {
|
if (c.eflag_either) {
|
||||||
if (rsq < (STACKPARAMS?c.m_cut_ljsq[itype][jtype]:c.d_cut_ljsq(itype,jtype))) {
|
if (rsq < (STACKPARAMS?c.m_cut_ljsq[itype][jtype]:c.d_cut_ljsq(itype,jtype))) {
|
||||||
evdwl = factor_lj * c.template compute_evdwl<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype);
|
evdwl = factor_lj * c.template compute_evdwl<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype);
|
||||||
ev.evdwl += (((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD)&&(NEWTON_PAIR||(j<c.nlocal)))?1.0:0.5)*evdwl;
|
ev.evdwl += (((NEIGHFLAG == HALF || NEIGHFLAG == HALFTHREAD) && (NEWTON_PAIR || (j < c.nlocal)))?1.0:0.5)*evdwl;
|
||||||
}
|
}
|
||||||
if (rsq < (STACKPARAMS?c.m_cut_coulsq[itype][jtype]:c.d_cut_coulsq(itype,jtype))) {
|
if (rsq < (STACKPARAMS?c.m_cut_coulsq[itype][jtype]:c.d_cut_coulsq(itype,jtype))) {
|
||||||
ecoul = c.template compute_ecoul<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype,factor_coul,qtmp);
|
ecoul = c.template compute_ecoul<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype,factor_coul,qtmp);
|
||||||
ev.ecoul += (((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD)&&(NEWTON_PAIR||(j<c.nlocal)))?1.0:0.5)*ecoul;
|
ev.ecoul += (((NEIGHFLAG == HALF || NEIGHFLAG == HALFTHREAD) && (NEWTON_PAIR || (j < c.nlocal)))?1.0:0.5)*ecoul;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -273,14 +281,16 @@ struct PairComputeFunctor {
|
|||||||
return ev;
|
return ev;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Use TeamPolicy, assume Newton off, Full Neighborlist, and no energy/virial
|
// TeamPolicy, newton off, and no energy/virial
|
||||||
// Loop over neighbors of one atom without coulomb interaction
|
// Loop over neighbors of one atom without coulomb interaction
|
||||||
// This function is called in parallel
|
// This function is called in parallel
|
||||||
|
|
||||||
KOKKOS_FUNCTION
|
KOKKOS_FUNCTION
|
||||||
void compute_item_team(typename Kokkos::TeamPolicy<device_type>::member_type team,
|
void compute_item_team(typename Kokkos::TeamPolicy<device_type>::member_type team,
|
||||||
const NeighListKokkos<device_type> &list, const NoCoulTag&) const {
|
const NeighListKokkos<device_type> &list, const NoCoulTag&) const {
|
||||||
|
|
||||||
const int inum = team.league_size();
|
auto a_f = dup_f.template access<typename AtomicDup<NEIGHFLAG,device_type>::value>();
|
||||||
|
|
||||||
const int atoms_per_team = team.team_size();
|
const int atoms_per_team = team.team_size();
|
||||||
const int firstatom = team.league_rank()*atoms_per_team;
|
const int firstatom = team.league_rank()*atoms_per_team;
|
||||||
const int lastatom = firstatom + atoms_per_team < inum ? firstatom + atoms_per_team : inum;
|
const int lastatom = firstatom + atoms_per_team < inum ? firstatom + atoms_per_team : inum;
|
||||||
@ -292,7 +302,7 @@ struct PairComputeFunctor {
|
|||||||
const X_FLOAT ztmp = c.x(i,2);
|
const X_FLOAT ztmp = c.x(i,2);
|
||||||
const int itype = c.type(i);
|
const int itype = c.type(i);
|
||||||
|
|
||||||
if (ZEROFLAG) {
|
if (NEIGHFLAG == FULL && ZEROFLAG) {
|
||||||
Kokkos::single(Kokkos::PerThread(team), [&] (){
|
Kokkos::single(Kokkos::PerThread(team), [&] (){
|
||||||
f(i,0) = 0.0;
|
f(i,0) = 0.0;
|
||||||
f(i,1) = 0.0;
|
f(i,1) = 0.0;
|
||||||
@ -321,30 +331,42 @@ struct PairComputeFunctor {
|
|||||||
|
|
||||||
const F_FLOAT fpair = factor_lj*c.template compute_fpair<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype);
|
const F_FLOAT fpair = factor_lj*c.template compute_fpair<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype);
|
||||||
|
|
||||||
ftmp.x += delx*fpair;
|
const F_FLOAT fx = delx*fpair;
|
||||||
ftmp.y += dely*fpair;
|
const F_FLOAT fy = dely*fpair;
|
||||||
ftmp.z += delz*fpair;
|
const F_FLOAT fz = delz*fpair;
|
||||||
|
|
||||||
|
ftmp.x += fx;
|
||||||
|
ftmp.y += fy;
|
||||||
|
ftmp.z += fz;
|
||||||
|
|
||||||
|
if ((NEIGHFLAG == HALF || NEIGHFLAG == HALFTHREAD) && j < c.nlocal) {
|
||||||
|
a_f(j,0) -= fx;
|
||||||
|
a_f(j,1) -= fy;
|
||||||
|
a_f(j,2) -= fz;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
},fsum);
|
},fsum);
|
||||||
|
|
||||||
Kokkos::single(Kokkos::PerThread(team), [&] () {
|
Kokkos::single(Kokkos::PerThread(team), [&] () {
|
||||||
f(i,0) += fsum.x;
|
a_f(i,0) += fsum.x;
|
||||||
f(i,1) += fsum.y;
|
a_f(i,1) += fsum.y;
|
||||||
f(i,2) += fsum.z;
|
a_f(i,2) += fsum.z;
|
||||||
});
|
});
|
||||||
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// Use TeamPolicy, assume Newton off, Full Neighborlist, and no energy/virial
|
// TeamPolicy, newton off, and no energy/virial
|
||||||
// Loop over neighbors of one atom with coulomb interaction
|
// Loop over neighbors of one atom with coulomb interaction
|
||||||
// This function is called in parallel
|
// This function is called in parallel
|
||||||
|
|
||||||
KOKKOS_FUNCTION
|
KOKKOS_FUNCTION
|
||||||
void compute_item_team(typename Kokkos::TeamPolicy<device_type>::member_type team,
|
void compute_item_team(typename Kokkos::TeamPolicy<device_type>::member_type team,
|
||||||
const NeighListKokkos<device_type> &list, const CoulTag& ) const {
|
const NeighListKokkos<device_type> &list, const CoulTag& ) const {
|
||||||
|
|
||||||
const int inum = team.league_size();
|
auto a_f = dup_f.template access<typename AtomicDup<NEIGHFLAG,device_type>::value>();
|
||||||
|
|
||||||
const int atoms_per_team = team.team_size();
|
const int atoms_per_team = team.team_size();
|
||||||
int firstatom = team.league_rank()*atoms_per_team;
|
int firstatom = team.league_rank()*atoms_per_team;
|
||||||
int lastatom = firstatom + atoms_per_team < inum ? firstatom + atoms_per_team : inum;
|
int lastatom = firstatom + atoms_per_team < inum ? firstatom + atoms_per_team : inum;
|
||||||
@ -357,8 +379,9 @@ struct PairComputeFunctor {
|
|||||||
const int itype = c.type(i);
|
const int itype = c.type(i);
|
||||||
const F_FLOAT qtmp = c.q(i);
|
const F_FLOAT qtmp = c.q(i);
|
||||||
|
|
||||||
if (ZEROFLAG) {
|
if (NEIGHFLAG == FULL && ZEROFLAG) {
|
||||||
Kokkos::single(Kokkos::PerThread(team), [&] (){
|
Kokkos::single(Kokkos::PerThread(team), [&] ()
|
||||||
|
{
|
||||||
f(i,0) = 0.0;
|
f(i,0) = 0.0;
|
||||||
f(i,1) = 0.0;
|
f(i,1) = 0.0;
|
||||||
f(i,2) = 0.0;
|
f(i,2) = 0.0;
|
||||||
@ -391,31 +414,45 @@ struct PairComputeFunctor {
|
|||||||
if (rsq < (STACKPARAMS?c.m_cut_coulsq[itype][jtype]:c.d_cut_coulsq(itype,jtype)))
|
if (rsq < (STACKPARAMS?c.m_cut_coulsq[itype][jtype]:c.d_cut_coulsq(itype,jtype)))
|
||||||
fpair+=c.template compute_fcoul<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype,factor_coul,qtmp);
|
fpair+=c.template compute_fcoul<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype,factor_coul,qtmp);
|
||||||
|
|
||||||
ftmp.x += delx*fpair;
|
const F_FLOAT fx = delx*fpair;
|
||||||
ftmp.y += dely*fpair;
|
const F_FLOAT fy = dely*fpair;
|
||||||
ftmp.z += delz*fpair;
|
const F_FLOAT fz = delz*fpair;
|
||||||
|
|
||||||
|
ftmp.x += fx;
|
||||||
|
ftmp.y += fy;
|
||||||
|
ftmp.z += fz;
|
||||||
|
|
||||||
|
if ((NEIGHFLAG == HALF || NEIGHFLAG == HALFTHREAD) && j < c.nlocal) {
|
||||||
|
a_f(j,0) -= fx;
|
||||||
|
a_f(j,1) -= fy;
|
||||||
|
a_f(j,2) -= fz;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
},fsum);
|
},fsum);
|
||||||
|
|
||||||
Kokkos::single(Kokkos::PerThread(team), [&] () {
|
Kokkos::single(Kokkos::PerThread(team), [&] () {
|
||||||
f(i,0) += fsum.x;
|
a_f(i,0) += fsum.x;
|
||||||
f(i,1) += fsum.y;
|
a_f(i,1) += fsum.y;
|
||||||
f(i,2) += fsum.z;
|
a_f(i,2) += fsum.z;
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TeamPolicy, newton off, and energy/virial
|
||||||
// Use TeamPolicy, assume Newton off, Full Neighborlist, and energy/virial
|
|
||||||
// Loop over neighbors of one atom without coulomb interaction
|
// Loop over neighbors of one atom without coulomb interaction
|
||||||
// This function is called in parallel
|
// This function is called in parallel
|
||||||
|
|
||||||
KOKKOS_FUNCTION
|
KOKKOS_FUNCTION
|
||||||
EV_FLOAT compute_item_team_ev(typename Kokkos::TeamPolicy<device_type>::member_type team,
|
EV_FLOAT compute_item_team_ev(typename Kokkos::TeamPolicy<device_type>::member_type team,
|
||||||
const NeighListKokkos<device_type> &list, const NoCoulTag&) const {
|
const NeighListKokkos<device_type> &list, const NoCoulTag&) const {
|
||||||
|
|
||||||
|
auto a_f = dup_f.template access<typename AtomicDup<NEIGHFLAG,device_type>::value>();
|
||||||
|
auto a_eatom = dup_eatom.template access<typename AtomicDup<NEIGHFLAG,device_type>::value>();
|
||||||
|
auto a_vatom = dup_vatom.template access<typename AtomicDup<NEIGHFLAG,device_type>::value>();
|
||||||
|
|
||||||
EV_FLOAT ev;
|
EV_FLOAT ev;
|
||||||
|
|
||||||
const int inum = team.league_size();
|
|
||||||
const int atoms_per_team = team.team_size();
|
const int atoms_per_team = team.team_size();
|
||||||
const int firstatom = team.league_rank()*atoms_per_team;
|
const int firstatom = team.league_rank()*atoms_per_team;
|
||||||
const int lastatom = firstatom + atoms_per_team < inum ? firstatom + atoms_per_team : inum;
|
const int lastatom = firstatom + atoms_per_team < inum ? firstatom + atoms_per_team : inum;
|
||||||
@ -427,8 +464,9 @@ struct PairComputeFunctor {
|
|||||||
const X_FLOAT ztmp = c.x(i,2);
|
const X_FLOAT ztmp = c.x(i,2);
|
||||||
const int itype = c.type(i);
|
const int itype = c.type(i);
|
||||||
|
|
||||||
if (ZEROFLAG) {
|
if (NEIGHFLAG == FULL && ZEROFLAG) {
|
||||||
Kokkos::single(Kokkos::PerThread(team), [&] (){
|
Kokkos::single(Kokkos::PerThread(team), [&] ()
|
||||||
|
{
|
||||||
f(i,0) = 0.0;
|
f(i,0) = 0.0;
|
||||||
f(i,1) = 0.0;
|
f(i,1) = 0.0;
|
||||||
f(i,2) = 0.0;
|
f(i,2) = 0.0;
|
||||||
@ -456,37 +494,85 @@ struct PairComputeFunctor {
|
|||||||
|
|
||||||
const F_FLOAT fpair = factor_lj*c.template compute_fpair<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype);
|
const F_FLOAT fpair = factor_lj*c.template compute_fpair<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype);
|
||||||
|
|
||||||
fev_tmp.f[0] += delx*fpair;
|
const F_FLOAT fx = delx*fpair;
|
||||||
fev_tmp.f[1] += dely*fpair;
|
const F_FLOAT fy = dely*fpair;
|
||||||
fev_tmp.f[2] += delz*fpair;
|
const F_FLOAT fz = delz*fpair;
|
||||||
|
|
||||||
|
fev_tmp.f[0] += fx;
|
||||||
|
fev_tmp.f[1] += fy;
|
||||||
|
fev_tmp.f[2] += fz;
|
||||||
|
|
||||||
|
const int I_CONTRIB = (NEIGHFLAG == HALF || NEIGHFLAG == HALFTHREAD);
|
||||||
|
const int J_CONTRIB = ((NEIGHFLAG == HALF || NEIGHFLAG == HALFTHREAD) && j < c.nlocal);
|
||||||
|
const E_FLOAT factor = J_CONTRIB?1.0:0.5;
|
||||||
|
|
||||||
|
if (J_CONTRIB) {
|
||||||
|
a_f(j,0) -= fx;
|
||||||
|
a_f(j,1) -= fy;
|
||||||
|
a_f(j,2) -= fz;
|
||||||
|
}
|
||||||
|
|
||||||
F_FLOAT evdwl = 0.0;
|
F_FLOAT evdwl = 0.0;
|
||||||
if (c.eflag) {
|
if (c.eflag_either) {
|
||||||
evdwl = factor_lj * c.template compute_evdwl<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype);
|
evdwl = factor_lj * c.template compute_evdwl<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype);
|
||||||
fev_tmp.evdwl += 0.5*evdwl;
|
fev_tmp.evdwl += factor * evdwl;
|
||||||
|
|
||||||
|
if (c.eflag_atom) {
|
||||||
|
const E_FLOAT epairhalf = 0.5 * evdwl;
|
||||||
|
|
||||||
|
if (I_CONTRIB)
|
||||||
|
a_eatom[i] += epairhalf;
|
||||||
|
|
||||||
|
if (J_CONTRIB)
|
||||||
|
a_eatom[j] += epairhalf;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (c.vflag_either) {
|
if (c.vflag_either) {
|
||||||
fev_tmp.v[0] += 0.5*delx*delx*fpair;
|
const E_FLOAT v0 = delx*delx*fpair;
|
||||||
fev_tmp.v[1] += 0.5*dely*dely*fpair;
|
const E_FLOAT v1 = dely*dely*fpair;
|
||||||
fev_tmp.v[2] += 0.5*delz*delz*fpair;
|
const E_FLOAT v2 = delz*delz*fpair;
|
||||||
fev_tmp.v[3] += 0.5*delx*dely*fpair;
|
const E_FLOAT v3 = delx*dely*fpair;
|
||||||
fev_tmp.v[4] += 0.5*delx*delz*fpair;
|
const E_FLOAT v4 = delx*delz*fpair;
|
||||||
fev_tmp.v[5] += 0.5*dely*delz*fpair;
|
const E_FLOAT v5 = dely*delz*fpair;
|
||||||
|
|
||||||
|
fev_tmp.v[0] += factor*v0;
|
||||||
|
fev_tmp.v[1] += factor*v1;
|
||||||
|
fev_tmp.v[2] += factor*v2;
|
||||||
|
fev_tmp.v[3] += factor*v3;
|
||||||
|
fev_tmp.v[4] += factor*v4;
|
||||||
|
fev_tmp.v[5] += factor*v5;
|
||||||
|
|
||||||
|
if (c.vflag_atom) {
|
||||||
|
if (I_CONTRIB) {
|
||||||
|
a_vatom(i,0) += 0.5*v0;
|
||||||
|
a_vatom(i,1) += 0.5*v1;
|
||||||
|
a_vatom(i,2) += 0.5*v2;
|
||||||
|
a_vatom(i,3) += 0.5*v3;
|
||||||
|
a_vatom(i,4) += 0.5*v4;
|
||||||
|
a_vatom(i,5) += 0.5*v5;
|
||||||
|
}
|
||||||
|
if (J_CONTRIB) {
|
||||||
|
a_vatom(j,0) += 0.5*v0;
|
||||||
|
a_vatom(j,1) += 0.5*v1;
|
||||||
|
a_vatom(j,2) += 0.5*v2;
|
||||||
|
a_vatom(j,3) += 0.5*v3;
|
||||||
|
a_vatom(j,4) += 0.5*v4;
|
||||||
|
a_vatom(j,5) += 0.5*v5;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},fev);
|
},fev);
|
||||||
|
|
||||||
Kokkos::single(Kokkos::PerThread(team), [&] () {
|
Kokkos::single(Kokkos::PerThread(team), [&] () {
|
||||||
f(i,0) += fev.f[0];
|
a_f(i,0) += fev.f[0];
|
||||||
f(i,1) += fev.f[1];
|
a_f(i,1) += fev.f[1];
|
||||||
f(i,2) += fev.f[2];
|
a_f(i,2) += fev.f[2];
|
||||||
|
|
||||||
if (c.eflag_global)
|
if (c.eflag_global)
|
||||||
ev.evdwl += fev.evdwl;
|
ev.evdwl += fev.evdwl;
|
||||||
|
|
||||||
if (c.eflag_atom)
|
|
||||||
d_eatom(i) += fev.evdwl;
|
|
||||||
|
|
||||||
if (c.vflag_global) {
|
if (c.vflag_global) {
|
||||||
ev.v[0] += fev.v[0];
|
ev.v[0] += fev.v[0];
|
||||||
ev.v[1] += fev.v[1];
|
ev.v[1] += fev.v[1];
|
||||||
@ -496,29 +582,39 @@ struct PairComputeFunctor {
|
|||||||
ev.v[5] += fev.v[5];
|
ev.v[5] += fev.v[5];
|
||||||
}
|
}
|
||||||
|
|
||||||
if (c.vflag_atom) {
|
if (NEIGHFLAG == FULL) {
|
||||||
d_vatom(i,0) += fev.v[0];
|
|
||||||
d_vatom(i,1) += fev.v[1];
|
if (c.eflag_atom)
|
||||||
d_vatom(i,2) += fev.v[2];
|
a_eatom(i) += fev.evdwl;
|
||||||
d_vatom(i,3) += fev.v[3];
|
|
||||||
d_vatom(i,4) += fev.v[4];
|
if (c.vflag_atom) {
|
||||||
d_vatom(i,5) += fev.v[5];
|
a_vatom(i,0) += fev.v[0];
|
||||||
|
a_vatom(i,1) += fev.v[1];
|
||||||
|
a_vatom(i,2) += fev.v[2];
|
||||||
|
a_vatom(i,3) += fev.v[3];
|
||||||
|
a_vatom(i,4) += fev.v[4];
|
||||||
|
a_vatom(i,5) += fev.v[5];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
return ev;
|
return ev;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Use TeamPolicy, assume Newton off, Full Neighborlist, and energy/virial
|
// TeamPolicy, newton off, and energy/virial
|
||||||
// Loop over neighbors of one atom with coulomb interaction
|
// Loop over neighbors of one atom with coulomb interaction
|
||||||
// This function is called in parallel
|
// This function is called in parallel
|
||||||
|
|
||||||
KOKKOS_FUNCTION
|
KOKKOS_FUNCTION
|
||||||
EV_FLOAT compute_item_team_ev(typename Kokkos::TeamPolicy<device_type>::member_type team,
|
EV_FLOAT compute_item_team_ev(typename Kokkos::TeamPolicy<device_type>::member_type team,
|
||||||
const NeighListKokkos<device_type> &list, const CoulTag& ) const {
|
const NeighListKokkos<device_type> &list, const CoulTag& ) const {
|
||||||
|
|
||||||
|
auto a_f = dup_f.template access<typename AtomicDup<NEIGHFLAG,device_type>::value>();
|
||||||
|
auto a_eatom = dup_eatom.template access<typename AtomicDup<NEIGHFLAG,device_type>::value>();
|
||||||
|
auto a_vatom = dup_vatom.template access<typename AtomicDup<NEIGHFLAG,device_type>::value>();
|
||||||
|
|
||||||
EV_FLOAT ev;
|
EV_FLOAT ev;
|
||||||
|
|
||||||
const int inum = team.league_size();
|
|
||||||
const int atoms_per_team = team.team_size();
|
const int atoms_per_team = team.team_size();
|
||||||
const int firstatom = team.league_rank()*atoms_per_team;
|
const int firstatom = team.league_rank()*atoms_per_team;
|
||||||
const int lastatom = firstatom + atoms_per_team < inum ? firstatom + atoms_per_team : inum;
|
const int lastatom = firstatom + atoms_per_team < inum ? firstatom + atoms_per_team : inum;
|
||||||
@ -566,45 +662,92 @@ struct PairComputeFunctor {
|
|||||||
if (rsq < (STACKPARAMS?c.m_cut_coulsq[itype][jtype]:c.d_cut_coulsq(itype,jtype)))
|
if (rsq < (STACKPARAMS?c.m_cut_coulsq[itype][jtype]:c.d_cut_coulsq(itype,jtype)))
|
||||||
fpair+=c.template compute_fcoul<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype,factor_coul,qtmp);
|
fpair+=c.template compute_fcoul<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype,factor_coul,qtmp);
|
||||||
|
|
||||||
fev_tmp.f[0] += delx*fpair;
|
const F_FLOAT fx = delx*fpair;
|
||||||
fev_tmp.f[1] += dely*fpair;
|
const F_FLOAT fy = dely*fpair;
|
||||||
fev_tmp.f[2] += delz*fpair;
|
const F_FLOAT fz = delz*fpair;
|
||||||
|
|
||||||
|
fev_tmp.f[0] += fx;
|
||||||
|
fev_tmp.f[1] += fy;
|
||||||
|
fev_tmp.f[2] += fz;
|
||||||
|
|
||||||
|
const int I_CONTRIB = (NEIGHFLAG == HALF || NEIGHFLAG == HALFTHREAD);
|
||||||
|
const int J_CONTRIB = ((NEIGHFLAG == HALF || NEIGHFLAG == HALFTHREAD) && j < c.nlocal);
|
||||||
|
const E_FLOAT factor = J_CONTRIB?1.0:0.5;
|
||||||
|
|
||||||
|
if ((NEIGHFLAG == HALF || NEIGHFLAG == HALFTHREAD) && j < c.nlocal) {
|
||||||
|
a_f(j,0) -= fx;
|
||||||
|
a_f(j,1) -= fy;
|
||||||
|
a_f(j,2) -= fz;
|
||||||
|
}
|
||||||
|
|
||||||
F_FLOAT evdwl = 0.0;
|
F_FLOAT evdwl = 0.0;
|
||||||
F_FLOAT ecoul = 0.0;
|
F_FLOAT ecoul = 0.0;
|
||||||
if (c.eflag) {
|
if (c.eflag_either) {
|
||||||
if (rsq < (STACKPARAMS?c.m_cut_ljsq[itype][jtype]:c.d_cut_ljsq(itype,jtype))) {
|
if (rsq < (STACKPARAMS?c.m_cut_ljsq[itype][jtype]:c.d_cut_ljsq(itype,jtype))) {
|
||||||
evdwl = factor_lj * c.template compute_evdwl<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype);
|
evdwl = factor_lj * c.template compute_evdwl<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype);
|
||||||
fev_tmp.evdwl += 0.5*evdwl;
|
fev_tmp.evdwl += factor * evdwl;
|
||||||
}
|
}
|
||||||
if (rsq < (STACKPARAMS?c.m_cut_coulsq[itype][jtype]:c.d_cut_coulsq(itype,jtype))) {
|
if (rsq < (STACKPARAMS?c.m_cut_coulsq[itype][jtype]:c.d_cut_coulsq(itype,jtype))) {
|
||||||
ecoul = c.template compute_ecoul<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype,factor_coul,qtmp);
|
ecoul = c.template compute_ecoul<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype,factor_coul,qtmp);
|
||||||
fev_tmp.ecoul += 0.5*ecoul;
|
fev_tmp.ecoul += factor * ecoul;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if (c.eflag_atom) {
|
||||||
|
const E_FLOAT epairhalf = 0.5 * (evdwl + ecoul);
|
||||||
|
|
||||||
|
if (I_CONTRIB)
|
||||||
|
a_eatom[i] += epairhalf;
|
||||||
|
|
||||||
|
if (J_CONTRIB)
|
||||||
|
a_eatom[j] += epairhalf;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (c.vflag_either) {
|
if (c.vflag_either) {
|
||||||
fev_tmp.v[0] += 0.5*delx*delx*fpair;
|
const E_FLOAT v0 = delx*delx*fpair;
|
||||||
fev_tmp.v[1] += 0.5*dely*dely*fpair;
|
const E_FLOAT v1 = dely*dely*fpair;
|
||||||
fev_tmp.v[2] += 0.5*delz*delz*fpair;
|
const E_FLOAT v2 = delz*delz*fpair;
|
||||||
fev_tmp.v[3] += 0.5*delx*dely*fpair;
|
const E_FLOAT v3 = delx*dely*fpair;
|
||||||
fev_tmp.v[4] += 0.5*delx*delz*fpair;
|
const E_FLOAT v4 = delx*delz*fpair;
|
||||||
fev_tmp.v[5] += 0.5*dely*delz*fpair;
|
const E_FLOAT v5 = dely*delz*fpair;
|
||||||
|
|
||||||
|
fev_tmp.v[0] += factor*v0;
|
||||||
|
fev_tmp.v[1] += factor*v1;
|
||||||
|
fev_tmp.v[2] += factor*v2;
|
||||||
|
fev_tmp.v[3] += factor*v3;
|
||||||
|
fev_tmp.v[4] += factor*v4;
|
||||||
|
fev_tmp.v[5] += factor*v5;
|
||||||
|
|
||||||
|
if (c.vflag_atom) {
|
||||||
|
if (I_CONTRIB) {
|
||||||
|
a_vatom(i,0) += 0.5*v0;
|
||||||
|
a_vatom(i,1) += 0.5*v1;
|
||||||
|
a_vatom(i,2) += 0.5*v2;
|
||||||
|
a_vatom(i,3) += 0.5*v3;
|
||||||
|
a_vatom(i,4) += 0.5*v4;
|
||||||
|
a_vatom(i,5) += 0.5*v5;
|
||||||
|
}
|
||||||
|
if (J_CONTRIB) {
|
||||||
|
a_vatom(j,0) += 0.5*v0;
|
||||||
|
a_vatom(j,1) += 0.5*v1;
|
||||||
|
a_vatom(j,2) += 0.5*v2;
|
||||||
|
a_vatom(j,3) += 0.5*v3;
|
||||||
|
a_vatom(j,4) += 0.5*v4;
|
||||||
|
a_vatom(j,5) += 0.5*v5;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},fev);
|
},fev);
|
||||||
|
|
||||||
Kokkos::single(Kokkos::PerThread(team), [&] () {
|
Kokkos::single(Kokkos::PerThread(team), [&] () {
|
||||||
f(i,0) += fev.f[0];
|
a_f(i,0) += fev.f[0];
|
||||||
f(i,1) += fev.f[1];
|
a_f(i,1) += fev.f[1];
|
||||||
f(i,2) += fev.f[2];
|
a_f(i,2) += fev.f[2];
|
||||||
|
|
||||||
if (c.eflag_global) {
|
if (c.eflag_global)
|
||||||
ev.evdwl += fev.evdwl;
|
ev.evdwl += fev.evdwl;
|
||||||
ev.ecoul += fev.ecoul;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (c.eflag_atom)
|
|
||||||
d_eatom(i) += fev.evdwl + fev.ecoul;
|
|
||||||
|
|
||||||
if (c.vflag_global) {
|
if (c.vflag_global) {
|
||||||
ev.v[0] += fev.v[0];
|
ev.v[0] += fev.v[0];
|
||||||
@ -615,13 +758,19 @@ struct PairComputeFunctor {
|
|||||||
ev.v[5] += fev.v[5];
|
ev.v[5] += fev.v[5];
|
||||||
}
|
}
|
||||||
|
|
||||||
if (c.vflag_atom) {
|
if (NEIGHFLAG == FULL) {
|
||||||
d_vatom(i,0) += fev.v[0];
|
|
||||||
d_vatom(i,1) += fev.v[1];
|
if (c.eflag_atom)
|
||||||
d_vatom(i,2) += fev.v[2];
|
a_eatom(i) += fev.evdwl;
|
||||||
d_vatom(i,3) += fev.v[3];
|
|
||||||
d_vatom(i,4) += fev.v[4];
|
if (c.vflag_atom) {
|
||||||
d_vatom(i,5) += fev.v[5];
|
a_vatom(i,0) += fev.v[0];
|
||||||
|
a_vatom(i,1) += fev.v[1];
|
||||||
|
a_vatom(i,2) += fev.v[2];
|
||||||
|
a_vatom(i,3) += fev.v[3];
|
||||||
|
a_vatom(i,4) += fev.v[4];
|
||||||
|
a_vatom(i,5) += fev.v[5];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
@ -636,7 +785,7 @@ struct PairComputeFunctor {
|
|||||||
auto a_eatom = dup_eatom.template access<typename AtomicDup<NEIGHFLAG,device_type>::value>();
|
auto a_eatom = dup_eatom.template access<typename AtomicDup<NEIGHFLAG,device_type>::value>();
|
||||||
auto a_vatom = dup_vatom.template access<typename AtomicDup<NEIGHFLAG,device_type>::value>();
|
auto a_vatom = dup_vatom.template access<typename AtomicDup<NEIGHFLAG,device_type>::value>();
|
||||||
|
|
||||||
const int EFLAG = c.eflag;
|
const int EFLAG = c.eflag_either;
|
||||||
const int NEWTON_PAIR = c.newton_pair;
|
const int NEWTON_PAIR = c.newton_pair;
|
||||||
const int VFLAG = c.vflag_either;
|
const int VFLAG = c.vflag_either;
|
||||||
|
|
||||||
@ -657,7 +806,7 @@ struct PairComputeFunctor {
|
|||||||
const E_FLOAT v5 = dely*delz*fpair;
|
const E_FLOAT v5 = dely*delz*fpair;
|
||||||
|
|
||||||
if (c.vflag_global) {
|
if (c.vflag_global) {
|
||||||
if (NEIGHFLAG!=FULL) {
|
if (NEIGHFLAG != FULL) {
|
||||||
if (NEWTON_PAIR) {
|
if (NEWTON_PAIR) {
|
||||||
ev.v[0] += v0;
|
ev.v[0] += v0;
|
||||||
ev.v[1] += v1;
|
ev.v[1] += v1;
|
||||||
@ -747,7 +896,8 @@ struct PairComputeFunctor {
|
|||||||
// This uses the fact that failure to match template parameters is not an error.
|
// This uses the fact that failure to match template parameters is not an error.
|
||||||
// By having the enable_if with a ! and without it, exactly one of the functions
|
// By having the enable_if with a ! and without it, exactly one of the functions
|
||||||
// pair_compute_neighlist will match - either the dummy version
|
// pair_compute_neighlist will match - either the dummy version
|
||||||
// or the real one further below.
|
// or the real one further below
|
||||||
|
|
||||||
template<class PairStyle, unsigned NEIGHFLAG, int ZEROFLAG = 0, class Specialisation = void>
|
template<class PairStyle, unsigned NEIGHFLAG, int ZEROFLAG = 0, class Specialisation = void>
|
||||||
EV_FLOAT pair_compute_neighlist (PairStyle* fpair, std::enable_if_t<!((NEIGHFLAG&PairStyle::EnabledNeighFlags) != 0), NeighListKokkos<typename PairStyle::device_type>*> list) {
|
EV_FLOAT pair_compute_neighlist (PairStyle* fpair, std::enable_if_t<!((NEIGHFLAG&PairStyle::EnabledNeighFlags) != 0), NeighListKokkos<typename PairStyle::device_type>*> list) {
|
||||||
EV_FLOAT ev;
|
EV_FLOAT ev;
|
||||||
@ -757,24 +907,29 @@ EV_FLOAT pair_compute_neighlist (PairStyle* fpair, std::enable_if_t<!((NEIGHFLAG
|
|||||||
return ev;
|
return ev;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<class NeighStyle>
|
||||||
|
int GetMaxNeighs(NeighStyle* list)
|
||||||
|
{
|
||||||
|
auto d_ilist = list->d_ilist;
|
||||||
|
auto d_numneigh = list->d_numneigh;
|
||||||
|
int inum = list->inum;
|
||||||
|
|
||||||
|
int maxneigh = 0;
|
||||||
|
Kokkos::parallel_reduce(inum, LAMMPS_LAMBDA(const int ii, int &maxneigh) {
|
||||||
|
const int i = d_ilist[ii];
|
||||||
|
const int num_neighs = d_numneigh[i];
|
||||||
|
maxneigh = MAX(maxneigh,num_neighs);
|
||||||
|
}, Kokkos::Max<int>(maxneigh));
|
||||||
|
|
||||||
|
return maxneigh;
|
||||||
|
}
|
||||||
|
|
||||||
template<class DeviceType, class FunctorStyle>
|
template<class DeviceType, class FunctorStyle>
|
||||||
int GetTeamSize(FunctorStyle& KOKKOS_GPU_ARG(functor), int KOKKOS_GPU_ARG(inum),
|
void GetMaxTeamSize(FunctorStyle& functor, int inum,
|
||||||
int KOKKOS_GPU_ARG(reduce_flag), int team_size, int KOKKOS_GPU_ARG(vector_length)) {
|
int &teamsize_max_for, int &teamsize_max_reduce)
|
||||||
|
{
|
||||||
#ifdef LMP_KOKKOS_GPU
|
teamsize_max_for = Kokkos::TeamPolicy<DeviceType>(inum,Kokkos::AUTO).team_size_max(functor,Kokkos::ParallelForTag());
|
||||||
int team_size_max;
|
teamsize_max_reduce = Kokkos::TeamPolicy<DeviceType>(inum,Kokkos::AUTO).team_size_max(functor,Kokkos::ParallelReduceTag());
|
||||||
|
|
||||||
if (reduce_flag)
|
|
||||||
team_size_max = Kokkos::TeamPolicy<DeviceType>(inum,Kokkos::AUTO).team_size_max(functor,Kokkos::ParallelReduceTag());
|
|
||||||
else
|
|
||||||
team_size_max = Kokkos::TeamPolicy<DeviceType>(inum,Kokkos::AUTO).team_size_max(functor,Kokkos::ParallelForTag());
|
|
||||||
|
|
||||||
if (team_size*vector_length > team_size_max)
|
|
||||||
team_size = team_size_max/vector_length;
|
|
||||||
#else
|
|
||||||
team_size = 1;
|
|
||||||
#endif
|
|
||||||
return team_size;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Submit ParallelFor for NEIGHFLAG=HALF,HALFTHREAD,FULL
|
// Submit ParallelFor for NEIGHFLAG=HALF,HALFTHREAD,FULL
|
||||||
@ -782,38 +937,77 @@ template<class PairStyle, unsigned NEIGHFLAG, int ZEROFLAG = 0, class Specialisa
|
|||||||
EV_FLOAT pair_compute_neighlist (PairStyle* fpair, std::enable_if_t<(NEIGHFLAG&PairStyle::EnabledNeighFlags) != 0, NeighListKokkos<typename PairStyle::device_type>*> list) {
|
EV_FLOAT pair_compute_neighlist (PairStyle* fpair, std::enable_if_t<(NEIGHFLAG&PairStyle::EnabledNeighFlags) != 0, NeighListKokkos<typename PairStyle::device_type>*> list) {
|
||||||
EV_FLOAT ev;
|
EV_FLOAT ev;
|
||||||
|
|
||||||
|
const int inum = list->inum;
|
||||||
|
|
||||||
if (!fpair->lmp->kokkos->neigh_thread_set)
|
if (!fpair->lmp->kokkos->neigh_thread_set)
|
||||||
if (list->inum <= 16384 && NEIGHFLAG == FULL)
|
if (fpair->lmp->kokkos->ngpus && inum <= 16000)
|
||||||
fpair->lmp->kokkos->neigh_thread = 1;
|
if (NEIGHFLAG == FULL || !fpair->newton_pair)
|
||||||
|
fpair->lmp->kokkos->neigh_thread = 1;
|
||||||
|
|
||||||
if (fpair->lmp->kokkos->neigh_thread) {
|
if (fpair->lmp->kokkos->neigh_thread) {
|
||||||
|
|
||||||
int vector_length = 8;
|
static int vectorsize = 0;
|
||||||
int atoms_per_team = 32;
|
static int atoms_per_team = 0;
|
||||||
|
static int lastcall = -1;
|
||||||
|
|
||||||
|
#if defined(LMP_KOKKOS_GPU)
|
||||||
|
if (!vectorsize || lastcall < fpair->lmp->neighbor->lastcall) {
|
||||||
|
lastcall = fpair->lmp->update->ntimestep;
|
||||||
|
vectorsize = GetMaxNeighs(list);
|
||||||
|
vectorsize = MathSpecial::powint(2,(int(log2(vectorsize) + 0.5))); // round to nearest power of 2
|
||||||
|
|
||||||
|
#if defined(KOKKOS_ENABLE_HIP)
|
||||||
|
int max_vectorsize = 64;
|
||||||
|
#else
|
||||||
|
int max_vectorsize = 32;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
vectorsize = MIN(vectorsize,max_vectorsize);
|
||||||
|
|
||||||
|
int teamsize_max_for,teamsize_max_reduce;
|
||||||
|
if (fpair->atom->ntypes > MAX_TYPES_STACKPARAMS) {
|
||||||
|
PairComputeFunctor<PairStyle,NEIGHFLAG,false,ZEROFLAG,Specialisation > ff(fpair,list);
|
||||||
|
GetMaxTeamSize<typename PairStyle::device_type>(ff, inum, teamsize_max_for, teamsize_max_reduce);
|
||||||
|
} else {
|
||||||
|
PairComputeFunctor<PairStyle,NEIGHFLAG,true,ZEROFLAG,Specialisation > ff(fpair,list);
|
||||||
|
GetMaxTeamSize<typename PairStyle::device_type>(ff, inum, teamsize_max_for, teamsize_max_reduce);
|
||||||
|
}
|
||||||
|
|
||||||
|
int teamsize_max = teamsize_max_for;
|
||||||
|
if (fpair->eflag || fpair->vflag)
|
||||||
|
teamsize_max = teamsize_max_reduce;
|
||||||
|
atoms_per_team = teamsize_max/vectorsize;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
vectorsize = 1;
|
||||||
|
atoms_per_team = 1;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
const int num_teams = inum / atoms_per_team + (inum % atoms_per_team ? 1 : 0);
|
||||||
|
|
||||||
if (fpair->atom->ntypes > MAX_TYPES_STACKPARAMS) {
|
if (fpair->atom->ntypes > MAX_TYPES_STACKPARAMS) {
|
||||||
PairComputeFunctor<PairStyle,NEIGHFLAG,false,ZEROFLAG,Specialisation > ff(fpair,list);
|
PairComputeFunctor<PairStyle,NEIGHFLAG,false,ZEROFLAG,Specialisation > ff(fpair,list);
|
||||||
atoms_per_team = GetTeamSize<typename PairStyle::device_type>(ff, list->inum, (fpair->eflag || fpair->vflag), atoms_per_team, vector_length);
|
Kokkos::TeamPolicy<typename PairStyle::device_type,Kokkos::IndexType<int> > policy(num_teams,atoms_per_team,vectorsize);
|
||||||
Kokkos::TeamPolicy<typename PairStyle::device_type,Kokkos::IndexType<int> > policy(list->inum,atoms_per_team,vector_length);
|
|
||||||
if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(policy,ff,ev);
|
if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(policy,ff,ev);
|
||||||
else Kokkos::parallel_for(policy,ff);
|
else Kokkos::parallel_for(policy,ff);
|
||||||
|
ff.contribute();
|
||||||
} else {
|
} else {
|
||||||
PairComputeFunctor<PairStyle,NEIGHFLAG,true,ZEROFLAG,Specialisation > ff(fpair,list);
|
PairComputeFunctor<PairStyle,NEIGHFLAG,true,ZEROFLAG,Specialisation > ff(fpair,list);
|
||||||
atoms_per_team = GetTeamSize<typename PairStyle::device_type>(ff, list->inum, (fpair->eflag || fpair->vflag), atoms_per_team, vector_length);
|
Kokkos::TeamPolicy<typename PairStyle::device_type,Kokkos::IndexType<int> > policy(num_teams,atoms_per_team,vectorsize);
|
||||||
Kokkos::TeamPolicy<typename PairStyle::device_type,Kokkos::IndexType<int> > policy(list->inum,atoms_per_team,vector_length);
|
|
||||||
if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(policy,ff,ev);
|
if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(policy,ff,ev);
|
||||||
else Kokkos::parallel_for(policy,ff);
|
else Kokkos::parallel_for(policy,ff);
|
||||||
|
ff.contribute();
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (fpair->atom->ntypes > MAX_TYPES_STACKPARAMS) {
|
if (fpair->atom->ntypes > MAX_TYPES_STACKPARAMS) {
|
||||||
PairComputeFunctor<PairStyle,NEIGHFLAG,false,ZEROFLAG,Specialisation > ff(fpair,list);
|
PairComputeFunctor<PairStyle,NEIGHFLAG,false,ZEROFLAG,Specialisation > ff(fpair,list);
|
||||||
if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(list->inum,ff,ev);
|
if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(inum,ff,ev);
|
||||||
else Kokkos::parallel_for(list->inum,ff);
|
else Kokkos::parallel_for(inum,ff);
|
||||||
ff.contribute();
|
ff.contribute();
|
||||||
} else {
|
} else {
|
||||||
PairComputeFunctor<PairStyle,NEIGHFLAG,true,ZEROFLAG,Specialisation > ff(fpair,list);
|
PairComputeFunctor<PairStyle,NEIGHFLAG,true,ZEROFLAG,Specialisation > ff(fpair,list);
|
||||||
if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(list->inum,ff,ev);
|
if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(inum,ff,ev);
|
||||||
else Kokkos::parallel_for(list->inum,ff);
|
else Kokkos::parallel_for(inum,ff);
|
||||||
ff.contribute();
|
ff.contribute();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user