Add team-based calcs to some KOKKOS package pair_styles

This commit is contained in:
Stan Moore
2019-02-06 14:42:37 -07:00
parent d6eaf73db1
commit 58905525bf
4 changed files with 439 additions and 9 deletions

View File

@ -192,6 +192,7 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
forward_comm_on_host = 0;
reverse_comm_on_host = 0;
gpu_direct_flag = 1;
team_flag = 0;
#if KOKKOS_USE_CUDA
// only if we can safely detect, that GPU-direct is not available, change default
@ -228,6 +229,7 @@ void KokkosLMP::accelerator(int narg, char **arg)
exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0;
exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0;
gpu_direct_flag = 1;
team_flag = 0;
int iarg = 0;
while (iarg < narg) {
@ -317,6 +319,12 @@ void KokkosLMP::accelerator(int narg, char **arg)
else if (strcmp(arg[iarg+1],"on") == 0) gpu_direct_flag = 1;
else error->all(FLERR,"Illegal package kokkos command");
iarg += 2;
} else if (strcmp(arg[iarg],"team") == 0) {
if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command");
if (strcmp(arg[iarg+1],"off") == 0) team_flag = 0;
else if (strcmp(arg[iarg+1],"on") == 0) team_flag = 1;
else error->all(FLERR,"Illegal package kokkos command");
iarg += 2;
} else error->all(FLERR,"Illegal package kokkos command");
}

View File

@ -36,6 +36,7 @@ class KokkosLMP : protected Pointers {
int numa;
int auto_sync;
int gpu_direct_flag;
int team_flag;
KokkosLMP(class LAMMPS *, int, char **);
~KokkosLMP();

View File

@ -448,6 +448,52 @@ struct s_EV_FLOAT_REAX {
};
typedef struct s_EV_FLOAT_REAX EV_FLOAT_REAX;
struct s_FEV_FLOAT {
F_FLOAT f[3];
E_FLOAT evdwl;
E_FLOAT ecoul;
E_FLOAT v[6];
KOKKOS_INLINE_FUNCTION
s_FEV_FLOAT() {
f[0] = 0; f[1] = 0; f[2] = 0;
evdwl = 0;
ecoul = 0;
v[0] = 0; v[1] = 0; v[2] = 0;
v[3] = 0; v[4] = 0; v[5] = 0;
}
KOKKOS_INLINE_FUNCTION
void operator+=(const s_FEV_FLOAT &rhs) {
f[0] += rhs.f[0];
f[1] += rhs.f[1];
f[2] += rhs.f[2];
evdwl += rhs.evdwl;
ecoul += rhs.ecoul;
v[0] += rhs.v[0];
v[1] += rhs.v[1];
v[2] += rhs.v[2];
v[3] += rhs.v[3];
v[4] += rhs.v[4];
v[5] += rhs.v[5];
}
KOKKOS_INLINE_FUNCTION
void operator+=(const volatile s_FEV_FLOAT &rhs) volatile {
f[0] += rhs.f[0];
f[1] += rhs.f[1];
f[2] += rhs.f[2];
evdwl += rhs.evdwl;
ecoul += rhs.ecoul;
v[0] += rhs.v[0];
v[1] += rhs.v[1];
v[2] += rhs.v[2];
v[3] += rhs.v[3];
v[4] += rhs.v[4];
v[5] += rhs.v[5];
}
};
typedef struct s_FEV_FLOAT FEV_FLOAT;
#ifndef PREC_POS
#define PREC_POS PRECISION
#endif

View File

@ -86,6 +86,7 @@ struct PairComputeFunctor {
NeighListKokkos<device_type>* list_ptr):
c(*c_ptr),list(*list_ptr) {
// allocate duplicated memory
f = c.f;
dup_f = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, NeedDup<NEIGHFLAG,device_type>::value >(c.f);
dup_eatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, NeedDup<NEIGHFLAG,device_type>::value >(c.d_eatom);
dup_vatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, NeedDup<NEIGHFLAG,device_type>::value >(c.d_vatom);
@ -255,6 +256,329 @@ struct PairComputeFunctor {
return ev;
}
// Use TeamPolicy, assume Newton off, Full Neighborlist, and no energy/virial
// Loop over neighbors of one atom without coulomb interaction
// This function is called in parallel
KOKKOS_FUNCTION
void compute_item_team(Kokkos::TeamPolicy<>::member_type team,
const NeighListKokkos<device_type> &list, const NoCoulTag&) const {
const int inum = team.league_size();
const int atoms_per_team = team.team_size();
const int firstatom = team.league_rank()*atoms_per_team;
const int lastatom = firstatom + atoms_per_team < inum ? firstatom + atoms_per_team : inum;
Kokkos::parallel_for(Kokkos::TeamThreadRange(team, firstatom, lastatom), [&] (const int &ii) {
const int i = list.d_ilist[ii];
const X_FLOAT xtmp = c.x(i,0);
const X_FLOAT ytmp = c.x(i,1);
const X_FLOAT ztmp = c.x(i,2);
const int itype = c.type(i);
const AtomNeighborsConst neighbors_i = list.get_neighbors_const(i);
const int jnum = list.d_numneigh[i];
t_scalar3<double> fsum;
Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,jnum),
[&] (const int jj, t_scalar3<double>& ftmp) {
int j = neighbors_i(jj);
const F_FLOAT factor_lj = c.special_lj[sbmask(j)];
j &= NEIGHMASK;
const X_FLOAT delx = xtmp - c.x(j,0);
const X_FLOAT dely = ytmp - c.x(j,1);
const X_FLOAT delz = ztmp - c.x(j,2);
const int jtype = c.type(j);
const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
if(rsq < (STACKPARAMS?c.m_cutsq[itype][jtype]:c.d_cutsq(itype,jtype))) {
const F_FLOAT fpair = factor_lj*c.template compute_fpair<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype);
ftmp.x += delx*fpair;
ftmp.y += dely*fpair;
ftmp.z += delz*fpair;
}
},fsum);
Kokkos::single(Kokkos::PerThread(team), [&] (){
f(i,0) += fsum.x;
f(i,1) += fsum.y;
f(i,2) += fsum.z;
});
});
}
// Use TeamPolicy, assume Newton off, Full Neighborlist, and no energy/virial
// Loop over neighbors of one atom with coulomb interaction
// This function is called in parallel
KOKKOS_FUNCTION
void compute_item_team(Kokkos::TeamPolicy<>::member_type team,
const NeighListKokkos<device_type> &list, const CoulTag& ) const {
const int inum = team.league_size();
const int atoms_per_team = team.team_size();
int firstatom = team.league_rank()*atoms_per_team;
int lastatom = firstatom + atoms_per_team < inum ? firstatom + atoms_per_team : inum;
Kokkos::parallel_for(Kokkos::TeamThreadRange(team, firstatom, lastatom), [&] (const int &ii) {
const int i = list.d_ilist[ii];
const X_FLOAT xtmp = c.x(i,0);
const X_FLOAT ytmp = c.x(i,1);
const X_FLOAT ztmp = c.x(i,2);
const int itype = c.type(i);
const F_FLOAT qtmp = c.q(i);
const AtomNeighborsConst neighbors_i = list.get_neighbors_const(i);
const int jnum = list.d_numneigh[i];
t_scalar3<double> fsum;
Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,jnum),
[&] (const int jj, t_scalar3<double>& ftmp) {
int j = neighbors_i(jj);
const F_FLOAT factor_lj = c.special_lj[sbmask(j)];
const F_FLOAT factor_coul = c.special_coul[sbmask(j)];
j &= NEIGHMASK;
const X_FLOAT delx = xtmp - c.x(j,0);
const X_FLOAT dely = ytmp - c.x(j,1);
const X_FLOAT delz = ztmp - c.x(j,2);
const int jtype = c.type(j);
const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
if(rsq < (STACKPARAMS?c.m_cutsq[itype][jtype]:c.d_cutsq(itype,jtype))) {
F_FLOAT fpair = F_FLOAT();
if(rsq < (STACKPARAMS?c.m_cut_ljsq[itype][jtype]:c.d_cut_ljsq(itype,jtype)))
fpair+=factor_lj*c.template compute_fpair<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype);
if(rsq < (STACKPARAMS?c.m_cut_coulsq[itype][jtype]:c.d_cut_coulsq(itype,jtype)))
fpair+=c.template compute_fcoul<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype,factor_coul,qtmp);
ftmp.x += delx*fpair;
ftmp.y += dely*fpair;
ftmp.z += delz*fpair;
}
},fsum);
Kokkos::single(Kokkos::PerThread(team), [&] (){
f(i,0) += fsum.x;
f(i,1) += fsum.y;
f(i,2) += fsum.z;
});
});
}
// Use TeamPolicy, assume Newton off, Full Neighborlist, and energy/virial
// Loop over neighbors of one atom without coulomb interaction
// This function is called in parallel
KOKKOS_FUNCTION
EV_FLOAT compute_item_team_ev(Kokkos::TeamPolicy<>::member_type team,
const NeighListKokkos<device_type> &list, const NoCoulTag&) const {
EV_FLOAT ev;
const int inum = team.league_size();
const int atoms_per_team = team.team_size();
const int firstatom = team.league_rank()*atoms_per_team;
const int lastatom = firstatom + atoms_per_team < inum ? firstatom + atoms_per_team : inum;
Kokkos::parallel_for(Kokkos::TeamThreadRange(team, firstatom, lastatom), [&] (const int &ii) {
const int i = list.d_ilist[ii];
const X_FLOAT xtmp = c.x(i,0);
const X_FLOAT ytmp = c.x(i,1);
const X_FLOAT ztmp = c.x(i,2);
const int itype = c.type(i);
const AtomNeighborsConst neighbors_i = list.get_neighbors_const(i);
const int jnum = list.d_numneigh[i];
FEV_FLOAT fev;
Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,jnum),
[&] (const int jj, FEV_FLOAT& fev_tmp) {
int j = neighbors_i(jj);
const F_FLOAT factor_lj = c.special_lj[sbmask(j)];
j &= NEIGHMASK;
const X_FLOAT delx = xtmp - c.x(j,0);
const X_FLOAT dely = ytmp - c.x(j,1);
const X_FLOAT delz = ztmp - c.x(j,2);
const int jtype = c.type(j);
const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
if(rsq < (STACKPARAMS?c.m_cutsq[itype][jtype]:c.d_cutsq(itype,jtype))) {
const F_FLOAT fpair = factor_lj*c.template compute_fpair<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype);
fev_tmp.f[0] += delx*fpair;
fev_tmp.f[1] += dely*fpair;
fev_tmp.f[2] += delz*fpair;
F_FLOAT evdwl = 0.0;
if (c.eflag) {
evdwl = factor_lj * c.template compute_evdwl<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype);
fev.evdwl += 0.5*evdwl;
}
if (c.vflag_either) {
fev.v[0] += 0.5*delx*delx*fpair;
fev.v[1] += 0.5*dely*dely*fpair;
fev.v[2] += 0.5*delz*delz*fpair;
fev.v[3] += 0.5*delx*dely*fpair;
fev.v[4] += 0.5*delx*delz*fpair;
fev.v[5] += 0.5*dely*delz*fpair;
}
}
},fev);
Kokkos::single(Kokkos::PerThread(team), [&] (){
f(i,0) += fev.f[0];
f(i,1) += fev.f[1];
f(i,2) += fev.f[2];
if (c.eflag_global)
ev.evdwl += fev.evdwl;
if (c.eflag_atom)
d_eatom(i,0) += fev.evdwl;
if (c.vflag_global) {
ev.v[0] += fev.v[0];
ev.v[1] += fev.v[1];
ev.v[2] += fev.v[2];
ev.v[3] += fev.v[3];
ev.v[4] += fev.v[4];
ev.v[5] += fev.v[5];
}
if (c.vflag_atom) {
d_vatom(i,0) += fev.v[0];
d_vatom(i,1) += fev.v[1];
d_vatom(i,2) += fev.v[2];
d_vatom(i,3) += fev.v[3];
d_vatom(i,4) += fev.v[4];
d_vatom(i,5) += fev.v[5];
}
});
});
return ev;
}
// Use TeamPolicy, assume Newton off, Full Neighborlist, and energy/virial
// Loop over neighbors of one atom with coulomb interaction
// This function is called in parallel
KOKKOS_FUNCTION
EV_FLOAT compute_item_team_ev(Kokkos::TeamPolicy<>::member_type team,
const NeighListKokkos<device_type> &list, const CoulTag& ) const {
EV_FLOAT ev;
const int inum = team.league_size();
const int atoms_per_team = team.team_size();
int firstatom = team.league_rank()*atoms_per_team;
int lastatom = firstatom + atoms_per_team < inum ? firstatom + atoms_per_team : inum;
Kokkos::parallel_for(Kokkos::TeamThreadRange(team, firstatom, lastatom), [&] (const int &ii) {
const int i = list.d_ilist[ii];
const X_FLOAT xtmp = c.x(i,0);
const X_FLOAT ytmp = c.x(i,1);
const X_FLOAT ztmp = c.x(i,2);
const int itype = c.type(i);
const F_FLOAT qtmp = c.q(i);
const AtomNeighborsConst neighbors_i = list.get_neighbors_const(i);
const int jnum = list.d_numneigh[i];
FEV_FLOAT fev;
Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,jnum),
[&] (const int jj, FEV_FLOAT& fev_tmp) {
int j = neighbors_i(jj);
const F_FLOAT factor_lj = c.special_lj[sbmask(j)];
const F_FLOAT factor_coul = c.special_coul[sbmask(j)];
j &= NEIGHMASK;
const X_FLOAT delx = xtmp - c.x(j,0);
const X_FLOAT dely = ytmp - c.x(j,1);
const X_FLOAT delz = ztmp - c.x(j,2);
const int jtype = c.type(j);
const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
if(rsq < (STACKPARAMS?c.m_cutsq[itype][jtype]:c.d_cutsq(itype,jtype))) {
F_FLOAT fpair = F_FLOAT();
if(rsq < (STACKPARAMS?c.m_cut_ljsq[itype][jtype]:c.d_cut_ljsq(itype,jtype)))
fpair+=factor_lj*c.template compute_fpair<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype);
if(rsq < (STACKPARAMS?c.m_cut_coulsq[itype][jtype]:c.d_cut_coulsq(itype,jtype)))
fpair+=c.template compute_fcoul<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype,factor_coul,qtmp);
fev.f[0] += delx*fpair;
fev.f[1] += dely*fpair;
fev.f[2] += delz*fpair;
F_FLOAT evdwl = 0.0;
F_FLOAT ecoul = 0.0;
if (c.eflag) {
if(rsq < (STACKPARAMS?c.m_cut_ljsq[itype][jtype]:c.d_cut_ljsq(itype,jtype))) {
evdwl = factor_lj * c.template compute_evdwl<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype);
ev.evdwl += 0.5*evdwl;
}
if(rsq < (STACKPARAMS?c.m_cut_coulsq[itype][jtype]:c.d_cut_coulsq(itype,jtype))) {
ecoul = c.template compute_ecoul<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype,factor_coul,qtmp);
ev.ecoul += 0.5*ecoul;
}
}
if (c.vflag) {
fev.v[0] += 0.5*delx*delx*fpair;
fev.v[1] += 0.5*dely*dely*fpair;
fev.v[2] += 0.5*delz*delz*fpair;
fev.v[3] += 0.5*delx*dely*fpair;
fev.v[4] += 0.5*delx*delz*fpair;
fev.v[5] += 0.5*dely*delz*fpair;
}
}
},fev);
Kokkos::single(Kokkos::PerThread(team), [&] (){
f(i,0) += fev.f[0];
f(i,1) += fev.f[1];
f(i,2) += fev.f[2];
if (c.eflag_global) {
ev.evdwl += fev.evdwl;
ev.ecoul += fev.ecoul;
}
if (c.eflag_atom)
d_eatom(i,0) += fev.evdwl + fev.ecoul;
if (c.vflag_global) {
ev.v[0] += fev.v[0];
ev.v[1] += fev.v[1];
ev.v[2] += fev.v[2];
ev.v[3] += fev.v[3];
ev.v[4] += fev.v[4];
ev.v[5] += fev.v[5];
}
if (c.vflag_atom) {
d_vatom(i,0) += fev.v[0];
d_vatom(i,1) += fev.v[1];
d_vatom(i,2) += fev.v[2];
d_vatom(i,3) += fev.v[3];
d_vatom(i,4) += fev.v[4];
d_vatom(i,5) += fev.v[5];
}
});
});
return ev;
}
KOKKOS_INLINE_FUNCTION
void ev_tally(EV_FLOAT &ev, const int &i, const int &j,
const F_FLOAT &epair, const F_FLOAT &fpair, const F_FLOAT &delx,
@ -355,6 +679,16 @@ struct PairComputeFunctor {
else
energy_virial += compute_item<1,0>(i,list,typename DoCoul<PairStyle::COUL_FLAG>::type());
}
KOKKOS_INLINE_FUNCTION
void operator()(const typename Kokkos::TeamPolicy<>::member_type& team) const {
compute_item_team(team,list,typename DoCoul<PairStyle::COUL_FLAG>::type());
}
KOKKOS_INLINE_FUNCTION
void operator()(const typename Kokkos::TeamPolicy<>::member_type& team, value_type &energy_virial) const {
energy_virial += compute_item_team_ev(team,list,typename DoCoul<PairStyle::COUL_FLAG>::type());
}
};
template <class PairStyle, bool STACKPARAMS, class Specialisation>
@ -489,6 +823,15 @@ struct PairComputeFunctor<PairStyle,N2,STACKPARAMS,Specialisation> {
void operator()(const int i, value_type &energy_virial) const {
energy_virial += compute_item<1,0>(i,list,typename DoCoul<PairStyle::COUL_FLAG>::type());
}
KOKKOS_INLINE_FUNCTION
void operator()(const typename Kokkos::TeamPolicy<>::member_type& team) const
{}
KOKKOS_INLINE_FUNCTION
void operator()(const typename Kokkos::TeamPolicy<>::member_type& team, value_type &energy_virial) const
{}
};
// Filter out Neighflags which are not supported for PairStyle
@ -507,20 +850,52 @@ EV_FLOAT pair_compute_neighlist (PairStyle* fpair, typename Kokkos::Impl::enable
return ev;
}
template<class FunctorStyle>
int GetTeamSize(FunctorStyle& functor, int team_size, int vector_length) {
int team_size_max = Kokkos::TeamPolicy<>::team_size_max(functor);
#ifdef KOKKOS_ENABLE_CUDA
if(team_size*vector_length > team_size_max)
team_size = team_size_max/vector_length;
#else
team_size = 1;
#endif
return team_size;
}
// Submit ParallelFor for NEIGHFLAG=HALF,HALFTHREAD,FULL,N2
template<class PairStyle, unsigned NEIGHFLAG, class Specialisation>
EV_FLOAT pair_compute_neighlist (PairStyle* fpair, typename Kokkos::Impl::enable_if<(NEIGHFLAG&PairStyle::EnabledNeighFlags) != 0, NeighListKokkos<typename PairStyle::device_type>*>::type list) {
EV_FLOAT ev;
if(fpair->atom->ntypes > MAX_TYPES_STACKPARAMS) {
PairComputeFunctor<PairStyle,NEIGHFLAG,false,Specialisation > ff(fpair,list);
if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(list->inum,ff,ev);
else Kokkos::parallel_for(list->inum,ff);
ff.contribute();
if (fpair->lmp->kokkos->team_flag) {
int vector_length = 8;
int atoms_per_team = 32;
if(fpair->atom->ntypes > MAX_TYPES_STACKPARAMS) {
PairComputeFunctor<PairStyle,NEIGHFLAG,false,Specialisation > ff(fpair,list);
atoms_per_team = GetTeamSize(ff, atoms_per_team, vector_length);
Kokkos::TeamPolicy<Kokkos::IndexType<int> > policy(list->inum,atoms_per_team,vector_length);
if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(policy,ff,ev);
else Kokkos::parallel_for(policy,ff);
} else {
PairComputeFunctor<PairStyle,NEIGHFLAG,true,Specialisation > ff(fpair,list);
atoms_per_team = GetTeamSize(ff, atoms_per_team, vector_length);
Kokkos::TeamPolicy<Kokkos::IndexType<int> > policy(list->inum,atoms_per_team,vector_length);
if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(policy,ff,ev);
else Kokkos::parallel_for(policy,ff);
}
} else {
PairComputeFunctor<PairStyle,NEIGHFLAG,true,Specialisation > ff(fpair,list);
if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(list->inum,ff,ev);
else Kokkos::parallel_for(list->inum,ff);
ff.contribute();
if(fpair->atom->ntypes > MAX_TYPES_STACKPARAMS) {
PairComputeFunctor<PairStyle,NEIGHFLAG,false,Specialisation > ff(fpair,list);
if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(list->inum,ff,ev);
else Kokkos::parallel_for(list->inum,ff);
ff.contribute();
} else {
PairComputeFunctor<PairStyle,NEIGHFLAG,true,Specialisation > ff(fpair,list);
if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(list->inum,ff,ev);
else Kokkos::parallel_for(list->inum,ff);
ff.contribute();
}
}
return ev;
}