From 58905525bfa26e76ef85d87dadff706cb751d098 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Wed, 6 Feb 2019 14:42:37 -0700 Subject: [PATCH] Add team-based calcs to some KOKKOS package pair_styles --- src/KOKKOS/kokkos.cpp | 8 + src/KOKKOS/kokkos.h | 1 + src/KOKKOS/kokkos_type.h | 46 +++++ src/KOKKOS/pair_kokkos.h | 393 ++++++++++++++++++++++++++++++++++++++- 4 files changed, 439 insertions(+), 9 deletions(-) diff --git a/src/KOKKOS/kokkos.cpp b/src/KOKKOS/kokkos.cpp index 9973b5a688..5d041bcbb0 100644 --- a/src/KOKKOS/kokkos.cpp +++ b/src/KOKKOS/kokkos.cpp @@ -192,6 +192,7 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp) forward_comm_on_host = 0; reverse_comm_on_host = 0; gpu_direct_flag = 1; + team_flag = 0; #if KOKKOS_USE_CUDA // only if we can safely detect, that GPU-direct is not available, change default @@ -228,6 +229,7 @@ void KokkosLMP::accelerator(int narg, char **arg) exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0; exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0; gpu_direct_flag = 1; + team_flag = 0; int iarg = 0; while (iarg < narg) { @@ -317,6 +319,12 @@ void KokkosLMP::accelerator(int narg, char **arg) else if (strcmp(arg[iarg+1],"on") == 0) gpu_direct_flag = 1; else error->all(FLERR,"Illegal package kokkos command"); iarg += 2; + } else if (strcmp(arg[iarg],"team") == 0) { + if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command"); + if (strcmp(arg[iarg+1],"off") == 0) team_flag = 0; + else if (strcmp(arg[iarg+1],"on") == 0) team_flag = 1; + else error->all(FLERR,"Illegal package kokkos command"); + iarg += 2; } else error->all(FLERR,"Illegal package kokkos command"); } diff --git a/src/KOKKOS/kokkos.h b/src/KOKKOS/kokkos.h index cd429d5c1c..a665329d70 100644 --- a/src/KOKKOS/kokkos.h +++ b/src/KOKKOS/kokkos.h @@ -36,6 +36,7 @@ class KokkosLMP : protected Pointers { int numa; int auto_sync; int gpu_direct_flag; + int team_flag; KokkosLMP(class LAMMPS *, int, char **); ~KokkosLMP(); diff --git a/src/KOKKOS/kokkos_type.h b/src/KOKKOS/kokkos_type.h index b88c92ff73..16d7c3cbd2 100644 --- a/src/KOKKOS/kokkos_type.h +++ b/src/KOKKOS/kokkos_type.h @@ -448,6 +448,52 @@ struct s_EV_FLOAT_REAX { }; typedef struct s_EV_FLOAT_REAX EV_FLOAT_REAX; +struct s_FEV_FLOAT { + F_FLOAT f[3]; + E_FLOAT evdwl; + E_FLOAT ecoul; + E_FLOAT v[6]; + KOKKOS_INLINE_FUNCTION + s_FEV_FLOAT() { + f[0] = 0; f[1] = 0; f[2] = 0; + evdwl = 0; + ecoul = 0; + v[0] = 0; v[1] = 0; v[2] = 0; + v[3] = 0; v[4] = 0; v[5] = 0; + } + + KOKKOS_INLINE_FUNCTION + void operator+=(const s_FEV_FLOAT &rhs) { + f[0] += rhs.f[0]; + f[1] += rhs.f[1]; + f[2] += rhs.f[2]; + evdwl += rhs.evdwl; + ecoul += rhs.ecoul; + v[0] += rhs.v[0]; + v[1] += rhs.v[1]; + v[2] += rhs.v[2]; + v[3] += rhs.v[3]; + v[4] += rhs.v[4]; + v[5] += rhs.v[5]; + } + + KOKKOS_INLINE_FUNCTION + void operator+=(const volatile s_FEV_FLOAT &rhs) volatile { + f[0] += rhs.f[0]; + f[1] += rhs.f[1]; + f[2] += rhs.f[2]; + evdwl += rhs.evdwl; + ecoul += rhs.ecoul; + v[0] += rhs.v[0]; + v[1] += rhs.v[1]; + v[2] += rhs.v[2]; + v[3] += rhs.v[3]; + v[4] += rhs.v[4]; + v[5] += rhs.v[5]; + } +}; +typedef struct s_FEV_FLOAT FEV_FLOAT; + #ifndef PREC_POS #define PREC_POS PRECISION #endif diff --git a/src/KOKKOS/pair_kokkos.h b/src/KOKKOS/pair_kokkos.h index ab616d2c07..8758b2f03c 100644 --- a/src/KOKKOS/pair_kokkos.h +++ b/src/KOKKOS/pair_kokkos.h @@ -86,6 +86,7 @@ struct PairComputeFunctor { NeighListKokkos* list_ptr): c(*c_ptr),list(*list_ptr) { // allocate duplicated memory + f = c.f; dup_f = Kokkos::Experimental::create_scatter_view::value >(c.f); dup_eatom = Kokkos::Experimental::create_scatter_view::value >(c.d_eatom); dup_vatom = Kokkos::Experimental::create_scatter_view::value >(c.d_vatom); @@ -255,6 +256,329 @@ struct PairComputeFunctor { return ev; } + // Use TeamPolicy, assume Newton off, Full Neighborlist, and no energy/virial + // Loop over neighbors of one atom without coulomb interaction + // This function is called in parallel + KOKKOS_FUNCTION + void compute_item_team(Kokkos::TeamPolicy<>::member_type team, + const NeighListKokkos &list, const NoCoulTag&) const { + + const int inum = team.league_size(); + const int atoms_per_team = team.team_size(); + const int firstatom = team.league_rank()*atoms_per_team; + const int lastatom = firstatom + atoms_per_team < inum ? firstatom + atoms_per_team : inum; + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, firstatom, lastatom), [&] (const int &ii) { + + const int i = list.d_ilist[ii]; + const X_FLOAT xtmp = c.x(i,0); + const X_FLOAT ytmp = c.x(i,1); + const X_FLOAT ztmp = c.x(i,2); + const int itype = c.type(i); + + const AtomNeighborsConst neighbors_i = list.get_neighbors_const(i); + const int jnum = list.d_numneigh[i]; + + t_scalar3 fsum; + + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,jnum), + [&] (const int jj, t_scalar3& ftmp) { + + int j = neighbors_i(jj); + const F_FLOAT factor_lj = c.special_lj[sbmask(j)]; + j &= NEIGHMASK; + const X_FLOAT delx = xtmp - c.x(j,0); + const X_FLOAT dely = ytmp - c.x(j,1); + const X_FLOAT delz = ztmp - c.x(j,2); + const int jtype = c.type(j); + const F_FLOAT rsq = delx*delx + dely*dely + delz*delz; + + if(rsq < (STACKPARAMS?c.m_cutsq[itype][jtype]:c.d_cutsq(itype,jtype))) { + + const F_FLOAT fpair = factor_lj*c.template compute_fpair(rsq,i,j,itype,jtype); + + ftmp.x += delx*fpair; + ftmp.y += dely*fpair; + ftmp.z += delz*fpair; + } + + },fsum); + + Kokkos::single(Kokkos::PerThread(team), [&] (){ + f(i,0) += fsum.x; + f(i,1) += fsum.y; + f(i,2) += fsum.z; + }); + + }); + } + + // Use TeamPolicy, assume Newton off, Full Neighborlist, and no energy/virial + // Loop over neighbors of one atom with coulomb interaction + // This function is called in parallel + KOKKOS_FUNCTION + void compute_item_team(Kokkos::TeamPolicy<>::member_type team, + const NeighListKokkos &list, const CoulTag& ) const { + + const int inum = team.league_size(); + const int atoms_per_team = team.team_size(); + int firstatom = team.league_rank()*atoms_per_team; + int lastatom = firstatom + atoms_per_team < inum ? firstatom + atoms_per_team : inum; + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, firstatom, lastatom), [&] (const int &ii) { + + const int i = list.d_ilist[ii]; + const X_FLOAT xtmp = c.x(i,0); + const X_FLOAT ytmp = c.x(i,1); + const X_FLOAT ztmp = c.x(i,2); + const int itype = c.type(i); + const F_FLOAT qtmp = c.q(i); + + const AtomNeighborsConst neighbors_i = list.get_neighbors_const(i); + const int jnum = list.d_numneigh[i]; + + t_scalar3 fsum; + + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,jnum), + [&] (const int jj, t_scalar3& ftmp) { + int j = neighbors_i(jj); + const F_FLOAT factor_lj = c.special_lj[sbmask(j)]; + const F_FLOAT factor_coul = c.special_coul[sbmask(j)]; + j &= NEIGHMASK; + const X_FLOAT delx = xtmp - c.x(j,0); + const X_FLOAT dely = ytmp - c.x(j,1); + const X_FLOAT delz = ztmp - c.x(j,2); + const int jtype = c.type(j); + const F_FLOAT rsq = delx*delx + dely*dely + delz*delz; + + if(rsq < (STACKPARAMS?c.m_cutsq[itype][jtype]:c.d_cutsq(itype,jtype))) { + + F_FLOAT fpair = F_FLOAT(); + + if(rsq < (STACKPARAMS?c.m_cut_ljsq[itype][jtype]:c.d_cut_ljsq(itype,jtype))) + fpair+=factor_lj*c.template compute_fpair(rsq,i,j,itype,jtype); + if(rsq < (STACKPARAMS?c.m_cut_coulsq[itype][jtype]:c.d_cut_coulsq(itype,jtype))) + fpair+=c.template compute_fcoul(rsq,i,j,itype,jtype,factor_coul,qtmp); + + ftmp.x += delx*fpair; + ftmp.y += dely*fpair; + ftmp.z += delz*fpair; + } + },fsum); + + Kokkos::single(Kokkos::PerThread(team), [&] (){ + f(i,0) += fsum.x; + f(i,1) += fsum.y; + f(i,2) += fsum.z; + }); + }); + } + + + // Use TeamPolicy, assume Newton off, Full Neighborlist, and energy/virial + // Loop over neighbors of one atom without coulomb interaction + // This function is called in parallel + KOKKOS_FUNCTION + EV_FLOAT compute_item_team_ev(Kokkos::TeamPolicy<>::member_type team, + const NeighListKokkos &list, const NoCoulTag&) const { + + EV_FLOAT ev; + + const int inum = team.league_size(); + const int atoms_per_team = team.team_size(); + const int firstatom = team.league_rank()*atoms_per_team; + const int lastatom = firstatom + atoms_per_team < inum ? firstatom + atoms_per_team : inum; + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, firstatom, lastatom), [&] (const int &ii) { + + + const int i = list.d_ilist[ii]; + const X_FLOAT xtmp = c.x(i,0); + const X_FLOAT ytmp = c.x(i,1); + const X_FLOAT ztmp = c.x(i,2); + const int itype = c.type(i); + + const AtomNeighborsConst neighbors_i = list.get_neighbors_const(i); + const int jnum = list.d_numneigh[i]; + + FEV_FLOAT fev; + + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,jnum), + [&] (const int jj, FEV_FLOAT& fev_tmp) { + + int j = neighbors_i(jj); + const F_FLOAT factor_lj = c.special_lj[sbmask(j)]; + j &= NEIGHMASK; + const X_FLOAT delx = xtmp - c.x(j,0); + const X_FLOAT dely = ytmp - c.x(j,1); + const X_FLOAT delz = ztmp - c.x(j,2); + const int jtype = c.type(j); + const F_FLOAT rsq = delx*delx + dely*dely + delz*delz; + + if(rsq < (STACKPARAMS?c.m_cutsq[itype][jtype]:c.d_cutsq(itype,jtype))) { + + const F_FLOAT fpair = factor_lj*c.template compute_fpair(rsq,i,j,itype,jtype); + + fev_tmp.f[0] += delx*fpair; + fev_tmp.f[1] += dely*fpair; + fev_tmp.f[2] += delz*fpair; + + F_FLOAT evdwl = 0.0; + if (c.eflag) { + evdwl = factor_lj * c.template compute_evdwl(rsq,i,j,itype,jtype); + fev.evdwl += 0.5*evdwl; + } + if (c.vflag_either) { + fev.v[0] += 0.5*delx*delx*fpair; + fev.v[1] += 0.5*dely*dely*fpair; + fev.v[2] += 0.5*delz*delz*fpair; + fev.v[3] += 0.5*delx*dely*fpair; + fev.v[4] += 0.5*delx*delz*fpair; + fev.v[5] += 0.5*dely*delz*fpair; + } + } + },fev); + + Kokkos::single(Kokkos::PerThread(team), [&] (){ + f(i,0) += fev.f[0]; + f(i,1) += fev.f[1]; + f(i,2) += fev.f[2]; + + if (c.eflag_global) + ev.evdwl += fev.evdwl; + + if (c.eflag_atom) + d_eatom(i,0) += fev.evdwl; + + if (c.vflag_global) { + ev.v[0] += fev.v[0]; + ev.v[1] += fev.v[1]; + ev.v[2] += fev.v[2]; + ev.v[3] += fev.v[3]; + ev.v[4] += fev.v[4]; + ev.v[5] += fev.v[5]; + } + + if (c.vflag_atom) { + d_vatom(i,0) += fev.v[0]; + d_vatom(i,1) += fev.v[1]; + d_vatom(i,2) += fev.v[2]; + d_vatom(i,3) += fev.v[3]; + d_vatom(i,4) += fev.v[4]; + d_vatom(i,5) += fev.v[5]; + } + }); + }); + return ev; + } + + // Use TeamPolicy, assume Newton off, Full Neighborlist, and energy/virial + // Loop over neighbors of one atom with coulomb interaction + // This function is called in parallel + KOKKOS_FUNCTION + EV_FLOAT compute_item_team_ev(Kokkos::TeamPolicy<>::member_type team, + const NeighListKokkos &list, const CoulTag& ) const { + + EV_FLOAT ev; + + const int inum = team.league_size(); + const int atoms_per_team = team.team_size(); + int firstatom = team.league_rank()*atoms_per_team; + int lastatom = firstatom + atoms_per_team < inum ? firstatom + atoms_per_team : inum; + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, firstatom, lastatom), [&] (const int &ii) { + + const int i = list.d_ilist[ii]; + const X_FLOAT xtmp = c.x(i,0); + const X_FLOAT ytmp = c.x(i,1); + const X_FLOAT ztmp = c.x(i,2); + const int itype = c.type(i); + const F_FLOAT qtmp = c.q(i); + + const AtomNeighborsConst neighbors_i = list.get_neighbors_const(i); + const int jnum = list.d_numneigh[i]; + + FEV_FLOAT fev; + + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,jnum), + [&] (const int jj, FEV_FLOAT& fev_tmp) { + int j = neighbors_i(jj); + const F_FLOAT factor_lj = c.special_lj[sbmask(j)]; + const F_FLOAT factor_coul = c.special_coul[sbmask(j)]; + j &= NEIGHMASK; + const X_FLOAT delx = xtmp - c.x(j,0); + const X_FLOAT dely = ytmp - c.x(j,1); + const X_FLOAT delz = ztmp - c.x(j,2); + const int jtype = c.type(j); + const F_FLOAT rsq = delx*delx + dely*dely + delz*delz; + + if(rsq < (STACKPARAMS?c.m_cutsq[itype][jtype]:c.d_cutsq(itype,jtype))) { + + F_FLOAT fpair = F_FLOAT(); + + if(rsq < (STACKPARAMS?c.m_cut_ljsq[itype][jtype]:c.d_cut_ljsq(itype,jtype))) + fpair+=factor_lj*c.template compute_fpair(rsq,i,j,itype,jtype); + if(rsq < (STACKPARAMS?c.m_cut_coulsq[itype][jtype]:c.d_cut_coulsq(itype,jtype))) + fpair+=c.template compute_fcoul(rsq,i,j,itype,jtype,factor_coul,qtmp); + + fev.f[0] += delx*fpair; + fev.f[1] += dely*fpair; + fev.f[2] += delz*fpair; + + F_FLOAT evdwl = 0.0; + F_FLOAT ecoul = 0.0; + if (c.eflag) { + if(rsq < (STACKPARAMS?c.m_cut_ljsq[itype][jtype]:c.d_cut_ljsq(itype,jtype))) { + evdwl = factor_lj * c.template compute_evdwl(rsq,i,j,itype,jtype); + ev.evdwl += 0.5*evdwl; + } + if(rsq < (STACKPARAMS?c.m_cut_coulsq[itype][jtype]:c.d_cut_coulsq(itype,jtype))) { + ecoul = c.template compute_ecoul(rsq,i,j,itype,jtype,factor_coul,qtmp); + ev.ecoul += 0.5*ecoul; + } + } + if (c.vflag) { + fev.v[0] += 0.5*delx*delx*fpair; + fev.v[1] += 0.5*dely*dely*fpair; + fev.v[2] += 0.5*delz*delz*fpair; + fev.v[3] += 0.5*delx*dely*fpair; + fev.v[4] += 0.5*delx*delz*fpair; + fev.v[5] += 0.5*dely*delz*fpair; + } + } + },fev); + + Kokkos::single(Kokkos::PerThread(team), [&] (){ + f(i,0) += fev.f[0]; + f(i,1) += fev.f[1]; + f(i,2) += fev.f[2]; + + if (c.eflag_global) { + ev.evdwl += fev.evdwl; + ev.ecoul += fev.ecoul; + } + + if (c.eflag_atom) + d_eatom(i,0) += fev.evdwl + fev.ecoul; + + if (c.vflag_global) { + ev.v[0] += fev.v[0]; + ev.v[1] += fev.v[1]; + ev.v[2] += fev.v[2]; + ev.v[3] += fev.v[3]; + ev.v[4] += fev.v[4]; + ev.v[5] += fev.v[5]; + } + + if (c.vflag_atom) { + d_vatom(i,0) += fev.v[0]; + d_vatom(i,1) += fev.v[1]; + d_vatom(i,2) += fev.v[2]; + d_vatom(i,3) += fev.v[3]; + d_vatom(i,4) += fev.v[4]; + d_vatom(i,5) += fev.v[5]; + } + }); + }); + return ev; + } + KOKKOS_INLINE_FUNCTION void ev_tally(EV_FLOAT &ev, const int &i, const int &j, const F_FLOAT &epair, const F_FLOAT &fpair, const F_FLOAT &delx, @@ -355,6 +679,16 @@ struct PairComputeFunctor { else energy_virial += compute_item<1,0>(i,list,typename DoCoul::type()); } + + KOKKOS_INLINE_FUNCTION + void operator()(const typename Kokkos::TeamPolicy<>::member_type& team) const { + compute_item_team(team,list,typename DoCoul::type()); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const typename Kokkos::TeamPolicy<>::member_type& team, value_type &energy_virial) const { + energy_virial += compute_item_team_ev(team,list,typename DoCoul::type()); + } }; template @@ -489,6 +823,15 @@ struct PairComputeFunctor { void operator()(const int i, value_type &energy_virial) const { energy_virial += compute_item<1,0>(i,list,typename DoCoul::type()); } + + KOKKOS_INLINE_FUNCTION + void operator()(const typename Kokkos::TeamPolicy<>::member_type& team) const + {} + + KOKKOS_INLINE_FUNCTION + void operator()(const typename Kokkos::TeamPolicy<>::member_type& team, value_type &energy_virial) const + {} + }; // Filter out Neighflags which are not supported for PairStyle @@ -507,20 +850,52 @@ EV_FLOAT pair_compute_neighlist (PairStyle* fpair, typename Kokkos::Impl::enable return ev; } +template +int GetTeamSize(FunctorStyle& functor, int team_size, int vector_length) { + int team_size_max = Kokkos::TeamPolicy<>::team_size_max(functor); + +#ifdef KOKKOS_ENABLE_CUDA + if(team_size*vector_length > team_size_max) + team_size = team_size_max/vector_length; +#else + team_size = 1; +#endif + return team_size; +} + // Submit ParallelFor for NEIGHFLAG=HALF,HALFTHREAD,FULL,N2 template EV_FLOAT pair_compute_neighlist (PairStyle* fpair, typename Kokkos::Impl::enable_if<(NEIGHFLAG&PairStyle::EnabledNeighFlags) != 0, NeighListKokkos*>::type list) { EV_FLOAT ev; - if(fpair->atom->ntypes > MAX_TYPES_STACKPARAMS) { - PairComputeFunctor ff(fpair,list); - if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(list->inum,ff,ev); - else Kokkos::parallel_for(list->inum,ff); - ff.contribute(); + if (fpair->lmp->kokkos->team_flag) { + int vector_length = 8; + int atoms_per_team = 32; + + if(fpair->atom->ntypes > MAX_TYPES_STACKPARAMS) { + PairComputeFunctor ff(fpair,list); + atoms_per_team = GetTeamSize(ff, atoms_per_team, vector_length); + Kokkos::TeamPolicy > policy(list->inum,atoms_per_team,vector_length); + if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(policy,ff,ev); + else Kokkos::parallel_for(policy,ff); + } else { + PairComputeFunctor ff(fpair,list); + atoms_per_team = GetTeamSize(ff, atoms_per_team, vector_length); + Kokkos::TeamPolicy > policy(list->inum,atoms_per_team,vector_length); + if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(policy,ff,ev); + else Kokkos::parallel_for(policy,ff); + } } else { - PairComputeFunctor ff(fpair,list); - if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(list->inum,ff,ev); - else Kokkos::parallel_for(list->inum,ff); - ff.contribute(); + if(fpair->atom->ntypes > MAX_TYPES_STACKPARAMS) { + PairComputeFunctor ff(fpair,list); + if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(list->inum,ff,ev); + else Kokkos::parallel_for(list->inum,ff); + ff.contribute(); + } else { + PairComputeFunctor ff(fpair,list); + if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(list->inum,ff,ev); + else Kokkos::parallel_for(list->inum,ff); + ff.contribute(); + } } return ev; }