USER-DPD Kokkos: use a parallel_for() to build the locals workplan for SSA

This commit is contained in:
Tim Mattox
2017-04-06 02:31:45 -04:00
parent ad5481831f
commit 9e272cb393
2 changed files with 84 additions and 74 deletions

View File

@ -34,6 +34,14 @@ namespace LAMMPS_NS {
template<class DeviceType>
NPairSSAKokkos<DeviceType>::NPairSSAKokkos(LAMMPS *lmp) : NPair(lmp), ssa_phaseCt(27), ssa_gphaseCt(7)
{
const int gphaseLenEstimate = 1; //FIXME make this 4 eventually
k_ssa_gphaseLen = DAT::tdual_int_1d("NPairSSAKokkos:ssa_gphaseLen",ssa_gphaseCt);
ssa_gphaseLen = k_ssa_gphaseLen.view<DeviceType>();
k_ssa_gitemLoc = DAT::tdual_int_2d("NPairSSAKokkos::ssa_gitemLoc",ssa_gphaseCt,gphaseLenEstimate);
ssa_gitemLoc = k_ssa_gitemLoc.view<DeviceType>();
k_ssa_gitemLen = DAT::tdual_int_2d("NPairSSAKokkos::ssa_gitemLen",ssa_gphaseCt,gphaseLenEstimate);
ssa_gitemLen = k_ssa_gitemLen.view<DeviceType>();
}
/* ----------------------------------------------------------------------
@ -132,6 +140,27 @@ void NPairSSAKokkos<DeviceType>::copy_stencil_info()
sx1 = ns_ssa->sx + 1;
sy1 = ns_ssa->sy + 1;
sz1 = ns_ssa->sz + 1;
// Setup the phases of the workplan for locals
ssa_phaseCt = sz1*sy1*sx1;
if (ssa_phaseCt > (int) k_ssa_phaseLen.dimension_0()) {
k_ssa_phaseLen = DAT::tdual_int_1d("NPairSSAKokkos:ssa_phaseLen",ssa_phaseCt);
ssa_phaseLen = k_ssa_phaseLen.view<DeviceType>();
k_ssa_phaseOff = DAT::tdual_int_1d_3("NPairSSAKokkos:ssa_phaseOff",ssa_phaseCt);
ssa_phaseOff = k_ssa_phaseOff.view<DeviceType>();
}
int workPhase = 0;
for (int zoff = sz1 - 1; zoff >= 0; --zoff) {
for (int yoff = sy1 - 1; yoff >= 0; --yoff) {
for (int xoff = sx1 - 1; xoff >= 0; --xoff) {
ssa_phaseOff(workPhase, 0) = xoff;
ssa_phaseOff(workPhase, 1) = yoff;
ssa_phaseOff(workPhase, 2) = zoff;
workPhase++;
}
}
}
}
/* ---------------------------------------------------------------------- */
@ -208,18 +237,11 @@ void NPairSSAKokkos<DeviceType>::build(NeighList *list_)
const int nlocal = includegroup?atom->nfirst:atom->nlocal;
int nl_size;
ssa_phaseCt = sz1*sy1*sx1;
int xbinCt = (lbinxhi - lbinxlo + sx1 - 1) / sx1 + 1;
int ybinCt = (lbinyhi - lbinylo + sy1 - 1) / sy1 + 1;
int zbinCt = (lbinzhi - lbinzlo + sz1 - 1) / sz1 + 1;
int phaseLenEstimate = xbinCt*ybinCt*zbinCt;
int xbin = (lbinxhi - lbinxlo + sx1 - 1) / sx1 + 1;
int ybin = (lbinyhi - lbinylo + sy1 - 1) / sy1 + 1;
int zbin = (lbinzhi - lbinzlo + sz1 - 1) / sz1 + 1;
int phaseLenEstimate = xbin*ybin*zbin;
int gphaseLenEstimate = 1; //FIXME make this 4 eventually
if (ssa_phaseCt > (int) k_ssa_phaseLen.dimension_0()) {
k_ssa_phaseLen = DAT::tdual_int_1d("NPairSSAKokkos:ssa_phaseLen",ssa_phaseCt);
ssa_phaseLen = k_ssa_phaseLen.view<DeviceType>();
}
if ((ssa_phaseCt > (int) k_ssa_itemLoc.dimension_0()) ||
(phaseLenEstimate > (int) k_ssa_itemLoc.dimension_1())) {
k_ssa_itemLoc = DAT::tdual_int_2d("NPairSSAKokkos::ssa_itemLoc",ssa_phaseCt,phaseLenEstimate);
@ -228,18 +250,6 @@ void NPairSSAKokkos<DeviceType>::build(NeighList *list_)
ssa_itemLen = k_ssa_itemLen.view<DeviceType>();
}
if (ssa_gphaseCt > (int) k_ssa_gphaseLen.dimension_0()) {
k_ssa_gphaseLen = DAT::tdual_int_1d("NPairSSAKokkos:ssa_gphaseLen",ssa_gphaseCt);
ssa_gphaseLen = k_ssa_gphaseLen.view<DeviceType>();
}
if ((ssa_gphaseCt > (int) k_ssa_gitemLoc.dimension_0()) ||
(gphaseLenEstimate > (int) k_ssa_gitemLoc.dimension_1())) {
k_ssa_gitemLoc = DAT::tdual_int_2d("NPairSSAKokkos::ssa_gitemLoc",ssa_gphaseCt,gphaseLenEstimate);
ssa_gitemLoc = k_ssa_gitemLoc.view<DeviceType>();
k_ssa_gitemLen = DAT::tdual_int_2d("NPairSSAKokkos::ssa_gitemLen",ssa_gphaseCt,gphaseLenEstimate);
ssa_gitemLen = k_ssa_gitemLen.view<DeviceType>();
}
{ // Preflight the neighbor list workplan
const typename ArrayTypes<DeviceType>::t_int_1d_const c_bincount = k_bincount.view<DeviceType>();
const typename ArrayTypes<DeviceType>::t_int_2d_const c_bins = k_bins.view<DeviceType>();
@ -247,11 +257,11 @@ void NPairSSAKokkos<DeviceType>::build(NeighList *list_)
const typename ArrayTypes<DeviceType>::t_int_1d_const c_nstencil_ssa = k_nstencil_ssa.view<DeviceType>();
int inum = 0;
int workPhase = 0;
// loop over bins with local atoms, storing half of the neighbors
for (int zoff = sz1 - 1; zoff >= 0; --zoff) {
for (int yoff = sy1 - 1; yoff >= 0; --yoff) {
for (int xoff = sx1 - 1; xoff >= 0; --xoff) {
// loop over bins with local atoms, counting half of the neighbors
for (int workPhase = 0; workPhase < ssa_phaseCt; ++workPhase) {
int zoff = ssa_phaseOff(workPhase, 2);
int yoff = ssa_phaseOff(workPhase, 1);
int xoff = ssa_phaseOff(workPhase, 0);
int workItem = 0;
for (int zbin = lbinzlo + zoff; zbin < lbinzhi; zbin += sz1) {
for (int ybin = lbinylo + yoff - sy1 + 1; ybin < lbinyhi; ybin += sy1) {
@ -308,9 +318,7 @@ fprintf(stdout, "phas%03d phase %3d could use %6d inums, expected %6d inums. max
);
#endif
// record where workPhase ends
ssa_phaseLen(workPhase++) = workItem;
}
}
ssa_phaseLen(workPhase) = workItem;
}
#ifdef DEBUG_SSA_BUILD_LOCALS
fprintf(stdout, "tota%03d total %3d could use %6d inums, expected %6d inums. inums/phase = %g\n"
@ -343,6 +351,7 @@ fprintf(stdout, "tota%03d total %3d could use %6d inums, expected %6d inums. inu
k_nstencil_ssa.view<DeviceType>(),
ssa_phaseCt,
k_ssa_phaseLen.view<DeviceType>(),
k_ssa_phaseOff.view<DeviceType>(),
k_ssa_itemLoc.view<DeviceType>(),
k_ssa_itemLen.view<DeviceType>(),
ssa_gphaseCt,
@ -410,7 +419,17 @@ fprintf(stdout, "tota%03d total %3d could use %6d inums, expected %6d inums. inu
NPairSSAKokkosBuildFunctor<DeviceType> f(data,atoms_per_bin*5*sizeof(X_FLOAT));
Kokkos::parallel_for(nall, f);
#endif
data.build_locals(firstTry, comm->me);
// loop over bins with local atoms, storing half of the neighbors
#ifdef USE_LAMBDA_BUILD
Kokkos::parallel_for(ssa_phaseCt, LAMMPS_LAMBDA (const int workPhase) {
data.build_locals_onePhase(firstTry, comm->me, workPhase);
});
#else
NPairSSAKokkosBuildFunctor<DeviceType> f(data, firstTry, comm->me);
Kokkos::parallel_for(ssa_phaseCt, f);
#endif
data.neigh_list.inum = ssa_itemLoc(ssa_phaseCt-1,ssa_phaseLen(ssa_phaseCt-1)-1) +
ssa_itemLen(ssa_phaseCt-1,ssa_phaseLen(ssa_phaseCt-1)-1);
data.build_ghosts();
firstTry = false;
@ -451,20 +470,16 @@ fprintf(stdout, "Fina%03d %6d inum %6d gnum, total used %6d, allocated %6d\n"
template<class DeviceType>
void NPairSSAKokkosExecute<DeviceType>::build_locals(const bool firstTry, int me)
void NPairSSAKokkosExecute<DeviceType>::build_locals_onePhase(const bool firstTry, int me, int workPhase) const
{
const typename ArrayTypes<DeviceType>::t_int_1d_const_um stencil = d_stencil;
int which = 0;
int inum = 0;
int workPhase = 0;
// loop over bins with local atoms, storing half of the neighbors
for (int zoff = sz1 - 1; zoff >= 0; --zoff) {
for (int yoff = sy1 - 1; yoff >= 0; --yoff) {
for (int xoff = sx1 - 1; xoff >= 0; --xoff) {
int workItem = 0;
int skippedItems = 0;
// inum = d_ssa_itemLoc(workPhase, workItem); // get where workPhase starts in ilist
int zoff = d_ssa_phaseOff(workPhase, 2);
int yoff = d_ssa_phaseOff(workPhase, 1);
int xoff = d_ssa_phaseOff(workPhase, 0);
int workItem = 0;
int skippedItems = 0;
for (int zbin = lbinzlo + zoff; zbin < lbinzhi; zbin += sz1) {
for (int ybin = lbinylo + yoff - sy1 + 1; ybin < lbinyhi; ybin += sy1) {
for (int xbin = lbinxlo + xoff - sx1 + 1; xbin < lbinxhi; xbin += sx1) {
@ -474,21 +489,7 @@ void NPairSSAKokkosExecute<DeviceType>::build_locals(const bool firstTry, int me
continue;
}
int inum_start = d_ssa_itemLoc(workPhase, workItem + skippedItems);
#ifdef DEBUG_SSA_BUILD_LOCALS
if (inum > inum_start) { // This shouldn't happen!
fprintf(stdout, "Rank%03d workphase (%2d,%3d,%3d): inum = %4d, but ssa_itemLoc = %4d OVERFLOW\n"
,me
,workPhase
,workItem
,workItem + skippedItems
,inum
,d_ssa_itemLoc(workPhase, workItem + skippedItems)
);
inum_start = inum;
} else
#endif
inum = inum_start;
// d_ssa_itemLoc(workPhase, workItem) = inum; // record where workItem actually starts in ilist
int inum = inum_start;
for (int subphase = 0; subphase < 4; subphase++) {
int s_ybin = ybin + ((subphase & 0x2) ? sy1 - 1 : 0);
@ -600,23 +601,7 @@ fprintf(stdout, "Phas%03d phase %3d used %6d inums, workItems = %3d, skipped = %
d_ssa_itemLen(workPhase,workItem++) = 0;
}
}
++workPhase;
}
}
}
#ifdef DEBUG_SSA_BUILD_LOCALS
fprintf(stdout, "Totl%03d %3d could use %6d inums, expected %6d inums. inums/phase = %g\n"
,me
,workPhase
,inum
,nlocal*4
,inum / (double) workPhase
);
#endif
//FIXME if (ssa_phaseCt != workPhase) error->one(FLERR,"ssa_phaseCt was wrong");
neigh_list.inum = inum;
}

View File

@ -41,9 +41,11 @@ class NPairSSAKokkos : public NPair {
// SSA Work plan data structures
int ssa_phaseCt;
DAT::tdual_int_1d k_ssa_phaseLen;
DAT::tdual_int_1d_3 k_ssa_phaseOff;
DAT::tdual_int_2d k_ssa_itemLoc;
DAT::tdual_int_2d k_ssa_itemLen;
typename AT::t_int_1d ssa_phaseLen;
typename AT::t_int_1d_3 ssa_phaseOff;
typename AT::t_int_2d ssa_itemLoc;
typename AT::t_int_2d ssa_itemLen;
@ -175,6 +177,7 @@ class NPairSSAKokkosExecute
// SSA Work plan data structures
int ssa_phaseCt;
typename AT::t_int_1d d_ssa_phaseLen;
typename AT::t_int_1d_3_const d_ssa_phaseOff;
typename AT::t_int_2d d_ssa_itemLoc;
typename AT::t_int_2d d_ssa_itemLen;
int ssa_gphaseCt;
@ -198,6 +201,7 @@ class NPairSSAKokkosExecute
const typename AT::t_int_1d &_d_nstencil_ssa,
const int _ssa_phaseCt,
const typename AT::t_int_1d &_d_ssa_phaseLen,
const typename AT::t_int_1d_3 &_d_ssa_phaseOff,
const typename AT::t_int_2d &_d_ssa_itemLoc,
const typename AT::t_int_2d &_d_ssa_itemLen,
const int _ssa_gphaseCt,
@ -242,6 +246,7 @@ class NPairSSAKokkosExecute
d_stencil(_d_stencil),d_stencilxyz(_d_stencilxyz),d_nstencil_ssa(_d_nstencil_ssa),
ssa_phaseCt(_ssa_phaseCt),
d_ssa_phaseLen(_d_ssa_phaseLen),
d_ssa_phaseOff(_d_ssa_phaseOff),
d_ssa_itemLoc(_d_ssa_itemLoc),
d_ssa_itemLen(_d_ssa_itemLen),
ssa_gphaseCt(_ssa_gphaseCt),
@ -289,7 +294,9 @@ class NPairSSAKokkosExecute
~NPairSSAKokkosExecute() {neigh_list.copymode = 1;};
void build_locals(const bool firstTry, int me);
KOKKOS_FUNCTION
void build_locals_onePhase(const bool firstTry, int me, int workPhase) const;
void build_ghosts();
KOKKOS_INLINE_FUNCTION
@ -344,6 +351,24 @@ class NPairSSAKokkosExecute
};
template<class DeviceType>
struct NPairSSAKokkosBuildFunctor {
typedef DeviceType device_type;
const NPairSSAKokkosExecute<DeviceType> c;
const bool firstTry;
const int me;
NPairSSAKokkosBuildFunctor(const NPairSSAKokkosExecute<DeviceType> &_c,
const bool _firstTry, const int _me):c(_c),
firstTry(_firstTry), me(_me) {};
KOKKOS_INLINE_FUNCTION
void operator() (const int & i) const {
c.build_locals_onePhase(firstTry, me, i);
}
};
}
#endif