From 36cbe439780dc8b44ecbb25036327853033aab68 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Tue, 6 Jun 2017 10:51:26 -0600 Subject: [PATCH 1/7] Fixing some CUDA runtime issues in npair_ssa_kokkos --- src/KOKKOS/npair_ssa_kokkos.cpp | 100 +++++++++++++++++++++++--------- 1 file changed, 74 insertions(+), 26 deletions(-) diff --git a/src/KOKKOS/npair_ssa_kokkos.cpp b/src/KOKKOS/npair_ssa_kokkos.cpp index ba4bc9171c..0c3a5985ff 100644 --- a/src/KOKKOS/npair_ssa_kokkos.cpp +++ b/src/KOKKOS/npair_ssa_kokkos.cpp @@ -149,17 +149,21 @@ void NPairSSAKokkos::copy_stencil_info() k_ssa_phaseOff = DAT::tdual_int_1d_3("NPairSSAKokkos:ssa_phaseOff",ssa_phaseCt); ssa_phaseOff = k_ssa_phaseOff.view(); } + auto h_ssa_phaseOff = k_ssa_phaseOff.h_view; + k_ssa_phaseOff.sync(); int workPhase = 0; for (int zoff = sz1 - 1; zoff >= 0; --zoff) { for (int yoff = sy1 - 1; yoff >= 0; --yoff) { for (int xoff = sx1 - 1; xoff >= 0; --xoff) { - ssa_phaseOff(workPhase, 0) = xoff; - ssa_phaseOff(workPhase, 1) = yoff; - ssa_phaseOff(workPhase, 2) = zoff; + h_ssa_phaseOff(workPhase, 0) = xoff; + h_ssa_phaseOff(workPhase, 1) = yoff; + h_ssa_phaseOff(workPhase, 2) = zoff; workPhase++; } } } + k_ssa_phaseOff.modify(); + k_ssa_phaseOff.sync(); } @@ -250,8 +254,25 @@ void NPairSSAKokkos::build(NeighList *list_) ssa_itemLen = k_ssa_itemLen.view(); } + k_ssa_itemLoc.sync(); + k_ssa_itemLen.sync(); + k_ssa_gitemLoc.sync(); + k_ssa_gitemLen.sync(); + k_ssa_phaseOff.sync(); + k_ssa_phaseLen.sync(); + k_ssa_gphaseLen.sync(); + auto h_ssa_itemLoc = k_ssa_itemLoc.h_view; + auto h_ssa_itemLen = k_ssa_itemLen.h_view; + auto h_ssa_gitemLoc = k_ssa_gitemLoc.h_view; + auto h_ssa_gitemLen = k_ssa_gitemLen.h_view; + auto h_ssa_phaseOff = k_ssa_phaseOff.h_view; + auto h_ssa_phaseLen = k_ssa_phaseLen.h_view; + auto h_ssa_gphaseLen = k_ssa_gphaseLen.h_view; + { // Preflight the neighbor list workplan const typename ArrayTypes::t_int_1d_const c_bincount = k_bincount.view(); + k_bincount.sync(); + auto h_bincount = k_bincount.h_view; const typename ArrayTypes::t_int_2d_const c_bins = k_bins.view(); const typename ArrayTypes::t_int_1d_const_um c_stencil = k_stencil.view(); const typename ArrayTypes::t_int_1d_const c_nstencil_ssa = k_nstencil_ssa.view(); @@ -259,9 +280,9 @@ void NPairSSAKokkos::build(NeighList *list_) // loop over bins with local atoms, counting half of the neighbors for (int workPhase = 0; workPhase < ssa_phaseCt; ++workPhase) { - int zoff = ssa_phaseOff(workPhase, 2); - int yoff = ssa_phaseOff(workPhase, 1); - int xoff = ssa_phaseOff(workPhase, 0); + int zoff = h_ssa_phaseOff(workPhase, 2); + int yoff = h_ssa_phaseOff(workPhase, 1); + int xoff = h_ssa_phaseOff(workPhase, 0); int workItem = 0; for (int zbin = lbinzlo + zoff; zbin < lbinzhi; zbin += sz1) { for (int ybin = lbinylo + yoff - sy1 + 1; ybin < lbinyhi; ybin += sy1) { @@ -276,14 +297,14 @@ void NPairSSAKokkos::build(NeighList *list_) if ((s_xbin < lbinxlo) || (s_xbin >= lbinxhi)) continue; const int ibin = zbin*mbiny*mbinx + s_ybin*mbinx + s_xbin; - const int ibinCt = c_bincount(ibin); + const int ibinCt = h_bincount(ibin); if (ibinCt > 0) { int base_n = 0; bool include_same = false; // count all local atoms in the current stencil "subphase" as potential neighbors for (int k = c_nstencil_ssa(subphase); k < c_nstencil_ssa(subphase+1); k++) { const int jbin = ibin+c_stencil(k); - if (jbin != ibin) base_n += c_bincount(jbin); + if (jbin != ibin) base_n += h_bincount(jbin); else include_same = true; } // Calculate how many ibin particles would have had some neighbors @@ -291,10 +312,10 @@ void NPairSSAKokkos::build(NeighList *list_) else if (include_same) inum += ibinCt - 1; } } - ssa_itemLoc(workPhase,workItem) = inum_start; // record where workItem starts in ilist - ssa_itemLen(workPhase,workItem) = inum - inum_start; // record workItem length + h_ssa_itemLoc(workPhase,workItem) = inum_start; // record where workItem starts in ilist + h_ssa_itemLen(workPhase,workItem) = inum - inum_start; // record workItem length #ifdef DEBUG_SSA_BUILD_LOCALS -if (ssa_itemLen(workPhase,workItem) < 0) fprintf(stdout, "undr%03d phase (%3d,%3d) inum %d - inum_start %d UNDERFLOW\n" +if (h_ssa_itemLen(workPhase,workItem) < 0) fprintf(stdout, "undr%03d phase (%3d,%3d) inum %d - inum_start %d UNDERFLOW\n" ,comm->me ,workPhase ,workItem @@ -311,14 +332,14 @@ if (ssa_itemLen(workPhase,workItem) < 0) fprintf(stdout, "undr%03d phase (%3d,%3 fprintf(stdout, "phas%03d phase %3d could use %6d inums, expected %6d inums. maxworkItems = %3d, inums/workItems = %g\n" ,comm->me ,workPhase - ,inum - ssa_itemLoc(workPhase, 0) + ,inum - h_ssa_itemLoc(workPhase, 0) ,(nlocal*4 + ssa_phaseCt - 1) / ssa_phaseCt ,workItem - ,(inum - ssa_itemLoc(workPhase, 0)) / (double) workItem + ,(inum - h_ssa_itemLoc(workPhase, 0)) / (double) workItem ); #endif // record where workPhase ends - ssa_phaseLen(workPhase) = workItem; + h_ssa_phaseLen(workPhase) = workItem; } #ifdef DEBUG_SSA_BUILD_LOCALS fprintf(stdout, "tota%03d total %3d could use %6d inums, expected %6d inums. inums/phase = %g\n" @@ -331,15 +352,30 @@ fprintf(stdout, "tota%03d total %3d could use %6d inums, expected %6d inums. inu #endif nl_size = inum; // record how much space is needed for the local work plan } + // count how many ghosts might have neighbors, and increase the work plan storage for (int workPhase = 0; workPhase < ssa_gphaseCt; workPhase++) { int len = k_gbincount.h_view(workPhase + 1); - ssa_gitemLoc(workPhase,0) = nl_size; // record where workItem starts in ilist - ssa_gitemLen(workPhase,0) = len; + h_ssa_gitemLoc(workPhase,0) = nl_size; // record where workItem starts in ilist + h_ssa_gitemLen(workPhase,0) = len; nl_size += len; } list->grow(nl_size); // Make special larger SSA neighbor list + k_ssa_itemLoc.modify(); + k_ssa_itemLen.modify(); + k_ssa_gitemLoc.modify(); + k_ssa_gitemLen.modify(); + k_ssa_phaseOff.modify(); + k_ssa_phaseLen.modify(); + k_ssa_itemLoc.sync(); + k_ssa_itemLen.sync(); + k_ssa_gitemLen.sync(); + k_ssa_gitemLoc.sync(); + k_ssa_phaseOff.sync(); + k_ssa_phaseLen.sync(); + k_ssa_gphaseLen.sync(); + NPairSSAKokkosExecute data(*list, k_cutneighsq.view(), @@ -422,15 +458,27 @@ fprintf(stdout, "tota%03d total %3d could use %6d inums, expected %6d inums. inu Kokkos::parallel_for(ssa_phaseCt, LAMMPS_LAMBDA (const int workPhase) { data.build_locals_onePhase(firstTry, comm->me, workPhase); }); - data.neigh_list.inum = ssa_itemLoc(ssa_phaseCt-1,ssa_phaseLen(ssa_phaseCt-1)-1) + - ssa_itemLen(ssa_phaseCt-1,ssa_phaseLen(ssa_phaseCt-1)-1); + k_ssa_itemLoc.modify(); + k_ssa_itemLen.modify(); + k_ssa_phaseLen.modify(); + k_ssa_itemLoc.sync(); + k_ssa_itemLen.sync(); + k_ssa_phaseLen.sync(); + data.neigh_list.inum = h_ssa_itemLoc(ssa_phaseCt-1,h_ssa_phaseLen(ssa_phaseCt-1)-1) + + h_ssa_itemLen(ssa_phaseCt-1,h_ssa_phaseLen(ssa_phaseCt-1)-1); // loop over AIR ghost atoms, storing their local neighbors Kokkos::parallel_for(ssa_gphaseCt, LAMMPS_LAMBDA (const int workPhase) { data.build_ghosts_onePhase(workPhase); }); - data.neigh_list.gnum = ssa_gitemLoc(ssa_gphaseCt-1,ssa_gphaseLen(ssa_gphaseCt-1)-1) + - ssa_gitemLen(ssa_gphaseCt-1,ssa_gphaseLen(ssa_gphaseCt-1)-1) - data.neigh_list.inum; + k_ssa_gitemLoc.modify(); + k_ssa_gitemLen.modify(); + k_ssa_gphaseLen.modify(); + k_ssa_gitemLoc.sync(); + k_ssa_gitemLen.sync(); + k_ssa_gphaseLen.sync(); + data.neigh_list.gnum = h_ssa_gitemLoc(ssa_gphaseCt-1,h_ssa_gphaseLen(ssa_gphaseCt-1)-1) + + h_ssa_gitemLen(ssa_gphaseCt-1,h_ssa_gphaseLen(ssa_gphaseCt-1)-1) - data.neigh_list.inum; firstTry = false; DeviceType::fence(); @@ -445,12 +493,12 @@ fprintf(stdout, "tota%03d total %3d could use %6d inums, expected %6d inums. inu } } - k_ssa_phaseLen.modify(); - k_ssa_itemLoc.modify(); - k_ssa_itemLen.modify(); - k_ssa_gphaseLen.modify(); - k_ssa_gitemLoc.modify(); - k_ssa_gitemLen.modify(); + //k_ssa_phaseLen.modify(); + //k_ssa_itemLoc.modify(); + //k_ssa_itemLen.modify(); + //k_ssa_gphaseLen.modify(); + //k_ssa_gitemLoc.modify(); + //k_ssa_gitemLen.modify(); list->inum = data.neigh_list.inum; //FIXME once the above is in a parallel_for list->gnum = data.neigh_list.gnum; // it will need a deep_copy or something From efe60bf991c69d0cdd0e1f960f060c53abb62457 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Tue, 6 Jun 2017 13:10:04 -0600 Subject: [PATCH 2/7] Fixing more CUDA runtime issues --- src/KOKKOS/nbin_ssa_kokkos.cpp | 2 ++ src/KOKKOS/npair_ssa_kokkos.cpp | 8 ++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/KOKKOS/nbin_ssa_kokkos.cpp b/src/KOKKOS/nbin_ssa_kokkos.cpp index 6c9e3a3446..f11d7e18ef 100644 --- a/src/KOKKOS/nbin_ssa_kokkos.cpp +++ b/src/KOKKOS/nbin_ssa_kokkos.cpp @@ -212,6 +212,8 @@ void NBinSSAKokkos::bin_atoms() }); DeviceType::fence(); } + k_bins.modify(); + k_bincount.modify(); c_bins = bins; // bins won't change until the next bin_atoms //now dispose of the k_binID array diff --git a/src/KOKKOS/npair_ssa_kokkos.cpp b/src/KOKKOS/npair_ssa_kokkos.cpp index 0c3a5985ff..368fb1a6ed 100644 --- a/src/KOKKOS/npair_ssa_kokkos.cpp +++ b/src/KOKKOS/npair_ssa_kokkos.cpp @@ -275,7 +275,11 @@ void NPairSSAKokkos::build(NeighList *list_) auto h_bincount = k_bincount.h_view; const typename ArrayTypes::t_int_2d_const c_bins = k_bins.view(); const typename ArrayTypes::t_int_1d_const_um c_stencil = k_stencil.view(); + k_stencil.sync(); + auto h_stencil = k_stencil.h_view; const typename ArrayTypes::t_int_1d_const c_nstencil_ssa = k_nstencil_ssa.view(); + k_nstencil_ssa.sync(); + auto h_nstencil_ssa = k_nstencil_ssa.h_view; int inum = 0; // loop over bins with local atoms, counting half of the neighbors @@ -302,8 +306,8 @@ void NPairSSAKokkos::build(NeighList *list_) int base_n = 0; bool include_same = false; // count all local atoms in the current stencil "subphase" as potential neighbors - for (int k = c_nstencil_ssa(subphase); k < c_nstencil_ssa(subphase+1); k++) { - const int jbin = ibin+c_stencil(k); + for (int k = h_nstencil_ssa(subphase); k < h_nstencil_ssa(subphase+1); k++) { + const int jbin = ibin+h_stencil(k); if (jbin != ibin) base_n += h_bincount(jbin); else include_same = true; } From 520ab26bd966b5fda778b5e30f4cbdeb95d8e842 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Wed, 7 Jun 2017 15:07:53 -0600 Subject: [PATCH 3/7] Fixing more CUDA runtime issues --- src/KOKKOS/nbin_ssa_kokkos.cpp | 3 +++ src/KOKKOS/npair_ssa_kokkos.cpp | 9 ++------- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/src/KOKKOS/nbin_ssa_kokkos.cpp b/src/KOKKOS/nbin_ssa_kokkos.cpp index f11d7e18ef..883ba25b24 100644 --- a/src/KOKKOS/nbin_ssa_kokkos.cpp +++ b/src/KOKKOS/nbin_ssa_kokkos.cpp @@ -216,6 +216,9 @@ void NBinSSAKokkos::bin_atoms() k_bincount.modify(); c_bins = bins; // bins won't change until the next bin_atoms + k_gbins.modify(); + k_gbincount.modify(); + //now dispose of the k_binID array k_binID = DAT::tdual_int_1d("NBinSSAKokkos::binID",0); binID = k_binID.view(); diff --git a/src/KOKKOS/npair_ssa_kokkos.cpp b/src/KOKKOS/npair_ssa_kokkos.cpp index 368fb1a6ed..aec482993d 100644 --- a/src/KOKKOS/npair_ssa_kokkos.cpp +++ b/src/KOKKOS/npair_ssa_kokkos.cpp @@ -260,24 +260,18 @@ void NPairSSAKokkos::build(NeighList *list_) k_ssa_gitemLen.sync(); k_ssa_phaseOff.sync(); k_ssa_phaseLen.sync(); - k_ssa_gphaseLen.sync(); auto h_ssa_itemLoc = k_ssa_itemLoc.h_view; auto h_ssa_itemLen = k_ssa_itemLen.h_view; auto h_ssa_gitemLoc = k_ssa_gitemLoc.h_view; auto h_ssa_gitemLen = k_ssa_gitemLen.h_view; auto h_ssa_phaseOff = k_ssa_phaseOff.h_view; auto h_ssa_phaseLen = k_ssa_phaseLen.h_view; - auto h_ssa_gphaseLen = k_ssa_gphaseLen.h_view; { // Preflight the neighbor list workplan - const typename ArrayTypes::t_int_1d_const c_bincount = k_bincount.view(); k_bincount.sync(); auto h_bincount = k_bincount.h_view; - const typename ArrayTypes::t_int_2d_const c_bins = k_bins.view(); - const typename ArrayTypes::t_int_1d_const_um c_stencil = k_stencil.view(); k_stencil.sync(); auto h_stencil = k_stencil.h_view; - const typename ArrayTypes::t_int_1d_const c_nstencil_ssa = k_nstencil_ssa.view(); k_nstencil_ssa.sync(); auto h_nstencil_ssa = k_nstencil_ssa.h_view; int inum = 0; @@ -358,6 +352,7 @@ fprintf(stdout, "tota%03d total %3d could use %6d inums, expected %6d inums. inu } // count how many ghosts might have neighbors, and increase the work plan storage + k_gbincount.sync(); for (int workPhase = 0; workPhase < ssa_gphaseCt; workPhase++) { int len = k_gbincount.h_view(workPhase + 1); h_ssa_gitemLoc(workPhase,0) = nl_size; // record where workItem starts in ilist @@ -370,7 +365,6 @@ fprintf(stdout, "tota%03d total %3d could use %6d inums, expected %6d inums. inu k_ssa_itemLen.modify(); k_ssa_gitemLoc.modify(); k_ssa_gitemLen.modify(); - k_ssa_phaseOff.modify(); k_ssa_phaseLen.modify(); k_ssa_itemLoc.sync(); k_ssa_itemLen.sync(); @@ -481,6 +475,7 @@ fprintf(stdout, "tota%03d total %3d could use %6d inums, expected %6d inums. inu k_ssa_gitemLoc.sync(); k_ssa_gitemLen.sync(); k_ssa_gphaseLen.sync(); + auto h_ssa_gphaseLen = k_ssa_gphaseLen.h_view; data.neigh_list.gnum = h_ssa_gitemLoc(ssa_gphaseCt-1,h_ssa_gphaseLen(ssa_gphaseCt-1)-1) + h_ssa_gitemLen(ssa_gphaseCt-1,h_ssa_gphaseLen(ssa_gphaseCt-1)-1) - data.neigh_list.inum; firstTry = false; From b4b7310884382a18f9439983a4c241c24998d88c Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Thu, 8 Jun 2017 13:33:23 -0600 Subject: [PATCH 4/7] Fixing CUDA runtime issues in pair_exp6_rx_kokkos --- src/KOKKOS/pair_exp6_rx_kokkos.cpp | 16 ++++++++-------- src/KOKKOS/pair_exp6_rx_kokkos.h | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.cpp b/src/KOKKOS/pair_exp6_rx_kokkos.cpp index 5b84f09fd6..1eb1c6c770 100644 --- a/src/KOKKOS/pair_exp6_rx_kokkos.cpp +++ b/src/KOKKOS/pair_exp6_rx_kokkos.cpp @@ -426,7 +426,7 @@ KOKKOS_INLINE_FUNCTION void PairExp6rxKokkos::operator()(TagPairExp6rxCompute, const int &ii, EV_FLOAT& ev) const { { - const bool one_type = (atom->ntypes == 1); + const bool one_type = (ntypes == 1); if (isite1 == isite2) if (one_type) this->vectorized_operator(ii, ev); @@ -797,7 +797,7 @@ KOKKOS_INLINE_FUNCTION void PairExp6rxKokkos::operator()(TagPairExp6rxComputeNoAtomics, const int &ii, EV_FLOAT& ev) const { { - const bool one_type = (atom->ntypes == 1); + const bool one_type = (ntypes == 1); if (isite1 == isite2) if (one_type) this->vectorized_operator(ii, ev); @@ -1653,18 +1653,18 @@ template void PairExp6rxKokkos::allocate() { allocated = 1; - int n = atom->ntypes; + ntypes = atom->ntypes; - memory->create(setflag,n+1,n+1,"pair:setflag"); - for (int i = 1; i <= n; i++) - for (int j = i; j <= n; j++) + memory->create(setflag,ntypes+1,ntypes+1,"pair:setflag"); + for (int i = 1; i <= ntypes; i++) + for (int j = i; j <= ntypes; j++) setflag[i][j] = 0; - memory->create_kokkos(k_cutsq,cutsq,n+1,n+1,"pair:cutsq"); + memory->create_kokkos(k_cutsq,cutsq,ntypes+1,ntypes+1,"pair:cutsq"); d_cutsq = k_cutsq.template view(); k_cutsq.template modify(); - memory->create(cut,n+1,n+1,"pair:cut_lj"); + memory->create(cut,ntypes+1,ntypes+1,"pair:cut_lj"); } diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.h b/src/KOKKOS/pair_exp6_rx_kokkos.h index 09283662a2..4c35c76851 100644 --- a/src/KOKKOS/pair_exp6_rx_kokkos.h +++ b/src/KOKKOS/pair_exp6_rx_kokkos.h @@ -145,7 +145,7 @@ class PairExp6rxKokkos : public PairExp6rx { int eflag,vflag; int nlocal,newton_pair,neighflag; double special_lj[4]; - int num_threads; + int num_threads,ntypes; typename AT::t_x_array_randomread x; typename AT::t_f_array f; From 86497949f20a2a6ae0609172e9aabf4e7221390d Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Thu, 8 Jun 2017 13:40:20 -0600 Subject: [PATCH 5/7] Fixing CUDA runtime issues in fix_shardlow_kokkos --- src/KOKKOS/fix_shardlow_kokkos.cpp | 21 ++++++++++++++++----- src/KOKKOS/fix_shardlow_kokkos.h | 2 ++ 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/src/KOKKOS/fix_shardlow_kokkos.cpp b/src/KOKKOS/fix_shardlow_kokkos.cpp index 52287d586c..b3d4e86244 100644 --- a/src/KOKKOS/fix_shardlow_kokkos.cpp +++ b/src/KOKKOS/fix_shardlow_kokkos.cpp @@ -444,9 +444,6 @@ void FixShardlowKokkos::ssa_update_dpde( rand_type rand_gen = rand_pool.get_state(id); #endif - const double boltz_inv = 1.0/force->boltz; - const double ftm2v = force->ftm2v; - const double dt = update->dt; int ct = count; int ii = start_ii; @@ -639,6 +636,16 @@ void FixShardlowKokkos::initial_integrate(int vflag) ssa_gitemLoc = np_ssa->ssa_gitemLoc; ssa_gitemLen = np_ssa->ssa_gitemLen; + np_ssa->k_ssa_itemLoc.template sync(); + np_ssa->k_ssa_itemLen.template sync(); + np_ssa->k_ssa_gitemLoc.template sync(); + np_ssa->k_ssa_gitemLen.template sync(); + + np_ssa->k_ssa_phaseLen.template sync(); + np_ssa->k_ssa_gphaseLen.template sync(); + auto h_ssa_phaseLen = np_ssa->k_ssa_phaseLen.h_view; + auto h_ssa_gphaseLen = np_ssa->k_ssa_gphaseLen.h_view; + int maxWorkItemCt = (int) ssa_itemLoc.dimension_1(); if (maxWorkItemCt < (int) ssa_gitemLoc.dimension_1()) { maxWorkItemCt = (int) ssa_gitemLoc.dimension_1(); @@ -670,9 +677,13 @@ void FixShardlowKokkos::initial_integrate(int vflag) deep_copy(d_hist, h_hist); #endif + boltz_inv = 1.0/force->boltz; + ftm2v = force->ftm2v; + dt = update->dt; + // process neighbors in the local AIR for (int workPhase = 0; workPhase < ssa_phaseCt; ++workPhase) { - int workItemCt = ssa_phaseLen[workPhase]; + int workItemCt = h_ssa_phaseLen[workPhase]; if(atom->ntypes > MAX_TYPES_STACKPARAMS) { Kokkos::parallel_for(workItemCt, LAMMPS_LAMBDA (const int workItem ) { @@ -692,7 +703,7 @@ void FixShardlowKokkos::initial_integrate(int vflag) //Loop over all 13 outward directions (7 stages) for (int workPhase = 0; workPhase < ssa_gphaseCt; ++workPhase) { // int airnum = workPhase + 1; - int workItemCt = ssa_gphaseLen[workPhase]; + int workItemCt = h_ssa_gphaseLen[workPhase]; // Communicate the updated velocities to all nodes comm->forward_comm_fix(this); diff --git a/src/KOKKOS/fix_shardlow_kokkos.h b/src/KOKKOS/fix_shardlow_kokkos.h index 4dc47709e1..df8849d80b 100644 --- a/src/KOKKOS/fix_shardlow_kokkos.h +++ b/src/KOKKOS/fix_shardlow_kokkos.h @@ -68,6 +68,8 @@ class FixShardlowKokkos : public FixShardlow { #endif protected: + double boltz_inv,ftm2v,dt; + // class PairDPDfdt *pairDPD; PairDPDfdtEnergyKokkos *k_pairDPDE; From c51cadcc6c38ff2c939fb0bed46dd73c09873c2d Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Fri, 9 Jun 2017 09:31:37 -0600 Subject: [PATCH 6/7] Fixing CUDA runtime issues in fix_shardlow_kokkos --- src/KOKKOS/fix_shardlow_kokkos.cpp | 66 ++++++++++++++++-------------- src/KOKKOS/fix_shardlow_kokkos.h | 17 +++++++- 2 files changed, 51 insertions(+), 32 deletions(-) diff --git a/src/KOKKOS/fix_shardlow_kokkos.cpp b/src/KOKKOS/fix_shardlow_kokkos.cpp index b3d4e86244..d2fb937a57 100644 --- a/src/KOKKOS/fix_shardlow_kokkos.cpp +++ b/src/KOKKOS/fix_shardlow_kokkos.cpp @@ -436,7 +436,7 @@ template KOKKOS_INLINE_FUNCTION void FixShardlowKokkos::ssa_update_dpde( int start_ii, int count, int id -) +) const { #ifdef DPD_USE_RAN_MARS class RanMars *pRNG = pp_random[id]; @@ -682,26 +682,18 @@ void FixShardlowKokkos::initial_integrate(int vflag) dt = update->dt; // process neighbors in the local AIR - for (int workPhase = 0; workPhase < ssa_phaseCt; ++workPhase) { + for (workPhase = 0; workPhase < ssa_phaseCt; ++workPhase) { int workItemCt = h_ssa_phaseLen[workPhase]; - if(atom->ntypes > MAX_TYPES_STACKPARAMS) { - Kokkos::parallel_for(workItemCt, LAMMPS_LAMBDA (const int workItem ) { - int ct = ssa_itemLen(workPhase, workItem); - int ii = ssa_itemLoc(workPhase, workItem); - ssa_update_dpde(ii, ct, workItem); - }); - } else { - Kokkos::parallel_for(workItemCt, LAMMPS_LAMBDA (const int workItem ) { - int ct = ssa_itemLen(workPhase, workItem); - int ii = ssa_itemLoc(workPhase, workItem); - ssa_update_dpde(ii, ct, workItem); - }); - } + + if(atom->ntypes > MAX_TYPES_STACKPARAMS) + Kokkos::parallel_for(Kokkos::RangePolicy >(0,workItemCt),*this); + else + Kokkos::parallel_for(Kokkos::RangePolicy >(0,workItemCt),*this); } //Loop over all 13 outward directions (7 stages) - for (int workPhase = 0; workPhase < ssa_gphaseCt; ++workPhase) { + for (workPhase = 0; workPhase < ssa_gphaseCt; ++workPhase) { // int airnum = workPhase + 1; int workItemCt = h_ssa_gphaseLen[workPhase]; @@ -713,27 +705,21 @@ void FixShardlowKokkos::initial_integrate(int vflag) // memset(&(atom->uCond[nlocal]), 0, sizeof(double)*nghost); // memset(&(atom->uMech[nlocal]), 0, sizeof(double)*nghost); + // must capture local variables, not class variables + auto l_uCond = uCond; + auto l_uMech = uMech; Kokkos::parallel_for(Kokkos::RangePolicy(nlocal,nlocal+nghost), LAMMPS_LAMBDA (const int i) { - uCond(i) = 0.0; - uMech(i) = 0.0; + l_uCond(i) = 0.0; + l_uMech(i) = 0.0; }); DeviceType::fence(); } // process neighbors in this AIR - if(atom->ntypes > MAX_TYPES_STACKPARAMS) { - Kokkos::parallel_for(workItemCt, LAMMPS_LAMBDA (const int workItem ) { - int ct = ssa_gitemLen(workPhase, workItem); - int ii = ssa_gitemLoc(workPhase, workItem); - ssa_update_dpde(ii, ct, workItem); - }); - } else { - Kokkos::parallel_for(workItemCt, LAMMPS_LAMBDA (const int workItem ) { - int ct = ssa_gitemLen(workPhase, workItem); - int ii = ssa_gitemLoc(workPhase, workItem); - ssa_update_dpde(ii, ct, workItem); - }); - } + if(atom->ntypes > MAX_TYPES_STACKPARAMS) + Kokkos::parallel_for(Kokkos::RangePolicy >(0,workItemCt),*this); + else + Kokkos::parallel_for(Kokkos::RangePolicy >(0,workItemCt),*this); // Communicate the ghost deltas to the atom owners comm->reverse_comm_fix(this); @@ -755,6 +741,24 @@ fprintf(stdout, "\n%6d %6d,%6d %6d: " copymode = 0; } +template +template +KOKKOS_INLINE_FUNCTION +void FixShardlowKokkos::operator()(TagFixShardlowSSAUpdateDPDE, const int &workItem) const { + const int ct = ssa_itemLen(workPhase, workItem); + const int ii = ssa_itemLoc(workPhase, workItem); + ssa_update_dpde(ii, ct, workItem); +} + +template +template +KOKKOS_INLINE_FUNCTION +void FixShardlowKokkos::operator()(TagFixShardlowSSAUpdateDPDEGhost, const int &workItem) const { + const int ct = ssa_gitemLen(workPhase, workItem); + const int ii = ssa_gitemLoc(workPhase, workItem); + ssa_update_dpde(ii, ct, workItem); +} + /* ---------------------------------------------------------------------- */ template diff --git a/src/KOKKOS/fix_shardlow_kokkos.h b/src/KOKKOS/fix_shardlow_kokkos.h index df8849d80b..91a2fdbc97 100644 --- a/src/KOKKOS/fix_shardlow_kokkos.h +++ b/src/KOKKOS/fix_shardlow_kokkos.h @@ -30,6 +30,12 @@ FixStyle(shardlow/kk/host,FixShardlowKokkos) namespace LAMMPS_NS { +template +struct TagFixShardlowSSAUpdateDPDE{}; + +template +struct TagFixShardlowSSAUpdateDPDEGhost{}; + template class FixShardlowKokkos : public FixShardlow { public: @@ -60,6 +66,14 @@ class FixShardlowKokkos : public FixShardlow { F_FLOAT cutinv,halfsigma,kappa,alpha; }; + template + KOKKOS_INLINE_FUNCTION + void operator()(TagFixShardlowSSAUpdateDPDE, const int&) const; + + template + KOKKOS_INLINE_FUNCTION + void operator()(TagFixShardlowSSAUpdateDPDEGhost, const int&) const; + #ifdef DEBUG_PAIR_CT typename AT::t_int_2d d_counters; typename HAT::t_int_2d h_counters; @@ -68,6 +82,7 @@ class FixShardlowKokkos : public FixShardlow { #endif protected: + int workPhase; double boltz_inv,ftm2v,dt; // class PairDPDfdt *pairDPD; @@ -127,7 +142,7 @@ class FixShardlowKokkos : public FixShardlow { // void ssa_update_dpd(int, int); // Constant Temperature template KOKKOS_INLINE_FUNCTION - void ssa_update_dpde(int, int, int); // Constant Energy + void ssa_update_dpde(int, int, int) const; // Constant Energy }; From 3c8e75ad590ae35be1002ce88281d88d8bbfc6f9 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Fri, 9 Jun 2017 10:57:35 -0600 Subject: [PATCH 7/7] Add missing sync/modify to fix_shardlow_kokkos --- src/KOKKOS/fix_shardlow_kokkos.cpp | 27 ++++++++++++++++++--------- src/KOKKOS/fix_shardlow_kokkos.h | 2 +- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/src/KOKKOS/fix_shardlow_kokkos.cpp b/src/KOKKOS/fix_shardlow_kokkos.cpp index d2fb937a57..0c7c51c821 100644 --- a/src/KOKKOS/fix_shardlow_kokkos.cpp +++ b/src/KOKKOS/fix_shardlow_kokkos.cpp @@ -73,11 +73,11 @@ FixShardlowKokkos::FixShardlowKokkos(LAMMPS *lmp, int narg, char **a FixShardlow(lmp, narg, arg), k_pairDPDE(NULL), ghostmax(0), nlocal(0) , nghost(0) { kokkosable = 1; -// atomKK = (AtomKokkos *) atom; -// execution_space = ExecutionSpaceFromDevice::space; + atomKK = (AtomKokkos *) atom; + execution_space = ExecutionSpaceFromDevice::space; -// datamask_read = X_MASK | V_MASK | F_MASK | MASK_MASK | Q_MASK | TYPE_MASK; -// datamask_modify = Q_MASK | X_MASK; + datamask_read = EMPTY_MASK; + datamask_modify = EMPTY_MASK; if (narg != 3) error->all(FLERR,"Illegal fix shardlow command"); @@ -167,6 +167,7 @@ void FixShardlowKokkos::init() //FIXME either create cutsq and fill it in, or just point to pairDPD's... // memory->destroy(cutsq); //FIXME // memory->create_kokkos(k_cutsq,cutsq,ntypes+1,ntypes+1,"FixShardlowKokkos:cutsq"); + k_pairDPDE->k_cutsq.template sync(); d_cutsq = k_pairDPDE->k_cutsq.template view(); //FIXME const double boltz2 = 2.0*force->boltz; @@ -288,10 +289,6 @@ void FixShardlowKokkos::ssa_update_dpd( rand_type rand_gen = rand_pool.get_state(id); #endif - const double theta_ij_inv = 1.0/k_pairDPD->temperature; // independent of i,j - const double boltz_inv = 1.0/force->boltz; - const double ftm2v = force->ftm2v; - const double dt = update->dt; int ct = count; int ii = start_ii; @@ -677,20 +674,24 @@ void FixShardlowKokkos::initial_integrate(int vflag) deep_copy(d_hist, h_hist); #endif + //theta_ij_inv = 1.0/k_pairDPD->temperature; // independent of i,j boltz_inv = 1.0/force->boltz; ftm2v = force->ftm2v; dt = update->dt; + k_params.template sync(); + // process neighbors in the local AIR + atomKK->sync(execution_space,X_MASK | V_MASK | TYPE_MASK | RMASS_MASK | UCOND_MASK | UMECH_MASK | DPDTHETA_MASK); for (workPhase = 0; workPhase < ssa_phaseCt; ++workPhase) { int workItemCt = h_ssa_phaseLen[workPhase]; - if(atom->ntypes > MAX_TYPES_STACKPARAMS) Kokkos::parallel_for(Kokkos::RangePolicy >(0,workItemCt),*this); else Kokkos::parallel_for(Kokkos::RangePolicy >(0,workItemCt),*this); } + atomKK->modified(execution_space,V_MASK | UCOND_MASK | UMECH_MASK); //Loop over all 13 outward directions (7 stages) for (workPhase = 0; workPhase < ssa_gphaseCt; ++workPhase) { @@ -698,7 +699,9 @@ void FixShardlowKokkos::initial_integrate(int vflag) int workItemCt = h_ssa_gphaseLen[workPhase]; // Communicate the updated velocities to all nodes + atomKK->sync(Host,V_MASK); comm->forward_comm_fix(this); + atomKK->modified(Host,V_MASK); if(k_pairDPDE){ // Zero out the ghosts' uCond & uMech to be used as delta accumulators @@ -706,6 +709,7 @@ void FixShardlowKokkos::initial_integrate(int vflag) // memset(&(atom->uMech[nlocal]), 0, sizeof(double)*nghost); // must capture local variables, not class variables + atomKK->sync(execution_space,UCOND_MASK | UMECH_MASK); auto l_uCond = uCond; auto l_uMech = uMech; Kokkos::parallel_for(Kokkos::RangePolicy(nlocal,nlocal+nghost), LAMMPS_LAMBDA (const int i) { @@ -713,16 +717,21 @@ void FixShardlowKokkos::initial_integrate(int vflag) l_uMech(i) = 0.0; }); DeviceType::fence(); + atomKK->modified(execution_space,UCOND_MASK | UMECH_MASK); } // process neighbors in this AIR + atomKK->sync(execution_space,X_MASK | V_MASK | TYPE_MASK | RMASS_MASK | UCOND_MASK | UMECH_MASK | DPDTHETA_MASK); if(atom->ntypes > MAX_TYPES_STACKPARAMS) Kokkos::parallel_for(Kokkos::RangePolicy >(0,workItemCt),*this); else Kokkos::parallel_for(Kokkos::RangePolicy >(0,workItemCt),*this); + atomKK->modified(execution_space,V_MASK | UCOND_MASK | UMECH_MASK); // Communicate the ghost deltas to the atom owners + atomKK->sync(Host,V_MASK | UCOND_MASK | UMECH_MASK); comm->reverse_comm_fix(this); + atomKK->modified(Host,V_MASK | UCOND_MASK | UMECH_MASK); } //End Loop over all directions For airnum = Top, Top-Right, Right, Bottom-Right, Back diff --git a/src/KOKKOS/fix_shardlow_kokkos.h b/src/KOKKOS/fix_shardlow_kokkos.h index 91a2fdbc97..3dbbaaa61c 100644 --- a/src/KOKKOS/fix_shardlow_kokkos.h +++ b/src/KOKKOS/fix_shardlow_kokkos.h @@ -83,7 +83,7 @@ class FixShardlowKokkos : public FixShardlow { protected: int workPhase; - double boltz_inv,ftm2v,dt; + double theta_ij_inv,boltz_inv,ftm2v,dt; // class PairDPDfdt *pairDPD; PairDPDfdtEnergyKokkos *k_pairDPDE;