From 1249c713f0875d5c1757f1927d31d627240001da Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Tue, 8 Mar 2022 14:28:06 -0700 Subject: [PATCH 01/15] Add transpose to neigh list --- src/KOKKOS/kokkos_type.h | 14 ++++++++++++++ src/KOKKOS/neigh_list_kokkos.cpp | 1 + src/KOKKOS/neigh_list_kokkos.h | 7 +++++++ src/KOKKOS/npair_kokkos.cpp | 14 +++++++++----- 4 files changed, 31 insertions(+), 5 deletions(-) diff --git a/src/KOKKOS/kokkos_type.h b/src/KOKKOS/kokkos_type.h index 09a78b17df..22620f0d3d 100644 --- a/src/KOKKOS/kokkos_type.h +++ b/src/KOKKOS/kokkos_type.h @@ -887,6 +887,13 @@ typedef tdual_neighbors_2d::t_dev_um t_neighbors_2d_um; typedef tdual_neighbors_2d::t_dev_const_um t_neighbors_2d_const_um; typedef tdual_neighbors_2d::t_dev_const_randomread t_neighbors_2d_randomread; +typedef Kokkos::DualView tdual_neighbors_2d_lr; +typedef tdual_neighbors_2d_lr::t_dev t_neighbors_2d_lr; +typedef tdual_neighbors_2d_lr::t_dev_const t_neighbors_2d_const_lr; +typedef tdual_neighbors_2d_lr::t_dev_um t_neighbors_2d_um_lr; +typedef tdual_neighbors_2d_lr::t_dev_const_um t_neighbors_2d_const_um_lr; +typedef tdual_neighbors_2d_lr::t_dev_const_randomread t_neighbors_2d_randomread_lr; + }; #ifdef LMP_KOKKOS_GPU @@ -1156,6 +1163,13 @@ typedef tdual_neighbors_2d::t_host_um t_neighbors_2d_um; typedef tdual_neighbors_2d::t_host_const_um t_neighbors_2d_const_um; typedef tdual_neighbors_2d::t_host_const_randomread t_neighbors_2d_randomread; +typedef Kokkos::DualView tdual_neighbors_2d_lr; +typedef tdual_neighbors_2d_lr::t_host t_neighbors_2d_lr; +typedef tdual_neighbors_2d_lr::t_host_const t_neighbors_2d_const_lr; +typedef tdual_neighbors_2d_lr::t_host_um t_neighbors_2d_um_lr; +typedef tdual_neighbors_2d_lr::t_host_const_um t_neighbors_2d_const_um_lr; +typedef tdual_neighbors_2d_lr::t_host_const_randomread t_neighbors_2d_randomread_lr; + }; #endif //default LAMMPS Types diff --git a/src/KOKKOS/neigh_list_kokkos.cpp b/src/KOKKOS/neigh_list_kokkos.cpp index 153bc558fa..22a60be072 100644 --- a/src/KOKKOS/neigh_list_kokkos.cpp +++ b/src/KOKKOS/neigh_list_kokkos.cpp @@ -43,6 +43,7 @@ void NeighListKokkos::grow(int nmax) d_ilist = k_ilist.view(); d_numneigh = typename ArrayTypes::t_int_1d("neighlist:numneigh",maxatoms); d_neighbors = typename ArrayTypes::t_neighbors_2d(Kokkos::NoInit("neighlist:neighbors"),maxatoms,maxneighs); + d_neighbors_build = typename ArrayTypes::t_neighbors_2d_lr(Kokkos::NoInit("neighlist:neighbors"),maxatoms,maxneighs); } /* ---------------------------------------------------------------------- */ diff --git a/src/KOKKOS/neigh_list_kokkos.h b/src/KOKKOS/neigh_list_kokkos.h index 04e5e7e17f..033a4fcfa9 100644 --- a/src/KOKKOS/neigh_list_kokkos.h +++ b/src/KOKKOS/neigh_list_kokkos.h @@ -70,6 +70,7 @@ public: void grow(int nmax); typename ArrayTypes::t_neighbors_2d d_neighbors; + typename ArrayTypes::t_neighbors_2d_lr d_neighbors_build; DAT::tdual_int_1d k_ilist; // local indices of I atoms typename ArrayTypes::t_int_1d d_ilist; typename ArrayTypes::t_int_1d d_numneigh; @@ -82,6 +83,12 @@ public: &d_neighbors(i,1)-&d_neighbors(i,0)); } + KOKKOS_INLINE_FUNCTION + AtomNeighbors get_neighbors_build(const int &i) const { + return AtomNeighbors(&d_neighbors_build(i,0),d_numneigh(i), + &d_neighbors_build(i,1)-&d_neighbors_build(i,0)); + } + KOKKOS_INLINE_FUNCTION static AtomNeighborsConst static_neighbors_const(int i, typename ArrayTypes::t_neighbors_2d_const const& d_neighbors, diff --git a/src/KOKKOS/npair_kokkos.cpp b/src/KOKKOS/npair_kokkos.cpp index 42ecedd78a..54b0cbb66a 100644 --- a/src/KOKKOS/npair_kokkos.cpp +++ b/src/KOKKOS/npair_kokkos.cpp @@ -321,7 +321,9 @@ void NPairKokkos::build(NeighList *list_) if (data.h_resize()) { list->maxneighs = data.h_new_maxneighs() * 1.2; list->d_neighbors = typename AT::t_neighbors_2d(Kokkos::NoInit("neighbors"), list->d_neighbors.extent(0), list->maxneighs); + list->d_neighbors_build = typename AT::t_neighbors_2d_lr(Kokkos::NoInit("neighbors"), list->d_neighbors_build.extent(0), list->maxneighs); data.neigh_list.d_neighbors = list->d_neighbors; + data.neigh_list.d_neighbors_build = list->d_neighbors_build; data.neigh_list.maxneighs = list->maxneighs; } } @@ -335,6 +337,8 @@ void NPairKokkos::build(NeighList *list_) } list->k_ilist.template modify(); + + Kokkos::deep_copy(list->d_neighbors,list->d_neighbors_build); } /* ---------------------------------------------------------------------- */ @@ -413,7 +417,7 @@ void NeighborKokkosExecute:: else moltemplate = 0; // get subview of neighbors of i - const AtomNeighbors neighbors_i = neigh_list.get_neighbors(i); + const AtomNeighbors neighbors_i = neigh_list.get_neighbors_build(i); const X_FLOAT xtmp = x(i, 0); const X_FLOAT ytmp = x(i, 1); const X_FLOAT ztmp = x(i, 2); @@ -593,7 +597,7 @@ void NeighborKokkosExecute::build_ItemGPU(typename Kokkos::TeamPolic X_FLOAT ytmp; X_FLOAT ztmp; int itype; - const AtomNeighbors neighbors_i = neigh_list.get_neighbors((i>=0&&i=0&&i= 0) { xtmp = x(i, 0); @@ -786,7 +790,7 @@ void NeighborKokkosExecute:: else moltemplate = 0; // get subview of neighbors of i - const AtomNeighbors neighbors_i = neigh_list.get_neighbors(i); + const AtomNeighbors neighbors_i = neigh_list.get_neighbors_build(i); const X_FLOAT xtmp = x(i, 0); const X_FLOAT ytmp = x(i, 1); const X_FLOAT ztmp = x(i, 2); @@ -906,7 +910,7 @@ void NeighborKokkosExecute:: // get subview of neighbors of i - const AtomNeighbors neighbors_i = neigh_list.get_neighbors(i); + const AtomNeighbors neighbors_i = neigh_list.get_neighbors_build(i); const X_FLOAT xtmp = x(i, 0); const X_FLOAT ytmp = x(i, 1); const X_FLOAT ztmp = x(i, 2); @@ -1041,7 +1045,7 @@ void NeighborKokkosExecute::build_ItemSizeGPU(typename Kokkos::TeamP X_FLOAT ztmp; X_FLOAT radi; int itype; - const AtomNeighbors neighbors_i = neigh_list.get_neighbors((i>=0&&i=0&&i= 0) { From d49c27e362ab328b56e2c4634d27da1ae4678e87 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Wed, 16 Mar 2022 14:12:20 -0400 Subject: [PATCH 02/15] Added a "smart" transpose routine with scratch memory staging for neighbor table transposes between LR and LL --- src/KOKKOS/npair_kokkos.cpp | 6 +- src/KOKKOS/npair_kokkos.h | 180 ++++++++++++++++++++++++++++++++++++ 2 files changed, 185 insertions(+), 1 deletion(-) diff --git a/src/KOKKOS/npair_kokkos.cpp b/src/KOKKOS/npair_kokkos.cpp index 54b0cbb66a..e1a0d63e11 100644 --- a/src/KOKKOS/npair_kokkos.cpp +++ b/src/KOKKOS/npair_kokkos.cpp @@ -338,7 +338,11 @@ void NPairKokkos::build(NeighList *list_) list->k_ilist.template modify(); - Kokkos::deep_copy(list->d_neighbors,list->d_neighbors_build); + //Kokkos::deep_copy(list->d_neighbors,list->d_neighbors_build); + { + // Perform an optimized transpose + NPairKokkosTransposeHelper(list->d_neighbors, list->d_neighbors_build); + } } /* ---------------------------------------------------------------------- */ diff --git a/src/KOKKOS/npair_kokkos.h b/src/KOKKOS/npair_kokkos.h index e78a0f6a85..85249d7d1c 100644 --- a/src/KOKKOS/npair_kokkos.h +++ b/src/KOKKOS/npair_kokkos.h @@ -473,6 +473,186 @@ struct NPairKokkosBuildFunctorSize { void operator() (typename Kokkos::TeamPolicy::member_type /*dev*/) const {} // Should error out }; +// This helper class implements optimized out-of-place transposes for Rank-2 views. +// In the case where both views have the same layout, it uses Kokkos' default `deep_copy`. +// In the case where the views have different layouts (LayoutLeft/LayoutRight), it implements +// a transpose through scratch memory staging +template +struct NPairKokkosTransposeHelper { + + using DeviceType = DeviceType_; + + using t_view_dst = t_view_dst_; + using t_view_src = t_view_src_; + + static_assert(std::is_same::value, "Value types do not match"); + static_assert(t_view_dst::Rank == 2, "Destination view rank != 2"); + static_assert(t_view_src::Rank == 2, "Source view rank != 2"); + + using dst_layout = typename t_view_dst::traits::array_layout; + using src_layout = typename t_view_src::traits::array_layout; + + typedef ArrayTypes AT; + + using t_view_value = typename t_view_dst::value_type; + + // 32x32 tiles, will update so each thread does multiple loads + static constexpr int vector_length = 32; + static constexpr int bank_pad = 1; + static constexpr int elem_size = sizeof(t_view_value); + + static constexpr int threads_per_team = 4; + + t_view_dst d_dst; + t_view_src d_src; + + bool src_is_layout_right; + + // extents divided by vector length, rounded up + int extent_tiles[2]; + + // 1 if extent is divisible by vector length, 0 otherwise + int extent_is_multiple_vector_length[2]; + + // number of teams + int n_teams; + + // amount of shared memory per thread + int shared_mem_per_thread; + + NPairKokkosTransposeHelper(t_view_dst d_dst_, t_view_src d_src_) + : d_dst(d_dst_), d_src(d_src_) { + + assert(d_dst.extent(0) == d_src.extent(0) && d_dst.extent(1) == d_dst.extent(1)); + + if (std::is_same::value) { + Kokkos::deep_copy(d_dst, d_src); + } else { + + src_is_layout_right = std::is_same::value; + + extent_tiles[0] = (d_dst.extent(0) + vector_length - 1) / vector_length; + extent_tiles[1] = (d_dst.extent(1) + vector_length - 1) / vector_length; + + extent_is_multiple_vector_length[0] = (extent_tiles[0] * vector_length == d_dst.extent(0)) ? 1 : 0; + extent_is_multiple_vector_length[1] = (extent_tiles[1] * vector_length == d_dst.extent(1)) ? 1 : 0; + + n_teams = (extent_tiles[0] * extent_tiles[1] + threads_per_team - 1) / threads_per_team; + + shared_mem_per_thread = vector_length * (vector_length + bank_pad) * elem_size; + + Kokkos::TeamPolicy transpose_policy(n_teams, threads_per_team, vector_length); + transpose_policy = transpose_policy.set_scratch_size(0, Kokkos::PerThread(shared_mem_per_thread)); + + Kokkos::parallel_for(transpose_policy, *this); + } + } + + KOKKOS_INLINE_FUNCTION + void operator()(const typename Kokkos::TeamPolicy::member_type& team_member) const { + + t_view_value* buffer = (t_view_value*)(team_member.team_shmem().get_shmem(shared_mem_per_thread * threads_per_team, 0)) + (shared_mem_per_thread / elem_size) * team_member.team_rank(); + + // extract flattened tile + const int flattened_idx = team_member.team_rank() + team_member.league_rank() * threads_per_team; + + // get range x, range y tile + int extent_tile_id[2]; + if (src_is_layout_right) { + // keep extent 1 tiles close together b/c loading from layout right + extent_tile_id[0] = flattened_idx / extent_tiles[1]; + extent_tile_id[1] = flattened_idx - extent_tile_id[0] * extent_tiles[1]; + } else { + // keep extent 0 tiles close together b/c loading from layout left + extent_tile_id[1] = flattened_idx / extent_tiles[0]; + extent_tile_id[0] = flattened_idx - extent_tile_id[1] * extent_tiles[0]; + } + + int elem[2]; + elem[0] = extent_tile_id[0] * vector_length; + elem[1] = extent_tile_id[1] * vector_length; + + if (elem[0] >= d_dst.extent(0) || + elem[1] >= d_dst.extent(1)) return; + + // determine if a row/column is a full `vector_length` in size or not + bool perfect_pad[2]; + perfect_pad[0] = (extent_is_multiple_vector_length[0] == 1 || extent_tile_id[0] + 1 < extent_tiles[0]); + perfect_pad[1] = (extent_is_multiple_vector_length[1] == 1 || extent_tile_id[1] + 1 < extent_tiles[1]); + + // load phase + if (src_is_layout_right) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, vector_length), + [&] (const int j) { + + if (elem[1] + j < d_src.extent(1)) { + if (perfect_pad[0]) { + for (int i = 0; i < vector_length; i++) + buffer[i * (vector_length + bank_pad) + j] = d_src(elem[0] + i, elem[1] + j); + } else { + for (int i = 0; i < (d_src.extent(0) - elem[0]); i++) + buffer[i * (vector_length + bank_pad) + j] = d_src(elem[0] + i, elem[1] + j); + } + } + }); + + } else { + // src is layout left + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, vector_length), + [&] (const int i) { + + if (elem[0] + i < d_src.extent(0)) { + if (perfect_pad[0]) { + for (int j = 0; j < vector_length; j++) + buffer[i * (vector_length + bank_pad) + j] = d_src(elem[0] + i, elem[1] + j); + } else { + for (int j = 0; j < (d_src.extent(1) - elem[1]); j++) + buffer[i * (vector_length + bank_pad) + j] = d_src(elem[0] + i, elem[1] + j); + } + } + }); + } + + // No need for an extra sync b/c, as confirmed by asking on the Kokkos Slack, there + // is an implicit sync at the end of a ThreadVectorRange as per the Kokkos + // programming model. + + // save phase + if (src_is_layout_right) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, vector_length), + [&] (const int i) { + + if (elem[0] + i < d_dst.extent(0)) { + if (perfect_pad[1]) { + for (int j = 0; j < vector_length; j++) + d_dst(elem[0] + i, elem[1] + j) = buffer[i * (vector_length + bank_pad) + j]; + } else { + for (int j = 0; j < (d_dst.extent(1) - elem[1]); j++) + d_dst(elem[0] + i, elem[1] + j) = buffer[i * (vector_length + bank_pad) + j]; + } + } + }); + } else { + + // src is layout left + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, vector_length), + [&] (const int j) { + + if (elem[1] + j < d_dst.extent(1)) { + if (perfect_pad[1]) { + for (int i = 0; i < vector_length; i++) + d_dst(elem[0] + i, elem[1] + j) = buffer[i * (vector_length + bank_pad) + j]; + } else { + for (int i = 0; i < (d_dst.extent(0) - elem[0]); i++) + d_dst(elem[0] + i, elem[1] + j) = buffer[i * (vector_length + bank_pad) + j]; + } + } + }); + } + + } +}; + } #endif From 30966dfed0568a69bee61da1a0fb7ce28a4e40c5 Mon Sep 17 00:00:00 2001 From: Stan Gerald Moore Date: Fri, 18 Mar 2022 14:14:34 -0600 Subject: [PATCH 03/15] Bugfix from @weinbe2 --- src/KOKKOS/npair_kokkos.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/KOKKOS/npair_kokkos.h b/src/KOKKOS/npair_kokkos.h index 85249d7d1c..bd8635ce35 100644 --- a/src/KOKKOS/npair_kokkos.h +++ b/src/KOKKOS/npair_kokkos.h @@ -602,7 +602,7 @@ struct NPairKokkosTransposeHelper { [&] (const int i) { if (elem[0] + i < d_src.extent(0)) { - if (perfect_pad[0]) { + if (perfect_pad[1]) { for (int j = 0; j < vector_length; j++) buffer[i * (vector_length + bank_pad) + j] = d_src(elem[0] + i, elem[1] + j); } else { @@ -639,7 +639,7 @@ struct NPairKokkosTransposeHelper { [&] (const int j) { if (elem[1] + j < d_dst.extent(1)) { - if (perfect_pad[1]) { + if (perfect_pad[0]) { for (int i = 0; i < vector_length; i++) d_dst(elem[0] + i, elem[1] + j) = buffer[i * (vector_length + bank_pad) + j]; } else { From 6dae6198a242caf51886928a12399ebd75984a12 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Fri, 25 Mar 2022 09:17:39 -0700 Subject: [PATCH 04/15] Add neigh transpose as Kokkos package option --- src/KOKKOS/Install.sh | 1 + src/KOKKOS/kokkos.cpp | 5 + src/KOKKOS/kokkos.h | 1 + src/KOKKOS/neigh_list_kokkos.cpp | 7 +- src/KOKKOS/neigh_list_kokkos.h | 8 +- src/KOKKOS/npair_kokkos.cpp | 38 +++-- src/KOKKOS/npair_kokkos.h | 186 +----------------------- src/KOKKOS/transpose_helper_kokkos.h | 202 +++++++++++++++++++++++++++ 8 files changed, 247 insertions(+), 201 deletions(-) create mode 100644 src/KOKKOS/transpose_helper_kokkos.h diff --git a/src/KOKKOS/Install.sh b/src/KOKKOS/Install.sh index 6da318c2d5..f56ab45914 100755 --- a/src/KOKKOS/Install.sh +++ b/src/KOKKOS/Install.sh @@ -316,6 +316,7 @@ action sna_kokkos.h sna.h action sna_kokkos_impl.h sna.cpp action third_order_kokkos.cpp dynamical_matrix.cpp action third_order_kokkos.h dynamical_matrix.h +action transpose_helper_kokkos.h action verlet_kokkos.cpp action verlet_kokkos.h diff --git a/src/KOKKOS/kokkos.cpp b/src/KOKKOS/kokkos.cpp index dc8a1173f3..382808d523 100644 --- a/src/KOKKOS/kokkos.cpp +++ b/src/KOKKOS/kokkos.cpp @@ -232,6 +232,7 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp) #endif neigh_thread = 0; neigh_thread_set = 0; + neigh_transpose = 0; if (ngpus > 0) { neighflag = FULL; neighflag_qeq = FULL; @@ -481,6 +482,10 @@ void KokkosLMP::accelerator(int narg, char **arg) neigh_thread = utils::logical(FLERR,arg[iarg+1],false,lmp); neigh_thread_set = 1; iarg += 2; + } else if (strcmp(arg[iarg],"neigh/transpose") == 0) { + if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command"); + neigh_transpose = utils::logical(FLERR,arg[iarg+1],false,lmp); + iarg += 2; } else error->all(FLERR,"Illegal package kokkos command"); } diff --git a/src/KOKKOS/kokkos.h b/src/KOKKOS/kokkos.h index be4990d0a4..669a773734 100644 --- a/src/KOKKOS/kokkos.h +++ b/src/KOKKOS/kokkos.h @@ -46,6 +46,7 @@ class KokkosLMP : protected Pointers { int gpu_aware_flag; int neigh_thread; int neigh_thread_set; + int neigh_transpose; int newtonflag; double binsize; diff --git a/src/KOKKOS/neigh_list_kokkos.cpp b/src/KOKKOS/neigh_list_kokkos.cpp index 76e8ef0c19..2337b1ef3d 100644 --- a/src/KOKKOS/neigh_list_kokkos.cpp +++ b/src/KOKKOS/neigh_list_kokkos.cpp @@ -13,6 +13,7 @@ ------------------------------------------------------------------------- */ #include "neigh_list_kokkos.h" +#include "kokkos.h" using namespace LAMMPS_NS; @@ -44,7 +45,11 @@ void NeighListKokkos::grow(int nmax) d_numneigh = typename ArrayTypes::t_int_1d("neighlist:numneigh",maxatoms); d_neighbors = typename ArrayTypes::t_neighbors_2d(); d_neighbors = typename ArrayTypes::t_neighbors_2d(Kokkos::NoInit("neighlist:neighbors"),maxatoms,maxneighs); - d_neighbors_build = typename ArrayTypes::t_neighbors_2d_lr(Kokkos::NoInit("neighlist:neighbors"),maxatoms,maxneighs); + + if (lmp->kokkos->neigh_transpose) { + d_neighbors_transpose = typename ArrayTypes::t_neighbors_2d_lr(); + d_neighbors_transpose = typename ArrayTypes::t_neighbors_2d_lr(Kokkos::NoInit("neighlist:neighbors"),maxatoms,maxneighs); + } } /* ---------------------------------------------------------------------- */ diff --git a/src/KOKKOS/neigh_list_kokkos.h b/src/KOKKOS/neigh_list_kokkos.h index 033a4fcfa9..d06a3e76cf 100644 --- a/src/KOKKOS/neigh_list_kokkos.h +++ b/src/KOKKOS/neigh_list_kokkos.h @@ -70,7 +70,7 @@ public: void grow(int nmax); typename ArrayTypes::t_neighbors_2d d_neighbors; - typename ArrayTypes::t_neighbors_2d_lr d_neighbors_build; + typename ArrayTypes::t_neighbors_2d_lr d_neighbors_transpose; DAT::tdual_int_1d k_ilist; // local indices of I atoms typename ArrayTypes::t_int_1d d_ilist; typename ArrayTypes::t_int_1d d_numneigh; @@ -84,9 +84,9 @@ public: } KOKKOS_INLINE_FUNCTION - AtomNeighbors get_neighbors_build(const int &i) const { - return AtomNeighbors(&d_neighbors_build(i,0),d_numneigh(i), - &d_neighbors_build(i,1)-&d_neighbors_build(i,0)); + AtomNeighbors get_neighbors_transpose(const int &i) const { + return AtomNeighbors(&d_neighbors_transpose(i,0),d_numneigh(i), + &d_neighbors_transpose(i,1)-&d_neighbors_transpose(i,0)); } KOKKOS_INLINE_FUNCTION diff --git a/src/KOKKOS/npair_kokkos.cpp b/src/KOKKOS/npair_kokkos.cpp index 723bddb6de..3624976a11 100644 --- a/src/KOKKOS/npair_kokkos.cpp +++ b/src/KOKKOS/npair_kokkos.cpp @@ -21,6 +21,8 @@ #include "nbin_kokkos.h" #include "nstencil.h" #include "force.h" +#include "kokkos.h" +#include "transpose_helper_kokkos.h" namespace LAMMPS_NS { @@ -158,7 +160,7 @@ void NPairKokkos::build(NeighList *list_) mbins,nstencil, k_stencil.view(), k_stencilxyz.view(), - nlocal, + nlocal,nall,lmp->kokkos->neigh_transpose, atomKK->k_x.view(), atomKK->k_radius.view(), atomKK->k_type.view(), @@ -324,10 +326,15 @@ void NPairKokkos::build(NeighList *list_) data.neigh_list.d_neighbors = typename AT::t_neighbors_2d(); list->d_neighbors = typename AT::t_neighbors_2d(); list->d_neighbors = typename AT::t_neighbors_2d(Kokkos::NoInit("neighlist:neighbors"), maxatoms, list->maxneighs); - list->d_neighbors_build = typename AT::t_neighbors_2d_lr(Kokkos::NoInit("neighbors"), list->d_neighbors_build.extent(0), list->maxneighs); data.neigh_list.d_neighbors = list->d_neighbors; - data.neigh_list.d_neighbors_build = list->d_neighbors_build; data.neigh_list.maxneighs = list->maxneighs; + + if (lmp->kokkos->neigh_transpose) { + data.neigh_list.d_neighbors_transpose = typename AT::t_neighbors_2d_lr(); + list->d_neighbors_transpose = typename AT::t_neighbors_2d_lr(); + list->d_neighbors_transpose = typename AT::t_neighbors_2d_lr(Kokkos::NoInit("neighlist:neighbors"), maxatoms, list->maxneighs); + data.neigh_list.d_neighbors_transpose = list->d_neighbors_transpose; + } } } @@ -341,11 +348,9 @@ void NPairKokkos::build(NeighList *list_) list->k_ilist.template modify(); - //Kokkos::deep_copy(list->d_neighbors,list->d_neighbors_build); - { - // Perform an optimized transpose - NPairKokkosTransposeHelper(list->d_neighbors, list->d_neighbors_build); - } + if (lmp->kokkos->neigh_transpose) + TransposeHelperKokkos(list->d_neighbors, list->d_neighbors_transpose); } /* ---------------------------------------------------------------------- */ @@ -424,7 +429,8 @@ void NeighborKokkosExecute:: else moltemplate = 0; // get subview of neighbors of i - const AtomNeighbors neighbors_i = neigh_list.get_neighbors_build(i); + const AtomNeighbors neighbors_i = neigh_transpose ? + neigh_list.get_neighbors_transpose(i) : neigh_list.get_neighbors(i); const X_FLOAT xtmp = x(i, 0); const X_FLOAT ytmp = x(i, 1); const X_FLOAT ztmp = x(i, 2); @@ -604,7 +610,9 @@ void NeighborKokkosExecute::build_ItemGPU(typename Kokkos::TeamPolic X_FLOAT ytmp; X_FLOAT ztmp; int itype; - const AtomNeighbors neighbors_i = neigh_list.get_neighbors_build((i>=0&&i= 0 && i < nlocal) ? i : 0; + const AtomNeighbors neighbors_i = neigh_transpose ? + neigh_list.get_neighbors_transpose(index) : neigh_list.get_neighbors(index); if (i >= 0) { xtmp = x(i, 0); @@ -797,7 +805,8 @@ void NeighborKokkosExecute:: else moltemplate = 0; // get subview of neighbors of i - const AtomNeighbors neighbors_i = neigh_list.get_neighbors_build(i); + const AtomNeighbors neighbors_i = neigh_transpose ? + neigh_list.get_neighbors_transpose(i) : neigh_list.get_neighbors(i); const X_FLOAT xtmp = x(i, 0); const X_FLOAT ytmp = x(i, 1); const X_FLOAT ztmp = x(i, 2); @@ -917,7 +926,8 @@ void NeighborKokkosExecute:: // get subview of neighbors of i - const AtomNeighbors neighbors_i = neigh_list.get_neighbors_build(i); + const AtomNeighbors neighbors_i = neigh_transpose ? + neigh_list.get_neighbors_transpose(i) : neigh_list.get_neighbors(i); const X_FLOAT xtmp = x(i, 0); const X_FLOAT ytmp = x(i, 1); const X_FLOAT ztmp = x(i, 2); @@ -1052,7 +1062,9 @@ void NeighborKokkosExecute::build_ItemSizeGPU(typename Kokkos::TeamP X_FLOAT ztmp; X_FLOAT radi; int itype; - const AtomNeighbors neighbors_i = neigh_list.get_neighbors_build((i>=0&&i= 0 && i < nlocal) ? i : 0; + const AtomNeighbors neighbors_i = neigh_transpose ? + neigh_list.get_neighbors_transpose(index) : neigh_list.get_neighbors(index); const int mask_history = 3 << SBBITS; if (i >= 0) { diff --git a/src/KOKKOS/npair_kokkos.h b/src/KOKKOS/npair_kokkos.h index bd8635ce35..fb524bc1ad 100644 --- a/src/KOKKOS/npair_kokkos.h +++ b/src/KOKKOS/npair_kokkos.h @@ -208,7 +208,7 @@ class NeighborKokkosExecute const X_FLOAT bininvx,bininvy,bininvz; X_FLOAT bboxhi[3],bboxlo[3]; - const int nlocal; + const int nlocal,nall,neigh_transpose; typename AT::t_int_scalar resize; typename AT::t_int_scalar new_maxneighs; @@ -230,7 +230,7 @@ class NeighborKokkosExecute const int _mbins,const int _nstencil, const typename AT::t_int_1d &_d_stencil, const typename AT::t_int_1d_3 &_d_stencilxyz, - const int _nlocal, + const int _nlocal,const int _nall,const int _neigh_transpose, const typename AT::t_x_array_randomread &_x, const typename AT::t_float_1d &_radius, const typename AT::t_int_1d_const &_type, @@ -280,7 +280,7 @@ class NeighborKokkosExecute mbinx(_mbinx),mbiny(_mbiny),mbinz(_mbinz), mbinxlo(_mbinxlo),mbinylo(_mbinylo),mbinzlo(_mbinzlo), bininvx(_bininvx),bininvy(_bininvy),bininvz(_bininvz), - nlocal(_nlocal), + nlocal(_nlocal),nall(_nall),neigh_transpose(_neigh_transpose), xperiodic(_xperiodic),yperiodic(_yperiodic),zperiodic(_zperiodic), xprd_half(_xprd_half),yprd_half(_yprd_half),zprd_half(_zprd_half), skin(_skin),resize(_resize),h_resize(_h_resize), @@ -473,186 +473,6 @@ struct NPairKokkosBuildFunctorSize { void operator() (typename Kokkos::TeamPolicy::member_type /*dev*/) const {} // Should error out }; -// This helper class implements optimized out-of-place transposes for Rank-2 views. -// In the case where both views have the same layout, it uses Kokkos' default `deep_copy`. -// In the case where the views have different layouts (LayoutLeft/LayoutRight), it implements -// a transpose through scratch memory staging -template -struct NPairKokkosTransposeHelper { - - using DeviceType = DeviceType_; - - using t_view_dst = t_view_dst_; - using t_view_src = t_view_src_; - - static_assert(std::is_same::value, "Value types do not match"); - static_assert(t_view_dst::Rank == 2, "Destination view rank != 2"); - static_assert(t_view_src::Rank == 2, "Source view rank != 2"); - - using dst_layout = typename t_view_dst::traits::array_layout; - using src_layout = typename t_view_src::traits::array_layout; - - typedef ArrayTypes AT; - - using t_view_value = typename t_view_dst::value_type; - - // 32x32 tiles, will update so each thread does multiple loads - static constexpr int vector_length = 32; - static constexpr int bank_pad = 1; - static constexpr int elem_size = sizeof(t_view_value); - - static constexpr int threads_per_team = 4; - - t_view_dst d_dst; - t_view_src d_src; - - bool src_is_layout_right; - - // extents divided by vector length, rounded up - int extent_tiles[2]; - - // 1 if extent is divisible by vector length, 0 otherwise - int extent_is_multiple_vector_length[2]; - - // number of teams - int n_teams; - - // amount of shared memory per thread - int shared_mem_per_thread; - - NPairKokkosTransposeHelper(t_view_dst d_dst_, t_view_src d_src_) - : d_dst(d_dst_), d_src(d_src_) { - - assert(d_dst.extent(0) == d_src.extent(0) && d_dst.extent(1) == d_dst.extent(1)); - - if (std::is_same::value) { - Kokkos::deep_copy(d_dst, d_src); - } else { - - src_is_layout_right = std::is_same::value; - - extent_tiles[0] = (d_dst.extent(0) + vector_length - 1) / vector_length; - extent_tiles[1] = (d_dst.extent(1) + vector_length - 1) / vector_length; - - extent_is_multiple_vector_length[0] = (extent_tiles[0] * vector_length == d_dst.extent(0)) ? 1 : 0; - extent_is_multiple_vector_length[1] = (extent_tiles[1] * vector_length == d_dst.extent(1)) ? 1 : 0; - - n_teams = (extent_tiles[0] * extent_tiles[1] + threads_per_team - 1) / threads_per_team; - - shared_mem_per_thread = vector_length * (vector_length + bank_pad) * elem_size; - - Kokkos::TeamPolicy transpose_policy(n_teams, threads_per_team, vector_length); - transpose_policy = transpose_policy.set_scratch_size(0, Kokkos::PerThread(shared_mem_per_thread)); - - Kokkos::parallel_for(transpose_policy, *this); - } - } - - KOKKOS_INLINE_FUNCTION - void operator()(const typename Kokkos::TeamPolicy::member_type& team_member) const { - - t_view_value* buffer = (t_view_value*)(team_member.team_shmem().get_shmem(shared_mem_per_thread * threads_per_team, 0)) + (shared_mem_per_thread / elem_size) * team_member.team_rank(); - - // extract flattened tile - const int flattened_idx = team_member.team_rank() + team_member.league_rank() * threads_per_team; - - // get range x, range y tile - int extent_tile_id[2]; - if (src_is_layout_right) { - // keep extent 1 tiles close together b/c loading from layout right - extent_tile_id[0] = flattened_idx / extent_tiles[1]; - extent_tile_id[1] = flattened_idx - extent_tile_id[0] * extent_tiles[1]; - } else { - // keep extent 0 tiles close together b/c loading from layout left - extent_tile_id[1] = flattened_idx / extent_tiles[0]; - extent_tile_id[0] = flattened_idx - extent_tile_id[1] * extent_tiles[0]; - } - - int elem[2]; - elem[0] = extent_tile_id[0] * vector_length; - elem[1] = extent_tile_id[1] * vector_length; - - if (elem[0] >= d_dst.extent(0) || - elem[1] >= d_dst.extent(1)) return; - - // determine if a row/column is a full `vector_length` in size or not - bool perfect_pad[2]; - perfect_pad[0] = (extent_is_multiple_vector_length[0] == 1 || extent_tile_id[0] + 1 < extent_tiles[0]); - perfect_pad[1] = (extent_is_multiple_vector_length[1] == 1 || extent_tile_id[1] + 1 < extent_tiles[1]); - - // load phase - if (src_is_layout_right) { - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, vector_length), - [&] (const int j) { - - if (elem[1] + j < d_src.extent(1)) { - if (perfect_pad[0]) { - for (int i = 0; i < vector_length; i++) - buffer[i * (vector_length + bank_pad) + j] = d_src(elem[0] + i, elem[1] + j); - } else { - for (int i = 0; i < (d_src.extent(0) - elem[0]); i++) - buffer[i * (vector_length + bank_pad) + j] = d_src(elem[0] + i, elem[1] + j); - } - } - }); - - } else { - // src is layout left - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, vector_length), - [&] (const int i) { - - if (elem[0] + i < d_src.extent(0)) { - if (perfect_pad[1]) { - for (int j = 0; j < vector_length; j++) - buffer[i * (vector_length + bank_pad) + j] = d_src(elem[0] + i, elem[1] + j); - } else { - for (int j = 0; j < (d_src.extent(1) - elem[1]); j++) - buffer[i * (vector_length + bank_pad) + j] = d_src(elem[0] + i, elem[1] + j); - } - } - }); - } - - // No need for an extra sync b/c, as confirmed by asking on the Kokkos Slack, there - // is an implicit sync at the end of a ThreadVectorRange as per the Kokkos - // programming model. - - // save phase - if (src_is_layout_right) { - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, vector_length), - [&] (const int i) { - - if (elem[0] + i < d_dst.extent(0)) { - if (perfect_pad[1]) { - for (int j = 0; j < vector_length; j++) - d_dst(elem[0] + i, elem[1] + j) = buffer[i * (vector_length + bank_pad) + j]; - } else { - for (int j = 0; j < (d_dst.extent(1) - elem[1]); j++) - d_dst(elem[0] + i, elem[1] + j) = buffer[i * (vector_length + bank_pad) + j]; - } - } - }); - } else { - - // src is layout left - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, vector_length), - [&] (const int j) { - - if (elem[1] + j < d_dst.extent(1)) { - if (perfect_pad[0]) { - for (int i = 0; i < vector_length; i++) - d_dst(elem[0] + i, elem[1] + j) = buffer[i * (vector_length + bank_pad) + j]; - } else { - for (int i = 0; i < (d_dst.extent(0) - elem[0]); i++) - d_dst(elem[0] + i, elem[1] + j) = buffer[i * (vector_length + bank_pad) + j]; - } - } - }); - } - - } -}; - } #endif diff --git a/src/KOKKOS/transpose_helper_kokkos.h b/src/KOKKOS/transpose_helper_kokkos.h new file mode 100644 index 0000000000..25bbe2402b --- /dev/null +++ b/src/KOKKOS/transpose_helper_kokkos.h @@ -0,0 +1,202 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Evan Weinberg (NVIDIA) +------------------------------------------------------------------------- */ + +#include "kokkos_type.h" + +namespace LAMMPS_NS { + +// This helper class implements optimized out-of-place transposes for Rank-2 views. +// In the case where both views have the same layout, it uses Kokkos' default `deep_copy`. +// In the case where the views have different layouts (LayoutLeft/LayoutRight), it implements +// a transpose through scratch memory staging +template +struct TransposeHelperKokkos { + + using DeviceType = DeviceType_; + + using t_view_dst = t_view_dst_; + using t_view_src = t_view_src_; + + static_assert(std::is_same::value, "Value types do not match"); + static_assert(t_view_dst::Rank == 2, "Destination view rank != 2"); + static_assert(t_view_src::Rank == 2, "Source view rank != 2"); + + using dst_layout = typename t_view_dst::traits::array_layout; + using src_layout = typename t_view_src::traits::array_layout; + + typedef ArrayTypes AT; + + using t_view_value = typename t_view_dst::value_type; + + // 32x32 tiles, will update so each thread does multiple loads + static constexpr int vector_length = 32; + static constexpr int bank_pad = 1; + static constexpr int elem_size = sizeof(t_view_value); + + static constexpr int threads_per_team = 4; + + t_view_dst d_dst; + t_view_src d_src; + + bool src_is_layout_right; + + // extents divided by vector length, rounded up + int extent_tiles[2]; + + // 1 if extent is divisible by vector length, 0 otherwise + int extent_is_multiple_vector_length[2]; + + // number of teams + int n_teams; + + // amount of shared memory per thread + int shared_mem_per_thread; + + TransposeHelperKokkos(t_view_dst d_dst_, t_view_src d_src_) + : d_dst(d_dst_), d_src(d_src_) { + + assert(d_dst.extent(0) == d_src.extent(0) && d_dst.extent(1) == d_dst.extent(1)); + + if (std::is_same::value) { + Kokkos::deep_copy(d_dst, d_src); + } else { + + src_is_layout_right = std::is_same::value; + + extent_tiles[0] = (d_dst.extent(0) + vector_length - 1) / vector_length; + extent_tiles[1] = (d_dst.extent(1) + vector_length - 1) / vector_length; + + extent_is_multiple_vector_length[0] = (extent_tiles[0] * vector_length == d_dst.extent(0)) ? 1 : 0; + extent_is_multiple_vector_length[1] = (extent_tiles[1] * vector_length == d_dst.extent(1)) ? 1 : 0; + + n_teams = (extent_tiles[0] * extent_tiles[1] + threads_per_team - 1) / threads_per_team; + + shared_mem_per_thread = vector_length * (vector_length + bank_pad) * elem_size; + + Kokkos::TeamPolicy transpose_policy(n_teams, threads_per_team, vector_length); + transpose_policy = transpose_policy.set_scratch_size(0, Kokkos::PerThread(shared_mem_per_thread)); + + Kokkos::parallel_for(transpose_policy, *this); + } + } + + KOKKOS_INLINE_FUNCTION + void operator()(const typename Kokkos::TeamPolicy::member_type& team_member) const { + + t_view_value* buffer = (t_view_value*)(team_member.team_shmem().get_shmem(shared_mem_per_thread * threads_per_team, 0)) + (shared_mem_per_thread / elem_size) * team_member.team_rank(); + + // extract flattened tile + const int flattened_idx = team_member.team_rank() + team_member.league_rank() * threads_per_team; + + // get range x, range y tile + int extent_tile_id[2]; + if (src_is_layout_right) { + // keep extent 1 tiles close together b/c loading from layout right + extent_tile_id[0] = flattened_idx / extent_tiles[1]; + extent_tile_id[1] = flattened_idx - extent_tile_id[0] * extent_tiles[1]; + } else { + // keep extent 0 tiles close together b/c loading from layout left + extent_tile_id[1] = flattened_idx / extent_tiles[0]; + extent_tile_id[0] = flattened_idx - extent_tile_id[1] * extent_tiles[0]; + } + + int elem[2]; + elem[0] = extent_tile_id[0] * vector_length; + elem[1] = extent_tile_id[1] * vector_length; + + if (elem[0] >= d_dst.extent(0) || + elem[1] >= d_dst.extent(1)) return; + + // determine if a row/column is a full `vector_length` in size or not + bool perfect_pad[2]; + perfect_pad[0] = (extent_is_multiple_vector_length[0] == 1 || extent_tile_id[0] + 1 < extent_tiles[0]); + perfect_pad[1] = (extent_is_multiple_vector_length[1] == 1 || extent_tile_id[1] + 1 < extent_tiles[1]); + + // load phase + if (src_is_layout_right) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, vector_length), + [&] (const int j) { + + if (elem[1] + j < d_src.extent(1)) { + if (perfect_pad[0]) { + for (int i = 0; i < vector_length; i++) + buffer[i * (vector_length + bank_pad) + j] = d_src(elem[0] + i, elem[1] + j); + } else { + for (int i = 0; i < (d_src.extent(0) - elem[0]); i++) + buffer[i * (vector_length + bank_pad) + j] = d_src(elem[0] + i, elem[1] + j); + } + } + }); + + } else { + // src is layout left + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, vector_length), + [&] (const int i) { + + if (elem[0] + i < d_src.extent(0)) { + if (perfect_pad[1]) { + for (int j = 0; j < vector_length; j++) + buffer[i * (vector_length + bank_pad) + j] = d_src(elem[0] + i, elem[1] + j); + } else { + for (int j = 0; j < (d_src.extent(1) - elem[1]); j++) + buffer[i * (vector_length + bank_pad) + j] = d_src(elem[0] + i, elem[1] + j); + } + } + }); + } + + // No need for an extra sync b/c, as confirmed by asking on the Kokkos Slack, there + // is an implicit sync at the end of a ThreadVectorRange as per the Kokkos + // programming model. + + // save phase + if (src_is_layout_right) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, vector_length), + [&] (const int i) { + + if (elem[0] + i < d_dst.extent(0)) { + if (perfect_pad[1]) { + for (int j = 0; j < vector_length; j++) + d_dst(elem[0] + i, elem[1] + j) = buffer[i * (vector_length + bank_pad) + j]; + } else { + for (int j = 0; j < (d_dst.extent(1) - elem[1]); j++) + d_dst(elem[0] + i, elem[1] + j) = buffer[i * (vector_length + bank_pad) + j]; + } + } + }); + } else { + + // src is layout left + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, vector_length), + [&] (const int j) { + + if (elem[1] + j < d_dst.extent(1)) { + if (perfect_pad[0]) { + for (int i = 0; i < vector_length; i++) + d_dst(elem[0] + i, elem[1] + j) = buffer[i * (vector_length + bank_pad) + j]; + } else { + for (int i = 0; i < (d_dst.extent(0) - elem[0]); i++) + d_dst(elem[0] + i, elem[1] + j) = buffer[i * (vector_length + bank_pad) + j]; + } + } + }); + } + + } +}; + +} From c3f4dac7dc4b1a2010281ad5597909e344524208 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Fri, 25 Mar 2022 09:45:05 -0700 Subject: [PATCH 05/15] Small tweak to comments --- src/KOKKOS/transpose_helper_kokkos.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/KOKKOS/transpose_helper_kokkos.h b/src/KOKKOS/transpose_helper_kokkos.h index 25bbe2402b..fd7088eac7 100644 --- a/src/KOKKOS/transpose_helper_kokkos.h +++ b/src/KOKKOS/transpose_helper_kokkos.h @@ -12,7 +12,7 @@ ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- - Contributing authors: Evan Weinberg (NVIDIA) + Contributing author: Evan Weinberg (NVIDIA) ------------------------------------------------------------------------- */ #include "kokkos_type.h" @@ -159,9 +159,8 @@ struct TransposeHelperKokkos { }); } - // No need for an extra sync b/c, as confirmed by asking on the Kokkos Slack, there - // is an implicit sync at the end of a ThreadVectorRange as per the Kokkos - // programming model. + // No need for an extra sync b/c there is an implicit sync at the end + // of a ThreadVectorRange as per the Kokkos programming model // save phase if (src_is_layout_right) { From 17b35878ea010deed753cc3d01dd89527dc83a64 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Fri, 25 Mar 2022 12:03:20 -0600 Subject: [PATCH 06/15] Add version of Kokkos Ghost neigh list optimized for GPUs --- src/KOKKOS/npair_kokkos.cpp | 562 +++++++++++++++++++++++++----------- src/KOKKOS/npair_kokkos.h | 28 +- 2 files changed, 410 insertions(+), 180 deletions(-) diff --git a/src/KOKKOS/npair_kokkos.cpp b/src/KOKKOS/npair_kokkos.cpp index 1825fd3abe..267b7fbf63 100644 --- a/src/KOKKOS/npair_kokkos.cpp +++ b/src/KOKKOS/npair_kokkos.cpp @@ -158,7 +158,7 @@ void NPairKokkos::build(NeighList *list_) mbins,nstencil, k_stencil.view(), k_stencilxyz.view(), - nlocal, + nlocal,nall, atomKK->k_x.view(), atomKK->k_radius.view(), atomKK->k_type.view(), @@ -229,18 +229,33 @@ void NPairKokkos::build(NeighList *list_) #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) #define BINS_PER_BLOCK 2 - const int factor = atoms_per_bin<64?2:1; + const int factor = atoms_per_bin <64?2:1; #else const int factor = 1; #endif if (GHOST) { - NPairKokkosBuildFunctorGhost f(data); + NPairKokkosBuildFunctorGhost f(data,atoms_per_bin * 5 * sizeof(X_FLOAT) * factor); +#ifdef LMP_KOKKOS_GPU + if (ExecutionSpaceFromDevice::space == Device) { + int team_size = atoms_per_bin*factor; + int team_size_max = Kokkos::TeamPolicy(team_size,Kokkos::AUTO).team_size_max(f,Kokkos::ParallelForTag()); + if (team_size <= team_size_max) { + Kokkos::TeamPolicy config((mbins+factor-1)/factor,team_size); + Kokkos::parallel_for(config, f); + } else { // fall back to flat method + f.sharedsize = 0; + Kokkos::parallel_for(nall, f); + } + } else + Kokkos::parallel_for(nall, f); +#else Kokkos::parallel_for(nall, f); +#endif } else { if (newton_pair) { if (SIZE) { - NPairKokkosBuildFunctorSize f(data,atoms_per_bin * 5 * sizeof(X_FLOAT) * factor); + NPairKokkosBuildFunctorSize f(data,atoms_per_bin * 6 * sizeof(X_FLOAT) * factor); #ifdef LMP_KOKKOS_GPU if (ExecutionSpaceFromDevice::space == Device) { int team_size = atoms_per_bin*factor; @@ -444,7 +459,7 @@ void NeighborKokkosExecute:: const X_FLOAT delx = xtmp - x(j, 0); const X_FLOAT dely = ytmp - x(j, 1); const X_FLOAT delz = ztmp - x(j, 2); - const X_FLOAT rsq = delx * delx + dely * dely + delz * delz; + const X_FLOAT rsq = delx*delx + dely*dely + delz*delz; if (rsq <= cutneighsq(itype,jtype)) { if (molecular != Atom::ATOMIC) { if (!moltemplate) @@ -455,18 +470,18 @@ void NeighborKokkosExecute:: /* tag[j]-tagprev); */ /* else which = 0; */ if (which == 0) { - if (n 0) { - if (n:: const X_FLOAT delx = xtmp - x(j, 0); const X_FLOAT dely = ytmp - x(j, 1); const X_FLOAT delz = ztmp - x(j, 2); - const X_FLOAT rsq = delx * delx + dely * dely + delz * delz; + const X_FLOAT rsq = delx*delx + dely*dely + delz*delz; if (rsq <= cutneighsq(itype,jtype)) { if (molecular != Atom::ATOMIC) { @@ -512,18 +527,18 @@ void NeighborKokkosExecute:: /* tag[j]-tagprev); */ /* else which = 0; */ if (which == 0) { - if (n 0) { - if (n::build_ItemGPU(typename Kokkos::TeamPolic /* loop over atoms in i's bin, */ const int atoms_per_bin = c_bins.extent(1); - const int BINS_PER_TEAM = dev.team_size()/atoms_per_bin<1?1:dev.team_size()/atoms_per_bin; + const int BINS_PER_TEAM = dev.team_size()/atoms_per_bin <1?1:dev.team_size()/atoms_per_bin; const int TEAMS_PER_BIN = atoms_per_bin/dev.team_size()<1?1:atoms_per_bin/dev.team_size(); const int MY_BIN = dev.team_rank()/atoms_per_bin; const int ibin = dev.league_rank()*BINS_PER_TEAM+MY_BIN; if (ibin >= mbins) return; - X_FLOAT* other_x = sharedmem; - other_x = other_x + 5*atoms_per_bin*MY_BIN; + X_FLOAT* other_x = sharedmem + 5*atoms_per_bin*MY_BIN; int* other_id = (int*) &other_x[4 * atoms_per_bin]; int bincount_current = c_bincount[ibin]; for (int kk = 0; kk < TEAMS_PER_BIN; kk++) { const int MY_II = dev.team_rank()%atoms_per_bin+kk*dev.team_size(); - const int i = MY_II < bincount_current ? c_bins(ibin, MY_II) : -1; - /* if necessary, goto next page and add pages */ + const int i = MY_II < bincount_current ? c_bins(ibin, MY_II) : -1; + /* if necessary, goto next page and add pages */ - int n = 0; + int n = 0; - X_FLOAT xtmp; - X_FLOAT ytmp; - X_FLOAT ztmp; - int itype; - const AtomNeighbors neighbors_i = neigh_list.get_neighbors((i>=0&&i= 0 && i < nlocal)?i:0); - if (i >= 0) { - xtmp = x(i, 0); - ytmp = x(i, 1); - ztmp = x(i, 2); - itype = type(i); - other_x[MY_II] = xtmp; - other_x[MY_II + atoms_per_bin] = ytmp; - other_x[MY_II + 2 * atoms_per_bin] = ztmp; - other_x[MY_II + 3 * atoms_per_bin] = itype; - } - other_id[MY_II] = i; + if (i >= 0) { + xtmp = x(i, 0); + ytmp = x(i, 1); + ztmp = x(i, 2); + itype = type(i); + other_x[MY_II] = xtmp; + other_x[MY_II + atoms_per_bin] = ytmp; + other_x[MY_II + 2 * atoms_per_bin] = ztmp; + other_x[MY_II + 3 * atoms_per_bin] = itype; + } + other_id[MY_II] = i; #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) - int test = (__syncthreads_count(i >= 0 && i <= nlocal) == 0); - if (test) return; + int test = (__syncthreads_count(i >= 0 && i < nlocal) == 0); + if (test) return; #elif defined(KOKKOS_ENABLE_SYCL) - int not_done = (i >= 0 && i <= nlocal); - dev.team_reduce(Kokkos::Max(not_done)); - if(not_done == 0) return; + int not_done = (i >= 0 && i < nlocal); + dev.team_reduce(Kokkos::Max(not_done)); + if(not_done == 0) return; #elif defined(KOKKOS_ENABLE_OPENMPTARGET) - dev.team_barrier(); + dev.team_barrier(); #endif - if (i >= 0 && i < nlocal) { - #pragma unroll 4 - for (int m = 0; m < bincount_current; m++) { - int j = other_id[m]; - const int jtype = other_x[m + 3 * atoms_per_bin]; - - //for same bin as atom i skip j if i==j and skip atoms "below and to the left" if using halfneighborlists - if ((j == i) || - (HalfNeigh && !Newton && (j < i)) || - (HalfNeigh && Newton && - ((j < i) || - ((j >= nlocal) && ((x(j, 2) < ztmp) || (x(j, 2) == ztmp && x(j, 1) < ytmp) || - (x(j, 2) == ztmp && x(j, 1) == ytmp && x(j, 0) < xtmp))))) - ) continue; - if (Tri) { - if (x(j,2) < ztmp) continue; - if (x(j,2) == ztmp) { - if (x(j,1) < ytmp) continue; - if (x(j,1) == ytmp) { - if (x(j,0) < xtmp) continue; - if (x(j,0) == xtmp && j <= i) continue; - } - } - } - if (exclude && exclusion(i,j,itype,jtype)) continue; - const X_FLOAT delx = xtmp - other_x[m]; - const X_FLOAT dely = ytmp - other_x[m + atoms_per_bin]; - const X_FLOAT delz = ztmp - other_x[m + 2 * atoms_per_bin]; - const X_FLOAT rsq = delx * delx + dely * dely + delz * delz; - - if (rsq <= cutneighsq(itype,jtype)) { - if (molecular != Atom::ATOMIC) { - int which = 0; - if (!moltemplate) - which = NeighborKokkosExecute::find_special(i,j); - /* else if (imol >= 0) */ - /* which = find_special(onemols[imol]->special[iatom], */ - /* onemols[imol]->nspecial[iatom], */ - /* tag[j]-tagprev); */ - /* else which = 0; */ - if (which == 0) { - if (n 0) { - if (n::t_int_1d_const_um stencil - = d_stencil; - for (int k = 0; k < nstencil; k++) { - const int jbin = ibin + stencil[k]; - - if (ibin == jbin) continue; - - bincount_current = c_bincount[jbin]; - int j = MY_II < bincount_current ? c_bins(jbin, MY_II) : -1; - - if (j >= 0) { - other_x[MY_II] = x(j, 0); - other_x[MY_II + atoms_per_bin] = x(j, 1); - other_x[MY_II + 2 * atoms_per_bin] = x(j, 2); - other_x[MY_II + 3 * atoms_per_bin] = type(j); - } - - other_id[MY_II] = j; - - dev.team_barrier(); - if (i >= 0 && i < nlocal) { - #pragma unroll 8 + #pragma unroll 4 for (int m = 0; m < bincount_current; m++) { - const int j = other_id[m]; + int j = other_id[m]; const int jtype = other_x[m + 3 * atoms_per_bin]; - //if(HalfNeigh && (j < i)) continue; - if (HalfNeigh && !Newton && (j < i)) continue; - if (!HalfNeigh && j==i) continue; - if (Tri) { - if (x(j,2) < ztmp) continue; - if (x(j,2) == ztmp) { - if (x(j,1) < ytmp) continue; - if (x(j,1) == ytmp) { - if (x(j,0) < xtmp) continue; - if (x(j,0) == xtmp && j <= i) continue; + //for same bin as atom i skip j if i==j and skip atoms "below and to the left" if using halfneighborlists + if ((j == i) || + (HalfNeigh && !Newton && (j < i)) || + (HalfNeigh && Newton && + ((j < i) || + ((j >= nlocal) && ((x(j, 2) < ztmp) || (x(j, 2) == ztmp && x(j, 1) < ytmp) || + (x(j, 2) == ztmp && x(j, 1) == ytmp && x(j, 0) < xtmp))))) + ) continue; + if (Tri) { + if (x(j,2) < ztmp) continue; + if (x(j,2) == ztmp) { + if (x(j,1) < ytmp) continue; + if (x(j,1) == ytmp) { + if (x(j,0) < xtmp) continue; + if (x(j,0) == xtmp && j <= i) continue; + } } } - } if (exclude && exclusion(i,j,itype,jtype)) continue; - const X_FLOAT delx = xtmp - other_x[m]; const X_FLOAT dely = ytmp - other_x[m + atoms_per_bin]; const X_FLOAT delz = ztmp - other_x[m + 2 * atoms_per_bin]; - const X_FLOAT rsq = delx * delx + dely * dely + delz * delz; + const X_FLOAT rsq = delx*delx + dely*dely + delz*delz; if (rsq <= cutneighsq(itype,jtype)) { if (molecular != Atom::ATOMIC) { @@ -739,18 +675,18 @@ void NeighborKokkosExecute::build_ItemGPU(typename Kokkos::TeamPolic /* tag[j]-tagprev); */ /* else which = 0; */ if (which == 0) { - if (n 0) { - if (n::build_ItemGPU(typename Kokkos::TeamPolic } } dev.team_barrier(); - } - if (i >= 0 && i < nlocal) { - neigh_list.d_numneigh(i) = n; - neigh_list.d_ilist(i) = i; - } + const typename ArrayTypes::t_int_1d_const_um stencil + = d_stencil; + for (int k = 0; k < nstencil; k++) { + const int jbin = ibin + stencil[k]; - if (n > neigh_list.maxneighs) { - resize() = 1; + if (ibin == jbin) continue; - if (n > new_maxneighs()) new_maxneighs() = n; // avoid atomics, safe because in while loop - } + bincount_current = c_bincount[jbin]; + int j = MY_II < bincount_current ? c_bins(jbin, MY_II) : -1; + + if (j >= 0) { + other_x[MY_II] = x(j, 0); + other_x[MY_II + atoms_per_bin] = x(j, 1); + other_x[MY_II + 2 * atoms_per_bin] = x(j, 2); + other_x[MY_II + 3 * atoms_per_bin] = type(j); + } + + other_id[MY_II] = j; + + dev.team_barrier(); + + if (i >= 0 && i < nlocal) { + #pragma unroll 8 + for (int m = 0; m < bincount_current; m++) { + const int j = other_id[m]; + const int jtype = other_x[m + 3 * atoms_per_bin]; + + //if(HalfNeigh && (j < i)) continue; + if (HalfNeigh && !Newton && (j < i)) continue; + if (!HalfNeigh && j==i) continue; + if (Tri) { + if (x(j,2) < ztmp) continue; + if (x(j,2) == ztmp) { + if (x(j,1) < ytmp) continue; + if (x(j,1) == ytmp) { + if (x(j,0) < xtmp) continue; + if (x(j,0) == xtmp && j <= i) continue; + } + } + } + if (exclude && exclusion(i,j,itype,jtype)) continue; + + const X_FLOAT delx = xtmp - other_x[m]; + const X_FLOAT dely = ytmp - other_x[m + atoms_per_bin]; + const X_FLOAT delz = ztmp - other_x[m + 2 * atoms_per_bin]; + const X_FLOAT rsq = delx*delx + dely*dely + delz*delz; + + if (rsq <= cutneighsq(itype,jtype)) { + if (molecular != Atom::ATOMIC) { + int which = 0; + if (!moltemplate) + which = NeighborKokkosExecute::find_special(i,j); + /* else if (imol >= 0) */ + /* which = find_special(onemols[imol]->special[iatom], */ + /* onemols[imol]->nspecial[iatom], */ + /* tag[j]-tagprev); */ + /* else which = 0; */ + if (which == 0) { + if (n < neigh_list.maxneighs) neighbors_i(n++) = j; + else n++; + } else if (minimum_image_check(delx,dely,delz)) { + if (n < neigh_list.maxneighs) neighbors_i(n++) = j; + else n++; + } + else if (which > 0) { + if (n < neigh_list.maxneighs) neighbors_i(n++) = j ^ (which << SBBITS); + else n++; + } + } else { + if (n < neigh_list.maxneighs) neighbors_i(n++) = j; + else n++; + } + } + + } + } + dev.team_barrier(); + } + + if (i >= 0 && i < nlocal) { + neigh_list.d_numneigh(i) = n; + neigh_list.d_ilist(i) = i; + } + + if (n > neigh_list.maxneighs) { + resize() = 1; + + if (n > new_maxneighs()) new_maxneighs() = n; // avoid atomics, safe because in while loop + } } } #endif @@ -779,7 +793,7 @@ void NeighborKokkosExecute::build_ItemGPU(typename Kokkos::TeamPolic template template KOKKOS_FUNCTION void NeighborKokkosExecute:: - build_Item_Ghost(const int &i) const + build_ItemGhost(const int &i) const { /* if necessary, goto next page and add pages */ int n = 0; @@ -833,24 +847,23 @@ void NeighborKokkosExecute:: /* tag[j]-tagprev); */ /* else which = 0; */ if (which == 0) { - if (n 0) { - if (n:: const X_FLOAT rsq = delx*delx + dely*dely + delz*delz; if (rsq <= cutneighsq(itype,jtype)) { - if (n:: /* ---------------------------------------------------------------------- */ +#ifdef LMP_KOKKOS_GPU +template template +LAMMPS_DEVICE_FUNCTION inline +void NeighborKokkosExecute::build_ItemGhostGPU(typename Kokkos::TeamPolicy::member_type dev, + size_t sharedsize) const +{ + auto* sharedmem = static_cast(dev.team_shmem().get_shmem(sharedsize)); + // loop over atoms in i's bin + + const int atoms_per_bin = c_bins.extent(1); + const int BINS_PER_TEAM = dev.team_size()/atoms_per_bin <1?1:dev.team_size()/atoms_per_bin; + const int TEAMS_PER_BIN = atoms_per_bin/dev.team_size()<1?1:atoms_per_bin/dev.team_size(); + const int MY_BIN = dev.team_rank()/atoms_per_bin; + + const int ibin = dev.league_rank()*BINS_PER_TEAM+MY_BIN; + + if (ibin >= mbins) return; + + X_FLOAT* other_x = sharedmem + 5*atoms_per_bin*MY_BIN; + int* other_id = (int*) &other_x[4 * atoms_per_bin]; + + int bincount_current = c_bincount[ibin]; + + for (int kk = 0; kk < TEAMS_PER_BIN; kk++) { + const int MY_II = dev.team_rank()%atoms_per_bin+kk*dev.team_size(); + const int i = MY_II < bincount_current ? c_bins(ibin, MY_II) : -1; + + int n = 0; + + X_FLOAT xtmp; + X_FLOAT ytmp; + X_FLOAT ztmp; + int itype; + const AtomNeighbors neighbors_i = neigh_list.get_neighbors((i >= 0 && i < nall)?i:0); + + if (i >= 0) { + xtmp = x(i, 0); + ytmp = x(i, 1); + ztmp = x(i, 2); + itype = type(i); + other_x[MY_II] = xtmp; + other_x[MY_II + atoms_per_bin] = ytmp; + other_x[MY_II + 2 * atoms_per_bin] = ztmp; + other_x[MY_II + 3 * atoms_per_bin] = itype; + } + other_id[MY_II] = i; +#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) + int test = (__syncthreads_count(i >= 0 && i < nall) == 0); + if (test) return; +#elif defined(KOKKOS_ENABLE_SYCL) + int not_done = (i >= 0 && i < nall); + dev.team_reduce(Kokkos::Max(not_done)); + if (not_done == 0) return; +#elif defined(KOKKOS_ENABLE_OPENMPTARGET) + dev.team_barrier(); +#endif + + int which = 0; + int moltemplate; + if (molecular == Atom::TEMPLATE) moltemplate = 1; + else moltemplate = 0; + + const typename ArrayTypes::t_int_1d_const_um stencil + = d_stencil; + const typename ArrayTypes::t_int_1d_3_const_um stencilxyz + = d_stencilxyz; + + // loop over all atoms in surrounding bins in stencil including self + // when i is a ghost atom, must check if stencil bin is out of bounds + // skip i = j + // no molecular test when i = ghost atom + + for (int k = 0; k < nstencil; k++) { + const int jbin = ibin + stencil[k]; + bincount_current = c_bincount[jbin]; + int j = MY_II < bincount_current ? c_bins(jbin, MY_II) : -1; + + if (j >= 0) { + other_x[MY_II] = x(j, 0); + other_x[MY_II + atoms_per_bin] = x(j, 1); + other_x[MY_II + 2 * atoms_per_bin] = x(j, 2); + other_x[MY_II + 3 * atoms_per_bin] = type(j); + } + + other_id[MY_II] = j; + + dev.team_barrier(); + + if (i >= 0 && i < nlocal) { + #pragma unroll 8 + for (int m = 0; m < bincount_current; m++) { + const int j = other_id[m]; + + if (HalfNeigh && j <= i) continue; + else if (j == i) continue; + + const int jtype = other_x[m + 3 * atoms_per_bin]; + if (exclude && exclusion(i,j,itype,jtype)) continue; + + const X_FLOAT delx = xtmp - other_x[m]; + const X_FLOAT dely = ytmp - other_x[m + atoms_per_bin]; + const X_FLOAT delz = ztmp - other_x[m + 2 * atoms_per_bin]; + const X_FLOAT rsq = delx*delx + dely*dely + delz*delz; + + if (rsq <= cutneighsq(itype,jtype)) { + if (molecular != Atom::ATOMIC) { + if (!moltemplate) + which = NeighborKokkosExecute::find_special(i,j); + /* else if (imol >= 0) */ + /* which = find_special(onemols[imol]->special[iatom], */ + /* onemols[imol]->nspecial[iatom], */ + /* tag[j]-tagprev); */ + /* else which = 0; */ + if (which == 0) { + if (n < neigh_list.maxneighs) neighbors_i(n++) = j; + else n++; + } else if (minimum_image_check(delx,dely,delz)) { + if (n < neigh_list.maxneighs) neighbors_i(n++) = j; + else n++; + } + else if (which > 0) { + if (n < neigh_list.maxneighs) neighbors_i(n++) = j ^ (which << SBBITS); + else n++; + } + } else { + if (n < neigh_list.maxneighs) neighbors_i(n++) = j; + else n++; + } + } + } + } + dev.team_barrier(); + } + + int binxyz[3]; + const int ibin = coord2bin(xtmp, ytmp, ztmp, binxyz); + const int xbin = binxyz[0]; + const int ybin = binxyz[1]; + const int zbin = binxyz[2]; + for (int k = 0; k < nstencil; k++) { + const int xbin2 = xbin + stencilxyz(k,0); + const int ybin2 = ybin + stencilxyz(k,1); + const int zbin2 = zbin + stencilxyz(k,2); + if (xbin2 < 0 || xbin2 >= mbinx || + ybin2 < 0 || ybin2 >= mbiny || + zbin2 < 0 || zbin2 >= mbinz) continue; + const int jbin = ibin + stencil[k]; + int bincount_current = c_bincount[jbin]; + int j = MY_II < bincount_current ? c_bins(jbin, MY_II) : -1; + + if (j >= 0) { + other_x[MY_II] = x(j, 0); + other_x[MY_II + atoms_per_bin] = x(j, 1); + other_x[MY_II + 2 * atoms_per_bin] = x(j, 2); + other_x[MY_II + 3 * atoms_per_bin] = type(j); + } + + other_id[MY_II] = j; + + dev.team_barrier(); + + if (i >= nlocal && i < nall) { + #pragma unroll 8 + for (int m = 0; m < bincount_current; m++) { + const int j = other_id[m]; + + if (HalfNeigh && j <= i) continue; + else if (j == i) continue; + + const int jtype = other_x[m + 3 * atoms_per_bin]; + if (exclude && exclusion(i,j,itype,jtype)) continue; + + const X_FLOAT delx = xtmp - other_x[m]; + const X_FLOAT dely = ytmp - other_x[m + atoms_per_bin]; + const X_FLOAT delz = ztmp - other_x[m + 2 * atoms_per_bin]; + const X_FLOAT rsq = delx*delx + dely*dely + delz*delz; + + if (rsq <= cutneighsq(itype,jtype)) { + if (n < neigh_list.maxneighs) neighbors_i(n++) = j; + else n++; + } + } + } + dev.team_barrier(); + } + + if (i >= 0 && i < nall) { + neigh_list.d_numneigh(i) = n; + neigh_list.d_ilist(i) = i; + } + + if (n > neigh_list.maxneighs) { + resize() = 1; + + if (n > new_maxneighs()) new_maxneighs() = n; // avoid atomics, safe because in while loop + } + } +} +#endif + +/* ---------------------------------------------------------------------- */ + template template KOKKOS_FUNCTION void NeighborKokkosExecute:: @@ -940,12 +1155,12 @@ void NeighborKokkosExecute:: const X_FLOAT delx = xtmp - x(j, 0); const X_FLOAT dely = ytmp - x(j, 1); const X_FLOAT delz = ztmp - x(j, 2); - const X_FLOAT rsq = delx * delx + dely * dely + delz * delz; + const X_FLOAT rsq = delx*delx + dely*dely + delz*delz; const X_FLOAT radsum = radi + radius(j); const X_FLOAT cutsq = (radsum + skin) * (radsum + skin); if (rsq <= cutsq) { - if (n:: const X_FLOAT delx = xtmp - x(j, 0); const X_FLOAT dely = ytmp - x(j, 1); const X_FLOAT delz = ztmp - x(j, 2); - const X_FLOAT rsq = delx * delx + dely * dely + delz * delz; + const X_FLOAT rsq = delx*delx + dely*dely + delz*delz; const X_FLOAT radsum = radi + radius(j); const X_FLOAT cutsq = (radsum + skin) * (radsum + skin); if (rsq <= cutsq) { - if (n::build_ItemSizeGPU(typename Kokkos::TeamP /* loop over atoms in i's bin, */ const int atoms_per_bin = c_bins.extent(1); - const int BINS_PER_TEAM = dev.team_size()/atoms_per_bin<1?1:dev.team_size()/atoms_per_bin; + const int BINS_PER_TEAM = dev.team_size()/atoms_per_bin <1?1:dev.team_size()/atoms_per_bin; const int TEAMS_PER_BIN = atoms_per_bin/dev.team_size()<1?1:atoms_per_bin/dev.team_size(); const int MY_BIN = dev.team_rank()/atoms_per_bin; const int ibin = dev.league_rank()*BINS_PER_TEAM+MY_BIN; if (ibin >= mbins) return; - X_FLOAT* other_x = sharedmem; - other_x = other_x + 6*atoms_per_bin*MY_BIN; + X_FLOAT* other_x = sharedmem + 6*atoms_per_bin*MY_BIN; int* other_id = (int*) &other_x[5 * atoms_per_bin]; int bincount_current = c_bincount[ibin]; @@ -1044,7 +1258,7 @@ void NeighborKokkosExecute::build_ItemSizeGPU(typename Kokkos::TeamP X_FLOAT ztmp; X_FLOAT radi; int itype; - const AtomNeighbors neighbors_i = neigh_list.get_neighbors((i>=0&&i= 0 && i < nlocal)?i:0); const int mask_history = 3 << SBBITS; if (i >= 0) { @@ -1061,12 +1275,12 @@ void NeighborKokkosExecute::build_ItemSizeGPU(typename Kokkos::TeamP } other_id[MY_II] = i; #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) - int test = (__syncthreads_count(i >= 0 && i <= nlocal) == 0); + int test = (__syncthreads_count(i >= 0 && i < nlocal) == 0); if (test) return; #elif defined(KOKKOS_ENABLE_SYCL) - int not_done = (i >= 0 && i <= nlocal); + int not_done = (i >= 0 && i < nlocal); dev.team_reduce(Kokkos::Max(not_done)); - if(not_done == 0) return; + if (not_done == 0) return; #elif defined(KOKKOS_ENABLE_OPENMPTARGET) dev.team_barrier(); #endif @@ -1099,12 +1313,12 @@ void NeighborKokkosExecute::build_ItemSizeGPU(typename Kokkos::TeamP const X_FLOAT delx = xtmp - other_x[m]; const X_FLOAT dely = ytmp - other_x[m + atoms_per_bin]; const X_FLOAT delz = ztmp - other_x[m + 2 * atoms_per_bin]; - const X_FLOAT rsq = delx * delx + dely * dely + delz * delz; + const X_FLOAT rsq = delx*delx + dely*dely + delz*delz; const X_FLOAT radsum = radi + other_x[m + 4 * atoms_per_bin]; const X_FLOAT cutsq = (radsum + skin) * (radsum + skin); if (rsq <= cutsq) { - if (n::build_ItemSizeGPU(typename Kokkos::TeamP const X_FLOAT delx = xtmp - other_x[m]; const X_FLOAT dely = ytmp - other_x[m + atoms_per_bin]; const X_FLOAT delz = ztmp - other_x[m + 2 * atoms_per_bin]; - const X_FLOAT rsq = delx * delx + dely * dely + delz * delz; + const X_FLOAT rsq = delx*delx + dely*dely + delz*delz; const X_FLOAT radsum = radi + other_x[m + 4 * atoms_per_bin]; const X_FLOAT cutsq = (radsum + skin) * (radsum + skin); if (rsq <= cutsq) { - if (n KOKKOS_FUNCTION - void build_Item_Ghost(const int &i) const; + void build_ItemGhost(const int &i) const; template KOKKOS_FUNCTION @@ -316,6 +316,11 @@ class NeighborKokkosExecute void build_ItemGPU(typename Kokkos::TeamPolicy::member_type dev, size_t sharedsize) const; + template + LAMMPS_DEVICE_FUNCTION inline + void build_ItemGhostGPU(typename Kokkos::TeamPolicy::member_type dev, + size_t sharedsize) const; + template LAMMPS_DEVICE_FUNCTION inline void build_ItemSizeGPU(typename Kokkos::TeamPolicy::member_type dev, @@ -422,13 +427,24 @@ struct NPairKokkosBuildFunctorGhost { typedef DeviceType device_type; const NeighborKokkosExecute c; + size_t sharedsize; - NPairKokkosBuildFunctorGhost(const NeighborKokkosExecute &_c):c(_c) {} + NPairKokkosBuildFunctorGhost(const NeighborKokkosExecute &_c, + size_t _sharedsize):c(_c), + sharedsize(_sharedsize) {} KOKKOS_INLINE_FUNCTION void operator() (const int & i) const { - c.template build_Item_Ghost(i); + c.template build_ItemGhost(i); } + +#ifdef LMP_KOKKOS_GPU + LAMMPS_DEVICE_FUNCTION inline + void operator() (typename Kokkos::TeamPolicy::member_type dev) const { + c.template build_ItemGhostGPU(dev, sharedsize); + } + size_t team_shmem_size(const int team_size) const { (void) team_size; return sharedsize; } +#endif }; template From e2e046c4526b2113d3453b38cc3632b0cb9153e9 Mon Sep 17 00:00:00 2001 From: Nick Curtis Date: Fri, 25 Mar 2022 11:57:42 -0400 Subject: [PATCH 07/15] add missing host overload to make Clang happy Change-Id: Ib0bd9dec3ecc6bf13b9894b07024172a9810cd77 --- src/KOKKOS/npair_kokkos.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/KOKKOS/npair_kokkos.h b/src/KOKKOS/npair_kokkos.h index 7a63957081..94f94d0593 100644 --- a/src/KOKKOS/npair_kokkos.h +++ b/src/KOKKOS/npair_kokkos.h @@ -447,6 +447,25 @@ struct NPairKokkosBuildFunctorGhost { #endif }; +template +struct NPairKokkosBuildFunctorGhost { + typedef LMPHostType device_type; + + const NeighborKokkosExecute c; + size_t sharedsize; + + NPairKokkosBuildFunctorGhost(const NeighborKokkosExecute &_c, + size_t _sharedsize):c(_c), + sharedsize(_sharedsize) {} + + KOKKOS_INLINE_FUNCTION + void operator() (const int & i) const { + c.template build_ItemGhost(i); + } + + void operator() (typename Kokkos::TeamPolicy::member_type /*dev*/) const {} // Should error out +}; + template struct NPairKokkosBuildFunctorSize { typedef DeviceType device_type; From b14086f3f9c8bd50939576a1df5b1a8f84e48e0b Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Fri, 25 Mar 2022 12:12:04 -0700 Subject: [PATCH 08/15] Update docs --- doc/src/package.rst | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/doc/src/package.rst b/doc/src/package.rst index 437601dc60..c48d364b9a 100644 --- a/doc/src/package.rst +++ b/doc/src/package.rst @@ -81,6 +81,9 @@ Syntax *neigh/thread* value = *off* or *on* off = thread only over atoms on = thread over both atoms and neighbors + *neigh/transpose* value = *off* or *on* + off = use same memory layout for GPU neigh list build as pair style + on = use transposed memory layout for GPU neigh list build *newton* = *off* or *on* off = set Newton pairwise and bonded flags off on = set Newton pairwise and bonded flags on @@ -463,6 +466,13 @@ potentials support this keyword yet, and only thread over atoms. Many simple pairwise potentials such as Lennard-Jones do support threading over both atoms and neighbors. +If the *neigh/transpose* keyword is set to *off*, then the KOKKOS package +will use the same memory layout for building the neigh list on GPUs as +used for the pair style. When this keyword is set to *on* it will use a +different (transposed) memory layout to build the neigh list on GPUs. +This can be faster in some cases (e.g. ReaxFF HNS benchmark) but slower +in others (e.g. Lennard Jones benchmark). + The *newton* keyword sets the Newton flags for pairwise and bonded interactions to *off* or *on*, the same as the :doc:`newton ` command allows. The default for GPUs is *off* because this will almost @@ -681,15 +691,16 @@ script or via the "-pk intel" :doc:`command-line switch `. For the KOKKOS package, the option defaults for GPUs are neigh = full, neigh/qeq = full, newton = off, binsize for GPUs = 2x LAMMPS default -value, comm = device, gpu/aware = on. When LAMMPS can safely detect -that GPU-aware MPI is not available, the default value of gpu/aware -becomes "off". For CPUs or Xeon Phis, the option defaults are neigh = -half, neigh/qeq = half, newton = on, binsize = 0.0, and comm = no. The -option neigh/thread = on when there are 16K atoms or less on an MPI -rank, otherwise it is "off". These settings are made automatically by -the required "-k on" :doc:`command-line switch `. You can -change them by using the package kokkos command in your input script or -via the :doc:`-pk kokkos command-line switch `. +value, comm = device, neigh/transpose = off, gpu/aware = on. When +LAMMPS can safely detect that GPU-aware MPI is not available, the +default value of gpu/aware becomes "off". For CPUs or Xeon Phis, the +option defaults are neigh = half, neigh/qeq = half, newton = on, +binsize = 0.0, and comm = no. The option neigh/thread = on when there +are 16K atoms or less on an MPI rank, otherwise it is "off". These +settings are made automatically by the required "-k on" +:doc:`command-line switch `. You can change them by using +the package kokkos command in your input script or via the :doc:`-pk +kokkos command-line switch `. For the OMP package, the default is Nthreads = 0 and the option defaults are neigh = yes. These settings are made automatically if From 9f0eb2ea1a4d2918c9951cdc25a0e067ec131c61 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Fri, 25 Mar 2022 12:17:33 -0700 Subject: [PATCH 09/15] Add note about memory --- doc/src/package.rst | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/doc/src/package.rst b/doc/src/package.rst index c48d364b9a..a037c91999 100644 --- a/doc/src/package.rst +++ b/doc/src/package.rst @@ -466,12 +466,15 @@ potentials support this keyword yet, and only thread over atoms. Many simple pairwise potentials such as Lennard-Jones do support threading over both atoms and neighbors. -If the *neigh/transpose* keyword is set to *off*, then the KOKKOS package -will use the same memory layout for building the neigh list on GPUs as -used for the pair style. When this keyword is set to *on* it will use a -different (transposed) memory layout to build the neigh list on GPUs. -This can be faster in some cases (e.g. ReaxFF HNS benchmark) but slower -in others (e.g. Lennard Jones benchmark). +If the *neigh/transpose* keyword is set to *off*, then the KOKKOS +package will use the same memory layout for building the neigh list on +GPUs as used for the pair style. When this keyword is set to *on* it +will use a different (transposed) memory layout to build the neigh +list on GPUs. This can be faster in some cases (e.g. ReaxFF HNS +benchmark) but slower in others (e.g. Lennard Jones benchmark). The +copy between different memory layouts is done out of place and +therefore doubles the memory overhead of the neigh list, which can be +signicant. The *newton* keyword sets the Newton flags for pairwise and bonded interactions to *off* or *on*, the same as the :doc:`newton ` From d131223161445c1b2f9c8bae8bdcd0e061bb6e8b Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Fri, 25 Mar 2022 12:59:45 -0700 Subject: [PATCH 10/15] Restore changes lost in Git shuffle --- src/KOKKOS/npair_kokkos.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/KOKKOS/npair_kokkos.cpp b/src/KOKKOS/npair_kokkos.cpp index d42ae4510f..dbb9120832 100644 --- a/src/KOKKOS/npair_kokkos.cpp +++ b/src/KOKKOS/npair_kokkos.cpp @@ -1064,7 +1064,7 @@ void NeighborKokkosExecute::build_ItemGhostGPU(typename Kokkos::Team } int binxyz[3]; - const int ibin = coord2bin(xtmp, ytmp, ztmp, binxyz); + coord2bin(xtmp, ytmp, ztmp, binxyz); const int xbin = binxyz[0]; const int ybin = binxyz[1]; const int zbin = binxyz[2]; @@ -1072,9 +1072,10 @@ void NeighborKokkosExecute::build_ItemGhostGPU(typename Kokkos::Team const int xbin2 = xbin + stencilxyz(k,0); const int ybin2 = ybin + stencilxyz(k,1); const int zbin2 = zbin + stencilxyz(k,2); + int active = 1; if (xbin2 < 0 || xbin2 >= mbinx || ybin2 < 0 || ybin2 >= mbiny || - zbin2 < 0 || zbin2 >= mbinz) continue; + zbin2 < 0 || zbin2 >= mbinz) active = 0; const int jbin = ibin + stencil[k]; int bincount_current = c_bincount[jbin]; int j = MY_II < bincount_current ? c_bins(jbin, MY_II) : -1; @@ -1090,7 +1091,7 @@ void NeighborKokkosExecute::build_ItemGhostGPU(typename Kokkos::Team dev.team_barrier(); - if (i >= nlocal && i < nall) { + if (active && i >= nlocal && i < nall) { #pragma unroll 8 for (int m = 0; m < bincount_current; m++) { const int j = other_id[m]; From d648247b801882ab82f1d76e95547c8c76a7febe Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Fri, 25 Mar 2022 13:03:56 -0700 Subject: [PATCH 11/15] whitespace --- src/KOKKOS/npair_kokkos.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/KOKKOS/npair_kokkos.cpp b/src/KOKKOS/npair_kokkos.cpp index dbb9120832..fa9b722b3b 100644 --- a/src/KOKKOS/npair_kokkos.cpp +++ b/src/KOKKOS/npair_kokkos.cpp @@ -726,7 +726,7 @@ void NeighborKokkosExecute::build_ItemGPU(typename Kokkos::TeamPolic other_x[MY_II + atoms_per_bin] = x(j, 1); other_x[MY_II + 2 * atoms_per_bin] = x(j, 2); other_x[MY_II + 3 * atoms_per_bin] = type(j); - } + } other_id[MY_II] = j; @@ -1011,7 +1011,7 @@ void NeighborKokkosExecute::build_ItemGhostGPU(typename Kokkos::Team other_x[MY_II + atoms_per_bin] = x(j, 1); other_x[MY_II + 2 * atoms_per_bin] = x(j, 2); other_x[MY_II + 3 * atoms_per_bin] = type(j); - } + } other_id[MY_II] = j; @@ -1085,7 +1085,7 @@ void NeighborKokkosExecute::build_ItemGhostGPU(typename Kokkos::Team other_x[MY_II + atoms_per_bin] = x(j, 1); other_x[MY_II + 2 * atoms_per_bin] = x(j, 2); other_x[MY_II + 3 * atoms_per_bin] = type(j); - } + } other_id[MY_II] = j; From 0c1516b34d7c11a59abf8ca099f9a97cce5eb186 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Fri, 25 Mar 2022 14:14:42 -0700 Subject: [PATCH 12/15] Add another missing change --- src/KOKKOS/npair_kokkos.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/KOKKOS/npair_kokkos.cpp b/src/KOKKOS/npair_kokkos.cpp index fa9b722b3b..d0ce09cf19 100644 --- a/src/KOKKOS/npair_kokkos.cpp +++ b/src/KOKKOS/npair_kokkos.cpp @@ -639,6 +639,7 @@ void NeighborKokkosExecute::build_ItemGPU(typename Kokkos::TeamPolic other_x[MY_II + 3 * atoms_per_bin] = itype; } other_id[MY_II] = i; + #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) int test = (__syncthreads_count(i >= 0 && i < nlocal) == 0); if (test) return; @@ -962,7 +963,9 @@ void NeighborKokkosExecute::build_ItemGhostGPU(typename Kokkos::Team X_FLOAT ytmp; X_FLOAT ztmp; int itype; - const AtomNeighbors neighbors_i = neigh_list.get_neighbors((i >= 0 && i < nall)?i:0); + const int index = (i >= 0 && i < nall) ? i : 0; + const AtomNeighbors neighbors_i = neigh_transpose ? + neigh_list.get_neighbors_transpose(index) : neigh_list.get_neighbors(index); if (i >= 0) { xtmp = x(i, 0); From 48332c3b180bc0ba1f22765dcda53ea458b48f7b Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Mon, 28 Mar 2022 14:17:44 -0600 Subject: [PATCH 13/15] Rename variable --- src/KOKKOS/transpose_helper_kokkos.h | 58 ++++++++++++++-------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/src/KOKKOS/transpose_helper_kokkos.h b/src/KOKKOS/transpose_helper_kokkos.h index fd7088eac7..d4b1ad929c 100644 --- a/src/KOKKOS/transpose_helper_kokkos.h +++ b/src/KOKKOS/transpose_helper_kokkos.h @@ -43,7 +43,7 @@ struct TransposeHelperKokkos { using t_view_value = typename t_view_dst::value_type; // 32x32 tiles, will update so each thread does multiple loads - static constexpr int vector_length = 32; + static constexpr int tile_size = 32; static constexpr int bank_pad = 1; static constexpr int elem_size = sizeof(t_view_value); @@ -58,7 +58,7 @@ struct TransposeHelperKokkos { int extent_tiles[2]; // 1 if extent is divisible by vector length, 0 otherwise - int extent_is_multiple_vector_length[2]; + int extent_is_multiple_tile_size[2]; // number of teams int n_teams; @@ -77,17 +77,17 @@ struct TransposeHelperKokkos { src_is_layout_right = std::is_same::value; - extent_tiles[0] = (d_dst.extent(0) + vector_length - 1) / vector_length; - extent_tiles[1] = (d_dst.extent(1) + vector_length - 1) / vector_length; + extent_tiles[0] = (d_dst.extent(0) + tile_size - 1) / tile_size; + extent_tiles[1] = (d_dst.extent(1) + tile_size - 1) / tile_size; - extent_is_multiple_vector_length[0] = (extent_tiles[0] * vector_length == d_dst.extent(0)) ? 1 : 0; - extent_is_multiple_vector_length[1] = (extent_tiles[1] * vector_length == d_dst.extent(1)) ? 1 : 0; + extent_is_multiple_tile_size[0] = (extent_tiles[0] * tile_size == d_dst.extent(0)) ? 1 : 0; + extent_is_multiple_tile_size[1] = (extent_tiles[1] * tile_size == d_dst.extent(1)) ? 1 : 0; n_teams = (extent_tiles[0] * extent_tiles[1] + threads_per_team - 1) / threads_per_team; - shared_mem_per_thread = vector_length * (vector_length + bank_pad) * elem_size; + shared_mem_per_thread = tile_size * (tile_size + bank_pad) * elem_size; - Kokkos::TeamPolicy transpose_policy(n_teams, threads_per_team, vector_length); + Kokkos::TeamPolicy transpose_policy(n_teams, threads_per_team, tile_size); transpose_policy = transpose_policy.set_scratch_size(0, Kokkos::PerThread(shared_mem_per_thread)); Kokkos::parallel_for(transpose_policy, *this); @@ -115,45 +115,45 @@ struct TransposeHelperKokkos { } int elem[2]; - elem[0] = extent_tile_id[0] * vector_length; - elem[1] = extent_tile_id[1] * vector_length; + elem[0] = extent_tile_id[0] * tile_size; + elem[1] = extent_tile_id[1] * tile_size; if (elem[0] >= d_dst.extent(0) || elem[1] >= d_dst.extent(1)) return; - // determine if a row/column is a full `vector_length` in size or not + // determine if a row/column is a full `tile_size` in size or not bool perfect_pad[2]; - perfect_pad[0] = (extent_is_multiple_vector_length[0] == 1 || extent_tile_id[0] + 1 < extent_tiles[0]); - perfect_pad[1] = (extent_is_multiple_vector_length[1] == 1 || extent_tile_id[1] + 1 < extent_tiles[1]); + perfect_pad[0] = (extent_is_multiple_tile_size[0] == 1 || extent_tile_id[0] + 1 < extent_tiles[0]); + perfect_pad[1] = (extent_is_multiple_tile_size[1] == 1 || extent_tile_id[1] + 1 < extent_tiles[1]); // load phase if (src_is_layout_right) { - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, vector_length), + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, tile_size), [&] (const int j) { if (elem[1] + j < d_src.extent(1)) { if (perfect_pad[0]) { - for (int i = 0; i < vector_length; i++) - buffer[i * (vector_length + bank_pad) + j] = d_src(elem[0] + i, elem[1] + j); + for (int i = 0; i < tile_size; i++) + buffer[i * (tile_size + bank_pad) + j] = d_src(elem[0] + i, elem[1] + j); } else { for (int i = 0; i < (d_src.extent(0) - elem[0]); i++) - buffer[i * (vector_length + bank_pad) + j] = d_src(elem[0] + i, elem[1] + j); + buffer[i * (tile_size + bank_pad) + j] = d_src(elem[0] + i, elem[1] + j); } } }); } else { // src is layout left - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, vector_length), + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, tile_size), [&] (const int i) { if (elem[0] + i < d_src.extent(0)) { if (perfect_pad[1]) { - for (int j = 0; j < vector_length; j++) - buffer[i * (vector_length + bank_pad) + j] = d_src(elem[0] + i, elem[1] + j); + for (int j = 0; j < tile_size; j++) + buffer[i * (tile_size + bank_pad) + j] = d_src(elem[0] + i, elem[1] + j); } else { for (int j = 0; j < (d_src.extent(1) - elem[1]); j++) - buffer[i * (vector_length + bank_pad) + j] = d_src(elem[0] + i, elem[1] + j); + buffer[i * (tile_size + bank_pad) + j] = d_src(elem[0] + i, elem[1] + j); } } }); @@ -164,32 +164,32 @@ struct TransposeHelperKokkos { // save phase if (src_is_layout_right) { - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, vector_length), + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, tile_size), [&] (const int i) { if (elem[0] + i < d_dst.extent(0)) { if (perfect_pad[1]) { - for (int j = 0; j < vector_length; j++) - d_dst(elem[0] + i, elem[1] + j) = buffer[i * (vector_length + bank_pad) + j]; + for (int j = 0; j < tile_size; j++) + d_dst(elem[0] + i, elem[1] + j) = buffer[i * (tile_size + bank_pad) + j]; } else { for (int j = 0; j < (d_dst.extent(1) - elem[1]); j++) - d_dst(elem[0] + i, elem[1] + j) = buffer[i * (vector_length + bank_pad) + j]; + d_dst(elem[0] + i, elem[1] + j) = buffer[i * (tile_size + bank_pad) + j]; } } }); } else { // src is layout left - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, vector_length), + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team_member, tile_size), [&] (const int j) { if (elem[1] + j < d_dst.extent(1)) { if (perfect_pad[0]) { - for (int i = 0; i < vector_length; i++) - d_dst(elem[0] + i, elem[1] + j) = buffer[i * (vector_length + bank_pad) + j]; + for (int i = 0; i < tile_size; i++) + d_dst(elem[0] + i, elem[1] + j) = buffer[i * (tile_size + bank_pad) + j]; } else { for (int i = 0; i < (d_dst.extent(0) - elem[0]); i++) - d_dst(elem[0] + i, elem[1] + j) = buffer[i * (vector_length + bank_pad) + j]; + d_dst(elem[0] + i, elem[1] + j) = buffer[i * (tile_size + bank_pad) + j]; } } }); From 88c075ba90c6d95b4ed8e1846ebb518cd6fb5ad4 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Mon, 28 Mar 2022 14:19:50 -0600 Subject: [PATCH 14/15] Tune for HIP --- src/KOKKOS/transpose_helper_kokkos.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/KOKKOS/transpose_helper_kokkos.h b/src/KOKKOS/transpose_helper_kokkos.h index d4b1ad929c..464bf04c4c 100644 --- a/src/KOKKOS/transpose_helper_kokkos.h +++ b/src/KOKKOS/transpose_helper_kokkos.h @@ -42,12 +42,18 @@ struct TransposeHelperKokkos { using t_view_value = typename t_view_dst::value_type; - // 32x32 tiles, will update so each thread does multiple loads + + // set tile size, will update so each thread does multiple loads +#ifdef KOKKOS_ENABLE_HIP + static constexpr int tile_size = 16; + static constexpr int threads_per_team = 8; +#else static constexpr int tile_size = 32; + static constexpr int threads_per_team = 4; +#endif static constexpr int bank_pad = 1; static constexpr int elem_size = sizeof(t_view_value); - static constexpr int threads_per_team = 4; t_view_dst d_dst; t_view_src d_src; From 60e2b84e094f49138ee047490eb53ddeeee9ca02 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Mon, 28 Mar 2022 14:43:16 -0600 Subject: [PATCH 15/15] Fuse loops --- src/KOKKOS/npair_kokkos.cpp | 76 ++++++++++--------------------------- 1 file changed, 20 insertions(+), 56 deletions(-) diff --git a/src/KOKKOS/npair_kokkos.cpp b/src/KOKKOS/npair_kokkos.cpp index d0ce09cf19..f42ca39404 100644 --- a/src/KOKKOS/npair_kokkos.cpp +++ b/src/KOKKOS/npair_kokkos.cpp @@ -1004,7 +1004,24 @@ void NeighborKokkosExecute::build_ItemGhostGPU(typename Kokkos::Team // skip i = j // no molecular test when i = ghost atom + int ghost = (i >= nlocal && i < nall); + int binxyz[3]; + if (ghost) + coord2bin(xtmp, ytmp, ztmp, binxyz); + const int xbin = binxyz[0]; + const int ybin = binxyz[1]; + const int zbin = binxyz[2]; for (int k = 0; k < nstencil; k++) { + int active = 1; + if (ghost) { + const int xbin2 = xbin + stencilxyz(k,0); + const int ybin2 = ybin + stencilxyz(k,1); + const int zbin2 = zbin + stencilxyz(k,2); + if (xbin2 < 0 || xbin2 >= mbinx || + ybin2 < 0 || ybin2 >= mbiny || + zbin2 < 0 || zbin2 >= mbinz) active = 0; + } + const int jbin = ibin + stencil[k]; bincount_current = c_bincount[jbin]; int j = MY_II < bincount_current ? c_bins(jbin, MY_II) : -1; @@ -1020,8 +1037,8 @@ void NeighborKokkosExecute::build_ItemGhostGPU(typename Kokkos::Team dev.team_barrier(); - if (i >= 0 && i < nlocal) { - #pragma unroll 8 + if (active && i >= 0 && i < nall) { + #pragma unroll 4 for (int m = 0; m < bincount_current; m++) { const int j = other_id[m]; @@ -1037,7 +1054,7 @@ void NeighborKokkosExecute::build_ItemGhostGPU(typename Kokkos::Team const X_FLOAT rsq = delx*delx + dely*dely + delz*delz; if (rsq <= cutneighsq(itype,jtype)) { - if (molecular != Atom::ATOMIC) { + if (molecular != Atom::ATOMIC && !ghost) { if (!moltemplate) which = NeighborKokkosExecute::find_special(i,j); /* else if (imol >= 0) */ @@ -1066,59 +1083,6 @@ void NeighborKokkosExecute::build_ItemGhostGPU(typename Kokkos::Team dev.team_barrier(); } - int binxyz[3]; - coord2bin(xtmp, ytmp, ztmp, binxyz); - const int xbin = binxyz[0]; - const int ybin = binxyz[1]; - const int zbin = binxyz[2]; - for (int k = 0; k < nstencil; k++) { - const int xbin2 = xbin + stencilxyz(k,0); - const int ybin2 = ybin + stencilxyz(k,1); - const int zbin2 = zbin + stencilxyz(k,2); - int active = 1; - if (xbin2 < 0 || xbin2 >= mbinx || - ybin2 < 0 || ybin2 >= mbiny || - zbin2 < 0 || zbin2 >= mbinz) active = 0; - const int jbin = ibin + stencil[k]; - int bincount_current = c_bincount[jbin]; - int j = MY_II < bincount_current ? c_bins(jbin, MY_II) : -1; - - if (j >= 0) { - other_x[MY_II] = x(j, 0); - other_x[MY_II + atoms_per_bin] = x(j, 1); - other_x[MY_II + 2 * atoms_per_bin] = x(j, 2); - other_x[MY_II + 3 * atoms_per_bin] = type(j); - } - - other_id[MY_II] = j; - - dev.team_barrier(); - - if (active && i >= nlocal && i < nall) { - #pragma unroll 8 - for (int m = 0; m < bincount_current; m++) { - const int j = other_id[m]; - - if (HalfNeigh && j <= i) continue; - else if (j == i) continue; - - const int jtype = other_x[m + 3 * atoms_per_bin]; - if (exclude && exclusion(i,j,itype,jtype)) continue; - - const X_FLOAT delx = xtmp - other_x[m]; - const X_FLOAT dely = ytmp - other_x[m + atoms_per_bin]; - const X_FLOAT delz = ztmp - other_x[m + 2 * atoms_per_bin]; - const X_FLOAT rsq = delx*delx + dely*dely + delz*delz; - - if (rsq <= cutneighsq(itype,jtype)) { - if (n < neigh_list.maxneighs) neighbors_i(n++) = j; - else n++; - } - } - } - dev.team_barrier(); - } - if (i >= 0 && i < nall) { neigh_list.d_numneigh(i) = n; neigh_list.d_ilist(i) = i;