Add version of Kokkos Ghost neigh list optimized for GPUs
This commit is contained in:
@ -158,7 +158,7 @@ void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI,SIZE>::build(NeighList *list_)
|
||||
mbins,nstencil,
|
||||
k_stencil.view<DeviceType>(),
|
||||
k_stencilxyz.view<DeviceType>(),
|
||||
nlocal,
|
||||
nlocal,nall,
|
||||
atomKK->k_x.view<DeviceType>(),
|
||||
atomKK->k_radius.view<DeviceType>(),
|
||||
atomKK->k_type.view<DeviceType>(),
|
||||
@ -229,18 +229,33 @@ void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI,SIZE>::build(NeighList *list_)
|
||||
|
||||
#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
|
||||
#define BINS_PER_BLOCK 2
|
||||
const int factor = atoms_per_bin<64?2:1;
|
||||
const int factor = atoms_per_bin <64?2:1;
|
||||
#else
|
||||
const int factor = 1;
|
||||
#endif
|
||||
|
||||
if (GHOST) {
|
||||
NPairKokkosBuildFunctorGhost<DeviceType,HALF_NEIGH> f(data);
|
||||
NPairKokkosBuildFunctorGhost<DeviceType,HALF_NEIGH> f(data,atoms_per_bin * 5 * sizeof(X_FLOAT) * factor);
|
||||
#ifdef LMP_KOKKOS_GPU
|
||||
if (ExecutionSpaceFromDevice<DeviceType>::space == Device) {
|
||||
int team_size = atoms_per_bin*factor;
|
||||
int team_size_max = Kokkos::TeamPolicy<DeviceType>(team_size,Kokkos::AUTO).team_size_max(f,Kokkos::ParallelForTag());
|
||||
if (team_size <= team_size_max) {
|
||||
Kokkos::TeamPolicy<DeviceType> config((mbins+factor-1)/factor,team_size);
|
||||
Kokkos::parallel_for(config, f);
|
||||
} else { // fall back to flat method
|
||||
f.sharedsize = 0;
|
||||
Kokkos::parallel_for(nall, f);
|
||||
}
|
||||
} else
|
||||
Kokkos::parallel_for(nall, f);
|
||||
#else
|
||||
Kokkos::parallel_for(nall, f);
|
||||
#endif
|
||||
} else {
|
||||
if (newton_pair) {
|
||||
if (SIZE) {
|
||||
NPairKokkosBuildFunctorSize<DeviceType,TRI?0:HALF_NEIGH,1,TRI> f(data,atoms_per_bin * 5 * sizeof(X_FLOAT) * factor);
|
||||
NPairKokkosBuildFunctorSize<DeviceType,TRI?0:HALF_NEIGH,1,TRI> f(data,atoms_per_bin * 6 * sizeof(X_FLOAT) * factor);
|
||||
#ifdef LMP_KOKKOS_GPU
|
||||
if (ExecutionSpaceFromDevice<DeviceType>::space == Device) {
|
||||
int team_size = atoms_per_bin*factor;
|
||||
@ -444,7 +459,7 @@ void NeighborKokkosExecute<DeviceType>::
|
||||
const X_FLOAT delx = xtmp - x(j, 0);
|
||||
const X_FLOAT dely = ytmp - x(j, 1);
|
||||
const X_FLOAT delz = ztmp - x(j, 2);
|
||||
const X_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
const X_FLOAT rsq = delx*delx + dely*dely + delz*delz;
|
||||
if (rsq <= cutneighsq(itype,jtype)) {
|
||||
if (molecular != Atom::ATOMIC) {
|
||||
if (!moltemplate)
|
||||
@ -455,18 +470,18 @@ void NeighborKokkosExecute<DeviceType>::
|
||||
/* tag[j]-tagprev); */
|
||||
/* else which = 0; */
|
||||
if (which == 0) {
|
||||
if (n<neigh_list.maxneighs) neighbors_i(n++) = j;
|
||||
if (n < neigh_list.maxneighs) neighbors_i(n++) = j;
|
||||
else n++;
|
||||
} else if (minimum_image_check(delx,dely,delz)) {
|
||||
if (n<neigh_list.maxneighs) neighbors_i(n++) = j;
|
||||
if (n < neigh_list.maxneighs) neighbors_i(n++) = j;
|
||||
else n++;
|
||||
}
|
||||
else if (which > 0) {
|
||||
if (n<neigh_list.maxneighs) neighbors_i(n++) = j ^ (which << SBBITS);
|
||||
if (n < neigh_list.maxneighs) neighbors_i(n++) = j ^ (which << SBBITS);
|
||||
else n++;
|
||||
}
|
||||
} else {
|
||||
if (n<neigh_list.maxneighs) neighbors_i(n++) = j;
|
||||
if (n < neigh_list.maxneighs) neighbors_i(n++) = j;
|
||||
else n++;
|
||||
}
|
||||
}
|
||||
@ -500,7 +515,7 @@ void NeighborKokkosExecute<DeviceType>::
|
||||
const X_FLOAT delx = xtmp - x(j, 0);
|
||||
const X_FLOAT dely = ytmp - x(j, 1);
|
||||
const X_FLOAT delz = ztmp - x(j, 2);
|
||||
const X_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
const X_FLOAT rsq = delx*delx + dely*dely + delz*delz;
|
||||
|
||||
if (rsq <= cutneighsq(itype,jtype)) {
|
||||
if (molecular != Atom::ATOMIC) {
|
||||
@ -512,18 +527,18 @@ void NeighborKokkosExecute<DeviceType>::
|
||||
/* tag[j]-tagprev); */
|
||||
/* else which = 0; */
|
||||
if (which == 0) {
|
||||
if (n<neigh_list.maxneighs) neighbors_i(n++) = j;
|
||||
if (n < neigh_list.maxneighs) neighbors_i(n++) = j;
|
||||
else n++;
|
||||
} else if (minimum_image_check(delx,dely,delz)) {
|
||||
if (n<neigh_list.maxneighs) neighbors_i(n++) = j;
|
||||
if (n < neigh_list.maxneighs) neighbors_i(n++) = j;
|
||||
else n++;
|
||||
}
|
||||
else if (which > 0) {
|
||||
if (n<neigh_list.maxneighs) neighbors_i(n++) = j ^ (which << SBBITS);
|
||||
if (n < neigh_list.maxneighs) neighbors_i(n++) = j ^ (which << SBBITS);
|
||||
else n++;
|
||||
}
|
||||
} else {
|
||||
if (n<neigh_list.maxneighs) neighbors_i(n++) = j;
|
||||
if (n < neigh_list.maxneighs) neighbors_i(n++) = j;
|
||||
else n++;
|
||||
}
|
||||
}
|
||||
@ -571,162 +586,83 @@ void NeighborKokkosExecute<DeviceType>::build_ItemGPU(typename Kokkos::TeamPolic
|
||||
/* loop over atoms in i's bin,
|
||||
*/
|
||||
const int atoms_per_bin = c_bins.extent(1);
|
||||
const int BINS_PER_TEAM = dev.team_size()/atoms_per_bin<1?1:dev.team_size()/atoms_per_bin;
|
||||
const int BINS_PER_TEAM = dev.team_size()/atoms_per_bin <1?1:dev.team_size()/atoms_per_bin;
|
||||
const int TEAMS_PER_BIN = atoms_per_bin/dev.team_size()<1?1:atoms_per_bin/dev.team_size();
|
||||
const int MY_BIN = dev.team_rank()/atoms_per_bin;
|
||||
|
||||
const int ibin = dev.league_rank()*BINS_PER_TEAM+MY_BIN;
|
||||
|
||||
if (ibin >= mbins) return;
|
||||
X_FLOAT* other_x = sharedmem;
|
||||
other_x = other_x + 5*atoms_per_bin*MY_BIN;
|
||||
|
||||
X_FLOAT* other_x = sharedmem + 5*atoms_per_bin*MY_BIN;
|
||||
int* other_id = (int*) &other_x[4 * atoms_per_bin];
|
||||
|
||||
int bincount_current = c_bincount[ibin];
|
||||
|
||||
for (int kk = 0; kk < TEAMS_PER_BIN; kk++) {
|
||||
const int MY_II = dev.team_rank()%atoms_per_bin+kk*dev.team_size();
|
||||
const int i = MY_II < bincount_current ? c_bins(ibin, MY_II) : -1;
|
||||
/* if necessary, goto next page and add pages */
|
||||
const int i = MY_II < bincount_current ? c_bins(ibin, MY_II) : -1;
|
||||
/* if necessary, goto next page and add pages */
|
||||
|
||||
int n = 0;
|
||||
int n = 0;
|
||||
|
||||
X_FLOAT xtmp;
|
||||
X_FLOAT ytmp;
|
||||
X_FLOAT ztmp;
|
||||
int itype;
|
||||
const AtomNeighbors neighbors_i = neigh_list.get_neighbors((i>=0&&i<nlocal)?i:0);
|
||||
X_FLOAT xtmp;
|
||||
X_FLOAT ytmp;
|
||||
X_FLOAT ztmp;
|
||||
int itype;
|
||||
const AtomNeighbors neighbors_i = neigh_list.get_neighbors((i >= 0 && i < nlocal)?i:0);
|
||||
|
||||
if (i >= 0) {
|
||||
xtmp = x(i, 0);
|
||||
ytmp = x(i, 1);
|
||||
ztmp = x(i, 2);
|
||||
itype = type(i);
|
||||
other_x[MY_II] = xtmp;
|
||||
other_x[MY_II + atoms_per_bin] = ytmp;
|
||||
other_x[MY_II + 2 * atoms_per_bin] = ztmp;
|
||||
other_x[MY_II + 3 * atoms_per_bin] = itype;
|
||||
}
|
||||
other_id[MY_II] = i;
|
||||
if (i >= 0) {
|
||||
xtmp = x(i, 0);
|
||||
ytmp = x(i, 1);
|
||||
ztmp = x(i, 2);
|
||||
itype = type(i);
|
||||
other_x[MY_II] = xtmp;
|
||||
other_x[MY_II + atoms_per_bin] = ytmp;
|
||||
other_x[MY_II + 2 * atoms_per_bin] = ztmp;
|
||||
other_x[MY_II + 3 * atoms_per_bin] = itype;
|
||||
}
|
||||
other_id[MY_II] = i;
|
||||
#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
|
||||
int test = (__syncthreads_count(i >= 0 && i <= nlocal) == 0);
|
||||
if (test) return;
|
||||
int test = (__syncthreads_count(i >= 0 && i < nlocal) == 0);
|
||||
if (test) return;
|
||||
#elif defined(KOKKOS_ENABLE_SYCL)
|
||||
int not_done = (i >= 0 && i <= nlocal);
|
||||
dev.team_reduce(Kokkos::Max<int>(not_done));
|
||||
if(not_done == 0) return;
|
||||
int not_done = (i >= 0 && i < nlocal);
|
||||
dev.team_reduce(Kokkos::Max<int>(not_done));
|
||||
if(not_done == 0) return;
|
||||
#elif defined(KOKKOS_ENABLE_OPENMPTARGET)
|
||||
dev.team_barrier();
|
||||
dev.team_barrier();
|
||||
#endif
|
||||
|
||||
if (i >= 0 && i < nlocal) {
|
||||
#pragma unroll 4
|
||||
for (int m = 0; m < bincount_current; m++) {
|
||||
int j = other_id[m];
|
||||
const int jtype = other_x[m + 3 * atoms_per_bin];
|
||||
|
||||
//for same bin as atom i skip j if i==j and skip atoms "below and to the left" if using halfneighborlists
|
||||
if ((j == i) ||
|
||||
(HalfNeigh && !Newton && (j < i)) ||
|
||||
(HalfNeigh && Newton &&
|
||||
((j < i) ||
|
||||
((j >= nlocal) && ((x(j, 2) < ztmp) || (x(j, 2) == ztmp && x(j, 1) < ytmp) ||
|
||||
(x(j, 2) == ztmp && x(j, 1) == ytmp && x(j, 0) < xtmp)))))
|
||||
) continue;
|
||||
if (Tri) {
|
||||
if (x(j,2) < ztmp) continue;
|
||||
if (x(j,2) == ztmp) {
|
||||
if (x(j,1) < ytmp) continue;
|
||||
if (x(j,1) == ytmp) {
|
||||
if (x(j,0) < xtmp) continue;
|
||||
if (x(j,0) == xtmp && j <= i) continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (exclude && exclusion(i,j,itype,jtype)) continue;
|
||||
const X_FLOAT delx = xtmp - other_x[m];
|
||||
const X_FLOAT dely = ytmp - other_x[m + atoms_per_bin];
|
||||
const X_FLOAT delz = ztmp - other_x[m + 2 * atoms_per_bin];
|
||||
const X_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
|
||||
if (rsq <= cutneighsq(itype,jtype)) {
|
||||
if (molecular != Atom::ATOMIC) {
|
||||
int which = 0;
|
||||
if (!moltemplate)
|
||||
which = NeighborKokkosExecute<DeviceType>::find_special(i,j);
|
||||
/* else if (imol >= 0) */
|
||||
/* which = find_special(onemols[imol]->special[iatom], */
|
||||
/* onemols[imol]->nspecial[iatom], */
|
||||
/* tag[j]-tagprev); */
|
||||
/* else which = 0; */
|
||||
if (which == 0) {
|
||||
if (n<neigh_list.maxneighs) neighbors_i(n++) = j;
|
||||
else n++;
|
||||
} else if (minimum_image_check(delx,dely,delz)) {
|
||||
if (n<neigh_list.maxneighs) neighbors_i(n++) = j;
|
||||
else n++;
|
||||
}
|
||||
else if (which > 0) {
|
||||
if (n<neigh_list.maxneighs) neighbors_i(n++) = j ^ (which << SBBITS);
|
||||
else n++;
|
||||
}
|
||||
} else {
|
||||
if (n<neigh_list.maxneighs) neighbors_i(n++) = j;
|
||||
else n++;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
dev.team_barrier();
|
||||
|
||||
const typename ArrayTypes<DeviceType>::t_int_1d_const_um stencil
|
||||
= d_stencil;
|
||||
for (int k = 0; k < nstencil; k++) {
|
||||
const int jbin = ibin + stencil[k];
|
||||
|
||||
if (ibin == jbin) continue;
|
||||
|
||||
bincount_current = c_bincount[jbin];
|
||||
int j = MY_II < bincount_current ? c_bins(jbin, MY_II) : -1;
|
||||
|
||||
if (j >= 0) {
|
||||
other_x[MY_II] = x(j, 0);
|
||||
other_x[MY_II + atoms_per_bin] = x(j, 1);
|
||||
other_x[MY_II + 2 * atoms_per_bin] = x(j, 2);
|
||||
other_x[MY_II + 3 * atoms_per_bin] = type(j);
|
||||
}
|
||||
|
||||
other_id[MY_II] = j;
|
||||
|
||||
dev.team_barrier();
|
||||
|
||||
if (i >= 0 && i < nlocal) {
|
||||
#pragma unroll 8
|
||||
#pragma unroll 4
|
||||
for (int m = 0; m < bincount_current; m++) {
|
||||
const int j = other_id[m];
|
||||
int j = other_id[m];
|
||||
const int jtype = other_x[m + 3 * atoms_per_bin];
|
||||
|
||||
//if(HalfNeigh && (j < i)) continue;
|
||||
if (HalfNeigh && !Newton && (j < i)) continue;
|
||||
if (!HalfNeigh && j==i) continue;
|
||||
if (Tri) {
|
||||
if (x(j,2) < ztmp) continue;
|
||||
if (x(j,2) == ztmp) {
|
||||
if (x(j,1) < ytmp) continue;
|
||||
if (x(j,1) == ytmp) {
|
||||
if (x(j,0) < xtmp) continue;
|
||||
if (x(j,0) == xtmp && j <= i) continue;
|
||||
//for same bin as atom i skip j if i==j and skip atoms "below and to the left" if using halfneighborlists
|
||||
if ((j == i) ||
|
||||
(HalfNeigh && !Newton && (j < i)) ||
|
||||
(HalfNeigh && Newton &&
|
||||
((j < i) ||
|
||||
((j >= nlocal) && ((x(j, 2) < ztmp) || (x(j, 2) == ztmp && x(j, 1) < ytmp) ||
|
||||
(x(j, 2) == ztmp && x(j, 1) == ytmp && x(j, 0) < xtmp)))))
|
||||
) continue;
|
||||
if (Tri) {
|
||||
if (x(j,2) < ztmp) continue;
|
||||
if (x(j,2) == ztmp) {
|
||||
if (x(j,1) < ytmp) continue;
|
||||
if (x(j,1) == ytmp) {
|
||||
if (x(j,0) < xtmp) continue;
|
||||
if (x(j,0) == xtmp && j <= i) continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (exclude && exclusion(i,j,itype,jtype)) continue;
|
||||
|
||||
const X_FLOAT delx = xtmp - other_x[m];
|
||||
const X_FLOAT dely = ytmp - other_x[m + atoms_per_bin];
|
||||
const X_FLOAT delz = ztmp - other_x[m + 2 * atoms_per_bin];
|
||||
const X_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
const X_FLOAT rsq = delx*delx + dely*dely + delz*delz;
|
||||
|
||||
if (rsq <= cutneighsq(itype,jtype)) {
|
||||
if (molecular != Atom::ATOMIC) {
|
||||
@ -739,18 +675,18 @@ void NeighborKokkosExecute<DeviceType>::build_ItemGPU(typename Kokkos::TeamPolic
|
||||
/* tag[j]-tagprev); */
|
||||
/* else which = 0; */
|
||||
if (which == 0) {
|
||||
if (n<neigh_list.maxneighs) neighbors_i(n++) = j;
|
||||
if (n < neigh_list.maxneighs) neighbors_i(n++) = j;
|
||||
else n++;
|
||||
} else if (minimum_image_check(delx,dely,delz)) {
|
||||
if (n<neigh_list.maxneighs) neighbors_i(n++) = j;
|
||||
if (n < neigh_list.maxneighs) neighbors_i(n++) = j;
|
||||
else n++;
|
||||
}
|
||||
else if (which > 0) {
|
||||
if (n<neigh_list.maxneighs) neighbors_i(n++) = j ^ (which << SBBITS);
|
||||
if (n < neigh_list.maxneighs) neighbors_i(n++) = j ^ (which << SBBITS);
|
||||
else n++;
|
||||
}
|
||||
} else {
|
||||
if (n<neigh_list.maxneighs) neighbors_i(n++) = j;
|
||||
if (n < neigh_list.maxneighs) neighbors_i(n++) = j;
|
||||
else n++;
|
||||
}
|
||||
}
|
||||
@ -758,18 +694,96 @@ void NeighborKokkosExecute<DeviceType>::build_ItemGPU(typename Kokkos::TeamPolic
|
||||
}
|
||||
}
|
||||
dev.team_barrier();
|
||||
}
|
||||
|
||||
if (i >= 0 && i < nlocal) {
|
||||
neigh_list.d_numneigh(i) = n;
|
||||
neigh_list.d_ilist(i) = i;
|
||||
}
|
||||
const typename ArrayTypes<DeviceType>::t_int_1d_const_um stencil
|
||||
= d_stencil;
|
||||
for (int k = 0; k < nstencil; k++) {
|
||||
const int jbin = ibin + stencil[k];
|
||||
|
||||
if (n > neigh_list.maxneighs) {
|
||||
resize() = 1;
|
||||
if (ibin == jbin) continue;
|
||||
|
||||
if (n > new_maxneighs()) new_maxneighs() = n; // avoid atomics, safe because in while loop
|
||||
}
|
||||
bincount_current = c_bincount[jbin];
|
||||
int j = MY_II < bincount_current ? c_bins(jbin, MY_II) : -1;
|
||||
|
||||
if (j >= 0) {
|
||||
other_x[MY_II] = x(j, 0);
|
||||
other_x[MY_II + atoms_per_bin] = x(j, 1);
|
||||
other_x[MY_II + 2 * atoms_per_bin] = x(j, 2);
|
||||
other_x[MY_II + 3 * atoms_per_bin] = type(j);
|
||||
}
|
||||
|
||||
other_id[MY_II] = j;
|
||||
|
||||
dev.team_barrier();
|
||||
|
||||
if (i >= 0 && i < nlocal) {
|
||||
#pragma unroll 8
|
||||
for (int m = 0; m < bincount_current; m++) {
|
||||
const int j = other_id[m];
|
||||
const int jtype = other_x[m + 3 * atoms_per_bin];
|
||||
|
||||
//if(HalfNeigh && (j < i)) continue;
|
||||
if (HalfNeigh && !Newton && (j < i)) continue;
|
||||
if (!HalfNeigh && j==i) continue;
|
||||
if (Tri) {
|
||||
if (x(j,2) < ztmp) continue;
|
||||
if (x(j,2) == ztmp) {
|
||||
if (x(j,1) < ytmp) continue;
|
||||
if (x(j,1) == ytmp) {
|
||||
if (x(j,0) < xtmp) continue;
|
||||
if (x(j,0) == xtmp && j <= i) continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (exclude && exclusion(i,j,itype,jtype)) continue;
|
||||
|
||||
const X_FLOAT delx = xtmp - other_x[m];
|
||||
const X_FLOAT dely = ytmp - other_x[m + atoms_per_bin];
|
||||
const X_FLOAT delz = ztmp - other_x[m + 2 * atoms_per_bin];
|
||||
const X_FLOAT rsq = delx*delx + dely*dely + delz*delz;
|
||||
|
||||
if (rsq <= cutneighsq(itype,jtype)) {
|
||||
if (molecular != Atom::ATOMIC) {
|
||||
int which = 0;
|
||||
if (!moltemplate)
|
||||
which = NeighborKokkosExecute<DeviceType>::find_special(i,j);
|
||||
/* else if (imol >= 0) */
|
||||
/* which = find_special(onemols[imol]->special[iatom], */
|
||||
/* onemols[imol]->nspecial[iatom], */
|
||||
/* tag[j]-tagprev); */
|
||||
/* else which = 0; */
|
||||
if (which == 0) {
|
||||
if (n < neigh_list.maxneighs) neighbors_i(n++) = j;
|
||||
else n++;
|
||||
} else if (minimum_image_check(delx,dely,delz)) {
|
||||
if (n < neigh_list.maxneighs) neighbors_i(n++) = j;
|
||||
else n++;
|
||||
}
|
||||
else if (which > 0) {
|
||||
if (n < neigh_list.maxneighs) neighbors_i(n++) = j ^ (which << SBBITS);
|
||||
else n++;
|
||||
}
|
||||
} else {
|
||||
if (n < neigh_list.maxneighs) neighbors_i(n++) = j;
|
||||
else n++;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
dev.team_barrier();
|
||||
}
|
||||
|
||||
if (i >= 0 && i < nlocal) {
|
||||
neigh_list.d_numneigh(i) = n;
|
||||
neigh_list.d_ilist(i) = i;
|
||||
}
|
||||
|
||||
if (n > neigh_list.maxneighs) {
|
||||
resize() = 1;
|
||||
|
||||
if (n > new_maxneighs()) new_maxneighs() = n; // avoid atomics, safe because in while loop
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@ -779,7 +793,7 @@ void NeighborKokkosExecute<DeviceType>::build_ItemGPU(typename Kokkos::TeamPolic
|
||||
template<class DeviceType> template<int HalfNeigh>
|
||||
KOKKOS_FUNCTION
|
||||
void NeighborKokkosExecute<DeviceType>::
|
||||
build_Item_Ghost(const int &i) const
|
||||
build_ItemGhost(const int &i) const
|
||||
{
|
||||
/* if necessary, goto next page and add pages */
|
||||
int n = 0;
|
||||
@ -833,24 +847,23 @@ void NeighborKokkosExecute<DeviceType>::
|
||||
/* tag[j]-tagprev); */
|
||||
/* else which = 0; */
|
||||
if (which == 0) {
|
||||
if (n<neigh_list.maxneighs) neighbors_i(n++) = j;
|
||||
if (n < neigh_list.maxneighs) neighbors_i(n++) = j;
|
||||
else n++;
|
||||
} else if (minimum_image_check(delx,dely,delz)) {
|
||||
if (n<neigh_list.maxneighs) neighbors_i(n++) = j;
|
||||
if (n < neigh_list.maxneighs) neighbors_i(n++) = j;
|
||||
else n++;
|
||||
}
|
||||
else if (which > 0) {
|
||||
if (n<neigh_list.maxneighs) neighbors_i(n++) = j ^ (which << SBBITS);
|
||||
if (n < neigh_list.maxneighs) neighbors_i(n++) = j ^ (which << SBBITS);
|
||||
else n++;
|
||||
}
|
||||
} else {
|
||||
if (n<neigh_list.maxneighs) neighbors_i(n++) = j;
|
||||
if (n < neigh_list.maxneighs) neighbors_i(n++) = j;
|
||||
else n++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
int binxyz[3];
|
||||
const int ibin = coord2bin(xtmp, ytmp, ztmp, binxyz);
|
||||
@ -880,7 +893,7 @@ void NeighborKokkosExecute<DeviceType>::
|
||||
const X_FLOAT rsq = delx*delx + dely*dely + delz*delz;
|
||||
|
||||
if (rsq <= cutneighsq(itype,jtype)) {
|
||||
if (n<neigh_list.maxneighs) neighbors_i(n++) = j;
|
||||
if (n < neigh_list.maxneighs) neighbors_i(n++) = j;
|
||||
else n++;
|
||||
}
|
||||
}
|
||||
@ -899,6 +912,208 @@ void NeighborKokkosExecute<DeviceType>::
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
#ifdef LMP_KOKKOS_GPU
|
||||
template<class DeviceType> template<int HalfNeigh>
|
||||
LAMMPS_DEVICE_FUNCTION inline
|
||||
void NeighborKokkosExecute<DeviceType>::build_ItemGhostGPU(typename Kokkos::TeamPolicy<DeviceType>::member_type dev,
|
||||
size_t sharedsize) const
|
||||
{
|
||||
auto* sharedmem = static_cast<X_FLOAT *>(dev.team_shmem().get_shmem(sharedsize));
|
||||
// loop over atoms in i's bin
|
||||
|
||||
const int atoms_per_bin = c_bins.extent(1);
|
||||
const int BINS_PER_TEAM = dev.team_size()/atoms_per_bin <1?1:dev.team_size()/atoms_per_bin;
|
||||
const int TEAMS_PER_BIN = atoms_per_bin/dev.team_size()<1?1:atoms_per_bin/dev.team_size();
|
||||
const int MY_BIN = dev.team_rank()/atoms_per_bin;
|
||||
|
||||
const int ibin = dev.league_rank()*BINS_PER_TEAM+MY_BIN;
|
||||
|
||||
if (ibin >= mbins) return;
|
||||
|
||||
X_FLOAT* other_x = sharedmem + 5*atoms_per_bin*MY_BIN;
|
||||
int* other_id = (int*) &other_x[4 * atoms_per_bin];
|
||||
|
||||
int bincount_current = c_bincount[ibin];
|
||||
|
||||
for (int kk = 0; kk < TEAMS_PER_BIN; kk++) {
|
||||
const int MY_II = dev.team_rank()%atoms_per_bin+kk*dev.team_size();
|
||||
const int i = MY_II < bincount_current ? c_bins(ibin, MY_II) : -1;
|
||||
|
||||
int n = 0;
|
||||
|
||||
X_FLOAT xtmp;
|
||||
X_FLOAT ytmp;
|
||||
X_FLOAT ztmp;
|
||||
int itype;
|
||||
const AtomNeighbors neighbors_i = neigh_list.get_neighbors((i >= 0 && i < nall)?i:0);
|
||||
|
||||
if (i >= 0) {
|
||||
xtmp = x(i, 0);
|
||||
ytmp = x(i, 1);
|
||||
ztmp = x(i, 2);
|
||||
itype = type(i);
|
||||
other_x[MY_II] = xtmp;
|
||||
other_x[MY_II + atoms_per_bin] = ytmp;
|
||||
other_x[MY_II + 2 * atoms_per_bin] = ztmp;
|
||||
other_x[MY_II + 3 * atoms_per_bin] = itype;
|
||||
}
|
||||
other_id[MY_II] = i;
|
||||
#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
|
||||
int test = (__syncthreads_count(i >= 0 && i < nall) == 0);
|
||||
if (test) return;
|
||||
#elif defined(KOKKOS_ENABLE_SYCL)
|
||||
int not_done = (i >= 0 && i < nall);
|
||||
dev.team_reduce(Kokkos::Max<int>(not_done));
|
||||
if (not_done == 0) return;
|
||||
#elif defined(KOKKOS_ENABLE_OPENMPTARGET)
|
||||
dev.team_barrier();
|
||||
#endif
|
||||
|
||||
int which = 0;
|
||||
int moltemplate;
|
||||
if (molecular == Atom::TEMPLATE) moltemplate = 1;
|
||||
else moltemplate = 0;
|
||||
|
||||
const typename ArrayTypes<DeviceType>::t_int_1d_const_um stencil
|
||||
= d_stencil;
|
||||
const typename ArrayTypes<DeviceType>::t_int_1d_3_const_um stencilxyz
|
||||
= d_stencilxyz;
|
||||
|
||||
// loop over all atoms in surrounding bins in stencil including self
|
||||
// when i is a ghost atom, must check if stencil bin is out of bounds
|
||||
// skip i = j
|
||||
// no molecular test when i = ghost atom
|
||||
|
||||
for (int k = 0; k < nstencil; k++) {
|
||||
const int jbin = ibin + stencil[k];
|
||||
bincount_current = c_bincount[jbin];
|
||||
int j = MY_II < bincount_current ? c_bins(jbin, MY_II) : -1;
|
||||
|
||||
if (j >= 0) {
|
||||
other_x[MY_II] = x(j, 0);
|
||||
other_x[MY_II + atoms_per_bin] = x(j, 1);
|
||||
other_x[MY_II + 2 * atoms_per_bin] = x(j, 2);
|
||||
other_x[MY_II + 3 * atoms_per_bin] = type(j);
|
||||
}
|
||||
|
||||
other_id[MY_II] = j;
|
||||
|
||||
dev.team_barrier();
|
||||
|
||||
if (i >= 0 && i < nlocal) {
|
||||
#pragma unroll 8
|
||||
for (int m = 0; m < bincount_current; m++) {
|
||||
const int j = other_id[m];
|
||||
|
||||
if (HalfNeigh && j <= i) continue;
|
||||
else if (j == i) continue;
|
||||
|
||||
const int jtype = other_x[m + 3 * atoms_per_bin];
|
||||
if (exclude && exclusion(i,j,itype,jtype)) continue;
|
||||
|
||||
const X_FLOAT delx = xtmp - other_x[m];
|
||||
const X_FLOAT dely = ytmp - other_x[m + atoms_per_bin];
|
||||
const X_FLOAT delz = ztmp - other_x[m + 2 * atoms_per_bin];
|
||||
const X_FLOAT rsq = delx*delx + dely*dely + delz*delz;
|
||||
|
||||
if (rsq <= cutneighsq(itype,jtype)) {
|
||||
if (molecular != Atom::ATOMIC) {
|
||||
if (!moltemplate)
|
||||
which = NeighborKokkosExecute<DeviceType>::find_special(i,j);
|
||||
/* else if (imol >= 0) */
|
||||
/* which = find_special(onemols[imol]->special[iatom], */
|
||||
/* onemols[imol]->nspecial[iatom], */
|
||||
/* tag[j]-tagprev); */
|
||||
/* else which = 0; */
|
||||
if (which == 0) {
|
||||
if (n < neigh_list.maxneighs) neighbors_i(n++) = j;
|
||||
else n++;
|
||||
} else if (minimum_image_check(delx,dely,delz)) {
|
||||
if (n < neigh_list.maxneighs) neighbors_i(n++) = j;
|
||||
else n++;
|
||||
}
|
||||
else if (which > 0) {
|
||||
if (n < neigh_list.maxneighs) neighbors_i(n++) = j ^ (which << SBBITS);
|
||||
else n++;
|
||||
}
|
||||
} else {
|
||||
if (n < neigh_list.maxneighs) neighbors_i(n++) = j;
|
||||
else n++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
dev.team_barrier();
|
||||
}
|
||||
|
||||
int binxyz[3];
|
||||
const int ibin = coord2bin(xtmp, ytmp, ztmp, binxyz);
|
||||
const int xbin = binxyz[0];
|
||||
const int ybin = binxyz[1];
|
||||
const int zbin = binxyz[2];
|
||||
for (int k = 0; k < nstencil; k++) {
|
||||
const int xbin2 = xbin + stencilxyz(k,0);
|
||||
const int ybin2 = ybin + stencilxyz(k,1);
|
||||
const int zbin2 = zbin + stencilxyz(k,2);
|
||||
if (xbin2 < 0 || xbin2 >= mbinx ||
|
||||
ybin2 < 0 || ybin2 >= mbiny ||
|
||||
zbin2 < 0 || zbin2 >= mbinz) continue;
|
||||
const int jbin = ibin + stencil[k];
|
||||
int bincount_current = c_bincount[jbin];
|
||||
int j = MY_II < bincount_current ? c_bins(jbin, MY_II) : -1;
|
||||
|
||||
if (j >= 0) {
|
||||
other_x[MY_II] = x(j, 0);
|
||||
other_x[MY_II + atoms_per_bin] = x(j, 1);
|
||||
other_x[MY_II + 2 * atoms_per_bin] = x(j, 2);
|
||||
other_x[MY_II + 3 * atoms_per_bin] = type(j);
|
||||
}
|
||||
|
||||
other_id[MY_II] = j;
|
||||
|
||||
dev.team_barrier();
|
||||
|
||||
if (i >= nlocal && i < nall) {
|
||||
#pragma unroll 8
|
||||
for (int m = 0; m < bincount_current; m++) {
|
||||
const int j = other_id[m];
|
||||
|
||||
if (HalfNeigh && j <= i) continue;
|
||||
else if (j == i) continue;
|
||||
|
||||
const int jtype = other_x[m + 3 * atoms_per_bin];
|
||||
if (exclude && exclusion(i,j,itype,jtype)) continue;
|
||||
|
||||
const X_FLOAT delx = xtmp - other_x[m];
|
||||
const X_FLOAT dely = ytmp - other_x[m + atoms_per_bin];
|
||||
const X_FLOAT delz = ztmp - other_x[m + 2 * atoms_per_bin];
|
||||
const X_FLOAT rsq = delx*delx + dely*dely + delz*delz;
|
||||
|
||||
if (rsq <= cutneighsq(itype,jtype)) {
|
||||
if (n < neigh_list.maxneighs) neighbors_i(n++) = j;
|
||||
else n++;
|
||||
}
|
||||
}
|
||||
}
|
||||
dev.team_barrier();
|
||||
}
|
||||
|
||||
if (i >= 0 && i < nall) {
|
||||
neigh_list.d_numneigh(i) = n;
|
||||
neigh_list.d_ilist(i) = i;
|
||||
}
|
||||
|
||||
if (n > neigh_list.maxneighs) {
|
||||
resize() = 1;
|
||||
|
||||
if (n > new_maxneighs()) new_maxneighs() = n; // avoid atomics, safe because in while loop
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType> template<int HalfNeigh,int Newton,int Tri>
|
||||
KOKKOS_FUNCTION
|
||||
void NeighborKokkosExecute<DeviceType>::
|
||||
@ -940,12 +1155,12 @@ void NeighborKokkosExecute<DeviceType>::
|
||||
const X_FLOAT delx = xtmp - x(j, 0);
|
||||
const X_FLOAT dely = ytmp - x(j, 1);
|
||||
const X_FLOAT delz = ztmp - x(j, 2);
|
||||
const X_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
const X_FLOAT rsq = delx*delx + dely*dely + delz*delz;
|
||||
const X_FLOAT radsum = radi + radius(j);
|
||||
const X_FLOAT cutsq = (radsum + skin) * (radsum + skin);
|
||||
|
||||
if (rsq <= cutsq) {
|
||||
if (n<neigh_list.maxneighs) {
|
||||
if (n < neigh_list.maxneighs) {
|
||||
if (neigh_list.history && rsq < radsum*radsum) neighbors_i(n++) = j ^ mask_history;
|
||||
else neighbors_i(n++) = j;
|
||||
}
|
||||
@ -981,12 +1196,12 @@ void NeighborKokkosExecute<DeviceType>::
|
||||
const X_FLOAT delx = xtmp - x(j, 0);
|
||||
const X_FLOAT dely = ytmp - x(j, 1);
|
||||
const X_FLOAT delz = ztmp - x(j, 2);
|
||||
const X_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
const X_FLOAT rsq = delx*delx + dely*dely + delz*delz;
|
||||
const X_FLOAT radsum = radi + radius(j);
|
||||
const X_FLOAT cutsq = (radsum + skin) * (radsum + skin);
|
||||
|
||||
if (rsq <= cutsq) {
|
||||
if (n<neigh_list.maxneighs) {
|
||||
if (n < neigh_list.maxneighs) {
|
||||
if (neigh_list.history && rsq < radsum*radsum) neighbors_i(n++) = j ^ mask_history;
|
||||
else neighbors_i(n++) = j;
|
||||
}
|
||||
@ -1018,16 +1233,15 @@ void NeighborKokkosExecute<DeviceType>::build_ItemSizeGPU(typename Kokkos::TeamP
|
||||
/* loop over atoms in i's bin,
|
||||
*/
|
||||
const int atoms_per_bin = c_bins.extent(1);
|
||||
const int BINS_PER_TEAM = dev.team_size()/atoms_per_bin<1?1:dev.team_size()/atoms_per_bin;
|
||||
const int BINS_PER_TEAM = dev.team_size()/atoms_per_bin <1?1:dev.team_size()/atoms_per_bin;
|
||||
const int TEAMS_PER_BIN = atoms_per_bin/dev.team_size()<1?1:atoms_per_bin/dev.team_size();
|
||||
const int MY_BIN = dev.team_rank()/atoms_per_bin;
|
||||
|
||||
const int ibin = dev.league_rank()*BINS_PER_TEAM+MY_BIN;
|
||||
|
||||
if (ibin >= mbins) return;
|
||||
X_FLOAT* other_x = sharedmem;
|
||||
other_x = other_x + 6*atoms_per_bin*MY_BIN;
|
||||
|
||||
X_FLOAT* other_x = sharedmem + 6*atoms_per_bin*MY_BIN;
|
||||
int* other_id = (int*) &other_x[5 * atoms_per_bin];
|
||||
|
||||
int bincount_current = c_bincount[ibin];
|
||||
@ -1044,7 +1258,7 @@ void NeighborKokkosExecute<DeviceType>::build_ItemSizeGPU(typename Kokkos::TeamP
|
||||
X_FLOAT ztmp;
|
||||
X_FLOAT radi;
|
||||
int itype;
|
||||
const AtomNeighbors neighbors_i = neigh_list.get_neighbors((i>=0&&i<nlocal)?i:0);
|
||||
const AtomNeighbors neighbors_i = neigh_list.get_neighbors((i >= 0 && i < nlocal)?i:0);
|
||||
const int mask_history = 3 << SBBITS;
|
||||
|
||||
if (i >= 0) {
|
||||
@ -1061,12 +1275,12 @@ void NeighborKokkosExecute<DeviceType>::build_ItemSizeGPU(typename Kokkos::TeamP
|
||||
}
|
||||
other_id[MY_II] = i;
|
||||
#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
|
||||
int test = (__syncthreads_count(i >= 0 && i <= nlocal) == 0);
|
||||
int test = (__syncthreads_count(i >= 0 && i < nlocal) == 0);
|
||||
if (test) return;
|
||||
#elif defined(KOKKOS_ENABLE_SYCL)
|
||||
int not_done = (i >= 0 && i <= nlocal);
|
||||
int not_done = (i >= 0 && i < nlocal);
|
||||
dev.team_reduce(Kokkos::Max<int>(not_done));
|
||||
if(not_done == 0) return;
|
||||
if (not_done == 0) return;
|
||||
#elif defined(KOKKOS_ENABLE_OPENMPTARGET)
|
||||
dev.team_barrier();
|
||||
#endif
|
||||
@ -1099,12 +1313,12 @@ void NeighborKokkosExecute<DeviceType>::build_ItemSizeGPU(typename Kokkos::TeamP
|
||||
const X_FLOAT delx = xtmp - other_x[m];
|
||||
const X_FLOAT dely = ytmp - other_x[m + atoms_per_bin];
|
||||
const X_FLOAT delz = ztmp - other_x[m + 2 * atoms_per_bin];
|
||||
const X_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
const X_FLOAT rsq = delx*delx + dely*dely + delz*delz;
|
||||
const X_FLOAT radsum = radi + other_x[m + 4 * atoms_per_bin];
|
||||
const X_FLOAT cutsq = (radsum + skin) * (radsum + skin);
|
||||
|
||||
if (rsq <= cutsq) {
|
||||
if (n<neigh_list.maxneighs) {
|
||||
if (n < neigh_list.maxneighs) {
|
||||
if (neigh_list.history && rsq < radsum*radsum) neighbors_i(n++) = j ^ mask_history;
|
||||
else neighbors_i(n++) = j;
|
||||
}
|
||||
@ -1160,12 +1374,12 @@ void NeighborKokkosExecute<DeviceType>::build_ItemSizeGPU(typename Kokkos::TeamP
|
||||
const X_FLOAT delx = xtmp - other_x[m];
|
||||
const X_FLOAT dely = ytmp - other_x[m + atoms_per_bin];
|
||||
const X_FLOAT delz = ztmp - other_x[m + 2 * atoms_per_bin];
|
||||
const X_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
const X_FLOAT rsq = delx*delx + dely*dely + delz*delz;
|
||||
const X_FLOAT radsum = radi + other_x[m + 4 * atoms_per_bin];
|
||||
const X_FLOAT cutsq = (radsum + skin) * (radsum + skin);
|
||||
|
||||
if (rsq <= cutsq) {
|
||||
if (n<neigh_list.maxneighs) {
|
||||
if (n < neigh_list.maxneighs) {
|
||||
if (neigh_list.history && rsq < radsum*radsum) neighbors_i(n++) = j ^ mask_history;
|
||||
else neighbors_i(n++) = j;
|
||||
}
|
||||
|
||||
@ -208,7 +208,7 @@ class NeighborKokkosExecute
|
||||
const X_FLOAT bininvx,bininvy,bininvz;
|
||||
X_FLOAT bboxhi[3],bboxlo[3];
|
||||
|
||||
const int nlocal;
|
||||
const int nlocal,nall;
|
||||
|
||||
typename AT::t_int_scalar resize;
|
||||
typename AT::t_int_scalar new_maxneighs;
|
||||
@ -230,7 +230,7 @@ class NeighborKokkosExecute
|
||||
const int _mbins,const int _nstencil,
|
||||
const typename AT::t_int_1d &_d_stencil,
|
||||
const typename AT::t_int_1d_3 &_d_stencilxyz,
|
||||
const int _nlocal,
|
||||
const int _nlocal,const int _nall,
|
||||
const typename AT::t_x_array_randomread &_x,
|
||||
const typename AT::t_float_1d &_radius,
|
||||
const typename AT::t_int_1d_const &_type,
|
||||
@ -280,7 +280,7 @@ class NeighborKokkosExecute
|
||||
mbinx(_mbinx),mbiny(_mbiny),mbinz(_mbinz),
|
||||
mbinxlo(_mbinxlo),mbinylo(_mbinylo),mbinzlo(_mbinzlo),
|
||||
bininvx(_bininvx),bininvy(_bininvy),bininvz(_bininvz),
|
||||
nlocal(_nlocal),
|
||||
nlocal(_nlocal),nall(_nall),
|
||||
xperiodic(_xperiodic),yperiodic(_yperiodic),zperiodic(_zperiodic),
|
||||
xprd_half(_xprd_half),yprd_half(_yprd_half),zprd_half(_zprd_half),
|
||||
skin(_skin),resize(_resize),h_resize(_h_resize),
|
||||
@ -304,7 +304,7 @@ class NeighborKokkosExecute
|
||||
|
||||
template<int HalfNeigh>
|
||||
KOKKOS_FUNCTION
|
||||
void build_Item_Ghost(const int &i) const;
|
||||
void build_ItemGhost(const int &i) const;
|
||||
|
||||
template<int HalfNeigh, int Newton, int Tri>
|
||||
KOKKOS_FUNCTION
|
||||
@ -316,6 +316,11 @@ class NeighborKokkosExecute
|
||||
void build_ItemGPU(typename Kokkos::TeamPolicy<DeviceType>::member_type dev,
|
||||
size_t sharedsize) const;
|
||||
|
||||
template<int HalfNeigh>
|
||||
LAMMPS_DEVICE_FUNCTION inline
|
||||
void build_ItemGhostGPU(typename Kokkos::TeamPolicy<DeviceType>::member_type dev,
|
||||
size_t sharedsize) const;
|
||||
|
||||
template<int HalfNeigh, int Newton, int Tri>
|
||||
LAMMPS_DEVICE_FUNCTION inline
|
||||
void build_ItemSizeGPU(typename Kokkos::TeamPolicy<DeviceType>::member_type dev,
|
||||
@ -422,13 +427,24 @@ struct NPairKokkosBuildFunctorGhost {
|
||||
typedef DeviceType device_type;
|
||||
|
||||
const NeighborKokkosExecute<DeviceType> c;
|
||||
size_t sharedsize;
|
||||
|
||||
NPairKokkosBuildFunctorGhost(const NeighborKokkosExecute<DeviceType> &_c):c(_c) {}
|
||||
NPairKokkosBuildFunctorGhost(const NeighborKokkosExecute<DeviceType> &_c,
|
||||
size_t _sharedsize):c(_c),
|
||||
sharedsize(_sharedsize) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (const int & i) const {
|
||||
c.template build_Item_Ghost<HALF_NEIGH>(i);
|
||||
c.template build_ItemGhost<HALF_NEIGH>(i);
|
||||
}
|
||||
|
||||
#ifdef LMP_KOKKOS_GPU
|
||||
LAMMPS_DEVICE_FUNCTION inline
|
||||
void operator() (typename Kokkos::TeamPolicy<DeviceType>::member_type dev) const {
|
||||
c.template build_ItemGhostGPU<HALF_NEIGH>(dev, sharedsize);
|
||||
}
|
||||
size_t team_shmem_size(const int team_size) const { (void) team_size; return sharedsize; }
|
||||
#endif
|
||||
};
|
||||
|
||||
template <class DeviceType, int HALF_NEIGH, int GHOST_NEWTON, int TRI>
|
||||
|
||||
Reference in New Issue
Block a user