Merge branch 'master' into collected-small-changes
This commit is contained in:
@ -137,24 +137,24 @@ class AtomVecKokkos : public AtomVec {
|
||||
size_t buffer_size;
|
||||
void* buffer;
|
||||
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
#ifdef LMP_KOKKOS_GPU
|
||||
template<class ViewType>
|
||||
Kokkos::View<typename ViewType::data_type,
|
||||
typename ViewType::array_layout,
|
||||
Kokkos::CudaHostPinnedSpace,
|
||||
LMPPinnedHostType,
|
||||
Kokkos::MemoryTraits<Kokkos::Unmanaged> >
|
||||
create_async_copy(const ViewType& src) {
|
||||
typedef Kokkos::View<typename ViewType::data_type,
|
||||
typename ViewType::array_layout,
|
||||
typename std::conditional<
|
||||
std::is_same<typename ViewType::execution_space,LMPDeviceType>::value,
|
||||
Kokkos::CudaHostPinnedSpace,typename ViewType::memory_space>::type,
|
||||
LMPPinnedHostType,typename ViewType::memory_space>::type,
|
||||
Kokkos::MemoryTraits<Kokkos::Unmanaged> > mirror_type;
|
||||
if (buffer_size == 0) {
|
||||
buffer = Kokkos::kokkos_malloc<Kokkos::CudaHostPinnedSpace>(src.span());
|
||||
buffer = Kokkos::kokkos_malloc<LMPPinnedHostType>(src.span());
|
||||
buffer_size = src.span();
|
||||
} else if (buffer_size < src.span()) {
|
||||
buffer = Kokkos::kokkos_realloc<Kokkos::CudaHostPinnedSpace>(buffer,src.span());
|
||||
buffer = Kokkos::kokkos_realloc<LMPPinnedHostType>(buffer,src.span());
|
||||
buffer_size = src.span();
|
||||
}
|
||||
return mirror_type(buffer, src.d_view.layout());
|
||||
@ -166,13 +166,13 @@ class AtomVecKokkos : public AtomVec {
|
||||
typename ViewType::array_layout,
|
||||
typename std::conditional<
|
||||
std::is_same<typename ViewType::execution_space,LMPDeviceType>::value,
|
||||
Kokkos::CudaHostPinnedSpace,typename ViewType::memory_space>::type,
|
||||
LMPPinnedHostType,typename ViewType::memory_space>::type,
|
||||
Kokkos::MemoryTraits<Kokkos::Unmanaged> > mirror_type;
|
||||
if (buffer_size == 0) {
|
||||
buffer = Kokkos::kokkos_malloc<Kokkos::CudaHostPinnedSpace>(src.span()*sizeof(typename ViewType::value_type));
|
||||
buffer = Kokkos::kokkos_malloc<LMPPinnedHostType>(src.span()*sizeof(typename ViewType::value_type));
|
||||
buffer_size = src.span();
|
||||
} else if (buffer_size < src.span()) {
|
||||
buffer = Kokkos::kokkos_realloc<Kokkos::CudaHostPinnedSpace>(buffer,src.span()*sizeof(typename ViewType::value_type));
|
||||
buffer = Kokkos::kokkos_realloc<LMPPinnedHostType>(buffer,src.span()*sizeof(typename ViewType::value_type));
|
||||
buffer_size = src.span();
|
||||
}
|
||||
mirror_type tmp_view((typename ViewType::value_type*)buffer, src.d_view.layout());
|
||||
|
||||
@ -419,7 +419,7 @@ void CommKokkos::forward_comm_pair_device(Pair *pair)
|
||||
if (sendproc[iswap] != me) {
|
||||
double* buf_send_pair;
|
||||
double* buf_recv_pair;
|
||||
if (lmp->kokkos->cuda_aware_flag) {
|
||||
if (lmp->kokkos->gpu_aware_flag) {
|
||||
buf_send_pair = k_buf_send_pair.view<DeviceType>().data();
|
||||
buf_recv_pair = k_buf_recv_pair.view<DeviceType>().data();
|
||||
} else {
|
||||
@ -437,7 +437,7 @@ void CommKokkos::forward_comm_pair_device(Pair *pair)
|
||||
MPI_Send(buf_send_pair,n,MPI_DOUBLE,sendproc[iswap],0,world);
|
||||
if (recvnum[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE);
|
||||
|
||||
if (!lmp->kokkos->cuda_aware_flag) {
|
||||
if (!lmp->kokkos->gpu_aware_flag) {
|
||||
k_buf_recv_pair.modify<LMPHostType>();
|
||||
k_buf_recv_pair.sync<DeviceType>();
|
||||
}
|
||||
|
||||
@ -678,7 +678,7 @@ forward_comm_kspace_regular(KSpace *kspace, int nper, int which,
|
||||
KokkosBaseFFT* kspaceKKBase = dynamic_cast<KokkosBaseFFT*>(kspace);
|
||||
FFT_SCALAR* buf1;
|
||||
FFT_SCALAR* buf2;
|
||||
if (lmp->kokkos->cuda_aware_flag) {
|
||||
if (lmp->kokkos->gpu_aware_flag) {
|
||||
buf1 = k_buf1.view<DeviceType>().data();
|
||||
buf2 = k_buf2.view<DeviceType>().data();
|
||||
} else {
|
||||
@ -695,7 +695,7 @@ forward_comm_kspace_regular(KSpace *kspace, int nper, int which,
|
||||
|
||||
if (swap[m].sendproc != me) {
|
||||
|
||||
if (!lmp->kokkos->cuda_aware_flag) {
|
||||
if (!lmp->kokkos->gpu_aware_flag) {
|
||||
k_buf1.modify<DeviceType>();
|
||||
k_buf1.sync<LMPHostType>();
|
||||
}
|
||||
@ -706,7 +706,7 @@ forward_comm_kspace_regular(KSpace *kspace, int nper, int which,
|
||||
swap[m].sendproc,0,gridcomm);
|
||||
if (swap[m].nunpack) MPI_Wait(&request,MPI_STATUS_IGNORE);
|
||||
|
||||
if (!lmp->kokkos->cuda_aware_flag) {
|
||||
if (!lmp->kokkos->gpu_aware_flag) {
|
||||
k_buf2.modify<LMPHostType>();
|
||||
k_buf2.sync<DeviceType>();
|
||||
}
|
||||
@ -731,7 +731,7 @@ forward_comm_kspace_tiled(KSpace *kspace, int nper, int which,
|
||||
KokkosBaseFFT* kspaceKKBase = dynamic_cast<KokkosBaseFFT*>(kspace);
|
||||
FFT_SCALAR* buf1;
|
||||
FFT_SCALAR* buf2;
|
||||
if (lmp->kokkos->cuda_aware_flag) {
|
||||
if (lmp->kokkos->gpu_aware_flag) {
|
||||
buf1 = k_buf1.view<DeviceType>().data();
|
||||
buf2 = k_buf2.view<DeviceType>().data();
|
||||
} else {
|
||||
@ -753,7 +753,7 @@ forward_comm_kspace_tiled(KSpace *kspace, int nper, int which,
|
||||
kspaceKKBase->pack_forward_grid_kokkos(which,k_buf1,send[m].npack,k_send_packlist,m);
|
||||
DeviceType().fence();
|
||||
|
||||
if (!lmp->kokkos->cuda_aware_flag) {
|
||||
if (!lmp->kokkos->gpu_aware_flag) {
|
||||
k_buf1.modify<DeviceType>();
|
||||
k_buf1.sync<LMPHostType>();
|
||||
}
|
||||
@ -773,7 +773,7 @@ forward_comm_kspace_tiled(KSpace *kspace, int nper, int which,
|
||||
for (i = 0; i < nrecv; i++) {
|
||||
MPI_Waitany(nrecv,requests,&m,MPI_STATUS_IGNORE);
|
||||
|
||||
if (!lmp->kokkos->cuda_aware_flag) {
|
||||
if (!lmp->kokkos->gpu_aware_flag) {
|
||||
k_buf2.modify<LMPHostType>();
|
||||
k_buf2.sync<DeviceType>();
|
||||
}
|
||||
@ -814,7 +814,7 @@ reverse_comm_kspace_regular(KSpace *kspace, int nper, int which,
|
||||
KokkosBaseFFT* kspaceKKBase = dynamic_cast<KokkosBaseFFT*>(kspace);
|
||||
FFT_SCALAR* buf1;
|
||||
FFT_SCALAR* buf2;
|
||||
if (lmp->kokkos->cuda_aware_flag) {
|
||||
if (lmp->kokkos->gpu_aware_flag) {
|
||||
buf1 = k_buf1.view<DeviceType>().data();
|
||||
buf2 = k_buf2.view<DeviceType>().data();
|
||||
} else {
|
||||
@ -831,7 +831,7 @@ reverse_comm_kspace_regular(KSpace *kspace, int nper, int which,
|
||||
|
||||
if (swap[m].recvproc != me) {
|
||||
|
||||
if (!lmp->kokkos->cuda_aware_flag) {
|
||||
if (!lmp->kokkos->gpu_aware_flag) {
|
||||
k_buf1.modify<DeviceType>();
|
||||
k_buf1.sync<LMPHostType>();
|
||||
}
|
||||
@ -843,7 +843,7 @@ reverse_comm_kspace_regular(KSpace *kspace, int nper, int which,
|
||||
if (swap[m].npack) MPI_Wait(&request,MPI_STATUS_IGNORE);
|
||||
|
||||
|
||||
if (!lmp->kokkos->cuda_aware_flag) {
|
||||
if (!lmp->kokkos->gpu_aware_flag) {
|
||||
k_buf2.modify<LMPHostType>();
|
||||
k_buf2.sync<DeviceType>();
|
||||
}
|
||||
@ -869,7 +869,7 @@ reverse_comm_kspace_tiled(KSpace *kspace, int nper, int which,
|
||||
|
||||
FFT_SCALAR* buf1;
|
||||
FFT_SCALAR* buf2;
|
||||
if (lmp->kokkos->cuda_aware_flag) {
|
||||
if (lmp->kokkos->gpu_aware_flag) {
|
||||
buf1 = k_buf1.view<DeviceType>().data();
|
||||
buf2 = k_buf2.view<DeviceType>().data();
|
||||
} else {
|
||||
@ -891,7 +891,7 @@ reverse_comm_kspace_tiled(KSpace *kspace, int nper, int which,
|
||||
kspaceKKBase->pack_reverse_grid_kokkos(which,k_buf1,recv[m].nunpack,k_recv_unpacklist,m);
|
||||
DeviceType().fence();
|
||||
|
||||
if (!lmp->kokkos->cuda_aware_flag) {
|
||||
if (!lmp->kokkos->gpu_aware_flag) {
|
||||
k_buf1.modify<DeviceType>();
|
||||
k_buf1.sync<LMPHostType>();
|
||||
}
|
||||
@ -911,7 +911,7 @@ reverse_comm_kspace_tiled(KSpace *kspace, int nper, int which,
|
||||
for (i = 0; i < nsend; i++) {
|
||||
MPI_Waitany(nsend,requests,&m,MPI_STATUS_IGNORE);
|
||||
|
||||
if (!lmp->kokkos->cuda_aware_flag) {
|
||||
if (!lmp->kokkos->gpu_aware_flag) {
|
||||
k_buf2.modify<LMPHostType>();
|
||||
k_buf2.sync<DeviceType>();
|
||||
}
|
||||
|
||||
@ -24,15 +24,20 @@
|
||||
#include <csignal>
|
||||
#include <unistd.h>
|
||||
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
#ifdef LMP_KOKKOS_GPU
|
||||
|
||||
// for detecting CUDA-aware MPI support:
|
||||
// the variable int have_cuda_aware
|
||||
// - is 1 if CUDA-aware MPI support is available
|
||||
// - is 0 if CUDA-aware MPI support is unavailable
|
||||
// - is -1 if CUDA-aware MPI support is unknown
|
||||
// for detecting GPU-aware MPI support:
|
||||
// the variable int have_gpu_aware
|
||||
// - is 1 if GPU-aware MPI support is available
|
||||
// - is 0 if GPU-aware MPI support is unavailable
|
||||
// - is -1 if GPU-aware MPI support is unknown
|
||||
|
||||
#define CUDA_AWARE_UNKNOWN static int have_cuda_aware = -1;
|
||||
#define GPU_AWARE_UNKNOWN static int have_gpu_aware = -1;
|
||||
|
||||
// TODO HIP: implement HIP-aware MPI support (UCX) detection
|
||||
#if defined(KOKKOS_ENABLE_HIP)
|
||||
GPU_AWARE_UNKNOWN
|
||||
#elif defined(KOKKOS_ENABLE_CUDA)
|
||||
|
||||
// OpenMPI supports detecting CUDA-aware MPI as of version 2.0.0
|
||||
|
||||
@ -42,23 +47,25 @@
|
||||
#include <mpi-ext.h>
|
||||
|
||||
#if defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT
|
||||
static int have_cuda_aware = 1;
|
||||
static int have_gpu_aware = 1;
|
||||
#elif defined(MPIX_CUDA_AWARE_SUPPORT) && !MPIX_CUDA_AWARE_SUPPORT
|
||||
static int have_cuda_aware = 0;
|
||||
static int have_gpu_aware = 0;
|
||||
#else
|
||||
CUDA_AWARE_UNKNOWN
|
||||
GPU_AWARE_UNKNOWN
|
||||
#endif // defined(MPIX_CUDA_AWARE_SUPPORT)
|
||||
|
||||
#else // old OpenMPI
|
||||
CUDA_AWARE_UNKNOWN
|
||||
GPU_AWARE_UNKNOWN
|
||||
#endif // (OMPI_MAJOR_VERSION >=2)
|
||||
|
||||
#else // unknown MPI library
|
||||
CUDA_AWARE_UNKNOWN
|
||||
GPU_AWARE_UNKNOWN
|
||||
#endif // OPEN_MPI
|
||||
|
||||
#endif // KOKKOS_ENABLE_CUDA
|
||||
|
||||
#endif // LMP_ENABLE_DEVICE
|
||||
|
||||
using namespace LAMMPS_NS;
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
@ -185,9 +192,10 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
|
||||
|
||||
binsize = 0.0;
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
cuda_aware_flag = 1;
|
||||
// TODO HIP: implement HIP-aware MPI testing
|
||||
gpu_aware_flag = 1;
|
||||
#else
|
||||
cuda_aware_flag = 0;
|
||||
gpu_aware_flag = 0;
|
||||
#endif
|
||||
neigh_thread = 0;
|
||||
neigh_thread_set = 0;
|
||||
@ -211,7 +219,7 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
|
||||
exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0;
|
||||
}
|
||||
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
#ifdef LMP_KOKKOS_GPU
|
||||
|
||||
// check and warn about CUDA-aware MPI availability when using multiple MPI tasks
|
||||
// change default only if we can safely detect that CUDA-aware MPI is not available
|
||||
@ -227,42 +235,42 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
|
||||
char mpi_version[MPI_MAX_LIBRARY_VERSION_STRING];
|
||||
MPI_Get_library_version(mpi_version, &len);
|
||||
if (strstr(&mpi_version[0], "Spectrum") != nullptr) {
|
||||
cuda_aware_flag = 0;
|
||||
gpu_aware_flag = 0;
|
||||
char* str;
|
||||
if (str = getenv("OMPI_MCA_pml_pami_enable_cuda"))
|
||||
if ((str = getenv("OMPI_MCA_pml_pami_enable_cuda")))
|
||||
if((strcmp(str,"1") == 0)) {
|
||||
have_cuda_aware = 1;
|
||||
cuda_aware_flag = 1;
|
||||
have_gpu_aware = 1;
|
||||
gpu_aware_flag = 1;
|
||||
}
|
||||
|
||||
if (!cuda_aware_flag)
|
||||
if (!gpu_aware_flag)
|
||||
if (me == 0)
|
||||
error->warning(FLERR,"The Spectrum MPI '-gpu' flag is not set. Disabling CUDA-aware MPI");
|
||||
}
|
||||
#endif
|
||||
|
||||
if (cuda_aware_flag == 1 && have_cuda_aware == 0) {
|
||||
if (gpu_aware_flag == 1 && have_gpu_aware == 0) {
|
||||
if (me == 0)
|
||||
error->warning(FLERR,"Turning off CUDA-aware MPI since it is not detected, "
|
||||
"use '-pk kokkos cuda/aware on' to override");
|
||||
cuda_aware_flag = 0;
|
||||
} else if (have_cuda_aware == -1) { // maybe we are dealing with MPICH, MVAPICH2 or some derivative?
|
||||
gpu_aware_flag = 0;
|
||||
} else if (have_gpu_aware == -1) { // maybe we are dealing with MPICH, MVAPICH2 or some derivative?
|
||||
// MVAPICH2
|
||||
#if defined(MPICH) && defined(MVAPICH2_VERSION)
|
||||
char* str;
|
||||
cuda_aware_flag = 0;
|
||||
if ((str = getenv("MV2_USE_CUDA")))
|
||||
gpu_aware_flag = 0;
|
||||
if ((str = getenv("MV2_ENABLE_CUDA")))
|
||||
if ((strcmp(str,"1") == 0))
|
||||
cuda_aware_flag = 1;
|
||||
gpu_aware_flag = 1;
|
||||
|
||||
if (!cuda_aware_flag)
|
||||
if (!gpu_aware_flag)
|
||||
if (me == 0)
|
||||
error->warning(FLERR,"MVAPICH2 'MV2_USE_CUDA' environment variable is not set. Disabling CUDA-aware MPI");
|
||||
// pure MPICH or some unsupported MPICH derivative
|
||||
#elif defined(MPICH) && !defined(MVAPICH2_VERSION)
|
||||
if (me == 0)
|
||||
error->warning(FLERR,"Detected MPICH. Disabling CUDA-aware MPI");
|
||||
cuda_aware_flag = 0;
|
||||
gpu_aware_flag = 0;
|
||||
#else
|
||||
if (me == 0)
|
||||
error->warning(FLERR,"Kokkos with CUDA assumes CUDA-aware MPI is available,"
|
||||
@ -270,9 +278,9 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
|
||||
" '-pk kokkos cuda/aware off' if getting segmentation faults");
|
||||
|
||||
#endif
|
||||
} // if (-1 == have_cuda_aware)
|
||||
} // if (-1 == have_gpu_aware)
|
||||
} // nmpi > 0
|
||||
#endif // KOKKOS_ENABLE_CUDA
|
||||
#endif // LMP_ENABLE_DEVICE
|
||||
|
||||
#ifdef KILL_KOKKOS_ON_SIGSEGV
|
||||
signal(SIGSEGV, my_signal_handler);
|
||||
@ -381,8 +389,8 @@ void KokkosLMP::accelerator(int narg, char **arg)
|
||||
iarg += 2;
|
||||
} else if (strcmp(arg[iarg],"cuda/aware") == 0) {
|
||||
if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command");
|
||||
if (strcmp(arg[iarg+1],"off") == 0) cuda_aware_flag = 0;
|
||||
else if (strcmp(arg[iarg+1],"on") == 0) cuda_aware_flag = 1;
|
||||
if (strcmp(arg[iarg+1],"off") == 0) gpu_aware_flag = 0;
|
||||
else if (strcmp(arg[iarg+1],"on") == 0) gpu_aware_flag = 1;
|
||||
else error->all(FLERR,"Illegal package kokkos command");
|
||||
iarg += 2;
|
||||
} else if (strcmp(arg[iarg],"neigh/thread") == 0) {
|
||||
@ -395,14 +403,14 @@ void KokkosLMP::accelerator(int narg, char **arg)
|
||||
} else error->all(FLERR,"Illegal package kokkos command");
|
||||
}
|
||||
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
#ifdef LMP_KOKKOS_GPU
|
||||
|
||||
int nmpi = 0;
|
||||
MPI_Comm_size(world,&nmpi);
|
||||
|
||||
// if "cuda/aware off" and "comm device", change to "comm host"
|
||||
|
||||
if (!cuda_aware_flag && nmpi > 1) {
|
||||
if (!gpu_aware_flag && nmpi > 1) {
|
||||
if (exchange_comm_classic == 0 && exchange_comm_on_host == 0) {
|
||||
exchange_comm_on_host = 1;
|
||||
exchange_comm_changed = 1;
|
||||
@ -419,7 +427,7 @@ void KokkosLMP::accelerator(int narg, char **arg)
|
||||
|
||||
// if "cuda/aware on" and comm flags were changed previously, change them back
|
||||
|
||||
if (cuda_aware_flag) {
|
||||
if (gpu_aware_flag) {
|
||||
if (exchange_comm_changed) {
|
||||
exchange_comm_on_host = 0;
|
||||
exchange_comm_changed = 0;
|
||||
|
||||
@ -38,7 +38,7 @@ class KokkosLMP : protected Pointers {
|
||||
int nthreads,ngpus;
|
||||
int numa;
|
||||
int auto_sync;
|
||||
int cuda_aware_flag;
|
||||
int gpu_aware_flag;
|
||||
int neigh_thread;
|
||||
int neigh_thread_set;
|
||||
int newtonflag;
|
||||
|
||||
@ -211,6 +211,21 @@ struct ExecutionSpaceFromDevice<Kokkos::Experimental::HIP> {
|
||||
};
|
||||
#endif
|
||||
|
||||
// set host pinned space
|
||||
#if defined(KOKKOS_ENABLE_CUDA)
|
||||
typedef Kokkos::CudaHostPinnedSpace LMPPinnedHostType;
|
||||
#elif defined(KOKKOS_ENABLE_HIP)
|
||||
typedef Kokkos::Experimental::HIPHostPinnedSpace LMPPinnedHostType;
|
||||
#endif
|
||||
|
||||
// create simple LMPDeviceSpace typedef for non HIP or CUDA specific
|
||||
// behaviour
|
||||
#if defined(KOKKOS_ENABLE_CUDA)
|
||||
typedef Kokkos::Cuda LMPDeviceSpace;
|
||||
#elif defined(KOKKOS_ENABLE_HIP)
|
||||
typedef Kokkos::Experimental::HIP LMPDeviceSpace;
|
||||
#endif
|
||||
|
||||
|
||||
// Determine memory traits for force array
|
||||
// Do atomic trait when running HALFTHREAD neighbor list style
|
||||
@ -239,7 +254,7 @@ struct AtomicDup<HALFTHREAD,Kokkos::Cuda> {
|
||||
};
|
||||
#endif
|
||||
|
||||
#if defined(KOKKOS_ENABLE_HIP)
|
||||
#ifdef KOKKOS_ENABLE_HIP
|
||||
template<>
|
||||
struct AtomicDup<HALFTHREAD,Kokkos::Experimental::HIP> {
|
||||
using value = Kokkos::Experimental::ScatterAtomic;
|
||||
|
||||
@ -212,7 +212,7 @@ void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI,SIZE>::build(NeighList *list_)
|
||||
data.h_resize() = 0;
|
||||
|
||||
Kokkos::deep_copy(d_scalars, h_scalars);
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
#ifdef LMP_KOKKOS_GPU
|
||||
#define BINS_PER_BLOCK 2
|
||||
const int factor = atoms_per_bin<64?2:1;
|
||||
#else
|
||||
@ -226,7 +226,7 @@ void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI,SIZE>::build(NeighList *list_)
|
||||
if (newton_pair) {
|
||||
if (SIZE) {
|
||||
NPairKokkosBuildFunctorSize<DeviceType,TRI?0:HALF_NEIGH,1,TRI> f(data,atoms_per_bin * 5 * sizeof(X_FLOAT) * factor);
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
#ifdef LMP_KOKKOS_GPU
|
||||
if (ExecutionSpaceFromDevice<DeviceType>::space == Device) {
|
||||
int team_size = atoms_per_bin*factor;
|
||||
int team_size_max = Kokkos::TeamPolicy<DeviceType>(team_size,Kokkos::AUTO).team_size_max(f,Kokkos::ParallelForTag());
|
||||
@ -244,7 +244,7 @@ void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI,SIZE>::build(NeighList *list_)
|
||||
#endif
|
||||
} else {
|
||||
NPairKokkosBuildFunctor<DeviceType,TRI?0:HALF_NEIGH,1,TRI> f(data,atoms_per_bin * 5 * sizeof(X_FLOAT) * factor);
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
#ifdef LMP_KOKKOS_GPU
|
||||
if (ExecutionSpaceFromDevice<DeviceType>::space == Device) {
|
||||
int team_size = atoms_per_bin*factor;
|
||||
int team_size_max = Kokkos::TeamPolicy<DeviceType>(team_size,Kokkos::AUTO).team_size_max(f,Kokkos::ParallelForTag());
|
||||
@ -264,7 +264,7 @@ void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI,SIZE>::build(NeighList *list_)
|
||||
} else {
|
||||
if (SIZE) {
|
||||
NPairKokkosBuildFunctorSize<DeviceType,HALF_NEIGH,0,0> f(data,atoms_per_bin * 5 * sizeof(X_FLOAT) * factor);
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
#ifdef LMP_KOKKOS_GPU
|
||||
if (ExecutionSpaceFromDevice<DeviceType>::space == Device) {
|
||||
int team_size = atoms_per_bin*factor;
|
||||
int team_size_max = Kokkos::TeamPolicy<DeviceType>(team_size,Kokkos::AUTO).team_size_max(f,Kokkos::ParallelForTag());
|
||||
@ -282,7 +282,7 @@ void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI,SIZE>::build(NeighList *list_)
|
||||
#endif
|
||||
} else {
|
||||
NPairKokkosBuildFunctor<DeviceType,HALF_NEIGH,0,0> f(data,atoms_per_bin * 5 * sizeof(X_FLOAT) * factor);
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
#ifdef LMP_KOKKOS_GPU
|
||||
if (ExecutionSpaceFromDevice<DeviceType>::space == Device) {
|
||||
int team_size = atoms_per_bin*factor;
|
||||
int team_size_max = Kokkos::TeamPolicy<DeviceType>(team_size,Kokkos::AUTO).team_size_max(f,Kokkos::ParallelForTag());
|
||||
@ -529,15 +529,33 @@ void NeighborKokkosExecute<DeviceType>::
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
extern __shared__ X_FLOAT sharedmem[];
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
#ifdef KOKKOS_ENABLE_HIP
|
||||
#include <hip/hip_version.h>
|
||||
#if HIP_VERSION_MAJOR < 3 || (HIP_VERSION_MAJOR == 3 && HIP_VERSION_MINOR < 7)
|
||||
// ROCm versions < 3.7 are missing __syncthreads_count, so we define a functional
|
||||
// but (probably) not performant workaround
|
||||
__device__ __forceinline__ int __syncthreads_count(int predicate) {
|
||||
__shared__ int test_block[1];
|
||||
if (!(threadIdx.x || threadIdx.y || threadIdx.z))
|
||||
test_block[0] = 0;
|
||||
__syncthreads();
|
||||
atomicAdd(test_block, predicate);
|
||||
__threadfence_block();
|
||||
return test_block[0];
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef LMP_KOKKOS_GPU
|
||||
template<class DeviceType> template<int HalfNeigh,int Newton,int Tri>
|
||||
__device__ inline
|
||||
void NeighborKokkosExecute<DeviceType>::build_ItemCuda(typename Kokkos::TeamPolicy<DeviceType>::member_type dev) const
|
||||
{
|
||||
#ifdef KOKKOS_ENABLE_HIP
|
||||
HIP_DYNAMIC_SHARED(X_FLOAT, sharedmem);
|
||||
#else
|
||||
extern __shared__ X_FLOAT sharedmem[];
|
||||
#endif
|
||||
/* loop over atoms in i's bin,
|
||||
*/
|
||||
const int atoms_per_bin = c_bins.extent(1);
|
||||
@ -971,11 +989,16 @@ void NeighborKokkosExecute<DeviceType>::
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
#ifdef LMP_KOKKOS_GPU
|
||||
template<class DeviceType> template<int HalfNeigh,int Newton,int Tri>
|
||||
__device__ inline
|
||||
void NeighborKokkosExecute<DeviceType>::build_ItemSizeCuda(typename Kokkos::TeamPolicy<DeviceType>::member_type dev) const
|
||||
{
|
||||
#ifdef KOKKOS_ENABLE_HIP
|
||||
HIP_DYNAMIC_SHARED(X_FLOAT, sharedmem);
|
||||
#else
|
||||
extern __shared__ X_FLOAT sharedmem[];
|
||||
#endif
|
||||
/* loop over atoms in i's bin,
|
||||
*/
|
||||
const int atoms_per_bin = c_bins.extent(1);
|
||||
|
||||
@ -310,7 +310,7 @@ class NeighborKokkosExecute
|
||||
KOKKOS_FUNCTION
|
||||
void build_ItemSize(const int &i) const;
|
||||
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
#ifdef LMP_KOKKOS_GPU
|
||||
template<int HalfNeigh, int Newton, int Tri>
|
||||
__device__ inline
|
||||
void build_ItemCuda(typename Kokkos::TeamPolicy<DeviceType>::member_type dev) const;
|
||||
@ -387,7 +387,7 @@ struct NPairKokkosBuildFunctor {
|
||||
void operator() (const int & i) const {
|
||||
c.template build_Item<HALF_NEIGH,GHOST_NEWTON,TRI>(i);
|
||||
}
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
#ifdef LMP_KOKKOS_GPU
|
||||
__device__ inline
|
||||
|
||||
void operator() (typename Kokkos::TeamPolicy<DeviceType>::member_type dev) const {
|
||||
@ -445,7 +445,7 @@ struct NPairKokkosBuildFunctorSize {
|
||||
c.template build_ItemSize<HALF_NEIGH,GHOST_NEWTON,TRI>(i);
|
||||
}
|
||||
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
#ifdef LMP_KOKKOS_GPU
|
||||
__device__ inline
|
||||
void operator() (typename Kokkos::TeamPolicy<DeviceType>::member_type dev) const {
|
||||
c.template build_ItemSizeCuda<HALF_NEIGH,GHOST_NEWTON,TRI>(dev);
|
||||
|
||||
@ -112,10 +112,10 @@ void PairDPDfdtEnergyKokkos<DeviceType>::init_style()
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDACC__)
|
||||
#if (defined(KOKKOS_ENABLE_CUDA) && defined(__CUDACC__)) || defined(KOKKOS_ENABLE_HIP)
|
||||
// CUDA specialization of init_style to properly call rand_pool.init()
|
||||
template<>
|
||||
void PairDPDfdtEnergyKokkos<Kokkos::Cuda>::init_style()
|
||||
void PairDPDfdtEnergyKokkos<LMPDeviceSpace>::init_style()
|
||||
{
|
||||
PairDPDfdtEnergy::init_style();
|
||||
|
||||
@ -125,10 +125,10 @@ void PairDPDfdtEnergyKokkos<Kokkos::Cuda>::init_style()
|
||||
int irequest = neighbor->nrequest - 1;
|
||||
|
||||
neighbor->requests[irequest]->
|
||||
kokkos_host = std::is_same<Kokkos::Cuda,LMPHostType>::value &&
|
||||
!std::is_same<Kokkos::Cuda,LMPDeviceType>::value;
|
||||
kokkos_host = std::is_same<LMPDeviceSpace,LMPHostType>::value &&
|
||||
!std::is_same<LMPDeviceSpace,LMPDeviceType>::value;
|
||||
neighbor->requests[irequest]->
|
||||
kokkos_device = std::is_same<Kokkos::Cuda,LMPDeviceType>::value;
|
||||
kokkos_device = std::is_same<LMPDeviceSpace,LMPDeviceType>::value;
|
||||
|
||||
if (neighflag == FULL) {
|
||||
neighbor->requests[irequest]->full = 1;
|
||||
|
||||
@ -811,22 +811,22 @@ void PPPMKokkos<DeviceType>::allocate()
|
||||
// remap takes data from 3d brick to FFT decomposition
|
||||
|
||||
int collective_flag = 0; // not yet supported in Kokkos version
|
||||
int cuda_aware_flag = lmp->kokkos->cuda_aware_flag;
|
||||
int gpu_aware_flag = lmp->kokkos->gpu_aware_flag;
|
||||
int tmp;
|
||||
|
||||
fft1 = new FFT3dKokkos<DeviceType>(lmp,world,nx_pppm,ny_pppm,nz_pppm,
|
||||
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
|
||||
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
|
||||
0,0,&tmp,collective_flag,cuda_aware_flag);
|
||||
0,0,&tmp,collective_flag,gpu_aware_flag);
|
||||
|
||||
fft2 = new FFT3dKokkos<DeviceType>(lmp,world,nx_pppm,ny_pppm,nz_pppm,
|
||||
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
|
||||
nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
|
||||
0,0,&tmp,collective_flag,cuda_aware_flag);
|
||||
0,0,&tmp,collective_flag,gpu_aware_flag);
|
||||
remap = new RemapKokkos<DeviceType>(lmp,world,
|
||||
nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
|
||||
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
|
||||
1,0,0,FFT_PRECISION,collective_flag,cuda_aware_flag);
|
||||
1,0,0,FFT_PRECISION,collective_flag,gpu_aware_flag);
|
||||
|
||||
// create ghost grid object for rho and electric field communication
|
||||
// also create 2 bufs for ghost grid cell comm, passed to GridComm methods
|
||||
|
||||
Reference in New Issue
Block a user