Merge pull request #2997 from stanmoore1/kk_omp_target

Add preliminary support for Kokkos OpenMPTarget backend
This commit is contained in:
Axel Kohlmeyer
2021-10-27 08:15:45 -04:00
committed by GitHub
8 changed files with 42 additions and 26 deletions

View File

@ -1966,7 +1966,7 @@ void FixACKS2ReaxFFKokkos<DeviceType>::get_chi_field()
namespace LAMMPS_NS {
template class FixACKS2ReaxFFKokkos<LMPDeviceType>;
#ifdef KOKKOS_ENABLE_CUDA
#ifdef LMP_KOKKOS_GPU
template class FixACKS2ReaxFFKokkos<LMPHostType>;
#endif
}

View File

@ -299,7 +299,7 @@ struct FixACKS2ReaxFFKokkosComputeHFunctor {
c.template compute_h_team<NEIGHFLAG>(team, atoms_per_team, vector_length);
}
size_t team_shmem_size(int team_size) const {
size_t team_shmem_size(int /*team_size*/) const {
size_t shmem_size =
Kokkos::View<int *, scratch_space, Kokkos::MemoryUnmanaged>::shmem_size(
atoms_per_team) + // s_ilist
@ -347,7 +347,7 @@ struct FixACKS2ReaxFFKokkosComputeXFunctor {
c.template compute_x_team<NEIGHFLAG>(team, atoms_per_team, vector_length);
}
size_t team_shmem_size(int team_size) const {
size_t team_shmem_size(int /*team_size*/) const {
size_t shmem_size =
Kokkos::View<int *, scratch_space, Kokkos::MemoryUnmanaged>::shmem_size(
atoms_per_team) + // s_ilist

View File

@ -30,7 +30,6 @@ FixStyle(qeq/reax/kk/host,FixQEqReaxFFKokkos<LMPHostType>);
#include "kokkos_type.h"
#include "neigh_list.h"
#include "neigh_list_kokkos.h"
#include "kokkos_base.h"
namespace LAMMPS_NS {
@ -42,7 +41,7 @@ struct TagFixQEqReaxFFPackForwardComm {};
struct TagFixQEqReaxFFUnpackForwardComm {};
template<class DeviceType>
class FixQEqReaxFFKokkos : public FixQEqReaxFF, public KokkosBase {
class FixQEqReaxFFKokkos : public FixQEqReaxFF {
public:
typedef DeviceType device_type;
typedef ArrayTypes<DeviceType> AT;

View File

@ -44,7 +44,7 @@
#define GPU_AWARE_UNKNOWN static int have_gpu_aware = -1;
// TODO HIP: implement HIP-aware MPI support (UCX) detection
#if defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
#if defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_OPENMPTARGET)
GPU_AWARE_UNKNOWN
#elif defined(KOKKOS_ENABLE_CUDA)
@ -121,7 +121,7 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
} else if (strcmp(arg[iarg],"g") == 0 ||
strcmp(arg[iarg],"gpus") == 0) {
#ifndef LMP_KOKKOS_GPU
error->all(FLERR,"GPUs are requested but Kokkos has not been compiled for CUDA, HIP, or SYCL");
error->all(FLERR,"GPUs are requested but Kokkos has not been compiled using a GPU-enabled backend");
#endif
if (iarg+2 > narg) error->all(FLERR,"Invalid Kokkos command-line args");
ngpus = atoi(arg[iarg+1]);
@ -162,7 +162,7 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
if (ngpus > 1 && !set_flag)
error->all(FLERR,"Could not determine local MPI rank for multiple "
"GPUs with Kokkos CUDA, HIP, or SYCL because MPI library not recognized");
"GPUs with Kokkos because MPI library not recognized");
} else if (strcmp(arg[iarg],"t") == 0 ||
strcmp(arg[iarg],"threads") == 0) {
@ -204,7 +204,7 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
#ifdef LMP_KOKKOS_GPU
if (ngpus <= 0)
error->all(FLERR,"Kokkos has been compiled for CUDA, HIP, or SYCL but no GPUs are requested");
error->all(FLERR,"Kokkos has been compiled with GPU-enabled backend but no GPUs are requested");
#endif
#ifndef KOKKOS_ENABLE_SERIAL
@ -311,7 +311,7 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
error->warning(FLERR,"Detected MPICH. Disabling GPU-aware MPI");
#else
if (me == 0)
error->warning(FLERR,"Kokkos with CUDA, HIP, or SYCL assumes CUDA-aware MPI is available,"
error->warning(FLERR,"Kokkos with GPU-enabled backend assumes GPU-aware MPI is available,"
" but cannot determine if this is the case\n try"
" '-pk kokkos gpu/aware off' if getting segmentation faults");

View File

@ -87,7 +87,7 @@ E: Invalid Kokkos command-line args
Self-explanatory. See Section 2.7 of the manual for details.
E: Could not determine local MPI rank for multiple GPUs with Kokkos CUDA
E: Could not determine local MPI rank for multiple GPUs with Kokkos
because MPI library not recognized
The local MPI rank was not found in one of four supported environment variables.
@ -96,13 +96,13 @@ E: Invalid number of threads requested for Kokkos: must be 1 or greater
Self-explanatory.
E: GPUs are requested but Kokkos has not been compiled for CUDA
E: GPUs are requested but Kokkos has not been compiled using GPU-enabled backend
Recompile Kokkos with CUDA support to use GPUs.
Recompile Kokkos with GPU-enabled backend to use GPUs.
E: Kokkos has been compiled for CUDA, HIP, or SYCL but no GPUs are requested
E: Kokkos has been compiled with GPU-enabled backend but no GPUs are requested
One or more GPUs must be used when Kokkos is compiled for CUDA/HIP/SYCL.
One or more GPUs must be used when Kokkos is compiled for CUDA/HIP/SYCL/OpenMPTarget.
W: Kokkos package already initalized, cannot reinitialize with different parameters

View File

@ -20,7 +20,7 @@
#include <Kokkos_Core.hpp>
#include <Kokkos_DualView.hpp>
#include <impl/Kokkos_Timer.hpp>
#include <Kokkos_Timer.hpp>
#include <Kokkos_Vectorization.hpp>
#include <Kokkos_ScatterView.hpp>
#include <Kokkos_UnorderedMap.hpp>
@ -34,7 +34,7 @@ constexpr int HALF = 4;
#define ISFINITE(x) std::isfinite(x)
#endif
#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_OPENMPTARGET)
#define LMP_KOKKOS_GPU
#endif
@ -223,6 +223,11 @@ template<>
struct ExecutionSpaceFromDevice<Kokkos::Experimental::SYCL> {
static const LAMMPS_NS::ExecutionSpace space = LAMMPS_NS::Device;
};
#elif defined(KOKKOS_ENABLE_OPENMPTARGET)
template<>
struct ExecutionSpaceFromDevice<Kokkos::Experimental::OpenMPTarget> {
static const LAMMPS_NS::ExecutionSpace space = LAMMPS_NS::Device;
};
#endif
// set host pinned space
@ -232,6 +237,8 @@ typedef Kokkos::CudaHostPinnedSpace LMPPinnedHostType;
typedef Kokkos::Experimental::HIPHostPinnedSpace LMPPinnedHostType;
#elif defined(KOKKOS_ENABLE_SYCL)
typedef Kokkos::Experimental::SYCLSharedUSMSpace LMPPinnedHostType;
#elif defined(KOKKOS_ENABLE_OPENMPTARGET)
typedef Kokkos::Serial LMPPinnedHostType;
#endif
// create simple LMPDeviceSpace typedef for non CUDA-, HIP-, or SYCL-specific
@ -242,6 +249,8 @@ typedef Kokkos::Cuda LMPDeviceSpace;
typedef Kokkos::Experimental::HIP LMPDeviceSpace;
#elif defined(KOKKOS_ENABLE_SYCL)
typedef Kokkos::Experimental::SYCL LMPDeviceSpace;
#elif defined(KOKKOS_ENABLE_OPENMPTARGET)
typedef Kokkos::Experimental::OpenMPTarget LMPDeviceSpace;
#endif
@ -280,6 +289,11 @@ template<>
struct AtomicDup<HALFTHREAD,Kokkos::Experimental::SYCL> {
using value = Kokkos::Experimental::ScatterAtomic;
};
#elif defined(KOKKOS_ENABLE_OPENMPTARGET)
template<>
struct AtomicDup<HALFTHREAD,Kokkos::Experimental::OpenMPTarget> {
using value = Kokkos::Experimental::ScatterAtomic;
};
#endif
#ifdef LMP_KOKKOS_USE_ATOMICS

View File

@ -226,7 +226,8 @@ void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI,SIZE>::build(NeighList *list_)
data.h_resize() = 0;
Kokkos::deep_copy(d_scalars, h_scalars);
#ifdef LMP_KOKKOS_GPU
#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
#define BINS_PER_BLOCK 2
const int factor = atoms_per_bin<64?2:1;
#else
@ -605,14 +606,15 @@ void NeighborKokkosExecute<DeviceType>::build_ItemGPU(typename Kokkos::TeamPolic
other_x[MY_II + 3 * atoms_per_bin] = itype;
}
other_id[MY_II] = i;
#ifndef KOKKOS_ENABLE_SYCL
#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
int test = (__syncthreads_count(i >= 0 && i <= nlocal) == 0);
if (test) return;
#else
#elif defined(KOKKOS_ENABLE_SYCL)
int not_done = (i >= 0 && i <= nlocal);
dev.team_reduce(Kokkos::Max<int>(not_done));
if(not_done == 0) return;
#elif defined(KOKKOS_ENABLE_OPENMPTARGET)
dev.team_barrier();
#endif
if (i >= 0 && i < nlocal) {
@ -1055,14 +1057,15 @@ void NeighborKokkosExecute<DeviceType>::build_ItemSizeGPU(typename Kokkos::TeamP
other_x[MY_II + 4 * atoms_per_bin] = radi;
}
other_id[MY_II] = i;
#ifndef KOKKOS_ENABLE_SYCL
#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
int test = (__syncthreads_count(i >= 0 && i <= nlocal) == 0);
if (test) return;
#else
#elif defined(KOKKOS_ENABLE_SYCL)
int not_done = (i >= 0 && i <= nlocal);
dev.team_reduce(Kokkos::Max<int>(not_done));
if(not_done == 0) return;
#elif defined(KOKKOS_ENABLE_OPENMPTARGET)
dev.team_barrier();
#endif
if (i >= 0 && i < nlocal) {

View File

@ -283,7 +283,7 @@ void VerletKokkos::run(int n)
atomKK->sync(Device,ALL_MASK);
//static double time = 0.0;
//Kokkos::Impl::Timer ktimer;
//Kokkos::Timer ktimer;
timer->init_timeout();
for (int i = 0; i < n; i++) {
@ -445,7 +445,7 @@ void VerletKokkos::run(int n)
if (pair_compute_flag) {
atomKK->sync(force->pair->execution_space,force->pair->datamask_read);
atomKK->sync(force->pair->execution_space,~(~force->pair->datamask_read|(F_MASK | ENERGY_MASK | VIRIAL_MASK)));
Kokkos::Impl::Timer ktimer;
Kokkos::Timer ktimer;
force->pair->compute(eflag,vflag);
atomKK->modified(force->pair->execution_space,force->pair->datamask_modify);
atomKK->modified(force->pair->execution_space,~(~force->pair->datamask_modify|(F_MASK | ENERGY_MASK | VIRIAL_MASK)));