Merge pull request #3666 from ndtrung81/gpu-cuda-mps
Fixed GPU library builds with CUDA MPS
This commit is contained in:
@ -60,9 +60,9 @@ if(GPU_API STREQUAL "CUDA")
|
|||||||
option(CUDA_MPS_SUPPORT "Enable tweaks to support CUDA Multi-process service (MPS)" OFF)
|
option(CUDA_MPS_SUPPORT "Enable tweaks to support CUDA Multi-process service (MPS)" OFF)
|
||||||
if(CUDA_MPS_SUPPORT)
|
if(CUDA_MPS_SUPPORT)
|
||||||
if(CUDPP_OPT)
|
if(CUDPP_OPT)
|
||||||
message(FATAL_ERROR "Must use -DCUDPP_OPT=OFF with -DGPU_CUDA_MPS_SUPPORT=ON")
|
message(FATAL_ERROR "Must use -DCUDPP_OPT=OFF with -DCUDA_MPS_SUPPORT=ON")
|
||||||
endif()
|
endif()
|
||||||
set(GPU_CUDA_MPS_FLAGS "-DCUDA_PROXY")
|
set(GPU_CUDA_MPS_FLAGS "-DCUDA_MPS_SUPPORT")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(GPU_ARCH "sm_50" CACHE STRING "LAMMPS GPU CUDA SM primary architecture (e.g. sm_60)")
|
set(GPU_ARCH "sm_50" CACHE STRING "LAMMPS GPU CUDA SM primary architecture (e.g. sm_60)")
|
||||||
|
|||||||
@ -54,7 +54,7 @@ BIN2C = $(CUDA_HOME)/bin/bin2c
|
|||||||
|
|
||||||
CUDR_CPP = mpicxx -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK -DOMPI_SKIP_MPICXX=1 -fPIC
|
CUDR_CPP = mpicxx -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK -DOMPI_SKIP_MPICXX=1 -fPIC
|
||||||
CUDR_OPTS = -O2 $(LMP_INC)
|
CUDR_OPTS = -O2 $(LMP_INC)
|
||||||
CUDR = $(CUDR_CPP) $(CUDR_OPTS) $(CUDA_PROXY) $(CUDA_PRECISION) $(CUDA_INCLUDE) \
|
CUDR = $(CUDR_CPP) $(CUDR_OPTS) $(CUDA_MPS) $(CUDA_PRECISION) $(CUDA_INCLUDE) \
|
||||||
$(CUDPP_OPT)
|
$(CUDPP_OPT)
|
||||||
|
|
||||||
# Headers for Geryon
|
# Headers for Geryon
|
||||||
|
|||||||
@ -30,7 +30,7 @@ AR = ar
|
|||||||
BSH = /bin/sh
|
BSH = /bin/sh
|
||||||
|
|
||||||
CUDPP_OPT =
|
CUDPP_OPT =
|
||||||
CUDA_MPS = -DCUDA_PROXY
|
CUDA_MPS = -DCUDA_MPS_SUPPORT
|
||||||
|
|
||||||
# device code compiler and settings
|
# device code compiler and settings
|
||||||
|
|
||||||
@ -53,7 +53,7 @@ BIN2C = $(CUDA_HOME)/bin/bin2c
|
|||||||
|
|
||||||
CUDR_CPP = mpicxx -fopenmp -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK -DOMPI_SKIP_MPICXX=1 -fPIC
|
CUDR_CPP = mpicxx -fopenmp -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK -DOMPI_SKIP_MPICXX=1 -fPIC
|
||||||
CUDR_OPTS = -O2 $(LMP_INC)
|
CUDR_OPTS = -O2 $(LMP_INC)
|
||||||
CUDR = $(CUDR_CPP) $(CUDR_OPTS) $(CUDA_PROXY) $(CUDA_PRECISION) $(CUDA_INCLUDE) \
|
CUDR = $(CUDR_CPP) $(CUDR_OPTS) $(CUDA_MPS) $(CUDA_PRECISION) $(CUDA_INCLUDE) \
|
||||||
$(CUDPP_OPT)
|
$(CUDPP_OPT)
|
||||||
|
|
||||||
# Headers for Geryon
|
# Headers for Geryon
|
||||||
|
|||||||
@ -18,7 +18,7 @@ OCL_CPP = mpiicpc -std=c++11 -diag-disable=10441 -DMPICH_IGNORE_CXX_SEEK \
|
|||||||
$(LMP_INC) $(OCL_INC) $(CPP_OPT)
|
$(LMP_INC) $(OCL_INC) $(CPP_OPT)
|
||||||
OCL_LINK = -L$(ONEAPI_ROOT)/compiler/latest/linux/lib -lOpenCL
|
OCL_LINK = -L$(ONEAPI_ROOT)/compiler/latest/linux/lib -lOpenCL
|
||||||
OCL_PREC = -D_SINGLE_DOUBLE
|
OCL_PREC = -D_SINGLE_DOUBLE
|
||||||
OCL_TUNE = -DMPI_GERYON -DCUDA_PROXY -DGERYON_NUMA_FISSION -DUCL_NO_EXIT
|
OCL_TUNE = -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT
|
||||||
|
|
||||||
BIN_DIR = ./
|
BIN_DIR = ./
|
||||||
OBJ_DIR = ./
|
OBJ_DIR = ./
|
||||||
|
|||||||
@ -205,8 +205,7 @@ $(CUDA_HOME)/lib64/stubs), that can be used for linking.
|
|||||||
Best performance with the GPU library is typically with multiple MPI processes
|
Best performance with the GPU library is typically with multiple MPI processes
|
||||||
sharing the same GPU cards. For NVIDIA, this is most efficient with CUDA
|
sharing the same GPU cards. For NVIDIA, this is most efficient with CUDA
|
||||||
MPS enabled. To prevent runtime errors for GPUs configured in exclusive process
|
MPS enabled. To prevent runtime errors for GPUs configured in exclusive process
|
||||||
mode with MPS, the GPU library should be build with either of the equivalent
|
mode with MPS, the GPU library should be build with the -DCUDA_MPS_SUPPORT flag.
|
||||||
-DCUDA_MPS_SUPPORT or -DCUDA_PROXY flags.
|
|
||||||
|
|
||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
HIP BUILD NOTES
|
HIP BUILD NOTES
|
||||||
@ -244,7 +243,6 @@ _SINGLE_SINGLE Build library for single precision mode
|
|||||||
_SINGLE_DOUBLE Build library for mixed precision mode
|
_SINGLE_DOUBLE Build library for mixed precision mode
|
||||||
_DOUBLE_DOUBLE Build library for double precision mode
|
_DOUBLE_DOUBLE Build library for double precision mode
|
||||||
CUDA_MPS_SUPPORT Do not generate errors for exclusive mode for CUDA
|
CUDA_MPS_SUPPORT Do not generate errors for exclusive mode for CUDA
|
||||||
CUDA_PROXY Same as above
|
|
||||||
MPI_GERYON Library should use MPI_Abort for unhandled errors
|
MPI_GERYON Library should use MPI_Abort for unhandled errors
|
||||||
GERYON_NUMA_FISSION Accelerators with main memory NUMA are split into
|
GERYON_NUMA_FISSION Accelerators with main memory NUMA are split into
|
||||||
multiple virtual accelerators for each NUMA node
|
multiple virtual accelerators for each NUMA node
|
||||||
|
|||||||
@ -277,7 +277,7 @@ int DeviceT::init_device(MPI_Comm /*world*/, MPI_Comm replica, const int ngpu,
|
|||||||
MPI_Comm_split(node_comm,my_gpu,0,&_comm_gpu);
|
MPI_Comm_split(node_comm,my_gpu,0,&_comm_gpu);
|
||||||
MPI_Comm_rank(_comm_gpu,&_gpu_rank);
|
MPI_Comm_rank(_comm_gpu,&_gpu_rank);
|
||||||
|
|
||||||
#if !defined(CUDA_PROXY) && !defined(CUDA_MPS_SUPPORT)
|
#if !defined(CUDA_MPS_SUPPORT)
|
||||||
if (_procs_per_gpu>1 && !gpu->sharing_supported(my_gpu))
|
if (_procs_per_gpu>1 && !gpu->sharing_supported(my_gpu))
|
||||||
return -7;
|
return -7;
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
Reference in New Issue
Block a user