Merge pull request #3666 from ndtrung81/gpu-cuda-mps

Fixed GPU library builds with CUDA MPS
2023-03-01 21:02:27 -05:00
parent c8696478b7 2ccfe635ce
commit 2979296c02
6 changed files with 8 additions and 10 deletions
--- a/cmake/Modules/Packages/GPU.cmake
+++ b/cmake/Modules/Packages/GPU.cmake
@ -60,9 +60,9 @@ if(GPU_API STREQUAL "CUDA")
  option(CUDA_MPS_SUPPORT "Enable tweaks to support CUDA Multi-process service (MPS)" OFF)
  if(CUDA_MPS_SUPPORT)
    if(CUDPP_OPT)
-      message(FATAL_ERROR "Must use -DCUDPP_OPT=OFF with -DGPU_CUDA_MPS_SUPPORT=ON")
+      message(FATAL_ERROR "Must use -DCUDPP_OPT=OFF with -DCUDA_MPS_SUPPORT=ON")
    endif()
-    set(GPU_CUDA_MPS_FLAGS "-DCUDA_PROXY")
+    set(GPU_CUDA_MPS_FLAGS "-DCUDA_MPS_SUPPORT")
  endif()
  set(GPU_ARCH "sm_50" CACHE STRING "LAMMPS GPU CUDA SM primary architecture (e.g. sm_60)")
--- a/lib/gpu/Makefile.cuda
+++ b/lib/gpu/Makefile.cuda
@ -54,7 +54,7 @@ BIN2C = $(CUDA_HOME)/bin/bin2c
 CUDR_CPP = mpicxx -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK -DOMPI_SKIP_MPICXX=1 -fPIC
 CUDR_OPTS = -O2 $(LMP_INC)
-CUDR  = $(CUDR_CPP) $(CUDR_OPTS) $(CUDA_PROXY) $(CUDA_PRECISION) $(CUDA_INCLUDE) \
+CUDR  = $(CUDR_CPP) $(CUDR_OPTS) $(CUDA_MPS) $(CUDA_PRECISION) $(CUDA_INCLUDE) \
         $(CUDPP_OPT)
 # Headers for Geryon
--- a/lib/gpu/Makefile.cuda_mps
+++ b/lib/gpu/Makefile.cuda_mps
@ -30,7 +30,7 @@ AR = ar
 BSH = /bin/sh
 CUDPP_OPT =
-CUDA_MPS  = -DCUDA_PROXY
+CUDA_MPS  = -DCUDA_MPS_SUPPORT
 # device code compiler and settings
@ -53,7 +53,7 @@ BIN2C = $(CUDA_HOME)/bin/bin2c
 CUDR_CPP = mpicxx -fopenmp -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK -DOMPI_SKIP_MPICXX=1 -fPIC
 CUDR_OPTS = -O2 $(LMP_INC)
-CUDR  = $(CUDR_CPP) $(CUDR_OPTS) $(CUDA_PROXY) $(CUDA_PRECISION) $(CUDA_INCLUDE) \
+CUDR  = $(CUDR_CPP) $(CUDR_OPTS) $(CUDA_MPS) $(CUDA_PRECISION) $(CUDA_INCLUDE) \
         $(CUDPP_OPT)
 # Headers for Geryon
--- a/lib/gpu/Makefile.oneapi
+++ b/lib/gpu/Makefile.oneapi
@ -18,7 +18,7 @@ OCL_CPP = mpiicpc -std=c++11 -diag-disable=10441 -DMPICH_IGNORE_CXX_SEEK \
          $(LMP_INC) $(OCL_INC) $(CPP_OPT)
 OCL_LINK = -L$(ONEAPI_ROOT)/compiler/latest/linux/lib -lOpenCL
 OCL_PREC = -D_SINGLE_DOUBLE
-OCL_TUNE = -DMPI_GERYON -DCUDA_PROXY -DGERYON_NUMA_FISSION -DUCL_NO_EXIT
+OCL_TUNE = -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT
 BIN_DIR = ./
 OBJ_DIR = ./
--- a/lib/gpu/README
+++ b/lib/gpu/README
@ -205,8 +205,7 @@ $(CUDA_HOME)/lib64/stubs), that can be used for linking.
 Best performance with the GPU library is typically with multiple MPI processes
 sharing the same GPU cards. For NVIDIA, this is most efficient with CUDA
 MPS enabled. To prevent runtime errors for GPUs configured in exclusive process
-mode with MPS, the GPU library should be build with either of the equivalent
+mode with MPS, the GPU library should be build with the -DCUDA_MPS_SUPPORT flag.
 -DCUDA_MPS_SUPPORT or -DCUDA_PROXY flags.
 ------------------------------------------------------------------------------
                             HIP BUILD NOTES
@ -244,7 +243,6 @@ _SINGLE_SINGLE          Build library for single precision mode
 _SINGLE_DOUBLE          Build library for mixed precision mode
 _DOUBLE_DOUBLE          Build library for double precision mode
 CUDA_MPS_SUPPORT        Do not generate errors for exclusive mode for CUDA
 CUDA_PROXY              Same as above
 MPI_GERYON              Library should use MPI_Abort for unhandled errors
 GERYON_NUMA_FISSION     Accelerators with main memory NUMA are split into
                        multiple virtual accelerators for each NUMA node
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@ -277,7 +277,7 @@ int DeviceT::init_device(MPI_Comm /*world*/, MPI_Comm replica, const int ngpu,
  MPI_Comm_split(node_comm,my_gpu,0,&_comm_gpu);
  MPI_Comm_rank(_comm_gpu,&_gpu_rank);
-  #if !defined(CUDA_PROXY) && !defined(CUDA_MPS_SUPPORT)
+  #if !defined(CUDA_MPS_SUPPORT)
  if (_procs_per_gpu>1 && !gpu->sharing_supported(my_gpu))
    return -7;
  #endif