Use primary context in CUDA GPU code.

Since LAMMPS uses the low-level driver API of CUDA, it needs to ensure that it is in the correct context when invoking such functions. At the moment it creates and switches to its own context inside `UCL_Device::set` but then assumes that the driver is still in that context for subsequent calls into CUDA; if another part of the program uses a different context (such as the CUDA runtime using the "primary" context) this will cause failures inside LAMMPS. This patch changes the context creation to instead use the primary context for the requested device. While it's not perfect, in that it still doesn't ensure that it's in the correct context before making driver API calls, it at least allows it to work with libraries that use the runtime API.
2022-09-06 08:54:59 +10:00
parent 8315f9996b
commit 294a1c2168
1 changed files with 8 additions and 2 deletions
--- a/lib/gpu/geryon/nvd_device.h
+++ b/lib/gpu/geryon/nvd_device.h
@ -316,6 +316,7 @@ class UCL_Device {
  std::vector<CUstream> _cq;
  CUdevice _cu_device;
  CUcontext _context;
+  CUcontext _old_context;
 };

 // Grabs the properties for all devices
@ -391,8 +392,9 @@ int UCL_Device::set_platform(const int pid) {
 int UCL_Device::set(int num) {
  clear();
  _device=_properties[num].device_id;
+  CU_SAFE_CALL_NS(cuCtxGetCurrent(&_old_context));
  CU_SAFE_CALL_NS(cuDeviceGet(&_cu_device,_device));
-  CUresult err=cuCtxCreate(&_context,0,_cu_device);
+  CUresult err=cuDevicePrimaryCtxRetain(&_context,_cu_device);
  if (err!=CUDA_SUCCESS) {
    #ifndef UCL_NO_EXIT
    std::cerr << "UCL Error: Could not access accelerator number " << num
@ -401,13 +403,17 @@ int UCL_Device::set(int num) {
    #endif
    return UCL_ERROR;
  }
+  if (_context != _old_context) {
+    CU_SAFE_CALL_NS(cuCtxSetCurrent(_context));
+  }
  return UCL_SUCCESS;
 }

 void UCL_Device::clear() {
  if (_device>-1) {
    for (int i=1; i<num_queues(); i++) pop_command_queue();
-    cuCtxDestroy(_context);
+    CU_SAFE_CALL_NS(cuCtxSetCurrent(_old_context));
+    CU_SAFE_CALL_NS(cuDevicePrimaryCtxRelease(_cu_device));
  }
  _device=-1;
 }