remove support for CUDA toolkits before version 8 and GPUs older than Kepler

2022-04-12 15:48:16 -04:00
parent 2ff8ac0cb2
commit f3363070e7
13 changed files with 74 additions and 253 deletions
--- a/lib/gpu/geryon/hip_device.h
+++ b/lib/gpu/geryon/hip_device.h
@ -379,18 +379,9 @@ UCL_Device::UCL_Device() {
    prop.regsPerBlock = hip_prop.regsPerBlock;
    prop.clockRate = hip_prop.clockRate;
    prop.computeMode = hip_prop.computeMode;
-    //CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.memPitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, dev));
-    //CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, dev));

-    //#if CUDA_VERSION >= 2020
-    //CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.kernelExecTimeoutEnabled, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT,dev));
    CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.integrated, hipDeviceAttributeIntegrated, dev));
-    //CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev));
-    //#endif
-    //#if CUDA_VERSION >= 3010
    prop.concurrentKernels = hip_prop.concurrentKernels;
-    //CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.ECCEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev));
-    //#endif

    _properties.push_back(prop);
  }
@ -447,13 +438,11 @@ void UCL_Device::clear() {

 // List all devices along with all properties
 void UCL_Device::print_all(std::ostream &out) {
-  //#if CUDA_VERSION >= 2020
  int driver_version;
  hipDriverGetVersion(&driver_version);
  out << "Driver Version:                           "
      << driver_version/1000 << "." << driver_version%100
                  << std::endl;
-  //#endif

  if (num_devices() == 0)
    out << "There is no device supporting HIP\n";
@ -470,12 +459,10 @@ void UCL_Device::print_all(std::ostream &out) {
      out << "No\n";
    out << "  Total amount of global memory:                 "
        << gigabytes(i) << " GB\n";
-    //#if CUDA_VERSION >= 2000
    out << "  Number of compute units/multiprocessors:       "
        << _properties[i].multiProcessorCount << std::endl;
    out << "  Number of cores:                               "
        << cores(i) << std::endl;
-    //#endif
    out << "  Total amount of constant memory:               "
        << _properties[i].totalConstantMemory << " bytes\n";
    out << "  Total amount of local/shared memory per block: "
@ -494,58 +481,29 @@ void UCL_Device::print_all(std::ostream &out) {
        << _properties[i].maxGridSize[0] << " x "
        << _properties[i].maxGridSize[1] << " x "
        << _properties[i].maxGridSize[2] << std::endl;
-    //out << "  Maximum memory pitch:                          "
-    //    << max_pitch(i) << " bytes\n";
-    //out << "  Texture alignment:                             "
-    //    << _properties[i].textureAlign << " bytes\n";
    out << "  Clock rate:                                    "
        << clock_rate(i) << " GHz\n";
-    //#if CUDA_VERSION >= 2020
-    //out << "  Run time limit on kernels:                     ";
-    //if (_properties[i].kernelExecTimeoutEnabled)
-    //  out << "Yes\n";
-    //else
-    //  out << "No\n";
    out << "  Integrated:                                    ";
    if (_properties[i].integrated)
      out << "Yes\n";
    else
      out << "No\n";
-    //out << "  Support host page-locked memory mapping:       ";
-    //if (_properties[i].canMapHostMemory)
-    //  out << "Yes\n";
-    //else
-    //  out << "No\n";
    out << "  Compute mode:                                  ";
    if (_properties[i].computeMode == hipComputeModeDefault)
      out << "Default\n"; // multiple threads can use device
-//#if CUDA_VERSION >= 8000
-//    else if (_properties[i].computeMode == hipComputeModeExclusiveProcess)
-//#else
    else if (_properties[i].computeMode == hipComputeModeExclusive)
-//#endif
      out << "Exclusive\n"; // only thread can use device
    else if (_properties[i].computeMode == hipComputeModeProhibited)
      out << "Prohibited\n"; // no thread can use device
-    //#if CUDART_VERSION >= 4000
    else if (_properties[i].computeMode == hipComputeModeExclusiveProcess)
      out << "Exclusive Process\n"; // multiple threads 1 process
-    //#endif
    else
      out << "Unknown\n";
-    //#endif
-    //#if CUDA_VERSION >= 3010
    out << "  Concurrent kernel execution:                   ";
    if (_properties[i].concurrentKernels)
      out << "Yes\n";
    else
      out << "No\n";
-    //out << "  Device has ECC support enabled:                ";
-    //if (_properties[i].ECCEnabled)
-    //  out << "Yes\n";
-    //else
-    //  out << "No\n";
-    //#endif
  }
 }

--- a/lib/gpu/geryon/hip_macros.h
+++ b/lib/gpu/geryon/hip_macros.h
@ -5,11 +5,7 @@
 #include <cassert>
 #include <hip/hip_runtime.h>

-//#if CUDA_VERSION >= 3020
 #define CUDA_INT_TYPE size_t
-//#else
-//#define CUDA_INT_TYPE unsigned
-//#endif

 #ifdef MPI_GERYON
 #include "mpi.h"
--- a/lib/gpu/geryon/hip_texture.h
+++ b/lib/gpu/geryon/hip_texture.h
@ -71,9 +71,6 @@ class UCL_Texture {

  /// Make a texture reference available to kernel
  inline void allow(UCL_Kernel &) {
-    //#if CUDA_VERSION < 4000
-    //CU_SAFE_CALL(cuParamSetTexRef(kernel._kernel, CU_PARAM_TR_DEFAULT, _tex));
-    //#endif
  }

 private:
--- a/lib/gpu/geryon/nvd_device.h
+++ b/lib/gpu/geryon/nvd_device.h
@ -320,6 +320,9 @@ class UCL_Device {

 // Grabs the properties for all devices
 UCL_Device::UCL_Device() {
+#if CUDA_VERSION < 8000
+#error CUDA Toolkit version 8 or later required
+#endif
  CU_SAFE_CALL_NS(cuInit(0));
  CU_SAFE_CALL_NS(cuDeviceGetCount(&_num_devices));
  for (int i=0; i<_num_devices; ++i) {
@ -358,16 +361,12 @@ UCL_Device::UCL_Device() {
    CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev));
    CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, dev));

-    #if CUDA_VERSION >= 2020
    CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.kernelExecTimeoutEnabled, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT,dev));
    CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev));
    CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev));
    CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE,dev));
-    #endif
-    #if CUDA_VERSION >= 3010
    CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.concurrentKernels, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev));
    CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.ECCEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev));
-    #endif

    _properties.push_back(prop);
  }
@ -415,13 +414,10 @@ void UCL_Device::clear() {

 // List all devices along with all properties
 void UCL_Device::print_all(std::ostream &out) {
-  #if CUDA_VERSION >= 2020
  int driver_version;
  cuDriverGetVersion(&driver_version);
  out << "CUDA Driver Version:                           "
-      << driver_version/1000 << "." << driver_version%100
-                  << std::endl;
-  #endif
+      << driver_version/1000 << "." << driver_version%100 << std::endl;

  if (num_devices() == 0)
    out << "There is no device supporting CUDA\n";
@ -438,12 +434,10 @@ void UCL_Device::print_all(std::ostream &out) {
      out << "No\n";
    out << "  Total amount of global memory:                 "
        << gigabytes(i) << " GB\n";
-    #if CUDA_VERSION >= 2000
    out << "  Number of compute units/multiprocessors:       "
        << _properties[i].multiProcessorCount << std::endl;
    out << "  Number of cores:                               "
        << cores(i) << std::endl;
-    #endif
    out << "  Total amount of constant memory:               "
        << _properties[i].totalConstantMemory << " bytes\n";
    out << "  Total amount of local/shared memory per block: "
@ -468,7 +462,6 @@ void UCL_Device::print_all(std::ostream &out) {
        << _properties[i].textureAlign << " bytes\n";
    out << "  Clock rate:                                    "
        << clock_rate(i) << " GHz\n";
-    #if CUDA_VERSION >= 2020
    out << "  Run time limit on kernels:                     ";
    if (_properties[i].kernelExecTimeoutEnabled)
      out << "Yes\n";
@ -487,22 +480,14 @@ void UCL_Device::print_all(std::ostream &out) {
    out << "  Compute mode:                                  ";
    if (_properties[i].computeMode == CU_COMPUTEMODE_DEFAULT)
      out << "Default\n"; // multiple threads can use device
-#if CUDA_VERSION >= 8000
    else if (_properties[i].computeMode == CU_COMPUTEMODE_EXCLUSIVE_PROCESS)
-#else
-    else if (_properties[i].computeMode == CU_COMPUTEMODE_EXCLUSIVE)
-#endif
      out << "Exclusive\n"; // only thread can use device
    else if (_properties[i].computeMode == CU_COMPUTEMODE_PROHIBITED)
      out << "Prohibited\n"; // no thread can use device
-    #if CUDART_VERSION >= 4000
    else if (_properties[i].computeMode == CU_COMPUTEMODE_EXCLUSIVE_PROCESS)
      out << "Exclusive Process\n"; // multiple threads 1 process
-    #endif
    else
      out << "Unknown\n";
-    #endif
-    #if CUDA_VERSION >= 3010
    out << "  Concurrent kernel execution:                   ";
    if (_properties[i].concurrentKernels)
      out << "Yes\n";
@ -513,7 +498,6 @@ void UCL_Device::print_all(std::ostream &out) {
      out << "Yes\n";
    else
      out << "No\n";
-    #endif
  }
 }

--- a/lib/gpu/geryon/nvd_kernel.h
+++ b/lib/gpu/geryon/nvd_kernel.h
@ -165,17 +165,11 @@ class UCL_Program {
 class UCL_Kernel {
 public:
  UCL_Kernel() : _dimensions(1), _num_args(0) {
-    #if CUDA_VERSION < 4000
-    _param_size=0;
-    #endif
    _num_blocks[0]=0;
  }

  UCL_Kernel(UCL_Program &program, const char *function) :
    _dimensions(1), _num_args(0) {
-    #if CUDA_VERSION < 4000
-    _param_size=0;
-    #endif
    _num_blocks[0]=0;
    set_function(program,function);
    _cq=program._cq;
@ -211,11 +205,7 @@ class UCL_Kernel {
    if (index==_num_args)
      add_arg(arg);
    else if (index<_num_args)
-      #if CUDA_VERSION >= 4000
      _kernel_args[index]=arg;
-      #else
-      CU_SAFE_CALL(cuParamSetv(_kernel, _offsets[index], arg, sizeof(dtype)));
-      #endif
    else
      assert(0==1); // Must add kernel parameters in sequential order
  }
@ -242,15 +232,7 @@ class UCL_Kernel {

  /// Add a kernel argument.
  inline void add_arg(const CUdeviceptr* const arg) {
-    #if CUDA_VERSION >= 4000
    _kernel_args[_num_args]=(void *)arg;
-    #else
-    void* ptr = (void*)(size_t)(*arg);
-    _param_size = (_param_size + __alignof(ptr) - 1) & ~(__alignof(ptr) - 1);
-    CU_SAFE_CALL(cuParamSetv(_kernel, _param_size, &ptr, sizeof(ptr)));
-    _offsets.push_back(_param_size);
-    _param_size+=sizeof(ptr);
-    #endif
    _num_args++;
    if (_num_args>UCL_MAX_KERNEL_ARGS) assert(0==1);
  }
@ -258,14 +240,7 @@ class UCL_Kernel {
  /// Add a kernel argument.
  template <class dtype>
  inline void add_arg(const dtype* const arg) {
-    #if CUDA_VERSION >= 4000
    _kernel_args[_num_args]=const_cast<dtype *>(arg);
-    #else
-    _param_size = (_param_size+__alignof(dtype)-1) & ~(__alignof(dtype)-1);
-    CU_SAFE_CALL(cuParamSetv(_kernel,_param_size,(void*)arg,sizeof(dtype)));
-    _offsets.push_back(_param_size);
-    _param_size+=sizeof(dtype);
-    #endif
    _num_args++;
    if (_num_args>UCL_MAX_KERNEL_ARGS) assert(0==1);
  }
@ -298,13 +273,9 @@ class UCL_Kernel {
    _num_blocks[0]=num_blocks;
    _num_blocks[1]=1;
    _num_blocks[2]=1;
-    #if CUDA_VERSION >= 4000
    _block_size[0]=block_size;
    _block_size[1]=1;
    _block_size[2]=1;
-    #else
-    CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size,1,1));
-    #endif
  }

  /// Set the number of thread blocks and the number of threads in each block
@ -323,13 +294,9 @@ class UCL_Kernel {
    _num_blocks[0]=num_blocks_x;
    _num_blocks[1]=num_blocks_y;
    _num_blocks[2]=1;
-    #if CUDA_VERSION >= 4000
    _block_size[0]=block_size_x;
    _block_size[1]=block_size_y;
    _block_size[2]=1;
-    #else
-    CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size_x,block_size_y,1));
-    #endif
  }

  /// Set the number of thread blocks and the number of threads in each block
@ -350,14 +317,9 @@ class UCL_Kernel {
    _num_blocks[0]=num_blocks_x;
    _num_blocks[1]=num_blocks_y;
    _num_blocks[2]=1;
-    #if CUDA_VERSION >= 4000
    _block_size[0]=block_size_x;
    _block_size[1]=block_size_y;
    _block_size[2]=block_size_z;
-    #else
-    CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size_x,block_size_y,
-                                     block_size_z));
-    #endif
  }

  /// Set the number of thread blocks and the number of threads in each block
@ -373,23 +335,14 @@ class UCL_Kernel {

  /// Run the kernel in the default command queue
  inline void run() {
-    #if CUDA_VERSION >= 4000
    CU_SAFE_CALL(cuLaunchKernel(_kernel,_num_blocks[0],_num_blocks[1],
                                _num_blocks[2],_block_size[0],_block_size[1],
                                _block_size[2],0,_cq,_kernel_args,nullptr));
-    #else
-    CU_SAFE_CALL(cuParamSetSize(_kernel,_param_size));
-    CU_SAFE_CALL(cuLaunchGridAsync(_kernel,_num_blocks[0],_num_blocks[1],_cq));
-    #endif
  }

  /// Clear any arguments associated with the kernel
  inline void clear_args() {
    _num_args=0;
-    #if CUDA_VERSION < 4000
-    _offsets.clear();
-    _param_size=0;
-    #endif
  }

  /// Return the default command queue/stream associated with this data
@ -406,13 +359,8 @@ class UCL_Kernel {
  unsigned _num_args;
  friend class UCL_Texture;

-  #if CUDA_VERSION >= 4000
  unsigned _block_size[3];
  void * _kernel_args[UCL_MAX_KERNEL_ARGS];
-  #else
-  std::vector<unsigned> _offsets;
-  unsigned _param_size;
-  #endif
 };

 } // namespace
--- a/lib/gpu/geryon/nvd_macros.h
+++ b/lib/gpu/geryon/nvd_macros.h
@ -5,11 +5,7 @@
 #include <cassert>
 #include <cuda.h>

-#if CUDA_VERSION >= 3020
 #define CUDA_INT_TYPE size_t
-#else
-#define CUDA_INT_TYPE unsigned
-#endif

 #ifdef MPI_GERYON
 #include "mpi.h"
--- a/lib/gpu/geryon/nvd_texture.h
+++ b/lib/gpu/geryon/nvd_texture.h
@ -69,9 +69,6 @@ class UCL_Texture {

  /// Make a texture reference available to kernel
  inline void allow(UCL_Kernel &kernel) {
-    #if CUDA_VERSION < 4000
-    CU_SAFE_CALL(cuParamSetTexRef(kernel._kernel, CU_PARAM_TR_DEFAULT, _tex));
-    #endif
  }

 private:
--- a/lib/gpu/geryon/ucl_nv_kernel.h
+++ b/lib/gpu/geryon/ucl_nv_kernel.h
@ -25,21 +25,8 @@
 #ifndef UCL_NV_KERNEL_H
 #define UCL_NV_KERNEL_H

-#if (__CUDA_ARCH__ < 200)
-#define mul24 __mul24
-#define MEM_THREADS 16
-#else
 #define mul24(X,Y) (X)*(Y)
 #define MEM_THREADS 32
-#endif
-
-#ifdef CUDA_PRE_THREE
-struct __builtin_align__(16) _double4
-{
-  double x, y, z, w;
-};
-typedef struct _double4 double4;
-#endif

 #define GLOBAL_ID_X threadIdx.x+mul24(blockIdx.x,blockDim.x)
 #define GLOBAL_ID_Y threadIdx.y+mul24(blockIdx.y,blockDim.y)