From f3363070e7ddc9f45b294025d19c142a602e3c8c Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Tue, 12 Apr 2022 15:48:16 -0400
Subject: [PATCH] remove support for CUDA toolkits before version 8 and GPUs
 older than Kepler

---
 cmake/Modules/Packages/GPU.cmake | 100 +++++++++++++++++--------------
 doc/src/Build_extras.rst         |   7 ++-
 lib/gpu/Makefile.linux_multi     |  11 +---
 lib/gpu/README                   |   7 ++-
 lib/gpu/geryon/hip_device.h      |  42 -------------
 lib/gpu/geryon/hip_macros.h      |   4 --
 lib/gpu/geryon/hip_texture.h     |   3 -
 lib/gpu/geryon/nvd_device.h      |  24 ++------
 lib/gpu/geryon/nvd_kernel.h      |  52 ----------------
 lib/gpu/geryon/nvd_macros.h      |   4 --
 lib/gpu/geryon/nvd_texture.h     |   3 -
 lib/gpu/geryon/ucl_nv_kernel.h   |  13 ----
 lib/gpu/lal_pre_cuda_hip.h       |  57 ------------------
 13 files changed, 74 insertions(+), 253 deletions(-)

diff --git a/cmake/Modules/Packages/GPU.cmake b/cmake/Modules/Packages/GPU.cmake
index 53b5f33b9b..75569aa55d 100644
--- a/cmake/Modules/Packages/GPU.cmake
+++ b/cmake/Modules/Packages/GPU.cmake
@@ -80,14 +80,14 @@ if(GPU_API STREQUAL "CUDA")
   # --arch translates directly instead of JIT, so this should be for the preferred or most common architecture
   set(GPU_CUDA_GENCODE "-arch=${GPU_ARCH}")
 
-  # apply the following to build "fat" CUDA binaries only for known CUDA toolkits
-  if(CUDA_VERSION VERSION_GREATER_EQUAL "12.0")
-    message(WARNING "Untested CUDA Toolkit version. Use at your own risk")
+  # apply the following to build "fat" CUDA binaries only for known CUDA toolkits since version 8.0
+  # only the Kepler achitecture and beyond is supported
+  if(CUDA_VERSION VERSION_LESS 8.0)
+    message(FATAL_ERROR "CUDA Toolkit version 8.0 or later is required")
+  elseif(CUDA_VERSION VERSION_GREATER_EQUAL "12.0")
+    message(WARNING "Untested CUDA Toolkit version ${CUDA_VERSION}. Use at your own risk")
+    set(GPU_CUDA_GENCODE "-arch=all")
   else()
-    # Fermi (GPU Arch 2.x) is supported by CUDA 3.2 to CUDA 8.0
-    if((CUDA_VERSION VERSION_GREATER_EQUAL "3.2") AND (CUDA_VERSION VERSION_LESS "9.0"))
-      string(APPEND GPU_CUDA_GENCODE " -gencode arch=compute_20,code=[sm_20,compute_20] ")
-    endif()
     # Kepler (GPU Arch 3.0) is supported by CUDA 5 to CUDA 10.2
     if((CUDA_VERSION VERSION_GREATER_EQUAL "5.0") AND (CUDA_VERSION VERSION_LESS "11.0"))
       string(APPEND GPU_CUDA_GENCODE " -gencode arch=compute_30,code=[sm_30,compute_30] ")
@@ -120,6 +120,14 @@ if(GPU_API STREQUAL "CUDA")
     if(CUDA_VERSION VERSION_GREATER_EQUAL "11.1")
       string(APPEND GPU_CUDA_GENCODE " -gencode arch=compute_86,code=[sm_86,compute_86]")
     endif()
+    # Hopper (GPU Arch 9.0) is supported by CUDA 12.0? and later
+    if(CUDA_VERSION VERSION_GREATER_EQUAL "12.0")
+      string(APPEND GPU_CUDA_GENCODE " -gencode arch=compute_90,code=[sm_90,compute_90]")
+    endif()
+    #    # Lovelace (GPU Arch 9.x) is supported by CUDA 12.0? and later
+    #if(CUDA_VERSION VERSION_GREATER_EQUAL "12.0")
+    #  string(APPEND GPU_CUDA_GENCODE " -gencode arch=compute_9x,code=[sm_9x,compute_9x]")
+    #endif()
   endif()
 
   cuda_compile_fatbin(GPU_GEN_OBJS ${GPU_LIB_CU} OPTIONS ${CUDA_REQUEST_PIC}
@@ -257,43 +265,47 @@ elseif(GPU_API STREQUAL "HIP")
     find_package(CUDA REQUIRED)
     set(HIP_ARCH "sm_50" CACHE STRING "HIP primary CUDA architecture (e.g. sm_60)")
 
-    # build arch/gencode commands for nvcc based on CUDA toolkit version and use choice
-    # --arch translates directly instead of JIT, so this should be for the preferred or most common architecture
-    set(HIP_CUDA_GENCODE "-arch=${HIP_ARCH}")
-    # Fermi (GPU Arch 2.x) is supported by CUDA 3.2 to CUDA 8.0
-    if((CUDA_VERSION VERSION_GREATER_EQUAL "3.2") AND (CUDA_VERSION VERSION_LESS "9.0"))
-      string(APPEND HIP_CUDA_GENCODE " -gencode arch=compute_20,code=[sm_20,compute_20]")
-    endif()
-    # Kepler (GPU Arch 3.0) is supported by CUDA 5 to CUDA 10.2
-    if((CUDA_VERSION VERSION_GREATER_EQUAL "5.0") AND (CUDA_VERSION VERSION_LESS "11.0"))
-      string(APPEND HIP_CUDA_GENCODE " -gencode arch=compute_30,code=[sm_30,compute_30]")
-    endif()
-    # Kepler (GPU Arch 3.5) is supported by CUDA 5 to CUDA 11.0
-    if((CUDA_VERSION VERSION_GREATER_EQUAL "5.0") AND (CUDA_VERSION VERSION_LESS "12.0"))
-      string(APPEND HIP_CUDA_GENCODE " -gencode arch=compute_35,code=[sm_35,compute_35]")
-    endif()
-    # Maxwell (GPU Arch 5.x) is supported by CUDA 6 and later
-    if(CUDA_VERSION VERSION_GREATER_EQUAL "6.0")
-      string(APPEND HIP_CUDA_GENCODE " -gencode arch=compute_50,code=[sm_50,compute_50] -gencode arch=compute_52,code=[sm_52,compute_52]")
-    endif()
-    # Pascal (GPU Arch 6.x) is supported by CUDA 8 and later
-    if(CUDA_VERSION VERSION_GREATER_EQUAL "8.0")
-      string(APPEND HIP_CUDA_GENCODE " -gencode arch=compute_60,code=[sm_60,compute_60] -gencode arch=compute_61,code=[sm_61,compute_61]")
-    endif()
-    # Volta (GPU Arch 7.0) is supported by CUDA 9 and later
-    if(CUDA_VERSION VERSION_GREATER_EQUAL "9.0")
-      string(APPEND HIP_CUDA_GENCODE " -gencode arch=compute_70,code=[sm_70,compute_70]")
-    endif()
-    # Turing (GPU Arch 7.5) is supported by CUDA 10 and later
-    if(CUDA_VERSION VERSION_GREATER_EQUAL "10.0")
-      string(APPEND HIP_CUDA_GENCODE " -gencode arch=compute_75,code=[sm_75,compute_75]")
-    endif()
-    # Ampere (GPU Arch 8.0) is supported by CUDA 11 and later
-    if(CUDA_VERSION VERSION_GREATER_EQUAL "11.0")
-      string(APPEND HIP_CUDA_GENCODE " -gencode arch=compute_80,code=[sm_80,compute_80]")
-    endif()
-    if(CUDA_VERSION VERSION_GREATER_EQUAL "12.0")
-      message(WARNING "Unsupported CUDA version. Use at your own risk.")
+    if(CUDA_VERSION VERSION_LESS 8.0)
+      message(FATAL_ERROR "CUDA Toolkit version 8.0 or later is required")
+    elseif(CUDA_VERSION VERSION_GREATER_EQUAL "12.0")
+      message(WARNING "Untested CUDA Toolkit version ${CUDA_VERSION}. Use at your own risk")
+      set(HIP_CUDA_GENCODE "-arch=all")
+    else()
+      # build arch/gencode commands for nvcc based on CUDA toolkit version and use choice
+      # --arch translates directly instead of JIT, so this should be for the preferred or most common architecture
+      set(HIP_CUDA_GENCODE "-arch=${HIP_ARCH}")
+      # Kepler (GPU Arch 3.0) is supported by CUDA 5 to CUDA 10.2
+      if((CUDA_VERSION VERSION_GREATER_EQUAL "5.0") AND (CUDA_VERSION VERSION_LESS "11.0"))
+        string(APPEND HIP_CUDA_GENCODE " -gencode arch=compute_30,code=[sm_30,compute_30]")
+      endif()
+      # Kepler (GPU Arch 3.5) is supported by CUDA 5 to CUDA 11.0
+      if((CUDA_VERSION VERSION_GREATER_EQUAL "5.0") AND (CUDA_VERSION VERSION_LESS "12.0"))
+        string(APPEND HIP_CUDA_GENCODE " -gencode arch=compute_35,code=[sm_35,compute_35]")
+      endif()
+      # Maxwell (GPU Arch 5.x) is supported by CUDA 6 and later
+      if(CUDA_VERSION VERSION_GREATER_EQUAL "6.0")
+        string(APPEND HIP_CUDA_GENCODE " -gencode arch=compute_50,code=[sm_50,compute_50] -gencode arch=compute_52,code=[sm_52,compute_52]")
+      endif()
+      # Pascal (GPU Arch 6.x) is supported by CUDA 8 and later
+      if(CUDA_VERSION VERSION_GREATER_EQUAL "8.0")
+        string(APPEND HIP_CUDA_GENCODE " -gencode arch=compute_60,code=[sm_60,compute_60] -gencode arch=compute_61,code=[sm_61,compute_61]")
+      endif()
+      # Volta (GPU Arch 7.0) is supported by CUDA 9 and later
+      if(CUDA_VERSION VERSION_GREATER_EQUAL "9.0")
+        string(APPEND HIP_CUDA_GENCODE " -gencode arch=compute_70,code=[sm_70,compute_70]")
+      endif()
+      # Turing (GPU Arch 7.5) is supported by CUDA 10 and later
+      if(CUDA_VERSION VERSION_GREATER_EQUAL "10.0")
+        string(APPEND HIP_CUDA_GENCODE " -gencode arch=compute_75,code=[sm_75,compute_75]")
+      endif()
+      # Ampere (GPU Arch 8.0) is supported by CUDA 11 and later
+      if(CUDA_VERSION VERSION_GREATER_EQUAL "11.0")
+        string(APPEND HIP_CUDA_GENCODE " -gencode arch=compute_80,code=[sm_80,compute_80]")
+      endif()
+      # Hopper (GPU Arch 9.0) is supported by CUDA 12.0? and later
+      if(CUDA_VERSION VERSION_GREATER_EQUAL "12.0")
+        string(APPEND GPU_CUDA_GENCODE " -gencode arch=compute_90,code=[sm_90,compute_90]")
+      endif()
     endif()
   endif()
 
diff --git a/doc/src/Build_extras.rst b/doc/src/Build_extras.rst
index 03ebf5ee5a..ca55038d7b 100644
--- a/doc/src/Build_extras.rst
+++ b/doc/src/Build_extras.rst
@@ -141,8 +141,6 @@ CMake build
 
 :code:`GPU_ARCH` settings for different GPU hardware is as follows:
 
-* sm_12 or sm_13 for GT200 (supported by CUDA 3.2 until CUDA 6.5)
-* sm_20 or sm_21 for Fermi (supported by CUDA 3.2 until CUDA 7.5)
 * sm_30 for Kepler (supported since CUDA 5 and until CUDA 10.x)
 * sm_35 or sm_37 for Kepler (supported since CUDA 5 and until CUDA 11.x)
 * sm_50 or sm_52 for Maxwell (supported since CUDA 6)
@@ -150,6 +148,7 @@ CMake build
 * sm_70 for Volta (supported since CUDA 9)
 * sm_75 for Turing (supported since CUDA 10)
 * sm_80 for Ampere (supported since CUDA 11)
+.. * sm_90 for Hopper (supported since CUDA 12)
 
 A more detailed list can be found, for example,
 at `Wikipedia's CUDA article <https://en.wikipedia.org/wiki/CUDA#GPUs_supported>`_
@@ -160,6 +159,10 @@ Thus the GPU_ARCH setting is merely an optimization, to have code for
 the preferred GPU architecture directly included rather than having to wait
 for the JIT compiler of the CUDA driver to translate it.
 
+Version 8.0 or later of the CUDA toolkit is required and a GPU architecture
+of Kepler or laters, which must *also* be supported by the CUDA toolkit in use
+**and** the CUDA driver in use.
+
 When building with CMake, you **must NOT** build the GPU library in ``lib/gpu``
 using the traditional build procedure. CMake will detect files generated by that
 process and will terminate with an error and a suggestion for how to remove them.
diff --git a/lib/gpu/Makefile.linux_multi b/lib/gpu/Makefile.linux_multi
index f3d89fd9f0..fda640221f 100644
--- a/lib/gpu/Makefile.linux_multi
+++ b/lib/gpu/Makefile.linux_multi
@@ -13,14 +13,6 @@ endif
 
 NVCC = nvcc
 
-# obsolete hardware. not supported by current drivers anymore.
-#CUDA_ARCH = -arch=sm_13
-#CUDA_ARCH = -arch=sm_10 -DCUDA_PRE_THREE
-
-# Fermi hardware
-#CUDA_ARCH = -arch=sm_20
-#CUDA_ARCH = -arch=sm_21
-
 # Kepler hardware
 #CUDA_ARCH = -arch=sm_30
 #CUDA_ARCH = -arch=sm_32
@@ -45,6 +37,9 @@ CUDA_ARCH = -arch=sm_50
 #CUDA_ARCH = -arch=sm_80
 #CUDA_ARCH = -arch=sm_86
 
+# Hopper hardware
+#CUDA_ARCH = -arch=sm_90
+
 CUDA_CODE = -gencode arch=compute_50,code=[sm_50,compute_50] -gencode arch=compute_52,code=[sm_52,compute_52] \
 	    -gencode arch=compute_60,code=[sm_60,compute_60] -gencode arch=compute_61,code=[sm_61,compute_61] \
 	    -gencode arch=compute_70,code=[sm_70,compute_70] -gencode arch=compute_75,code=[sm_75,compute_75] \
diff --git a/lib/gpu/README b/lib/gpu/README
index 3185db3af6..b8866cf79e 100644
--- a/lib/gpu/README
+++ b/lib/gpu/README
@@ -171,7 +171,12 @@ NOTE: when compiling with CMake, all of the considerations listed below
 are considered within the CMake configuration process, so no separate
 compilation of the gpu library is required. Also this will build in support
 for all compute architecture that are supported by the CUDA toolkit version
-used to build the gpu library.
+used to build the gpu library.  A similar setup is possible using
+Makefile.linux_multi after adjusting the settings for the CUDA toolkit in use.
+
+Only CUDA toolkit version 8.0 and later and only GPU architecture 3.0
+(aka Kepler) and later are supported by this version of LAMMPS. If you want
+to use older hard- or software you have to use an older version of LAMMPS.
 
 If you do not want to use a fat binary, that supports multiple CUDA
 architectures, the CUDA_ARCH must be set to match the GPU architecture. This
diff --git a/lib/gpu/geryon/hip_device.h b/lib/gpu/geryon/hip_device.h
index 456a03b180..fadeec8711 100644
--- a/lib/gpu/geryon/hip_device.h
+++ b/lib/gpu/geryon/hip_device.h
@@ -379,18 +379,9 @@ UCL_Device::UCL_Device() {
     prop.regsPerBlock = hip_prop.regsPerBlock;
     prop.clockRate = hip_prop.clockRate;
     prop.computeMode = hip_prop.computeMode;
-    //CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.memPitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, dev));
-    //CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, dev));
 
-    //#if CUDA_VERSION >= 2020
-    //CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.kernelExecTimeoutEnabled, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT,dev));
     CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.integrated, hipDeviceAttributeIntegrated, dev));
-    //CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev));
-    //#endif
-    //#if CUDA_VERSION >= 3010
     prop.concurrentKernels = hip_prop.concurrentKernels;
-    //CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.ECCEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev));
-    //#endif
 
     _properties.push_back(prop);
   }
@@ -447,13 +438,11 @@ void UCL_Device::clear() {
 
 // List all devices along with all properties
 void UCL_Device::print_all(std::ostream &out) {
-  //#if CUDA_VERSION >= 2020
   int driver_version;
   hipDriverGetVersion(&driver_version);
   out << "Driver Version:                           "
       << driver_version/1000 << "." << driver_version%100
                   << std::endl;
-  //#endif
 
   if (num_devices() == 0)
     out << "There is no device supporting HIP\n";
@@ -470,12 +459,10 @@ void UCL_Device::print_all(std::ostream &out) {
       out << "No\n";
     out << "  Total amount of global memory:                 "
         << gigabytes(i) << " GB\n";
-    //#if CUDA_VERSION >= 2000
     out << "  Number of compute units/multiprocessors:       "
         << _properties[i].multiProcessorCount << std::endl;
     out << "  Number of cores:                               "
         << cores(i) << std::endl;
-    //#endif
     out << "  Total amount of constant memory:               "
         << _properties[i].totalConstantMemory << " bytes\n";
     out << "  Total amount of local/shared memory per block: "
@@ -494,58 +481,29 @@ void UCL_Device::print_all(std::ostream &out) {
         << _properties[i].maxGridSize[0] << " x "
         << _properties[i].maxGridSize[1] << " x "
         << _properties[i].maxGridSize[2] << std::endl;
-    //out << "  Maximum memory pitch:                          "
-    //    << max_pitch(i) << " bytes\n";
-    //out << "  Texture alignment:                             "
-    //    << _properties[i].textureAlign << " bytes\n";
     out << "  Clock rate:                                    "
         << clock_rate(i) << " GHz\n";
-    //#if CUDA_VERSION >= 2020
-    //out << "  Run time limit on kernels:                     ";
-    //if (_properties[i].kernelExecTimeoutEnabled)
-    //  out << "Yes\n";
-    //else
-    //  out << "No\n";
     out << "  Integrated:                                    ";
     if (_properties[i].integrated)
       out << "Yes\n";
     else
       out << "No\n";
-    //out << "  Support host page-locked memory mapping:       ";
-    //if (_properties[i].canMapHostMemory)
-    //  out << "Yes\n";
-    //else
-    //  out << "No\n";
     out << "  Compute mode:                                  ";
     if (_properties[i].computeMode == hipComputeModeDefault)
       out << "Default\n"; // multiple threads can use device
-//#if CUDA_VERSION >= 8000
-//    else if (_properties[i].computeMode == hipComputeModeExclusiveProcess)
-//#else
     else if (_properties[i].computeMode == hipComputeModeExclusive)
-//#endif
       out << "Exclusive\n"; // only thread can use device
     else if (_properties[i].computeMode == hipComputeModeProhibited)
       out << "Prohibited\n"; // no thread can use device
-    //#if CUDART_VERSION >= 4000
     else if (_properties[i].computeMode == hipComputeModeExclusiveProcess)
       out << "Exclusive Process\n"; // multiple threads 1 process
-    //#endif
     else
       out << "Unknown\n";
-    //#endif
-    //#if CUDA_VERSION >= 3010
     out << "  Concurrent kernel execution:                   ";
     if (_properties[i].concurrentKernels)
       out << "Yes\n";
     else
       out << "No\n";
-    //out << "  Device has ECC support enabled:                ";
-    //if (_properties[i].ECCEnabled)
-    //  out << "Yes\n";
-    //else
-    //  out << "No\n";
-    //#endif
   }
 }
 
diff --git a/lib/gpu/geryon/hip_macros.h b/lib/gpu/geryon/hip_macros.h
index 9c9971b896..96313ec87e 100644
--- a/lib/gpu/geryon/hip_macros.h
+++ b/lib/gpu/geryon/hip_macros.h
@@ -5,11 +5,7 @@
 #include <cassert>
 #include <hip/hip_runtime.h>
 
-//#if CUDA_VERSION >= 3020
 #define CUDA_INT_TYPE size_t
-//#else
-//#define CUDA_INT_TYPE unsigned
-//#endif
 
 #ifdef MPI_GERYON
 #include "mpi.h"
diff --git a/lib/gpu/geryon/hip_texture.h b/lib/gpu/geryon/hip_texture.h
index eb27c7a1ed..3e8c56a4e4 100644
--- a/lib/gpu/geryon/hip_texture.h
+++ b/lib/gpu/geryon/hip_texture.h
@@ -71,9 +71,6 @@ class UCL_Texture {
 
   /// Make a texture reference available to kernel
   inline void allow(UCL_Kernel &) {
-    //#if CUDA_VERSION < 4000
-    //CU_SAFE_CALL(cuParamSetTexRef(kernel._kernel, CU_PARAM_TR_DEFAULT, _tex));
-    //#endif
   }
 
  private:
diff --git a/lib/gpu/geryon/nvd_device.h b/lib/gpu/geryon/nvd_device.h
index d5963fd39f..80414c9873 100644
--- a/lib/gpu/geryon/nvd_device.h
+++ b/lib/gpu/geryon/nvd_device.h
@@ -320,6 +320,9 @@ class UCL_Device {
 
 // Grabs the properties for all devices
 UCL_Device::UCL_Device() {
+#if CUDA_VERSION < 8000
+#error CUDA Toolkit version 8 or later required
+#endif
   CU_SAFE_CALL_NS(cuInit(0));
   CU_SAFE_CALL_NS(cuDeviceGetCount(&_num_devices));
   for (int i=0; i<_num_devices; ++i) {
@@ -358,16 +361,12 @@ UCL_Device::UCL_Device() {
     CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev));
     CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, dev));
 
-    #if CUDA_VERSION >= 2020
     CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.kernelExecTimeoutEnabled, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT,dev));
     CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev));
     CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev));
     CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE,dev));
-    #endif
-    #if CUDA_VERSION >= 3010
     CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.concurrentKernels, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev));
     CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.ECCEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev));
-    #endif
 
     _properties.push_back(prop);
   }
@@ -415,13 +414,10 @@ void UCL_Device::clear() {
 
 // List all devices along with all properties
 void UCL_Device::print_all(std::ostream &out) {
-  #if CUDA_VERSION >= 2020
   int driver_version;
   cuDriverGetVersion(&driver_version);
   out << "CUDA Driver Version:                           "
-      << driver_version/1000 << "." << driver_version%100
-                  << std::endl;
-  #endif
+      << driver_version/1000 << "." << driver_version%100 << std::endl;
 
   if (num_devices() == 0)
     out << "There is no device supporting CUDA\n";
@@ -438,12 +434,10 @@ void UCL_Device::print_all(std::ostream &out) {
       out << "No\n";
     out << "  Total amount of global memory:                 "
         << gigabytes(i) << " GB\n";
-    #if CUDA_VERSION >= 2000
     out << "  Number of compute units/multiprocessors:       "
         << _properties[i].multiProcessorCount << std::endl;
     out << "  Number of cores:                               "
         << cores(i) << std::endl;
-    #endif
     out << "  Total amount of constant memory:               "
         << _properties[i].totalConstantMemory << " bytes\n";
     out << "  Total amount of local/shared memory per block: "
@@ -468,7 +462,6 @@ void UCL_Device::print_all(std::ostream &out) {
         << _properties[i].textureAlign << " bytes\n";
     out << "  Clock rate:                                    "
         << clock_rate(i) << " GHz\n";
-    #if CUDA_VERSION >= 2020
     out << "  Run time limit on kernels:                     ";
     if (_properties[i].kernelExecTimeoutEnabled)
       out << "Yes\n";
@@ -487,22 +480,14 @@ void UCL_Device::print_all(std::ostream &out) {
     out << "  Compute mode:                                  ";
     if (_properties[i].computeMode == CU_COMPUTEMODE_DEFAULT)
       out << "Default\n"; // multiple threads can use device
-#if CUDA_VERSION >= 8000
     else if (_properties[i].computeMode == CU_COMPUTEMODE_EXCLUSIVE_PROCESS)
-#else
-    else if (_properties[i].computeMode == CU_COMPUTEMODE_EXCLUSIVE)
-#endif
       out << "Exclusive\n"; // only thread can use device
     else if (_properties[i].computeMode == CU_COMPUTEMODE_PROHIBITED)
       out << "Prohibited\n"; // no thread can use device
-    #if CUDART_VERSION >= 4000
     else if (_properties[i].computeMode == CU_COMPUTEMODE_EXCLUSIVE_PROCESS)
       out << "Exclusive Process\n"; // multiple threads 1 process
-    #endif
     else
       out << "Unknown\n";
-    #endif
-    #if CUDA_VERSION >= 3010
     out << "  Concurrent kernel execution:                   ";
     if (_properties[i].concurrentKernels)
       out << "Yes\n";
@@ -513,7 +498,6 @@ void UCL_Device::print_all(std::ostream &out) {
       out << "Yes\n";
     else
       out << "No\n";
-    #endif
   }
 }
 
diff --git a/lib/gpu/geryon/nvd_kernel.h b/lib/gpu/geryon/nvd_kernel.h
index 798b12e53c..55ba5a8f47 100644
--- a/lib/gpu/geryon/nvd_kernel.h
+++ b/lib/gpu/geryon/nvd_kernel.h
@@ -165,17 +165,11 @@ class UCL_Program {
 class UCL_Kernel {
  public:
   UCL_Kernel() : _dimensions(1), _num_args(0) {
-    #if CUDA_VERSION < 4000
-    _param_size=0;
-    #endif
     _num_blocks[0]=0;
   }
 
   UCL_Kernel(UCL_Program &program, const char *function) :
     _dimensions(1), _num_args(0) {
-    #if CUDA_VERSION < 4000
-    _param_size=0;
-    #endif
     _num_blocks[0]=0;
     set_function(program,function);
     _cq=program._cq;
@@ -211,11 +205,7 @@ class UCL_Kernel {
     if (index==_num_args)
       add_arg(arg);
     else if (index<_num_args)
-      #if CUDA_VERSION >= 4000
       _kernel_args[index]=arg;
-      #else
-      CU_SAFE_CALL(cuParamSetv(_kernel, _offsets[index], arg, sizeof(dtype)));
-      #endif
     else
       assert(0==1); // Must add kernel parameters in sequential order
   }
@@ -242,15 +232,7 @@ class UCL_Kernel {
 
   /// Add a kernel argument.
   inline void add_arg(const CUdeviceptr* const arg) {
-    #if CUDA_VERSION >= 4000
     _kernel_args[_num_args]=(void *)arg;
-    #else
-    void* ptr = (void*)(size_t)(*arg);
-    _param_size = (_param_size + __alignof(ptr) - 1) & ~(__alignof(ptr) - 1);
-    CU_SAFE_CALL(cuParamSetv(_kernel, _param_size, &ptr, sizeof(ptr)));
-    _offsets.push_back(_param_size);
-    _param_size+=sizeof(ptr);
-    #endif
     _num_args++;
     if (_num_args>UCL_MAX_KERNEL_ARGS) assert(0==1);
   }
@@ -258,14 +240,7 @@ class UCL_Kernel {
   /// Add a kernel argument.
   template <class dtype>
   inline void add_arg(const dtype* const arg) {
-    #if CUDA_VERSION >= 4000
     _kernel_args[_num_args]=const_cast<dtype *>(arg);
-    #else
-    _param_size = (_param_size+__alignof(dtype)-1) & ~(__alignof(dtype)-1);
-    CU_SAFE_CALL(cuParamSetv(_kernel,_param_size,(void*)arg,sizeof(dtype)));
-    _offsets.push_back(_param_size);
-    _param_size+=sizeof(dtype);
-    #endif
     _num_args++;
     if (_num_args>UCL_MAX_KERNEL_ARGS) assert(0==1);
   }
@@ -298,13 +273,9 @@ class UCL_Kernel {
     _num_blocks[0]=num_blocks;
     _num_blocks[1]=1;
     _num_blocks[2]=1;
-    #if CUDA_VERSION >= 4000
     _block_size[0]=block_size;
     _block_size[1]=1;
     _block_size[2]=1;
-    #else
-    CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size,1,1));
-    #endif
   }
 
   /// Set the number of thread blocks and the number of threads in each block
@@ -323,13 +294,9 @@ class UCL_Kernel {
     _num_blocks[0]=num_blocks_x;
     _num_blocks[1]=num_blocks_y;
     _num_blocks[2]=1;
-    #if CUDA_VERSION >= 4000
     _block_size[0]=block_size_x;
     _block_size[1]=block_size_y;
     _block_size[2]=1;
-    #else
-    CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size_x,block_size_y,1));
-    #endif
   }
 
   /// Set the number of thread blocks and the number of threads in each block
@@ -350,14 +317,9 @@ class UCL_Kernel {
     _num_blocks[0]=num_blocks_x;
     _num_blocks[1]=num_blocks_y;
     _num_blocks[2]=1;
-    #if CUDA_VERSION >= 4000
     _block_size[0]=block_size_x;
     _block_size[1]=block_size_y;
     _block_size[2]=block_size_z;
-    #else
-    CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size_x,block_size_y,
-                                     block_size_z));
-    #endif
   }
 
   /// Set the number of thread blocks and the number of threads in each block
@@ -373,23 +335,14 @@ class UCL_Kernel {
 
   /// Run the kernel in the default command queue
   inline void run() {
-    #if CUDA_VERSION >= 4000
     CU_SAFE_CALL(cuLaunchKernel(_kernel,_num_blocks[0],_num_blocks[1],
                                 _num_blocks[2],_block_size[0],_block_size[1],
                                 _block_size[2],0,_cq,_kernel_args,nullptr));
-    #else
-    CU_SAFE_CALL(cuParamSetSize(_kernel,_param_size));
-    CU_SAFE_CALL(cuLaunchGridAsync(_kernel,_num_blocks[0],_num_blocks[1],_cq));
-    #endif
   }
 
   /// Clear any arguments associated with the kernel
   inline void clear_args() {
     _num_args=0;
-    #if CUDA_VERSION < 4000
-    _offsets.clear();
-    _param_size=0;
-    #endif
   }
 
   /// Return the default command queue/stream associated with this data
@@ -406,13 +359,8 @@ class UCL_Kernel {
   unsigned _num_args;
   friend class UCL_Texture;
 
-  #if CUDA_VERSION >= 4000
   unsigned _block_size[3];
   void * _kernel_args[UCL_MAX_KERNEL_ARGS];
-  #else
-  std::vector<unsigned> _offsets;
-  unsigned _param_size;
-  #endif
 };
 
 } // namespace
diff --git a/lib/gpu/geryon/nvd_macros.h b/lib/gpu/geryon/nvd_macros.h
index 08ff84991a..5c1f7f25d5 100644
--- a/lib/gpu/geryon/nvd_macros.h
+++ b/lib/gpu/geryon/nvd_macros.h
@@ -5,11 +5,7 @@
 #include <cassert>
 #include <cuda.h>
 
-#if CUDA_VERSION >= 3020
 #define CUDA_INT_TYPE size_t
-#else
-#define CUDA_INT_TYPE unsigned
-#endif
 
 #ifdef MPI_GERYON
 #include "mpi.h"
diff --git a/lib/gpu/geryon/nvd_texture.h b/lib/gpu/geryon/nvd_texture.h
index d7d65da903..96e1991859 100644
--- a/lib/gpu/geryon/nvd_texture.h
+++ b/lib/gpu/geryon/nvd_texture.h
@@ -69,9 +69,6 @@ class UCL_Texture {
 
   /// Make a texture reference available to kernel
   inline void allow(UCL_Kernel &kernel) {
-    #if CUDA_VERSION < 4000
-    CU_SAFE_CALL(cuParamSetTexRef(kernel._kernel, CU_PARAM_TR_DEFAULT, _tex));
-    #endif
   }
 
  private:
diff --git a/lib/gpu/geryon/ucl_nv_kernel.h b/lib/gpu/geryon/ucl_nv_kernel.h
index 437631ec3a..99cd640627 100644
--- a/lib/gpu/geryon/ucl_nv_kernel.h
+++ b/lib/gpu/geryon/ucl_nv_kernel.h
@@ -25,21 +25,8 @@
 #ifndef UCL_NV_KERNEL_H
 #define UCL_NV_KERNEL_H
 
-#if (__CUDA_ARCH__ < 200)
-#define mul24 __mul24
-#define MEM_THREADS 16
-#else
 #define mul24(X,Y) (X)*(Y)
 #define MEM_THREADS 32
-#endif
-
-#ifdef CUDA_PRE_THREE
-struct __builtin_align__(16) _double4
-{
-  double x, y, z, w;
-};
-typedef struct _double4 double4;
-#endif
 
 #define GLOBAL_ID_X threadIdx.x+mul24(blockIdx.x,blockDim.x)
 #define GLOBAL_ID_Y threadIdx.y+mul24(blockIdx.y,blockDim.y)
diff --git a/lib/gpu/lal_pre_cuda_hip.h b/lib/gpu/lal_pre_cuda_hip.h
index dfb6229bed..47a005b998 100644
--- a/lib/gpu/lal_pre_cuda_hip.h
+++ b/lib/gpu/lal_pre_cuda_hip.h
@@ -58,49 +58,6 @@
 #define MAX_BIO_SHARED_TYPES 128
 #define PPPM_MAX_SPLINE 8
 
-// -------------------------------------------------------------------------
-//                          LEGACY DEVICE CONFIGURATION
-// -------------------------------------------------------------------------
-
-#ifdef __CUDA_ARCH__
-
-#if (__CUDA_ARCH__ < 200)
-
-#undef CONFIG_ID
-#define CONFIG_ID 101
-#define MEM_THREADS 16
-#undef THREADS_PER_ATOM
-#define THREADS_PER_ATOM 1
-#undef THREADS_PER_CHARGE
-#define THREADS_PER_CHARGE 16
-#undef BLOCK_PAIR
-#define BLOCK_PAIR 64
-#undef BLOCK_BIO_PAIR
-#define BLOCK_BIO_PAIR 64
-#undef BLOCK_NBOR_BUILD
-#define BLOCK_NBOR_BUILD 64
-#undef MAX_SHARED_TYPES
-#define MAX_SHARED_TYPES 8
-#undef SHUFFLE_AVAIL
-#define SHUFFLE_AVAIL 0
-
-#elseif (__CUDA_ARCH__ < 300)
-
-#undef CONFIG_ID
-#define CONFIG_ID 102
-#undef BLOCK_PAIR
-#define BLOCK_PAIR 128
-#undef BLOCK_BIO_PAIR
-#define BLOCK_BIO_PAIR 128
-#undef MAX_SHARED_TYPES
-#define MAX_SHARED_TYPES 8
-#undef SHUFFLE_AVAIL
-#define SHUFFLE_AVAIL 0
-
-#endif
-
-#endif
-
 // -------------------------------------------------------------------------
 //                              KERNEL MACROS
 // -------------------------------------------------------------------------
@@ -111,12 +68,6 @@
 
 #define fast_mul(X,Y) (X)*(Y)
 
-#ifdef __CUDA_ARCH__
-#if (__CUDA_ARCH__ < 200)
-#define fast_mul __mul24
-#endif
-#endif
-
 #define EVFLAG 1
 #define NOUNROLL
 #define GLOBAL_ID_X threadIdx.x+fast_mul(blockIdx.x,blockDim.x)
@@ -220,14 +171,6 @@
 //                           KERNEL MACROS - MATH
 // -------------------------------------------------------------------------
 
-#ifdef CUDA_PRE_THREE
-struct __builtin_align__(16) _double4
-{
-  double x, y, z, w;
-};
-typedef struct _double4 double4;
-#endif
-
 #ifdef _DOUBLE_DOUBLE
 
 #define ucl_exp exp