From e299fa9aab4877f1e6b301b96edc9feec53df552 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Thu, 20 Jul 2023 14:00:21 -0400
Subject: [PATCH] add option to skip building for multiple GPU archs

---
 cmake/Modules/Packages/GPU.cmake | 104 ++++++++++++++++---------------
 doc/src/Build_extras.rst         |  68 +++++++++++---------
 2 files changed, 93 insertions(+), 79 deletions(-)

diff --git a/cmake/Modules/Packages/GPU.cmake b/cmake/Modules/Packages/GPU.cmake
index 4a70eb7a1e..99321fce9f 100644
--- a/cmake/Modules/Packages/GPU.cmake
+++ b/cmake/Modules/Packages/GPU.cmake
@@ -64,6 +64,8 @@ if(GPU_API STREQUAL "CUDA")
     endif()
     set(GPU_CUDA_MPS_FLAGS "-DCUDA_MPS_SUPPORT")
   endif()
+  option(CUDA_BUILD_MULTIARCH "Enable building CUDA kernels for all supported GPU architectures" ON)
+  mark_as_advanced(GPU_BUILD_MULTIARCH)
 
   set(GPU_ARCH "sm_50" CACHE STRING "LAMMPS GPU CUDA SM primary architecture (e.g. sm_60)")
 
@@ -93,56 +95,58 @@ if(GPU_API STREQUAL "CUDA")
   # --arch translates directly instead of JIT, so this should be for the preferred or most common architecture
   set(GPU_CUDA_GENCODE "-arch=${GPU_ARCH}")
 
-  # apply the following to build "fat" CUDA binaries only for known CUDA toolkits since version 8.0
-  # only the Kepler achitecture and beyond is supported
-  # comparison chart according to: https://en.wikipedia.org/wiki/CUDA#GPUs_supported
-  if(CUDA_VERSION VERSION_LESS 8.0)
-    message(FATAL_ERROR "CUDA Toolkit version 8.0 or later is required")
-  elseif(CUDA_VERSION VERSION_GREATER_EQUAL "13.0")
-    message(WARNING "Untested CUDA Toolkit version ${CUDA_VERSION}. Use at your own risk")
-    set(GPU_CUDA_GENCODE "-arch=all")
-  elseif(CUDA_VERSION VERSION_GREATER_EQUAL "12.0")
-    set(GPU_CUDA_GENCODE "-arch=all")
-  else()
-    # Kepler (GPU Arch 3.0) is supported by CUDA 5 to CUDA 10.2
-    if((CUDA_VERSION VERSION_GREATER_EQUAL "5.0") AND (CUDA_VERSION VERSION_LESS "11.0"))
-      string(APPEND GPU_CUDA_GENCODE " -gencode arch=compute_30,code=[sm_30,compute_30] ")
-    endif()
-    # Kepler (GPU Arch 3.5) is supported by CUDA 5 to CUDA 11
-    if((CUDA_VERSION VERSION_GREATER_EQUAL "5.0") AND (CUDA_VERSION VERSION_LESS "12.0"))
-      string(APPEND GPU_CUDA_GENCODE " -gencode arch=compute_35,code=[sm_35,compute_35]")
-    endif()
-    # Maxwell (GPU Arch 5.x) is supported by CUDA 6 and later
-    if(CUDA_VERSION VERSION_GREATER_EQUAL "6.0")
-      string(APPEND GPU_CUDA_GENCODE " -gencode arch=compute_50,code=[sm_50,compute_50] -gencode arch=compute_52,code=[sm_52,compute_52]")
-    endif()
-    # Pascal (GPU Arch 6.x) is supported by CUDA 8 and later
-    if(CUDA_VERSION VERSION_GREATER_EQUAL "8.0")
-      string(APPEND GPU_CUDA_GENCODE " -gencode arch=compute_60,code=[sm_60,compute_60] -gencode arch=compute_61,code=[sm_61,compute_61]")
-    endif()
-    # Volta (GPU Arch 7.0) is supported by CUDA 9 and later
-    if(CUDA_VERSION VERSION_GREATER_EQUAL "9.0")
-      string(APPEND GPU_CUDA_GENCODE " -gencode arch=compute_70,code=[sm_70,compute_70]")
-    endif()
-    # Turing (GPU Arch 7.5) is supported by CUDA 10 and later
-    if(CUDA_VERSION VERSION_GREATER_EQUAL "10.0")
-      string(APPEND GPU_CUDA_GENCODE " -gencode arch=compute_75,code=[sm_75,compute_75]")
-    endif()
-    # Ampere (GPU Arch 8.0) is supported by CUDA 11 and later
-    if(CUDA_VERSION VERSION_GREATER_EQUAL "11.0")
-      string(APPEND GPU_CUDA_GENCODE " -gencode arch=compute_80,code=[sm_80,compute_80]")
-    endif()
-    # Ampere (GPU Arch 8.6) is supported by CUDA 11.1 and later
-    if(CUDA_VERSION VERSION_GREATER_EQUAL "11.1")
-      string(APPEND GPU_CUDA_GENCODE " -gencode arch=compute_86,code=[sm_86,compute_86]")
-    endif()
-    # Lovelace (GPU Arch 8.9) is supported by CUDA 11.8 and later
-    if(CUDA_VERSION VERSION_GREATER_EQUAL "11.8")
-      string(APPEND GPU_CUDA_GENCODE " -gencode arch=compute_90,code=[sm_90,compute_90]")
-    endif()
-    # Hopper (GPU Arch 9.0) is supported by CUDA 12.0 and later
-    if(CUDA_VERSION VERSION_GREATER_EQUAL "12.0")
-      string(APPEND GPU_CUDA_GENCODE " -gencode arch=compute_90,code=[sm_90,compute_90]")
+  if(CUDA_BUILD_MULTIARCH)
+    # apply the following to build "fat" CUDA binaries only for known CUDA toolkits since version 8.0
+    # only the Kepler achitecture and beyond is supported
+    # comparison chart according to: https://en.wikipedia.org/wiki/CUDA#GPUs_supported
+    if(CUDA_VERSION VERSION_LESS 8.0)
+      message(FATAL_ERROR "CUDA Toolkit version 8.0 or later is required")
+    elseif(CUDA_VERSION VERSION_GREATER_EQUAL "13.0")
+      message(WARNING "Untested CUDA Toolkit version ${CUDA_VERSION}. Use at your own risk")
+      set(GPU_CUDA_GENCODE "-arch=all")
+    elseif(CUDA_VERSION VERSION_GREATER_EQUAL "12.0")
+      set(GPU_CUDA_GENCODE "-arch=all")
+    else()
+      # Kepler (GPU Arch 3.0) is supported by CUDA 5 to CUDA 10.2
+      if((CUDA_VERSION VERSION_GREATER_EQUAL "5.0") AND (CUDA_VERSION VERSION_LESS "11.0"))
+        string(APPEND GPU_CUDA_GENCODE " -gencode arch=compute_30,code=[sm_30,compute_30] ")
+      endif()
+      # Kepler (GPU Arch 3.5) is supported by CUDA 5 to CUDA 11
+      if((CUDA_VERSION VERSION_GREATER_EQUAL "5.0") AND (CUDA_VERSION VERSION_LESS "12.0"))
+        string(APPEND GPU_CUDA_GENCODE " -gencode arch=compute_35,code=[sm_35,compute_35]")
+      endif()
+      # Maxwell (GPU Arch 5.x) is supported by CUDA 6 and later
+      if(CUDA_VERSION VERSION_GREATER_EQUAL "6.0")
+        string(APPEND GPU_CUDA_GENCODE " -gencode arch=compute_50,code=[sm_50,compute_50] -gencode arch=compute_52,code=[sm_52,compute_52]")
+      endif()
+      # Pascal (GPU Arch 6.x) is supported by CUDA 8 and later
+      if(CUDA_VERSION VERSION_GREATER_EQUAL "8.0")
+        string(APPEND GPU_CUDA_GENCODE " -gencode arch=compute_60,code=[sm_60,compute_60] -gencode arch=compute_61,code=[sm_61,compute_61]")
+      endif()
+      # Volta (GPU Arch 7.0) is supported by CUDA 9 and later
+      if(CUDA_VERSION VERSION_GREATER_EQUAL "9.0")
+        string(APPEND GPU_CUDA_GENCODE " -gencode arch=compute_70,code=[sm_70,compute_70]")
+      endif()
+      # Turing (GPU Arch 7.5) is supported by CUDA 10 and later
+      if(CUDA_VERSION VERSION_GREATER_EQUAL "10.0")
+        string(APPEND GPU_CUDA_GENCODE " -gencode arch=compute_75,code=[sm_75,compute_75]")
+      endif()
+      # Ampere (GPU Arch 8.0) is supported by CUDA 11 and later
+      if(CUDA_VERSION VERSION_GREATER_EQUAL "11.0")
+        string(APPEND GPU_CUDA_GENCODE " -gencode arch=compute_80,code=[sm_80,compute_80]")
+      endif()
+      # Ampere (GPU Arch 8.6) is supported by CUDA 11.1 and later
+      if(CUDA_VERSION VERSION_GREATER_EQUAL "11.1")
+        string(APPEND GPU_CUDA_GENCODE " -gencode arch=compute_86,code=[sm_86,compute_86]")
+      endif()
+      # Lovelace (GPU Arch 8.9) is supported by CUDA 11.8 and later
+      if(CUDA_VERSION VERSION_GREATER_EQUAL "11.8")
+        string(APPEND GPU_CUDA_GENCODE " -gencode arch=compute_90,code=[sm_90,compute_90]")
+      endif()
+      # Hopper (GPU Arch 9.0) is supported by CUDA 12.0 and later
+      if(CUDA_VERSION VERSION_GREATER_EQUAL "12.0")
+        string(APPEND GPU_CUDA_GENCODE " -gencode arch=compute_90,code=[sm_90,compute_90]")
+      endif()
     endif()
   endif()
 
diff --git a/doc/src/Build_extras.rst b/doc/src/Build_extras.rst
index 0ecf54f744..692ab52e1d 100644
--- a/doc/src/Build_extras.rst
+++ b/doc/src/Build_extras.rst
@@ -140,6 +140,8 @@ CMake build
                                 # value = yes or no (default)
    -D CUDA_MPS_SUPPORT=value    # enables some tweaks required to run with active nvidia-cuda-mps daemon
                                 # value = yes or no (default)
+   -D CUDA_BUILD_MULTIARCH=value  # enables building CUDA kernels for all supported GPU architectures
+                                  # value = yes (default) or no
    -D USE_STATIC_OPENCL_LOADER=value  # downloads/includes OpenCL ICD loader library, no local OpenCL headers/libs needed
                                       # value = yes (default) or no
 
@@ -158,41 +160,49 @@ CMake build
 A more detailed list can be found, for example,
 at `Wikipedia's CUDA article <https://en.wikipedia.org/wiki/CUDA#GPUs_supported>`_
 
-CMake can detect which version of the CUDA toolkit is used and thus will try
-to include support for **all** major GPU architectures supported by this toolkit.
-Thus the GPU_ARCH setting is merely an optimization, to have code for
-the preferred GPU architecture directly included rather than having to wait
-for the JIT compiler of the CUDA driver to translate it.
+CMake can detect which version of the CUDA toolkit is used and thus will
+try to include support for **all** major GPU architectures supported by
+this toolkit.  Thus the GPU_ARCH setting is merely an optimization, to
+have code for the preferred GPU architecture directly included rather
+than having to wait for the JIT compiler of the CUDA driver to translate
+it.  This behavior can be turned off (e.g. to speed up compilation) by
+setting code:`CUDA_ENABLE_MULTIARCH` to code:`no`.
 
-When compiling for CUDA or HIP with CUDA, version 8.0 or later of the CUDA toolkit
-is required and a GPU architecture of Kepler or later, which must *also* be
-supported by the CUDA toolkit in use **and** the CUDA driver in use.
-When compiling for OpenCL, OpenCL version 1.2 or later is required and the
-GPU must be supported by the GPU driver and OpenCL runtime bundled with the driver.
+When compiling for CUDA or HIP with CUDA, version 8.0 or later of the
+CUDA toolkit is required and a GPU architecture of Kepler or later,
+which must *also* be supported by the CUDA toolkit in use **and** the
+CUDA driver in use.  When compiling for OpenCL, OpenCL version 1.2 or
+later is required and the GPU must be supported by the GPU driver and
+OpenCL runtime bundled with the driver.
 
-When building with CMake, you **must NOT** build the GPU library in ``lib/gpu``
-using the traditional build procedure. CMake will detect files generated by that
-process and will terminate with an error and a suggestion for how to remove them.
+When building with CMake, you **must NOT** build the GPU library in
+``lib/gpu`` using the traditional build procedure. CMake will detect
+files generated by that process and will terminate with an error and a
+suggestion for how to remove them.
 
-If you are compiling for OpenCL, the default setting is to download, build, and
-link with a static OpenCL ICD loader library and standard OpenCL headers.  This
-way no local OpenCL development headers or library needs to be present and only
-OpenCL compatible drivers need to be installed to use OpenCL.  If this is not
-desired, you can set :code:`USE_STATIC_OPENCL_LOADER` to :code:`no`.
+If you are compiling for OpenCL, the default setting is to download,
+build, and link with a static OpenCL ICD loader library and standard
+OpenCL headers.  This way no local OpenCL development headers or library
+needs to be present and only OpenCL compatible drivers need to be
+installed to use OpenCL.  If this is not desired, you can set
+:code:`USE_STATIC_OPENCL_LOADER` to :code:`no`.
 
-The GPU library has some multi-thread support using OpenMP.  If LAMMPS is built
-with ``-D BUILD_OMP=on`` this will also be enabled.
+The GPU library has some multi-thread support using OpenMP.  If LAMMPS
+is built with ``-D BUILD_OMP=on`` this will also be enabled.
 
-If you are compiling with HIP, note that before running CMake you will have to
-set appropriate environment variables. Some variables such as
-:code:`HCC_AMDGPU_TARGET` (for ROCm <= 4.0) or :code:`CUDA_PATH` are necessary for :code:`hipcc`
-and the linker to work correctly.
+If you are compiling with HIP, note that before running CMake you will
+have to set appropriate environment variables. Some variables such as
+:code:`HCC_AMDGPU_TARGET` (for ROCm <= 4.0) or :code:`CUDA_PATH` are
+necessary for :code:`hipcc` and the linker to work correctly.
 
-Using CHIP-SPV implementation of HIP is now supported. It allows one to run HIP
-code on Intel GPUs via the OpenCL or Level Zero backends. To use CHIP-SPV, you must
-set :code:`-DHIP_USE_DEVICE_SORT=OFF` in your CMake command line as CHIP-SPV does not
-yet support hipCUB. The use of HIP for Intel GPUs is still experimental so you
-should only use this option in preparations to run on Aurora system at ANL.
+.. versionadded:: 3Aug2022
+
+Using the CHIP-SPV implementation of HIP is supported. It allows one to
+run HIP code on Intel GPUs via the OpenCL or Level Zero backends. To use
+CHIP-SPV, you must set :code:`-DHIP_USE_DEVICE_SORT=OFF` in your CMake
+command line as CHIP-SPV does not yet support hipCUB. As of Summer 2022,
+the use of HIP for Intel GPUs is experimental. You should only use this
+option in preparations to run on Aurora system at Argonne.
 
 .. code:: bash