Whack extra files from #3532

2023-01-26 15:12:34 -07:00
parent 866cf8ae60
commit b7dfa3db05
5 changed files with 0 additions and 1035 deletions
--- a/lib/kokkos/kokkos_5538.diff
+++ b/lib/kokkos/kokkos_5538.diff
@ -1,199 +0,0 @@
-diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos
-index 22af411f32..530510a0d1 100644
--- a/lib/kokkos/Makefile.kokkos
-+++ b/lib/kokkos/Makefile.kokkos
-@@ -20,7 +20,7 @@ KOKKOS_DEVICES ?= "OpenMP"
- #KOKKOS_DEVICES ?= "Threads"
- # Options:
- # Intel:    KNC,KNL,SNB,HSW,BDW,SKL,SKX,ICL,ICX,SPR
-# NVIDIA:   Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86
-+# NVIDIA:   Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86,Hopper90
- # ARM:      ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX
- # IBM:      BGQ,Power7,Power8,Power9
- # AMD-GPUS: Vega900,Vega906,Vega908,Vega90A
-@@ -401,6 +401,7 @@ KOKKOS_INTERNAL_USE_ARCH_VOLTA72 := $(call kokkos_has_string,$(KOKKOS_ARCH),Volt
- KOKKOS_INTERNAL_USE_ARCH_TURING75 := $(call kokkos_has_string,$(KOKKOS_ARCH),Turing75)
- KOKKOS_INTERNAL_USE_ARCH_AMPERE80 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere80)
- KOKKOS_INTERNAL_USE_ARCH_AMPERE86 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere86)
-+KOKKOS_INTERNAL_USE_ARCH_HOPPER90 := $(call kokkos_has_string,$(KOKKOS_ARCH),Hopper90)
- KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30)  \
-                                               + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32)  \
-                                               + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35)  \
-@@ -414,7 +415,8 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLE
-                                               + $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72)   \
-                                               + $(KOKKOS_INTERNAL_USE_ARCH_TURING75)  \
-                                               + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE80)  \
-                                              + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86))
-+                                              + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86)  \
-+                                              + $(KOKKOS_INTERNAL_USE_ARCH_HOPPER90))
- 
- #SEK: This seems like a bug to me
- ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
-@@ -1194,6 +1196,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA_ARCH), 1)
-     tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE86")
-     KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_86
-   endif
-+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_HOPPER90), 1)
-+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER")
-+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER90")
-+    KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_90
-+  endif
- 
-   ifneq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
-     KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)
-diff --git a/lib/kokkos/cmake/KokkosCore_config.h.in b/lib/kokkos/cmake/KokkosCore_config.h.in
-index 88ddc48378..b83ced9243 100644
--- a/lib/kokkos/cmake/KokkosCore_config.h.in
-+++ b/lib/kokkos/cmake/KokkosCore_config.h.in
-@@ -102,6 +102,7 @@
- #cmakedefine KOKKOS_ARCH_AMPERE
- #cmakedefine KOKKOS_ARCH_AMPERE80
- #cmakedefine KOKKOS_ARCH_AMPERE86
-+#cmakedefine KOKKOS_ARCH_HOPPER90
- #cmakedefine KOKKOS_ARCH_AMD_ZEN
- #cmakedefine KOKKOS_ARCH_AMD_ZEN2
- #cmakedefine KOKKOS_ARCH_AMD_ZEN3
-diff --git a/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc b/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc
-index f56cef1651..2585a6a64c 100644
--- a/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc
-+++ b/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc
-@@ -74,6 +74,7 @@ int main() {
-     case 75: std::cout << "Set -DKokkos_ARCH_TURING75=ON ." << std::endl; break;
-     case 80: std::cout << "Set -DKokkos_ARCH_AMPERE80=ON ." << std::endl; break;
-     case 86: std::cout << "Set -DKokkos_ARCH_AMPERE86=ON ." << std::endl; break;
-+    case 90: std::cout << "Set -DKokkos_ARCH_HOPPER90=ON ." << std::endl; break;
-     default:
-       std::cout << "Compute capability " << compute_capability
-                 << " is not supported" << std::endl;
-diff --git a/lib/kokkos/cmake/kokkos_arch.cmake b/lib/kokkos/cmake/kokkos_arch.cmake
-index ef16aad047..c1d76cceeb 100644
--- a/lib/kokkos/cmake/kokkos_arch.cmake
-+++ b/lib/kokkos/cmake/kokkos_arch.cmake
-@@ -86,6 +86,7 @@ KOKKOS_ARCH_OPTION(VOLTA72         GPU  "NVIDIA Volta generation CC 7.2"   "KOKK
- KOKKOS_ARCH_OPTION(TURING75        GPU  "NVIDIA Turing generation CC 7.5"  "KOKKOS_SHOW_CUDA_ARCHS")
- KOKKOS_ARCH_OPTION(AMPERE80        GPU  "NVIDIA Ampere generation CC 8.0"  "KOKKOS_SHOW_CUDA_ARCHS")
- KOKKOS_ARCH_OPTION(AMPERE86        GPU  "NVIDIA Ampere generation CC 8.6"  "KOKKOS_SHOW_CUDA_ARCHS")
-+KOKKOS_ARCH_OPTION(HOPPER90        GPU  "NVIDIA Hopper generation CC 9.0"  "KOKKOS_SHOW_CUDA_ARCHS")
- 
- IF(Kokkos_ENABLE_HIP OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_UNSUPPORTED_ARCHS)
-   SET(KOKKOS_SHOW_HIP_ARCHS ON)
-@@ -544,6 +545,7 @@ CHECK_CUDA_ARCH(VOLTA72   sm_72)
- CHECK_CUDA_ARCH(TURING75  sm_75)
- CHECK_CUDA_ARCH(AMPERE80  sm_80)
- CHECK_CUDA_ARCH(AMPERE86  sm_86)
-+CHECK_CUDA_ARCH(HOPPER90  sm_90)
- 
- SET(AMDGPU_ARCH_ALREADY_SPECIFIED "")
- FUNCTION(CHECK_AMDGPU_ARCH ARCH FLAG)
-@@ -806,6 +808,10 @@ IF (KOKKOS_ARCH_AMPERE80 OR KOKKOS_ARCH_AMPERE86)
-   SET(KOKKOS_ARCH_AMPERE ON)
- ENDIF()
- 
-+IF (KOKKOS_ARCH_HOPPER90)
-+  SET(KOKKOS_ARCH_HOPPER ON)
-+ENDIF()
-+
- #Regardless of version, make sure we define the general architecture name
- IF (KOKKOS_ARCH_VEGA900 OR KOKKOS_ARCH_VEGA906 OR KOKKOS_ARCH_VEGA908 OR KOKKOS_ARCH_VEGA90A)
-   SET(KOKKOS_ARCH_VEGA ON)
-diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
-index 56f9117844..fcd4773dbc 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
-+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
-@@ -232,7 +232,8 @@ inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) {
-       case 61: return 96;
-       case 70:
-       case 80:
-      case 86: return 8;
-+      case 86:
-+      case 90: return 8;
-       case 75: return 32;
-       default:
-         Kokkos::Impl::throw_runtime_exception(
-diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp
-index 40a263561f..8c40ebd60d 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp
-+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp
-@@ -418,7 +418,7 @@ KOKKOS_INLINE_FUNCTION
- #endif  // CUDA_VERSION >= 11000 && CUDA_VERSION < 11010
- 
- #if CUDA_VERSION >= 11010 && \
-    ((defined(KOKKOS_ARCH_AMPERE80) || defined(KOKKOS_ARCH_AMPERE86)))
-+    ((defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER)))
- KOKKOS_INLINE_FUNCTION
- bhalf_t cast_to_bhalf(bhalf_t val) { return val; }
- KOKKOS_INLINE_FUNCTION
-diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp
-index f9451ecfe6..2ce1efb98c 100644
--- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp
-+++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp
-@@ -51,7 +51,7 @@ namespace Kokkos::Experimental::Impl {
- 
- struct OpenACC_Traits {
- #if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
-    defined(KOKKOS_ARCH_AMPERE)
-+    defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER)
-   static constexpr acc_device_t dev_type     = acc_device_nvidia;
-   static constexpr bool may_fallback_to_host = false;
- #else
-diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
-index a9bc085912..27ee1d4232 100644
--- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
-+++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
-@@ -115,8 +115,9 @@ void OpenMPTargetInternal::impl_initialize() {
- 
-   // FIXME_OPENMPTARGET:  Only fix the number of teams for NVIDIA architectures
-   // from Pascal and upwards.
-#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
-    defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
-+#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) ||    \
-+    defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
-+    defined(KOKKOS_ARCH_HOPPER)
- #if defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG >= 1300)
-   omp_set_num_teams(512);
- #endif
-diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
-index 840db4327c..7e5addbc5b 100644
--- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
-+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
-@@ -155,7 +155,7 @@ void SYCL::impl_initialize(InitializationSettings const& settings) {
- #if !defined(KOKKOS_ARCH_INTEL_GPU) && !defined(KOKKOS_ARCH_KEPLER) && \
-     !defined(KOKKOS_ARCH_MAXWELL) && !defined(KOKKOS_ARCH_PASCAL) &&   \
-     !defined(KOKKOS_ARCH_VOLTA) && !defined(KOKKOS_ARCH_TURING75) &&   \
-    !defined(KOKKOS_ARCH_AMPERE)
-+    !defined(KOKKOS_ARCH_AMPERE) && !defined(KOKKOS_ARCH_HOPPER)
-   if (!settings.has_device_id() && gpu_devices.empty()) {
-     Impl::SYCLInternal::singleton().initialize(sycl::device());
-     return;
-diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
-index 5ac7d8af30..ba101f699e 100644
--- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
-+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
-@@ -335,9 +335,10 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
-     return std::min({
-              int(m_space.impl_internal_space_instance()->m_maxWorkgroupSize),
-       // FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs.
-#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
-    defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) ||   \
-    defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
-+#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) ||  \
-+    defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) ||    \
-+    defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
-+    defined(KOKKOS_ARCH_HOPPER)
-                  256,
- #endif
-                  max_threads_for_memory
-@@ -367,9 +368,10 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
-     return std::min<int>({
-              int(m_space.impl_internal_space_instance()->m_maxWorkgroupSize),
-       // FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs.
-#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
-    defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) ||   \
-    defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
-+#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) ||  \
-+    defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) ||    \
-+    defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
-+    defined(KOKKOS_ARCH_HOPPER)
-                  256,
- #endif
-                  max_threads_for_memory
--- a/lib/kokkos/kokkos_5706.diff
+++ b/lib/kokkos/kokkos_5706.diff
@ -1,523 +0,0 @@
-diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
-index fcd4773dbc..30b6958a67 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
-+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
-@@ -207,7 +207,6 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
-                                 LaunchBounds{});
- }
- 
-// Assuming cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1)
- // NOTE these number can be obtained several ways:
- // * One option is to download the CUDA Occupancy Calculator spreadsheet, select
- // "Compute Capability" first and check what is the smallest "Shared Memory
-@@ -242,6 +241,7 @@ inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) {
-     return 0;
-   }() * 1024;
- }
-+
- }  // namespace Impl
- }  // namespace Kokkos
- 
-diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
-index 5811498e01..e22eb3b842 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
-+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
-@@ -569,12 +569,6 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default
-   }
- #endif
- 
-#ifdef KOKKOS_ENABLE_PRE_CUDA_10_DEPRECATION_API
-  cudaThreadSetCacheConfig(cudaFuncCachePreferShared);
-#else
-  cudaDeviceSetCacheConfig(cudaFuncCachePreferShared);
-#endif
-
-   // Init the array for used for arbitrarily sized atomics
-   if (stream == nullptr) Impl::initialize_host_cuda_lock_arrays();
- 
-diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
-index b7a80ad84f..5c4c3a7d39 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
-+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
-@@ -93,10 +93,6 @@ namespace Impl {
- //   __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
- // function qualifier which could be used to improve performance.
- //----------------------------------------------------------------------------
-// Maximize L1 cache and minimize shared memory:
-//   cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1 );
-// For 2.0 capability: 48 KB L1 and 16 KB shared
-//----------------------------------------------------------------------------
- 
- template <class DriverType>
- __global__ static void cuda_parallel_launch_constant_memory() {
-@@ -158,63 +154,105 @@ inline void check_shmem_request(CudaInternal const* cuda_instance, int shmem) {
-   }
- }
- 
-// This function needs to be template on DriverType and LaunchBounds
-+// These functions needs to be template on DriverType and LaunchBounds
- // so that the static bool is unique for each type combo
- // KernelFuncPtr does not necessarily contain that type information.
-+
- template <class DriverType, class LaunchBounds, class KernelFuncPtr>
-inline void configure_shmem_preference(KernelFuncPtr const& func,
-                                       bool prefer_shmem) {
-+const cudaFuncAttributes& get_cuda_kernel_func_attributes(
-+    const KernelFuncPtr& func) {
-+  // Only call cudaFuncGetAttributes once for each unique kernel
-+  // by leveraging static variable initialization rules
-+  auto wrap_get_attributes = [&]() -> cudaFuncAttributes {
-+    cudaFuncAttributes attr;
-+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncGetAttributes(&attr, func));
-+    return attr;
-+  };
-+  static cudaFuncAttributes func_attr = wrap_get_attributes();
-+  return func_attr;
-+}
-+
-+template <class DriverType, class LaunchBounds, class KernelFuncPtr>
-+inline void configure_shmem_preference(const KernelFuncPtr& func,
-+                                       const cudaDeviceProp& device_props,
-+                                       const size_t block_size, int& shmem,
-+                                       const size_t occupancy) {
- #ifndef KOKKOS_ARCH_KEPLER
-  // On Kepler the L1 has no benefit since it doesn't cache reads
-+
-+  const auto& func_attr =
-+      get_cuda_kernel_func_attributes<DriverType, LaunchBounds>(func);
-+
-+  // Compute limits for number of blocks due to registers/SM
-+  const size_t regs_per_sm     = device_props.regsPerMultiprocessor;
-+  const size_t regs_per_thread = func_attr.numRegs;
-+  // The granularity of register allocation is chunks of 256 registers per warp
-+  // -> 8 registers per thread
-+  const size_t allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
-+  const size_t max_blocks_regs =
-+      regs_per_sm / (allocated_regs_per_thread * block_size);
-+
-+  // Compute how many threads per sm we actually want
-+  const size_t max_threads_per_sm = device_props.maxThreadsPerMultiProcessor;
-+  // only allocate multiples of warp size
-+  const size_t num_threads_desired =
-+      ((max_threads_per_sm * occupancy / 100 + 31) / 32) * 32;
-+  // Get close to the desired occupancy,
-+  // don't undershoot by much but also don't allocate a whole new block just
-+  // because one is a few threads over otherwise.
-+  size_t num_blocks_desired =
-+      (num_threads_desired + block_size * 0.8) / block_size;
-+  num_blocks_desired = ::std::min(max_blocks_regs, num_blocks_desired);
-+  if (num_blocks_desired == 0) num_blocks_desired = 1;
-+
-+  // Calculate how much shared memory we need per block
-+  size_t shmem_per_block = shmem + func_attr.sharedSizeBytes;
-+
-+  // The minimum shared memory allocation we can have in total per SM is 8kB.
-+  // If we want to lower occupancy we have to make sure we request at least that
-+  // much in aggregate over all blocks, so that shared memory actually becomes a
-+  // limiting factor for occupancy
-+  constexpr size_t min_shmem_size_per_sm = 8192;
-+  if ((occupancy < 100) &&
-+      (shmem_per_block * num_blocks_desired < min_shmem_size_per_sm)) {
-+    shmem_per_block = min_shmem_size_per_sm / num_blocks_desired;
-+    // Need to set the caller's shmem variable so that the
-+    // kernel launch uses the correct dynamic shared memory request
-+    shmem = shmem_per_block - func_attr.sharedSizeBytes;
-+  }
-+
-+  // Compute the carveout fraction we need based on occupancy
-+  // Use multiples of 8kB
-+  const size_t max_shmem_per_sm = device_props.sharedMemPerMultiprocessor;
-+  size_t carveout               = shmem_per_block == 0
-+                        ? 0
-+                        : 100 *
-+                              (((num_blocks_desired * shmem_per_block +
-+                                 min_shmem_size_per_sm - 1) /
-+                                min_shmem_size_per_sm) *
-+                               min_shmem_size_per_sm) /
-+                              max_shmem_per_sm;
-+  if (carveout > 100) carveout = 100;
-+
-+  // Set the carveout, but only call it once per kernel or when it changes
-   auto set_cache_config = [&] {
-    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
-        func,
-        (prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1)));
-    return prefer_shmem;
-+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetAttribute(
-+        func, cudaFuncAttributePreferredSharedMemoryCarveout, carveout));
-+    return carveout;
-   };
-  static bool cache_config_preference_cached = set_cache_config();
-  if (cache_config_preference_cached != prefer_shmem) {
-+  // Store the value in a static variable so we only reset if needed
-+  static size_t cache_config_preference_cached = set_cache_config();
-+  if (cache_config_preference_cached != carveout) {
-     cache_config_preference_cached = set_cache_config();
-   }
- #else
-   // Use the parameters so we don't get a warning
-   (void)func;
-  (void)prefer_shmem;
-+  (void)device_props;
-+  (void)block_size;
-+  (void)occupancy;
- #endif
- }
- 
-template <class Policy>
-std::enable_if_t<Policy::experimental_contains_desired_occupancy>
-modify_launch_configuration_if_desired_occupancy_is_specified(
-    Policy const& policy, cudaDeviceProp const& properties,
-    cudaFuncAttributes const& attributes, dim3 const& block, int& shmem,
-    bool& prefer_shmem) {
-  int const block_size        = block.x * block.y * block.z;
-  int const desired_occupancy = policy.impl_get_desired_occupancy().value();
-
-  size_t const shmem_per_sm_prefer_l1 = get_shmem_per_sm_prefer_l1(properties);
-  size_t const static_shmem           = attributes.sharedSizeBytes;
-
-  // round to nearest integer and avoid division by zero
-  int active_blocks = std::max(
-      1, static_cast<int>(std::round(
-             static_cast<double>(properties.maxThreadsPerMultiProcessor) /
-             block_size * desired_occupancy / 100)));
-  int const dynamic_shmem =
-      shmem_per_sm_prefer_l1 / active_blocks - static_shmem;
-
-  if (dynamic_shmem > shmem) {
-    shmem        = dynamic_shmem;
-    prefer_shmem = false;
-  }
-}
-
-template <class Policy>
-std::enable_if_t<!Policy::experimental_contains_desired_occupancy>
-modify_launch_configuration_if_desired_occupancy_is_specified(
-    Policy const&, cudaDeviceProp const&, cudaFuncAttributes const&,
-    dim3 const& /*block*/, int& /*shmem*/, bool& /*prefer_shmem*/) {}
-
- // </editor-fold> end Some helper functions for launch code readability }}}1
- //==============================================================================
- 
-@@ -348,7 +386,7 @@ struct CudaParallelLaunchKernelInvoker<
- #ifdef KOKKOS_CUDA_ENABLE_GRAPHS
-   inline static void create_parallel_launch_graph_node(
-       DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
-      CudaInternal const* cuda_instance, bool prefer_shmem) {
-+      CudaInternal const* cuda_instance) {
-     //----------------------------------------
-     auto const& graph = Impl::get_cuda_graph_from_kernel(driver);
-     KOKKOS_EXPECTS(bool(graph));
-@@ -358,8 +396,15 @@ struct CudaParallelLaunchKernelInvoker<
- 
-     if (!Impl::is_empty_launch(grid, block)) {
-       Impl::check_shmem_request(cuda_instance, shmem);
-      Impl::configure_shmem_preference<DriverType, LaunchBounds>(
-          base_t::get_kernel_func(), prefer_shmem);
-+      if (DriverType::Policy::
-+                        experimental_contains_desired_occupancy) {
-+        int desired_occupancy =
-+            driver.get_policy().impl_get_desired_occupancy().value();
-+        size_t block_size = block.x * block.y * block.z;
-+        Impl::configure_shmem_preference<DriverType, LaunchBounds>(
-+            base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
-+            shmem, desired_occupancy);
-+      }
- 
-       void const* args[] = {&driver};
- 
-@@ -442,7 +487,7 @@ struct CudaParallelLaunchKernelInvoker<
- #ifdef KOKKOS_CUDA_ENABLE_GRAPHS
-   inline static void create_parallel_launch_graph_node(
-       DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
-      CudaInternal const* cuda_instance, bool prefer_shmem) {
-+      CudaInternal const* cuda_instance) {
-     //----------------------------------------
-     auto const& graph = Impl::get_cuda_graph_from_kernel(driver);
-     KOKKOS_EXPECTS(bool(graph));
-@@ -452,8 +497,15 @@ struct CudaParallelLaunchKernelInvoker<
- 
-     if (!Impl::is_empty_launch(grid, block)) {
-       Impl::check_shmem_request(cuda_instance, shmem);
-      Impl::configure_shmem_preference<DriverType, LaunchBounds>(
-          base_t::get_kernel_func(), prefer_shmem);
-+      if constexpr (DriverType::Policy::
-+                        experimental_contains_desired_occupancy) {
-+        int desired_occupancy =
-+            driver.get_policy().impl_get_desired_occupancy().value();
-+        size_t block_size = block.x * block.y * block.z;
-+        Impl::configure_shmem_preference<DriverType, LaunchBounds>(
-+            base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
-+            shmem, desired_occupancy);
-+      }
- 
-       auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver);
- 
-@@ -566,7 +618,7 @@ struct CudaParallelLaunchKernelInvoker<
- #ifdef KOKKOS_CUDA_ENABLE_GRAPHS
-   inline static void create_parallel_launch_graph_node(
-       DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
-      CudaInternal const* cuda_instance, bool prefer_shmem) {
-+      CudaInternal const* cuda_instance) {
-     // Just use global memory; coordinating through events to share constant
-     // memory with the non-graph interface is not really reasonable since
-     // events don't work with Graphs directly, and this would anyway require
-@@ -580,7 +632,7 @@ struct CudaParallelLaunchKernelInvoker<
-         DriverType, LaunchBounds,
-         Experimental::CudaLaunchMechanism::GlobalMemory>;
-     global_launch_impl_t::create_parallel_launch_graph_node(
-        driver, grid, block, shmem, cuda_instance, prefer_shmem);
-+        driver, grid, block, shmem, cuda_instance);
-   }
- #endif
- };
-@@ -613,8 +665,7 @@ struct CudaParallelLaunchImpl<
- 
-   inline static void launch_kernel(const DriverType& driver, const dim3& grid,
-                                    const dim3& block, int shmem,
-                                   const CudaInternal* cuda_instance,
-                                   bool prefer_shmem) {
-+                                   const CudaInternal* cuda_instance) {
-     if (!Impl::is_empty_launch(grid, block)) {
-       // Prevent multiple threads to simultaneously set the cache configuration
-       // preference and launch the same kernel
-@@ -623,18 +674,17 @@ struct CudaParallelLaunchImpl<
- 
-       Impl::check_shmem_request(cuda_instance, shmem);
- 
-      // If a desired occupancy is specified, we compute how much shared memory
-      // to ask for to achieve that occupancy, assuming that the cache
-      // configuration is `cudaFuncCachePreferL1`.  If the amount of dynamic
-      // shared memory computed is actually smaller than `shmem` we overwrite
-      // `shmem` and set `prefer_shmem` to `false`.
-      modify_launch_configuration_if_desired_occupancy_is_specified(
-          driver.get_policy(), cuda_instance->m_deviceProp,
-          get_cuda_func_attributes(), block, shmem, prefer_shmem);
-
-      Impl::configure_shmem_preference<
-          DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
-          base_t::get_kernel_func(), prefer_shmem);
-+      if (DriverType::Policy::
-+                        experimental_contains_desired_occupancy) {
-+        int desired_occupancy =
-+            driver.get_policy().impl_get_desired_occupancy().value();
-+        size_t block_size = block.x * block.y * block.z;
-+        Impl::configure_shmem_preference<
-+            DriverType,
-+            Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
-+            base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
-+            shmem, desired_occupancy);
-+      }
- 
-       KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
- 
-@@ -650,18 +700,9 @@ struct CudaParallelLaunchImpl<
-   }
- 
-   static cudaFuncAttributes get_cuda_func_attributes() {
-    // Race condition inside of cudaFuncGetAttributes if the same address is
-    // given requires using a local variable as input instead of a static Rely
-    // on static variable initialization to make sure only one thread executes
-    // the code and the result is visible.
-    auto wrap_get_attributes = []() -> cudaFuncAttributes {
-      cudaFuncAttributes attr_tmp;
-      KOKKOS_IMPL_CUDA_SAFE_CALL(
-          cudaFuncGetAttributes(&attr_tmp, base_t::get_kernel_func()));
-      return attr_tmp;
-    };
-    static cudaFuncAttributes attr = wrap_get_attributes();
-    return attr;
-+    return get_cuda_kernel_func_attributes<
-+        DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
-+        base_t::get_kernel_func());
-   }
- };
- 
-diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
-index e586bb4cc6..0e348c092a 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
-+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
-@@ -121,8 +121,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
-               maxblocks[1]),
-           1);
-       CudaParallelLaunch<ParallelFor, LaunchBounds>(
-          *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
-          false);
-+          *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
-     } else if (RP::rank == 3) {
-       const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], m_rp.m_tile[2]);
-       KOKKOS_ASSERT(block.x > 0);
-@@ -139,8 +138,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
-               (m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1) / block.z,
-               maxblocks[2]));
-       CudaParallelLaunch<ParallelFor, LaunchBounds>(
-          *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
-          false);
-+          *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
-     } else if (RP::rank == 4) {
-       // id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to
-       // threadIdx.z
-@@ -158,8 +156,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
-               (m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1) / block.z,
-               maxblocks[2]));
-       CudaParallelLaunch<ParallelFor, LaunchBounds>(
-          *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
-          false);
-+          *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
-     } else if (RP::rank == 5) {
-       // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to
-       // threadIdx.z
-@@ -175,8 +172,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
-               (m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1) / block.z,
-               maxblocks[2]));
-       CudaParallelLaunch<ParallelFor, LaunchBounds>(
-          *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
-          false);
-+          *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
-     } else if (RP::rank == 6) {
-       // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4,id5 to
-       // threadIdx.z
-@@ -191,8 +187,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
-           std::min<array_index_type>(m_rp.m_tile_end[4] * m_rp.m_tile_end[5],
-                                      maxblocks[2]));
-       CudaParallelLaunch<ParallelFor, LaunchBounds>(
-          *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
-          false);
-+          *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
-     } else {
-       Kokkos::abort("Kokkos::MDRange Error: Exceeded rank bounds with Cuda\n");
-     }
-@@ -405,8 +400,8 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
- 
-       CudaParallelLaunch<ParallelReduce, LaunchBounds>(
-           *this, grid, block, shmem,
-          m_policy.space().impl_internal_space_instance(),
-          false);  // copy to device and execute
-+          m_policy.space()
-+	      .impl_internal_space_instance()); // copy to device and execute
- 
-       if (!m_result_ptr_device_accessible) {
-         if (m_result_ptr) {
-diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
-index ac160f8fe2..d1031751c2 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
-+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
-@@ -135,8 +135,7 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
- #endif
- 
-     CudaParallelLaunch<ParallelFor, LaunchBounds>(
-        *this, grid, block, 0, m_policy.space().impl_internal_space_instance(),
-        false);
-+        *this, grid, block, 0, m_policy.space().impl_internal_space_instance());
-   }
- 
-   ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
-@@ -375,8 +374,8 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
- 
-       CudaParallelLaunch<ParallelReduce, LaunchBounds>(
-           *this, grid, block, shmem,
-          m_policy.space().impl_internal_space_instance(),
-          false);  // copy to device and execute
-+          m_policy.space()
-+	      .impl_internal_space_instance());  // copy to device and execute
- 
-       if (!m_result_ptr_device_accessible) {
-         if (m_result_ptr) {
-@@ -726,16 +725,16 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
-         m_final = false;
-         CudaParallelLaunch<ParallelScan, LaunchBounds>(
-             *this, grid, block, shmem,
-            m_policy.space().impl_internal_space_instance(),
-            false);  // copy to device and execute
-+            m_policy.space()
-+	        .impl_internal_space_instance());  // copy to device and execute
- #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
-       }
- #endif
-       m_final = true;
-       CudaParallelLaunch<ParallelScan, LaunchBounds>(
-           *this, grid, block, shmem,
-          m_policy.space().impl_internal_space_instance(),
-          false);  // copy to device and execute
-+          m_policy.space()
-+	      .impl_internal_space_instance());  // copy to device and execute
-     }
-   }
- 
-@@ -1038,16 +1037,16 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
-         m_final = false;
-         CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>(
-             *this, grid, block, shmem,
-            m_policy.space().impl_internal_space_instance(),
-            false);  // copy to device and execute
-+            m_policy.space()
-+	        .impl_internal_space_instance());  // copy to device and execute
- #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
-       }
- #endif
-       m_final = true;
-       CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>(
-           *this, grid, block, shmem,
-          m_policy.space().impl_internal_space_instance(),
-          false);  // copy to device and execute
-+          m_policy.space()
-+              .impl_internal_space_instance());  // copy to device and execute
- 
-       const int size = Analysis::value_size(m_functor);
- #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
-diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
-index cdd16085b3..ea9430b812 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
-+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
-@@ -552,8 +552,8 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
- 
-     CudaParallelLaunch<ParallelFor, LaunchBounds>(
-         *this, grid, block, shmem_size_total,
-        m_policy.space().impl_internal_space_instance(),
-        true);  // copy to device and execute
-+        m_policy.space()
-+	    .impl_internal_space_instance());  // copy to device and execute
-   }
- 
-   ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
-@@ -878,8 +878,8 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
- 
-       CudaParallelLaunch<ParallelReduce, LaunchBounds>(
-           *this, grid, block, shmem_size_total,
-          m_policy.space().impl_internal_space_instance(),
-          true);  // copy to device and execute
-+          m_policy.space()
-+	      .impl_internal_space_instance());  // copy to device and execute
- 
-       if (!m_result_ptr_device_accessible) {
-         m_policy.space().fence(
-diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
-index 34d4bef9fd..178012431c 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
-+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
-@@ -428,11 +428,6 @@ struct CudaReductionsFunctor<FunctorType, false, false> {
- //   __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
- // function qualifier which could be used to improve performance.
- //----------------------------------------------------------------------------
-// Maximize shared memory and minimize L1 cache:
-//   cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferShared );
-// For 2.0 capability: 48 KB shared and 16 KB L1
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
- /*
-  *  Algorithmic constraints:
-  *   (a) blockDim.y <= 1024
-diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
-index fb3a6b138f..a12378a891 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
-+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
-@@ -100,8 +100,7 @@ class ParallelFor<FunctorType, Kokkos::WorkGraphPolicy<Traits...>,
-     const int shared = 0;
- 
-     Kokkos::Impl::CudaParallelLaunch<Self>(
-        *this, grid, block, shared, Cuda().impl_internal_space_instance(),
-        false);
-+        *this, grid, block, shared, Cuda().impl_internal_space_instance());
-   }
- 
-   inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
--- a/lib/kokkos/kokkos_5731.diff
+++ b/lib/kokkos/kokkos_5731.diff
@ -1,46 +0,0 @@
-diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
-index 30b6958a67..b94f053272 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
-+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
-@@ -207,41 +207,6 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
-                                 LaunchBounds{});
- }
- 
-// NOTE these number can be obtained several ways:
-// * One option is to download the CUDA Occupancy Calculator spreadsheet, select
-// "Compute Capability" first and check what is the smallest "Shared Memory
-// Size Config" that is available.  The "Shared Memory Per Multiprocessor" in
-// bytes is then to be found below in the summary.
-// * Another option would be to look for the information in the "Tuning
-// Guide(s)" of the CUDA Toolkit Documentation for each GPU architecture, in
-// the "Shared Memory" section (more tedious)
-inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) {
-  int const compute_capability = properties.major * 10 + properties.minor;
-  return [compute_capability]() {
-    switch (compute_capability) {
-      case 30:
-      case 32:
-      case 35: return 16;
-      case 37: return 80;
-      case 50:
-      case 53:
-      case 60:
-      case 62: return 64;
-      case 52:
-      case 61: return 96;
-      case 70:
-      case 80:
-      case 86:
-      case 90: return 8;
-      case 75: return 32;
-      default:
-        Kokkos::Impl::throw_runtime_exception(
-            "Unknown device in cuda block size deduction");
-    }
-    return 0;
-  }() * 1024;
-}
-
- }  // namespace Impl
- }  // namespace Kokkos
- 
--- a/lib/kokkos/kokkos_5739.diff
+++ b/lib/kokkos/kokkos_5739.diff
@ -1,204 +0,0 @@
-diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
-index b94f053272..252c13c524 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
-+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
-@@ -53,17 +53,69 @@
- namespace Kokkos {
- namespace Impl {
- 
-+inline int cuda_warp_per_sm_allocation_granularity(
-+    cudaDeviceProp const& properties) {
-+  // Allocation granularity of warps in each sm
-+  switch (properties.major) {
-+    case 3:
-+    case 5:
-+    case 7:
-+    case 8:
-+    case 9: return 4;
-+    case 6: return (properties.minor == 0 ? 2 : 4);
-+    default:
-+      throw_runtime_exception(
-+          "Unknown device in cuda warp per sm allocation granularity");
-+      return 0;
-+  }
-+}
-+
-+inline int cuda_max_warps_per_sm_registers(
-+    cudaDeviceProp const& properties, cudaFuncAttributes const& attributes) {
-+  // Maximum number of warps per sm as a function of register counts,
-+  // subject to the constraint that warps are allocated with a fixed granularity
-+  int const max_regs_per_block = properties.regsPerBlock;
-+  int const regs_per_warp      = attributes.numRegs * properties.warpSize;
-+  int const warp_granularity =
-+      cuda_warp_per_sm_allocation_granularity(properties);
-+  // The granularity of register allocation is chunks of 256 registers per warp,
-+  // which implies a need to over-allocate, so we round up
-+  int const allocated_regs_per_warp = (regs_per_warp + 256 - 1) / 256;
-+
-+  // The maximum number of warps per SM is constrained from above by register
-+  // allocation. To satisfy the constraint that warps per SM is allocated at a
-+  // finite granularity, we need to round down.
-+  int const max_warps_per_sm =
-+      warp_granularity *
-+      (max_regs_per_block / (allocated_regs_per_warp * warp_granularity));
-+
-+  return max_warps_per_sm;
-+}
-+
- inline int cuda_max_active_blocks_per_sm(cudaDeviceProp const& properties,
-                                          cudaFuncAttributes const& attributes,
-                                          int block_size, size_t dynamic_shmem) {
-  // Limits due do registers/SM
-+  // Limits due to registers/SM
-   int const regs_per_sm     = properties.regsPerMultiprocessor;
-   int const regs_per_thread = attributes.numRegs;
-   // The granularity of register allocation is chunks of 256 registers per warp
-   // -> 8 registers per thread
-   int const allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
-  int const max_blocks_regs =
-      regs_per_sm / (allocated_regs_per_thread * block_size);
-+  int max_blocks_regs = regs_per_sm / (allocated_regs_per_thread * block_size);
-+
-+  // Compute the maximum number of warps as a function of the number of
-+  // registers
-+  int const max_warps_per_sm_registers =
-+      cuda_max_warps_per_sm_registers(properties, attributes);
-+
-+  // Constrain the number of blocks to respect the maximum number of warps per
-+  // SM On face value this should be an equality, but due to the warp
-+  // granularity constraints noted in `cuda_max_warps_per_sm_registers` the
-+  // left-hand-side of this comparison can overshoot what the hardware allows
-+  // based on register counts alone
-+  while ((max_blocks_regs * block_size / properties.warpSize) >
-+         max_warps_per_sm_registers)
-+    max_blocks_regs--;
- 
-   // Limits due to shared memory/SM
-   size_t const shmem_per_sm            = properties.sharedMemPerMultiprocessor;
-@@ -207,6 +259,19 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
-                                 LaunchBounds{});
- }
- 
-+template <class LaunchBounds>
-+int cuda_get_opt_block_size_no_shmem(const cudaFuncAttributes& attr,
-+                                     LaunchBounds) {
-+  auto const& prop = Kokkos::Cuda().cuda_device_prop();
-+
-+  // Thin version of cuda_get_opt_block_size for cases where there is no shared
-+  // memory
-+  auto const block_size_to_no_shmem = [&](int /*block_size*/) { return 0; };
-+
-+  return cuda_deduce_block_size(false, prop, attr, block_size_to_no_shmem,
-+                                LaunchBounds{});
-+}
-+
- }  // namespace Impl
- }  // namespace Kokkos
- 
-diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
-index 5c4c3a7d39..170183ca0a 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
-+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
-@@ -188,9 +188,23 @@ inline void configure_shmem_preference(const KernelFuncPtr& func,
-   // The granularity of register allocation is chunks of 256 registers per warp
-   // -> 8 registers per thread
-   const size_t allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
-  const size_t max_blocks_regs =
-+  size_t max_blocks_regs =
-       regs_per_sm / (allocated_regs_per_thread * block_size);
- 
-+  // Compute the maximum number of warps as a function of the number of
-+  // registers
-+  const size_t max_warps_per_sm_registers =
-+      cuda_max_warps_per_sm_registers(device_props, func_attr);
-+
-+  // Constrain the number of blocks to respect the maximum number of warps per
-+  // SM On face value this should be an equality, but due to the warp
-+  // granularity constraints noted in `cuda_max_warps_per_sm_registers` the
-+  // left-hand-side of this comparison can overshoot what the hardware allows
-+  // based on register counts alone
-+  while ((max_blocks_regs * block_size / device_props.warpSize) >
-+         max_warps_per_sm_registers)
-+    max_blocks_regs--;
-+
-   // Compute how many threads per sm we actually want
-   const size_t max_threads_per_sm = device_props.maxThreadsPerMultiProcessor;
-   // only allocate multiples of warp size
-diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
-index 0e348c092a..7e4f62f12e 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
-+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
-@@ -67,6 +67,34 @@
- namespace Kokkos {
- namespace Impl {
- 
-+template <typename ParallelType, typename Policy, typename LaunchBounds>
-+int max_tile_size_product_helper(const Policy& pol, const LaunchBounds&) {
-+  cudaFuncAttributes attr =
-+      CudaParallelLaunch<ParallelType,
-+                         LaunchBounds>::get_cuda_func_attributes();
-+  auto const& prop = pol.space().cuda_device_prop();
-+
-+  // Limits due to registers/SM, MDRange doesn't have
-+  // shared memory constraints
-+  int const optimal_block_size =
-+      Kokkos::Impl::cuda_get_opt_block_size_no_shmem(attr, LaunchBounds{});
-+
-+  // Compute how many blocks of this size we can launch, based on warp
-+  // constraints
-+  int const max_warps_per_sm_registers =
-+      Kokkos::Impl::cuda_max_warps_per_sm_registers(prop, attr);
-+  int const max_num_threads_from_warps =
-+      max_warps_per_sm_registers * prop.warpSize;
-+  int const max_num_blocks = max_num_threads_from_warps / optimal_block_size;
-+
-+  // Compute the total number of threads
-+  int const max_threads_per_sm = optimal_block_size * max_num_blocks;
-+
-+  return std::min(
-+      max_threads_per_sm,
-+      static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
-+}
-+
- template <class FunctorType, class... Traits>
- class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
-  public:
-@@ -85,18 +113,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
-  public:
-   template <typename Policy, typename Functor>
-   static int max_tile_size_product(const Policy& pol, const Functor&) {
-    cudaFuncAttributes attr =
-        CudaParallelLaunch<ParallelFor,
-                           LaunchBounds>::get_cuda_func_attributes();
-    auto const& prop = pol.space().cuda_device_prop();
-    // Limits due to registers/SM, MDRange doesn't have
-    // shared memory constraints
-    int const regs_per_sm        = prop.regsPerMultiprocessor;
-    int const regs_per_thread    = attr.numRegs;
-    int const max_threads_per_sm = regs_per_sm / regs_per_thread;
-    return std::min(
-        max_threads_per_sm,
-        static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
-+    return max_tile_size_product_helper<ParallelFor>(pol, LaunchBounds{});
-   }
-   Policy const& get_policy() const { return m_rp; }
-   inline __device__ void operator()() const {
-@@ -258,17 +275,7 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
-  public:
-   template <typename Policy, typename Functor>
-   static int max_tile_size_product(const Policy& pol, const Functor&) {
-    cudaFuncAttributes attr =
-        CudaParallelLaunch<ParallelReduce,
-                           LaunchBounds>::get_cuda_func_attributes();
-    auto const& prop = pol.space().cuda_device_prop();
-    // Limits due do registers/SM
-    int const regs_per_sm        = prop.regsPerMultiprocessor;
-    int const regs_per_thread    = attr.numRegs;
-    int const max_threads_per_sm = regs_per_sm / regs_per_thread;
-    return std::min(
-        max_threads_per_sm,
-        static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
-+    return max_tile_size_product_helper<ParallelReduce>(pol, LaunchBounds{});
-   }
-   Policy const& get_policy() const { return m_policy; }
-   inline __device__ void exec_range(reference_type update) const {
--- a/lib/kokkos/kokkos_fix_5706_apply_last.diff
+++ b/lib/kokkos/kokkos_fix_5706_apply_last.diff
@ -1,63 +0,0 @@
-diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
-index 170183ca0a..ba43e362bb 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
-+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
-@@ -412,12 +412,16 @@ struct CudaParallelLaunchKernelInvoker<
-       Impl::check_shmem_request(cuda_instance, shmem);
-       if (DriverType::Policy::
-                         experimental_contains_desired_occupancy) {
-+      /*
-         int desired_occupancy =
-             driver.get_policy().impl_get_desired_occupancy().value();
-         size_t block_size = block.x * block.y * block.z;
-         Impl::configure_shmem_preference<DriverType, LaunchBounds>(
-             base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
-            shmem, desired_occupancy);
-+            shmem, desired_occupancy);*/
-+        Kokkos::Impl::throw_runtime_exception(
-+        std::string("Cuda graph node creation FAILED:"
-+                    " occupancy requests are currently broken."));
-       }
- 
-       void const* args[] = {&driver};
-@@ -511,14 +515,17 @@ struct CudaParallelLaunchKernelInvoker<
- 
-     if (!Impl::is_empty_launch(grid, block)) {
-       Impl::check_shmem_request(cuda_instance, shmem);
-      if constexpr (DriverType::Policy::
-+      if (DriverType::Policy::
-                         experimental_contains_desired_occupancy) {
-        int desired_occupancy =
-+        /*int desired_occupancy =
-             driver.get_policy().impl_get_desired_occupancy().value();
-         size_t block_size = block.x * block.y * block.z;
-         Impl::configure_shmem_preference<DriverType, LaunchBounds>(
-             base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
-            shmem, desired_occupancy);
-+            shmem, desired_occupancy);*/
-+        Kokkos::Impl::throw_runtime_exception(
-+        std::string("Cuda graph node creation FAILED:"
-+                    " occupancy requests are currently broken."));
-       }
- 
-       auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver);
-@@ -690,14 +697,17 @@ struct CudaParallelLaunchImpl<
- 
-       if (DriverType::Policy::
-                         experimental_contains_desired_occupancy) {
-        int desired_occupancy =
-+        /*int desired_occupancy =
-             driver.get_policy().impl_get_desired_occupancy().value();
-         size_t block_size = block.x * block.y * block.z;
-         Impl::configure_shmem_preference<
-             DriverType,
-             Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
-             base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
-            shmem, desired_occupancy);
-+            shmem, desired_occupancy);*/
-+        Kokkos::Impl::throw_runtime_exception(
-+        std::string("Cuda graph node creation FAILED:"
-+                    " occupancy requests are currently broken."));
-       }
- 
-       KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();