diff --git a/lib/kokkos/kokkos_5538.diff b/lib/kokkos/kokkos_5538.diff deleted file mode 100644 index 6bf2ccf6a4..0000000000 --- a/lib/kokkos/kokkos_5538.diff +++ /dev/null @@ -1,199 +0,0 @@ -diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos -index 22af411f32..530510a0d1 100644 ---- a/lib/kokkos/Makefile.kokkos -+++ b/lib/kokkos/Makefile.kokkos -@@ -20,7 +20,7 @@ KOKKOS_DEVICES ?= "OpenMP" - #KOKKOS_DEVICES ?= "Threads" - # Options: - # Intel: KNC,KNL,SNB,HSW,BDW,SKL,SKX,ICL,ICX,SPR --# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86 -+# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86,Hopper90 - # ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX - # IBM: BGQ,Power7,Power8,Power9 - # AMD-GPUS: Vega900,Vega906,Vega908,Vega90A -@@ -401,6 +401,7 @@ KOKKOS_INTERNAL_USE_ARCH_VOLTA72 := $(call kokkos_has_string,$(KOKKOS_ARCH),Volt - KOKKOS_INTERNAL_USE_ARCH_TURING75 := $(call kokkos_has_string,$(KOKKOS_ARCH),Turing75) - KOKKOS_INTERNAL_USE_ARCH_AMPERE80 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere80) - KOKKOS_INTERNAL_USE_ARCH_AMPERE86 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere86) -+KOKKOS_INTERNAL_USE_ARCH_HOPPER90 := $(call kokkos_has_string,$(KOKKOS_ARCH),Hopper90) - KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30) \ - + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32) \ - + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \ -@@ -414,7 +415,8 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLE - + $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72) \ - + $(KOKKOS_INTERNAL_USE_ARCH_TURING75) \ - + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE80) \ -- + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86)) -+ + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86) \ -+ + $(KOKKOS_INTERNAL_USE_ARCH_HOPPER90)) - - #SEK: This seems like a bug to me - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0) -@@ -1194,6 +1196,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA_ARCH), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE86") - KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_86 - endif -+ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_HOPPER90), 1) -+ tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER") -+ tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER90") -+ KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_90 -+ endif - - ifneq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0) - KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG) -diff --git a/lib/kokkos/cmake/KokkosCore_config.h.in b/lib/kokkos/cmake/KokkosCore_config.h.in -index 88ddc48378..b83ced9243 100644 ---- a/lib/kokkos/cmake/KokkosCore_config.h.in -+++ b/lib/kokkos/cmake/KokkosCore_config.h.in -@@ -102,6 +102,7 @@ - #cmakedefine KOKKOS_ARCH_AMPERE - #cmakedefine KOKKOS_ARCH_AMPERE80 - #cmakedefine KOKKOS_ARCH_AMPERE86 -+#cmakedefine KOKKOS_ARCH_HOPPER90 - #cmakedefine KOKKOS_ARCH_AMD_ZEN - #cmakedefine KOKKOS_ARCH_AMD_ZEN2 - #cmakedefine KOKKOS_ARCH_AMD_ZEN3 -diff --git a/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc b/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc -index f56cef1651..2585a6a64c 100644 ---- a/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc -+++ b/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc -@@ -74,6 +74,7 @@ int main() { - case 75: std::cout << "Set -DKokkos_ARCH_TURING75=ON ." << std::endl; break; - case 80: std::cout << "Set -DKokkos_ARCH_AMPERE80=ON ." << std::endl; break; - case 86: std::cout << "Set -DKokkos_ARCH_AMPERE86=ON ." << std::endl; break; -+ case 90: std::cout << "Set -DKokkos_ARCH_HOPPER90=ON ." << std::endl; break; - default: - std::cout << "Compute capability " << compute_capability - << " is not supported" << std::endl; -diff --git a/lib/kokkos/cmake/kokkos_arch.cmake b/lib/kokkos/cmake/kokkos_arch.cmake -index ef16aad047..c1d76cceeb 100644 ---- a/lib/kokkos/cmake/kokkos_arch.cmake -+++ b/lib/kokkos/cmake/kokkos_arch.cmake -@@ -86,6 +86,7 @@ KOKKOS_ARCH_OPTION(VOLTA72 GPU "NVIDIA Volta generation CC 7.2" "KOKK - KOKKOS_ARCH_OPTION(TURING75 GPU "NVIDIA Turing generation CC 7.5" "KOKKOS_SHOW_CUDA_ARCHS") - KOKKOS_ARCH_OPTION(AMPERE80 GPU "NVIDIA Ampere generation CC 8.0" "KOKKOS_SHOW_CUDA_ARCHS") - KOKKOS_ARCH_OPTION(AMPERE86 GPU "NVIDIA Ampere generation CC 8.6" "KOKKOS_SHOW_CUDA_ARCHS") -+KOKKOS_ARCH_OPTION(HOPPER90 GPU "NVIDIA Hopper generation CC 9.0" "KOKKOS_SHOW_CUDA_ARCHS") - - IF(Kokkos_ENABLE_HIP OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_UNSUPPORTED_ARCHS) - SET(KOKKOS_SHOW_HIP_ARCHS ON) -@@ -544,6 +545,7 @@ CHECK_CUDA_ARCH(VOLTA72 sm_72) - CHECK_CUDA_ARCH(TURING75 sm_75) - CHECK_CUDA_ARCH(AMPERE80 sm_80) - CHECK_CUDA_ARCH(AMPERE86 sm_86) -+CHECK_CUDA_ARCH(HOPPER90 sm_90) - - SET(AMDGPU_ARCH_ALREADY_SPECIFIED "") - FUNCTION(CHECK_AMDGPU_ARCH ARCH FLAG) -@@ -806,6 +808,10 @@ IF (KOKKOS_ARCH_AMPERE80 OR KOKKOS_ARCH_AMPERE86) - SET(KOKKOS_ARCH_AMPERE ON) - ENDIF() - -+IF (KOKKOS_ARCH_HOPPER90) -+ SET(KOKKOS_ARCH_HOPPER ON) -+ENDIF() -+ - #Regardless of version, make sure we define the general architecture name - IF (KOKKOS_ARCH_VEGA900 OR KOKKOS_ARCH_VEGA906 OR KOKKOS_ARCH_VEGA908 OR KOKKOS_ARCH_VEGA90A) - SET(KOKKOS_ARCH_VEGA ON) -diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp -index 56f9117844..fcd4773dbc 100644 ---- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp -+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp -@@ -232,7 +232,8 @@ inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) { - case 61: return 96; - case 70: - case 80: -- case 86: return 8; -+ case 86: -+ case 90: return 8; - case 75: return 32; - default: - Kokkos::Impl::throw_runtime_exception( -diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp -index 40a263561f..8c40ebd60d 100644 ---- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp -+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp -@@ -418,7 +418,7 @@ KOKKOS_INLINE_FUNCTION - #endif // CUDA_VERSION >= 11000 && CUDA_VERSION < 11010 - - #if CUDA_VERSION >= 11010 && \ -- ((defined(KOKKOS_ARCH_AMPERE80) || defined(KOKKOS_ARCH_AMPERE86))) -+ ((defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER))) - KOKKOS_INLINE_FUNCTION - bhalf_t cast_to_bhalf(bhalf_t val) { return val; } - KOKKOS_INLINE_FUNCTION -diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp -index f9451ecfe6..2ce1efb98c 100644 ---- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp -+++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp -@@ -51,7 +51,7 @@ namespace Kokkos::Experimental::Impl { - - struct OpenACC_Traits { - #if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \ -- defined(KOKKOS_ARCH_AMPERE) -+ defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER) - static constexpr acc_device_t dev_type = acc_device_nvidia; - static constexpr bool may_fallback_to_host = false; - #else -diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp -index a9bc085912..27ee1d4232 100644 ---- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp -+++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp -@@ -115,8 +115,9 @@ void OpenMPTargetInternal::impl_initialize() { - - // FIXME_OPENMPTARGET: Only fix the number of teams for NVIDIA architectures - // from Pascal and upwards. --#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \ -- defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) -+#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \ -+ defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \ -+ defined(KOKKOS_ARCH_HOPPER) - #if defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG >= 1300) - omp_set_num_teams(512); - #endif -diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp -index 840db4327c..7e5addbc5b 100644 ---- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp -+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp -@@ -155,7 +155,7 @@ void SYCL::impl_initialize(InitializationSettings const& settings) { - #if !defined(KOKKOS_ARCH_INTEL_GPU) && !defined(KOKKOS_ARCH_KEPLER) && \ - !defined(KOKKOS_ARCH_MAXWELL) && !defined(KOKKOS_ARCH_PASCAL) && \ - !defined(KOKKOS_ARCH_VOLTA) && !defined(KOKKOS_ARCH_TURING75) && \ -- !defined(KOKKOS_ARCH_AMPERE) -+ !defined(KOKKOS_ARCH_AMPERE) && !defined(KOKKOS_ARCH_HOPPER) - if (!settings.has_device_id() && gpu_devices.empty()) { - Impl::SYCLInternal::singleton().initialize(sycl::device()); - return; -diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp -index 5ac7d8af30..ba101f699e 100644 ---- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp -+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp -@@ -335,9 +335,10 @@ class TeamPolicyInternal - return std::min({ - int(m_space.impl_internal_space_instance()->m_maxWorkgroupSize), - // FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs. --#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \ -- defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \ -- defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) -+#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \ -+ defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \ -+ defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \ -+ defined(KOKKOS_ARCH_HOPPER) - 256, - #endif - max_threads_for_memory -@@ -367,9 +368,10 @@ class TeamPolicyInternal - return std::min({ - int(m_space.impl_internal_space_instance()->m_maxWorkgroupSize), - // FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs. --#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \ -- defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \ -- defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) -+#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \ -+ defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \ -+ defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \ -+ defined(KOKKOS_ARCH_HOPPER) - 256, - #endif - max_threads_for_memory diff --git a/lib/kokkos/kokkos_5706.diff b/lib/kokkos/kokkos_5706.diff deleted file mode 100644 index 2bfbb35b06..0000000000 --- a/lib/kokkos/kokkos_5706.diff +++ /dev/null @@ -1,523 +0,0 @@ -diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp -index fcd4773dbc..30b6958a67 100644 ---- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp -+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp -@@ -207,7 +207,6 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance, - LaunchBounds{}); - } - --// Assuming cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1) - // NOTE these number can be obtained several ways: - // * One option is to download the CUDA Occupancy Calculator spreadsheet, select - // "Compute Capability" first and check what is the smallest "Shared Memory -@@ -242,6 +241,7 @@ inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) { - return 0; - }() * 1024; - } -+ - } // namespace Impl - } // namespace Kokkos - -diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp -index 5811498e01..e22eb3b842 100644 ---- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp -+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp -@@ -569,12 +569,6 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default - } - #endif - --#ifdef KOKKOS_ENABLE_PRE_CUDA_10_DEPRECATION_API -- cudaThreadSetCacheConfig(cudaFuncCachePreferShared); --#else -- cudaDeviceSetCacheConfig(cudaFuncCachePreferShared); --#endif -- - // Init the array for used for arbitrarily sized atomics - if (stream == nullptr) Impl::initialize_host_cuda_lock_arrays(); - -diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp -index b7a80ad84f..5c4c3a7d39 100644 ---- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp -+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp -@@ -93,10 +93,6 @@ namespace Impl { - // __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor) - // function qualifier which could be used to improve performance. - //---------------------------------------------------------------------------- --// Maximize L1 cache and minimize shared memory: --// cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1 ); --// For 2.0 capability: 48 KB L1 and 16 KB shared --//---------------------------------------------------------------------------- - - template - __global__ static void cuda_parallel_launch_constant_memory() { -@@ -158,63 +154,105 @@ inline void check_shmem_request(CudaInternal const* cuda_instance, int shmem) { - } - } - --// This function needs to be template on DriverType and LaunchBounds -+// These functions needs to be template on DriverType and LaunchBounds - // so that the static bool is unique for each type combo - // KernelFuncPtr does not necessarily contain that type information. -+ - template --inline void configure_shmem_preference(KernelFuncPtr const& func, -- bool prefer_shmem) { -+const cudaFuncAttributes& get_cuda_kernel_func_attributes( -+ const KernelFuncPtr& func) { -+ // Only call cudaFuncGetAttributes once for each unique kernel -+ // by leveraging static variable initialization rules -+ auto wrap_get_attributes = [&]() -> cudaFuncAttributes { -+ cudaFuncAttributes attr; -+ KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncGetAttributes(&attr, func)); -+ return attr; -+ }; -+ static cudaFuncAttributes func_attr = wrap_get_attributes(); -+ return func_attr; -+} -+ -+template -+inline void configure_shmem_preference(const KernelFuncPtr& func, -+ const cudaDeviceProp& device_props, -+ const size_t block_size, int& shmem, -+ const size_t occupancy) { - #ifndef KOKKOS_ARCH_KEPLER -- // On Kepler the L1 has no benefit since it doesn't cache reads -+ -+ const auto& func_attr = -+ get_cuda_kernel_func_attributes(func); -+ -+ // Compute limits for number of blocks due to registers/SM -+ const size_t regs_per_sm = device_props.regsPerMultiprocessor; -+ const size_t regs_per_thread = func_attr.numRegs; -+ // The granularity of register allocation is chunks of 256 registers per warp -+ // -> 8 registers per thread -+ const size_t allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8); -+ const size_t max_blocks_regs = -+ regs_per_sm / (allocated_regs_per_thread * block_size); -+ -+ // Compute how many threads per sm we actually want -+ const size_t max_threads_per_sm = device_props.maxThreadsPerMultiProcessor; -+ // only allocate multiples of warp size -+ const size_t num_threads_desired = -+ ((max_threads_per_sm * occupancy / 100 + 31) / 32) * 32; -+ // Get close to the desired occupancy, -+ // don't undershoot by much but also don't allocate a whole new block just -+ // because one is a few threads over otherwise. -+ size_t num_blocks_desired = -+ (num_threads_desired + block_size * 0.8) / block_size; -+ num_blocks_desired = ::std::min(max_blocks_regs, num_blocks_desired); -+ if (num_blocks_desired == 0) num_blocks_desired = 1; -+ -+ // Calculate how much shared memory we need per block -+ size_t shmem_per_block = shmem + func_attr.sharedSizeBytes; -+ -+ // The minimum shared memory allocation we can have in total per SM is 8kB. -+ // If we want to lower occupancy we have to make sure we request at least that -+ // much in aggregate over all blocks, so that shared memory actually becomes a -+ // limiting factor for occupancy -+ constexpr size_t min_shmem_size_per_sm = 8192; -+ if ((occupancy < 100) && -+ (shmem_per_block * num_blocks_desired < min_shmem_size_per_sm)) { -+ shmem_per_block = min_shmem_size_per_sm / num_blocks_desired; -+ // Need to set the caller's shmem variable so that the -+ // kernel launch uses the correct dynamic shared memory request -+ shmem = shmem_per_block - func_attr.sharedSizeBytes; -+ } -+ -+ // Compute the carveout fraction we need based on occupancy -+ // Use multiples of 8kB -+ const size_t max_shmem_per_sm = device_props.sharedMemPerMultiprocessor; -+ size_t carveout = shmem_per_block == 0 -+ ? 0 -+ : 100 * -+ (((num_blocks_desired * shmem_per_block + -+ min_shmem_size_per_sm - 1) / -+ min_shmem_size_per_sm) * -+ min_shmem_size_per_sm) / -+ max_shmem_per_sm; -+ if (carveout > 100) carveout = 100; -+ -+ // Set the carveout, but only call it once per kernel or when it changes - auto set_cache_config = [&] { -- KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetCacheConfig( -- func, -- (prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1))); -- return prefer_shmem; -+ KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetAttribute( -+ func, cudaFuncAttributePreferredSharedMemoryCarveout, carveout)); -+ return carveout; - }; -- static bool cache_config_preference_cached = set_cache_config(); -- if (cache_config_preference_cached != prefer_shmem) { -+ // Store the value in a static variable so we only reset if needed -+ static size_t cache_config_preference_cached = set_cache_config(); -+ if (cache_config_preference_cached != carveout) { - cache_config_preference_cached = set_cache_config(); - } - #else - // Use the parameters so we don't get a warning - (void)func; -- (void)prefer_shmem; -+ (void)device_props; -+ (void)block_size; -+ (void)occupancy; - #endif - } - --template --std::enable_if_t --modify_launch_configuration_if_desired_occupancy_is_specified( -- Policy const& policy, cudaDeviceProp const& properties, -- cudaFuncAttributes const& attributes, dim3 const& block, int& shmem, -- bool& prefer_shmem) { -- int const block_size = block.x * block.y * block.z; -- int const desired_occupancy = policy.impl_get_desired_occupancy().value(); -- -- size_t const shmem_per_sm_prefer_l1 = get_shmem_per_sm_prefer_l1(properties); -- size_t const static_shmem = attributes.sharedSizeBytes; -- -- // round to nearest integer and avoid division by zero -- int active_blocks = std::max( -- 1, static_cast(std::round( -- static_cast(properties.maxThreadsPerMultiProcessor) / -- block_size * desired_occupancy / 100))); -- int const dynamic_shmem = -- shmem_per_sm_prefer_l1 / active_blocks - static_shmem; -- -- if (dynamic_shmem > shmem) { -- shmem = dynamic_shmem; -- prefer_shmem = false; -- } --} -- --template --std::enable_if_t --modify_launch_configuration_if_desired_occupancy_is_specified( -- Policy const&, cudaDeviceProp const&, cudaFuncAttributes const&, -- dim3 const& /*block*/, int& /*shmem*/, bool& /*prefer_shmem*/) {} -- - // end Some helper functions for launch code readability }}}1 - //============================================================================== - -@@ -348,7 +386,7 @@ struct CudaParallelLaunchKernelInvoker< - #ifdef KOKKOS_CUDA_ENABLE_GRAPHS - inline static void create_parallel_launch_graph_node( - DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem, -- CudaInternal const* cuda_instance, bool prefer_shmem) { -+ CudaInternal const* cuda_instance) { - //---------------------------------------- - auto const& graph = Impl::get_cuda_graph_from_kernel(driver); - KOKKOS_EXPECTS(bool(graph)); -@@ -358,8 +396,15 @@ struct CudaParallelLaunchKernelInvoker< - - if (!Impl::is_empty_launch(grid, block)) { - Impl::check_shmem_request(cuda_instance, shmem); -- Impl::configure_shmem_preference( -- base_t::get_kernel_func(), prefer_shmem); -+ if (DriverType::Policy:: -+ experimental_contains_desired_occupancy) { -+ int desired_occupancy = -+ driver.get_policy().impl_get_desired_occupancy().value(); -+ size_t block_size = block.x * block.y * block.z; -+ Impl::configure_shmem_preference( -+ base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size, -+ shmem, desired_occupancy); -+ } - - void const* args[] = {&driver}; - -@@ -442,7 +487,7 @@ struct CudaParallelLaunchKernelInvoker< - #ifdef KOKKOS_CUDA_ENABLE_GRAPHS - inline static void create_parallel_launch_graph_node( - DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem, -- CudaInternal const* cuda_instance, bool prefer_shmem) { -+ CudaInternal const* cuda_instance) { - //---------------------------------------- - auto const& graph = Impl::get_cuda_graph_from_kernel(driver); - KOKKOS_EXPECTS(bool(graph)); -@@ -452,8 +497,15 @@ struct CudaParallelLaunchKernelInvoker< - - if (!Impl::is_empty_launch(grid, block)) { - Impl::check_shmem_request(cuda_instance, shmem); -- Impl::configure_shmem_preference( -- base_t::get_kernel_func(), prefer_shmem); -+ if constexpr (DriverType::Policy:: -+ experimental_contains_desired_occupancy) { -+ int desired_occupancy = -+ driver.get_policy().impl_get_desired_occupancy().value(); -+ size_t block_size = block.x * block.y * block.z; -+ Impl::configure_shmem_preference( -+ base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size, -+ shmem, desired_occupancy); -+ } - - auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver); - -@@ -566,7 +618,7 @@ struct CudaParallelLaunchKernelInvoker< - #ifdef KOKKOS_CUDA_ENABLE_GRAPHS - inline static void create_parallel_launch_graph_node( - DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem, -- CudaInternal const* cuda_instance, bool prefer_shmem) { -+ CudaInternal const* cuda_instance) { - // Just use global memory; coordinating through events to share constant - // memory with the non-graph interface is not really reasonable since - // events don't work with Graphs directly, and this would anyway require -@@ -580,7 +632,7 @@ struct CudaParallelLaunchKernelInvoker< - DriverType, LaunchBounds, - Experimental::CudaLaunchMechanism::GlobalMemory>; - global_launch_impl_t::create_parallel_launch_graph_node( -- driver, grid, block, shmem, cuda_instance, prefer_shmem); -+ driver, grid, block, shmem, cuda_instance); - } - #endif - }; -@@ -613,8 +665,7 @@ struct CudaParallelLaunchImpl< - - inline static void launch_kernel(const DriverType& driver, const dim3& grid, - const dim3& block, int shmem, -- const CudaInternal* cuda_instance, -- bool prefer_shmem) { -+ const CudaInternal* cuda_instance) { - if (!Impl::is_empty_launch(grid, block)) { - // Prevent multiple threads to simultaneously set the cache configuration - // preference and launch the same kernel -@@ -623,18 +674,17 @@ struct CudaParallelLaunchImpl< - - Impl::check_shmem_request(cuda_instance, shmem); - -- // If a desired occupancy is specified, we compute how much shared memory -- // to ask for to achieve that occupancy, assuming that the cache -- // configuration is `cudaFuncCachePreferL1`. If the amount of dynamic -- // shared memory computed is actually smaller than `shmem` we overwrite -- // `shmem` and set `prefer_shmem` to `false`. -- modify_launch_configuration_if_desired_occupancy_is_specified( -- driver.get_policy(), cuda_instance->m_deviceProp, -- get_cuda_func_attributes(), block, shmem, prefer_shmem); -- -- Impl::configure_shmem_preference< -- DriverType, Kokkos::LaunchBounds>( -- base_t::get_kernel_func(), prefer_shmem); -+ if (DriverType::Policy:: -+ experimental_contains_desired_occupancy) { -+ int desired_occupancy = -+ driver.get_policy().impl_get_desired_occupancy().value(); -+ size_t block_size = block.x * block.y * block.z; -+ Impl::configure_shmem_preference< -+ DriverType, -+ Kokkos::LaunchBounds>( -+ base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size, -+ shmem, desired_occupancy); -+ } - - KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE(); - -@@ -650,18 +700,9 @@ struct CudaParallelLaunchImpl< - } - - static cudaFuncAttributes get_cuda_func_attributes() { -- // Race condition inside of cudaFuncGetAttributes if the same address is -- // given requires using a local variable as input instead of a static Rely -- // on static variable initialization to make sure only one thread executes -- // the code and the result is visible. -- auto wrap_get_attributes = []() -> cudaFuncAttributes { -- cudaFuncAttributes attr_tmp; -- KOKKOS_IMPL_CUDA_SAFE_CALL( -- cudaFuncGetAttributes(&attr_tmp, base_t::get_kernel_func())); -- return attr_tmp; -- }; -- static cudaFuncAttributes attr = wrap_get_attributes(); -- return attr; -+ return get_cuda_kernel_func_attributes< -+ DriverType, Kokkos::LaunchBounds>( -+ base_t::get_kernel_func()); - } - }; - -diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp -index e586bb4cc6..0e348c092a 100644 ---- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp -+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp -@@ -121,8 +121,7 @@ class ParallelFor, Kokkos::Cuda> { - maxblocks[1]), - 1); - CudaParallelLaunch( -- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(), -- false); -+ *this, grid, block, 0, m_rp.space().impl_internal_space_instance()); - } else if (RP::rank == 3) { - const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], m_rp.m_tile[2]); - KOKKOS_ASSERT(block.x > 0); -@@ -139,8 +138,7 @@ class ParallelFor, Kokkos::Cuda> { - (m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1) / block.z, - maxblocks[2])); - CudaParallelLaunch( -- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(), -- false); -+ *this, grid, block, 0, m_rp.space().impl_internal_space_instance()); - } else if (RP::rank == 4) { - // id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to - // threadIdx.z -@@ -158,8 +156,7 @@ class ParallelFor, Kokkos::Cuda> { - (m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1) / block.z, - maxblocks[2])); - CudaParallelLaunch( -- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(), -- false); -+ *this, grid, block, 0, m_rp.space().impl_internal_space_instance()); - } else if (RP::rank == 5) { - // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to - // threadIdx.z -@@ -175,8 +172,7 @@ class ParallelFor, Kokkos::Cuda> { - (m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1) / block.z, - maxblocks[2])); - CudaParallelLaunch( -- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(), -- false); -+ *this, grid, block, 0, m_rp.space().impl_internal_space_instance()); - } else if (RP::rank == 6) { - // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4,id5 to - // threadIdx.z -@@ -191,8 +187,7 @@ class ParallelFor, Kokkos::Cuda> { - std::min(m_rp.m_tile_end[4] * m_rp.m_tile_end[5], - maxblocks[2])); - CudaParallelLaunch( -- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(), -- false); -+ *this, grid, block, 0, m_rp.space().impl_internal_space_instance()); - } else { - Kokkos::abort("Kokkos::MDRange Error: Exceeded rank bounds with Cuda\n"); - } -@@ -405,8 +400,8 @@ class ParallelReduce, ReducerType, - - CudaParallelLaunch( - *this, grid, block, shmem, -- m_policy.space().impl_internal_space_instance(), -- false); // copy to device and execute -+ m_policy.space() -+ .impl_internal_space_instance()); // copy to device and execute - - if (!m_result_ptr_device_accessible) { - if (m_result_ptr) { -diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp -index ac160f8fe2..d1031751c2 100644 ---- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp -+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp -@@ -135,8 +135,7 @@ class ParallelFor, Kokkos::Cuda> { - #endif - - CudaParallelLaunch( -- *this, grid, block, 0, m_policy.space().impl_internal_space_instance(), -- false); -+ *this, grid, block, 0, m_policy.space().impl_internal_space_instance()); - } - - ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) -@@ -375,8 +374,8 @@ class ParallelReduce, ReducerType, - - CudaParallelLaunch( - *this, grid, block, shmem, -- m_policy.space().impl_internal_space_instance(), -- false); // copy to device and execute -+ m_policy.space() -+ .impl_internal_space_instance()); // copy to device and execute - - if (!m_result_ptr_device_accessible) { - if (m_result_ptr) { -@@ -726,16 +725,16 @@ class ParallelScan, Kokkos::Cuda> { - m_final = false; - CudaParallelLaunch( - *this, grid, block, shmem, -- m_policy.space().impl_internal_space_instance(), -- false); // copy to device and execute -+ m_policy.space() -+ .impl_internal_space_instance()); // copy to device and execute - #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION - } - #endif - m_final = true; - CudaParallelLaunch( - *this, grid, block, shmem, -- m_policy.space().impl_internal_space_instance(), -- false); // copy to device and execute -+ m_policy.space() -+ .impl_internal_space_instance()); // copy to device and execute - } - } - -@@ -1038,16 +1037,16 @@ class ParallelScanWithTotal, - m_final = false; - CudaParallelLaunch( - *this, grid, block, shmem, -- m_policy.space().impl_internal_space_instance(), -- false); // copy to device and execute -+ m_policy.space() -+ .impl_internal_space_instance()); // copy to device and execute - #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION - } - #endif - m_final = true; - CudaParallelLaunch( - *this, grid, block, shmem, -- m_policy.space().impl_internal_space_instance(), -- false); // copy to device and execute -+ m_policy.space() -+ .impl_internal_space_instance()); // copy to device and execute - - const int size = Analysis::value_size(m_functor); - #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION -diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp -index cdd16085b3..ea9430b812 100644 ---- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp -+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp -@@ -552,8 +552,8 @@ class ParallelFor, - - CudaParallelLaunch( - *this, grid, block, shmem_size_total, -- m_policy.space().impl_internal_space_instance(), -- true); // copy to device and execute -+ m_policy.space() -+ .impl_internal_space_instance()); // copy to device and execute - } - - ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) -@@ -878,8 +878,8 @@ class ParallelReduce, - - CudaParallelLaunch( - *this, grid, block, shmem_size_total, -- m_policy.space().impl_internal_space_instance(), -- true); // copy to device and execute -+ m_policy.space() -+ .impl_internal_space_instance()); // copy to device and execute - - if (!m_result_ptr_device_accessible) { - m_policy.space().fence( -diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp -index 34d4bef9fd..178012431c 100644 ---- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp -+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp -@@ -428,11 +428,6 @@ struct CudaReductionsFunctor { - // __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor) - // function qualifier which could be used to improve performance. - //---------------------------------------------------------------------------- --// Maximize shared memory and minimize L1 cache: --// cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferShared ); --// For 2.0 capability: 48 KB shared and 16 KB L1 --//---------------------------------------------------------------------------- --//---------------------------------------------------------------------------- - /* - * Algorithmic constraints: - * (a) blockDim.y <= 1024 -diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp -index fb3a6b138f..a12378a891 100644 ---- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp -+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp -@@ -100,8 +100,7 @@ class ParallelFor, - const int shared = 0; - - Kokkos::Impl::CudaParallelLaunch( -- *this, grid, block, shared, Cuda().impl_internal_space_instance(), -- false); -+ *this, grid, block, shared, Cuda().impl_internal_space_instance()); - } - - inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) diff --git a/lib/kokkos/kokkos_5731.diff b/lib/kokkos/kokkos_5731.diff deleted file mode 100644 index e95f4a1546..0000000000 --- a/lib/kokkos/kokkos_5731.diff +++ /dev/null @@ -1,46 +0,0 @@ -diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp -index 30b6958a67..b94f053272 100644 ---- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp -+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp -@@ -207,41 +207,6 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance, - LaunchBounds{}); - } - --// NOTE these number can be obtained several ways: --// * One option is to download the CUDA Occupancy Calculator spreadsheet, select --// "Compute Capability" first and check what is the smallest "Shared Memory --// Size Config" that is available. The "Shared Memory Per Multiprocessor" in --// bytes is then to be found below in the summary. --// * Another option would be to look for the information in the "Tuning --// Guide(s)" of the CUDA Toolkit Documentation for each GPU architecture, in --// the "Shared Memory" section (more tedious) --inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) { -- int const compute_capability = properties.major * 10 + properties.minor; -- return [compute_capability]() { -- switch (compute_capability) { -- case 30: -- case 32: -- case 35: return 16; -- case 37: return 80; -- case 50: -- case 53: -- case 60: -- case 62: return 64; -- case 52: -- case 61: return 96; -- case 70: -- case 80: -- case 86: -- case 90: return 8; -- case 75: return 32; -- default: -- Kokkos::Impl::throw_runtime_exception( -- "Unknown device in cuda block size deduction"); -- } -- return 0; -- }() * 1024; --} -- - } // namespace Impl - } // namespace Kokkos - diff --git a/lib/kokkos/kokkos_5739.diff b/lib/kokkos/kokkos_5739.diff deleted file mode 100644 index fe7a1ff551..0000000000 --- a/lib/kokkos/kokkos_5739.diff +++ /dev/null @@ -1,204 +0,0 @@ -diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp -index b94f053272..252c13c524 100644 ---- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp -+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp -@@ -53,17 +53,69 @@ - namespace Kokkos { - namespace Impl { - -+inline int cuda_warp_per_sm_allocation_granularity( -+ cudaDeviceProp const& properties) { -+ // Allocation granularity of warps in each sm -+ switch (properties.major) { -+ case 3: -+ case 5: -+ case 7: -+ case 8: -+ case 9: return 4; -+ case 6: return (properties.minor == 0 ? 2 : 4); -+ default: -+ throw_runtime_exception( -+ "Unknown device in cuda warp per sm allocation granularity"); -+ return 0; -+ } -+} -+ -+inline int cuda_max_warps_per_sm_registers( -+ cudaDeviceProp const& properties, cudaFuncAttributes const& attributes) { -+ // Maximum number of warps per sm as a function of register counts, -+ // subject to the constraint that warps are allocated with a fixed granularity -+ int const max_regs_per_block = properties.regsPerBlock; -+ int const regs_per_warp = attributes.numRegs * properties.warpSize; -+ int const warp_granularity = -+ cuda_warp_per_sm_allocation_granularity(properties); -+ // The granularity of register allocation is chunks of 256 registers per warp, -+ // which implies a need to over-allocate, so we round up -+ int const allocated_regs_per_warp = (regs_per_warp + 256 - 1) / 256; -+ -+ // The maximum number of warps per SM is constrained from above by register -+ // allocation. To satisfy the constraint that warps per SM is allocated at a -+ // finite granularity, we need to round down. -+ int const max_warps_per_sm = -+ warp_granularity * -+ (max_regs_per_block / (allocated_regs_per_warp * warp_granularity)); -+ -+ return max_warps_per_sm; -+} -+ - inline int cuda_max_active_blocks_per_sm(cudaDeviceProp const& properties, - cudaFuncAttributes const& attributes, - int block_size, size_t dynamic_shmem) { -- // Limits due do registers/SM -+ // Limits due to registers/SM - int const regs_per_sm = properties.regsPerMultiprocessor; - int const regs_per_thread = attributes.numRegs; - // The granularity of register allocation is chunks of 256 registers per warp - // -> 8 registers per thread - int const allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8); -- int const max_blocks_regs = -- regs_per_sm / (allocated_regs_per_thread * block_size); -+ int max_blocks_regs = regs_per_sm / (allocated_regs_per_thread * block_size); -+ -+ // Compute the maximum number of warps as a function of the number of -+ // registers -+ int const max_warps_per_sm_registers = -+ cuda_max_warps_per_sm_registers(properties, attributes); -+ -+ // Constrain the number of blocks to respect the maximum number of warps per -+ // SM On face value this should be an equality, but due to the warp -+ // granularity constraints noted in `cuda_max_warps_per_sm_registers` the -+ // left-hand-side of this comparison can overshoot what the hardware allows -+ // based on register counts alone -+ while ((max_blocks_regs * block_size / properties.warpSize) > -+ max_warps_per_sm_registers) -+ max_blocks_regs--; - - // Limits due to shared memory/SM - size_t const shmem_per_sm = properties.sharedMemPerMultiprocessor; -@@ -207,6 +259,19 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance, - LaunchBounds{}); - } - -+template -+int cuda_get_opt_block_size_no_shmem(const cudaFuncAttributes& attr, -+ LaunchBounds) { -+ auto const& prop = Kokkos::Cuda().cuda_device_prop(); -+ -+ // Thin version of cuda_get_opt_block_size for cases where there is no shared -+ // memory -+ auto const block_size_to_no_shmem = [&](int /*block_size*/) { return 0; }; -+ -+ return cuda_deduce_block_size(false, prop, attr, block_size_to_no_shmem, -+ LaunchBounds{}); -+} -+ - } // namespace Impl - } // namespace Kokkos - -diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp -index 5c4c3a7d39..170183ca0a 100644 ---- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp -+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp -@@ -188,9 +188,23 @@ inline void configure_shmem_preference(const KernelFuncPtr& func, - // The granularity of register allocation is chunks of 256 registers per warp - // -> 8 registers per thread - const size_t allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8); -- const size_t max_blocks_regs = -+ size_t max_blocks_regs = - regs_per_sm / (allocated_regs_per_thread * block_size); - -+ // Compute the maximum number of warps as a function of the number of -+ // registers -+ const size_t max_warps_per_sm_registers = -+ cuda_max_warps_per_sm_registers(device_props, func_attr); -+ -+ // Constrain the number of blocks to respect the maximum number of warps per -+ // SM On face value this should be an equality, but due to the warp -+ // granularity constraints noted in `cuda_max_warps_per_sm_registers` the -+ // left-hand-side of this comparison can overshoot what the hardware allows -+ // based on register counts alone -+ while ((max_blocks_regs * block_size / device_props.warpSize) > -+ max_warps_per_sm_registers) -+ max_blocks_regs--; -+ - // Compute how many threads per sm we actually want - const size_t max_threads_per_sm = device_props.maxThreadsPerMultiProcessor; - // only allocate multiples of warp size -diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp -index 0e348c092a..7e4f62f12e 100644 ---- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp -+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp -@@ -67,6 +67,34 @@ - namespace Kokkos { - namespace Impl { - -+template -+int max_tile_size_product_helper(const Policy& pol, const LaunchBounds&) { -+ cudaFuncAttributes attr = -+ CudaParallelLaunch::get_cuda_func_attributes(); -+ auto const& prop = pol.space().cuda_device_prop(); -+ -+ // Limits due to registers/SM, MDRange doesn't have -+ // shared memory constraints -+ int const optimal_block_size = -+ Kokkos::Impl::cuda_get_opt_block_size_no_shmem(attr, LaunchBounds{}); -+ -+ // Compute how many blocks of this size we can launch, based on warp -+ // constraints -+ int const max_warps_per_sm_registers = -+ Kokkos::Impl::cuda_max_warps_per_sm_registers(prop, attr); -+ int const max_num_threads_from_warps = -+ max_warps_per_sm_registers * prop.warpSize; -+ int const max_num_blocks = max_num_threads_from_warps / optimal_block_size; -+ -+ // Compute the total number of threads -+ int const max_threads_per_sm = optimal_block_size * max_num_blocks; -+ -+ return std::min( -+ max_threads_per_sm, -+ static_cast(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism)); -+} -+ - template - class ParallelFor, Kokkos::Cuda> { - public: -@@ -85,18 +113,7 @@ class ParallelFor, Kokkos::Cuda> { - public: - template - static int max_tile_size_product(const Policy& pol, const Functor&) { -- cudaFuncAttributes attr = -- CudaParallelLaunch::get_cuda_func_attributes(); -- auto const& prop = pol.space().cuda_device_prop(); -- // Limits due to registers/SM, MDRange doesn't have -- // shared memory constraints -- int const regs_per_sm = prop.regsPerMultiprocessor; -- int const regs_per_thread = attr.numRegs; -- int const max_threads_per_sm = regs_per_sm / regs_per_thread; -- return std::min( -- max_threads_per_sm, -- static_cast(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism)); -+ return max_tile_size_product_helper(pol, LaunchBounds{}); - } - Policy const& get_policy() const { return m_rp; } - inline __device__ void operator()() const { -@@ -258,17 +275,7 @@ class ParallelReduce, ReducerType, - public: - template - static int max_tile_size_product(const Policy& pol, const Functor&) { -- cudaFuncAttributes attr = -- CudaParallelLaunch::get_cuda_func_attributes(); -- auto const& prop = pol.space().cuda_device_prop(); -- // Limits due do registers/SM -- int const regs_per_sm = prop.regsPerMultiprocessor; -- int const regs_per_thread = attr.numRegs; -- int const max_threads_per_sm = regs_per_sm / regs_per_thread; -- return std::min( -- max_threads_per_sm, -- static_cast(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism)); -+ return max_tile_size_product_helper(pol, LaunchBounds{}); - } - Policy const& get_policy() const { return m_policy; } - inline __device__ void exec_range(reference_type update) const { diff --git a/lib/kokkos/kokkos_fix_5706_apply_last.diff b/lib/kokkos/kokkos_fix_5706_apply_last.diff deleted file mode 100644 index 5d298323fd..0000000000 --- a/lib/kokkos/kokkos_fix_5706_apply_last.diff +++ /dev/null @@ -1,63 +0,0 @@ -diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp -index 170183ca0a..ba43e362bb 100644 ---- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp -+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp -@@ -412,12 +412,16 @@ struct CudaParallelLaunchKernelInvoker< - Impl::check_shmem_request(cuda_instance, shmem); - if (DriverType::Policy:: - experimental_contains_desired_occupancy) { -+ /* - int desired_occupancy = - driver.get_policy().impl_get_desired_occupancy().value(); - size_t block_size = block.x * block.y * block.z; - Impl::configure_shmem_preference( - base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size, -- shmem, desired_occupancy); -+ shmem, desired_occupancy);*/ -+ Kokkos::Impl::throw_runtime_exception( -+ std::string("Cuda graph node creation FAILED:" -+ " occupancy requests are currently broken.")); - } - - void const* args[] = {&driver}; -@@ -511,14 +515,17 @@ struct CudaParallelLaunchKernelInvoker< - - if (!Impl::is_empty_launch(grid, block)) { - Impl::check_shmem_request(cuda_instance, shmem); -- if constexpr (DriverType::Policy:: -+ if (DriverType::Policy:: - experimental_contains_desired_occupancy) { -- int desired_occupancy = -+ /*int desired_occupancy = - driver.get_policy().impl_get_desired_occupancy().value(); - size_t block_size = block.x * block.y * block.z; - Impl::configure_shmem_preference( - base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size, -- shmem, desired_occupancy); -+ shmem, desired_occupancy);*/ -+ Kokkos::Impl::throw_runtime_exception( -+ std::string("Cuda graph node creation FAILED:" -+ " occupancy requests are currently broken.")); - } - - auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver); -@@ -690,14 +697,17 @@ struct CudaParallelLaunchImpl< - - if (DriverType::Policy:: - experimental_contains_desired_occupancy) { -- int desired_occupancy = -+ /*int desired_occupancy = - driver.get_policy().impl_get_desired_occupancy().value(); - size_t block_size = block.x * block.y * block.z; - Impl::configure_shmem_preference< - DriverType, - Kokkos::LaunchBounds>( - base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size, -- shmem, desired_occupancy); -+ shmem, desired_occupancy);*/ -+ Kokkos::Impl::throw_runtime_exception( -+ std::string("Cuda graph node creation FAILED:" -+ " occupancy requests are currently broken.")); - } - - KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();