Whack extra files from #3532

This commit is contained in:
Stan Moore
2023-01-26 15:12:34 -07:00
parent 866cf8ae60
commit b7dfa3db05
5 changed files with 0 additions and 1035 deletions

View File

@ -1,199 +0,0 @@
diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos
index 22af411f32..530510a0d1 100644
--- a/lib/kokkos/Makefile.kokkos
+++ b/lib/kokkos/Makefile.kokkos
@@ -20,7 +20,7 @@ KOKKOS_DEVICES ?= "OpenMP"
#KOKKOS_DEVICES ?= "Threads"
# Options:
# Intel: KNC,KNL,SNB,HSW,BDW,SKL,SKX,ICL,ICX,SPR
-# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86
+# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86,Hopper90
# ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX
# IBM: BGQ,Power7,Power8,Power9
# AMD-GPUS: Vega900,Vega906,Vega908,Vega90A
@@ -401,6 +401,7 @@ KOKKOS_INTERNAL_USE_ARCH_VOLTA72 := $(call kokkos_has_string,$(KOKKOS_ARCH),Volt
KOKKOS_INTERNAL_USE_ARCH_TURING75 := $(call kokkos_has_string,$(KOKKOS_ARCH),Turing75)
KOKKOS_INTERNAL_USE_ARCH_AMPERE80 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere80)
KOKKOS_INTERNAL_USE_ARCH_AMPERE86 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere86)
+KOKKOS_INTERNAL_USE_ARCH_HOPPER90 := $(call kokkos_has_string,$(KOKKOS_ARCH),Hopper90)
KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30) \
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32) \
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \
@@ -414,7 +415,8 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLE
+ $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72) \
+ $(KOKKOS_INTERNAL_USE_ARCH_TURING75) \
+ $(KOKKOS_INTERNAL_USE_ARCH_AMPERE80) \
- + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86))
+ + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86) \
+ + $(KOKKOS_INTERNAL_USE_ARCH_HOPPER90))
#SEK: This seems like a bug to me
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
@@ -1194,6 +1196,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA_ARCH), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE86")
KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_86
endif
+ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_HOPPER90), 1)
+ tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER")
+ tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER90")
+ KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_90
+ endif
ifneq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)
diff --git a/lib/kokkos/cmake/KokkosCore_config.h.in b/lib/kokkos/cmake/KokkosCore_config.h.in
index 88ddc48378..b83ced9243 100644
--- a/lib/kokkos/cmake/KokkosCore_config.h.in
+++ b/lib/kokkos/cmake/KokkosCore_config.h.in
@@ -102,6 +102,7 @@
#cmakedefine KOKKOS_ARCH_AMPERE
#cmakedefine KOKKOS_ARCH_AMPERE80
#cmakedefine KOKKOS_ARCH_AMPERE86
+#cmakedefine KOKKOS_ARCH_HOPPER90
#cmakedefine KOKKOS_ARCH_AMD_ZEN
#cmakedefine KOKKOS_ARCH_AMD_ZEN2
#cmakedefine KOKKOS_ARCH_AMD_ZEN3
diff --git a/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc b/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc
index f56cef1651..2585a6a64c 100644
--- a/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc
+++ b/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc
@@ -74,6 +74,7 @@ int main() {
case 75: std::cout << "Set -DKokkos_ARCH_TURING75=ON ." << std::endl; break;
case 80: std::cout << "Set -DKokkos_ARCH_AMPERE80=ON ." << std::endl; break;
case 86: std::cout << "Set -DKokkos_ARCH_AMPERE86=ON ." << std::endl; break;
+ case 90: std::cout << "Set -DKokkos_ARCH_HOPPER90=ON ." << std::endl; break;
default:
std::cout << "Compute capability " << compute_capability
<< " is not supported" << std::endl;
diff --git a/lib/kokkos/cmake/kokkos_arch.cmake b/lib/kokkos/cmake/kokkos_arch.cmake
index ef16aad047..c1d76cceeb 100644
--- a/lib/kokkos/cmake/kokkos_arch.cmake
+++ b/lib/kokkos/cmake/kokkos_arch.cmake
@@ -86,6 +86,7 @@ KOKKOS_ARCH_OPTION(VOLTA72 GPU "NVIDIA Volta generation CC 7.2" "KOKK
KOKKOS_ARCH_OPTION(TURING75 GPU "NVIDIA Turing generation CC 7.5" "KOKKOS_SHOW_CUDA_ARCHS")
KOKKOS_ARCH_OPTION(AMPERE80 GPU "NVIDIA Ampere generation CC 8.0" "KOKKOS_SHOW_CUDA_ARCHS")
KOKKOS_ARCH_OPTION(AMPERE86 GPU "NVIDIA Ampere generation CC 8.6" "KOKKOS_SHOW_CUDA_ARCHS")
+KOKKOS_ARCH_OPTION(HOPPER90 GPU "NVIDIA Hopper generation CC 9.0" "KOKKOS_SHOW_CUDA_ARCHS")
IF(Kokkos_ENABLE_HIP OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_UNSUPPORTED_ARCHS)
SET(KOKKOS_SHOW_HIP_ARCHS ON)
@@ -544,6 +545,7 @@ CHECK_CUDA_ARCH(VOLTA72 sm_72)
CHECK_CUDA_ARCH(TURING75 sm_75)
CHECK_CUDA_ARCH(AMPERE80 sm_80)
CHECK_CUDA_ARCH(AMPERE86 sm_86)
+CHECK_CUDA_ARCH(HOPPER90 sm_90)
SET(AMDGPU_ARCH_ALREADY_SPECIFIED "")
FUNCTION(CHECK_AMDGPU_ARCH ARCH FLAG)
@@ -806,6 +808,10 @@ IF (KOKKOS_ARCH_AMPERE80 OR KOKKOS_ARCH_AMPERE86)
SET(KOKKOS_ARCH_AMPERE ON)
ENDIF()
+IF (KOKKOS_ARCH_HOPPER90)
+ SET(KOKKOS_ARCH_HOPPER ON)
+ENDIF()
+
#Regardless of version, make sure we define the general architecture name
IF (KOKKOS_ARCH_VEGA900 OR KOKKOS_ARCH_VEGA906 OR KOKKOS_ARCH_VEGA908 OR KOKKOS_ARCH_VEGA90A)
SET(KOKKOS_ARCH_VEGA ON)
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
index 56f9117844..fcd4773dbc 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
@@ -232,7 +232,8 @@ inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) {
case 61: return 96;
case 70:
case 80:
- case 86: return 8;
+ case 86:
+ case 90: return 8;
case 75: return 32;
default:
Kokkos::Impl::throw_runtime_exception(
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp
index 40a263561f..8c40ebd60d 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp
@@ -418,7 +418,7 @@ KOKKOS_INLINE_FUNCTION
#endif // CUDA_VERSION >= 11000 && CUDA_VERSION < 11010
#if CUDA_VERSION >= 11010 && \
- ((defined(KOKKOS_ARCH_AMPERE80) || defined(KOKKOS_ARCH_AMPERE86)))
+ ((defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER)))
KOKKOS_INLINE_FUNCTION
bhalf_t cast_to_bhalf(bhalf_t val) { return val; }
KOKKOS_INLINE_FUNCTION
diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp
index f9451ecfe6..2ce1efb98c 100644
--- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp
+++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp
@@ -51,7 +51,7 @@ namespace Kokkos::Experimental::Impl {
struct OpenACC_Traits {
#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
- defined(KOKKOS_ARCH_AMPERE)
+ defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER)
static constexpr acc_device_t dev_type = acc_device_nvidia;
static constexpr bool may_fallback_to_host = false;
#else
diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
index a9bc085912..27ee1d4232 100644
--- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
+++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
@@ -115,8 +115,9 @@ void OpenMPTargetInternal::impl_initialize() {
// FIXME_OPENMPTARGET: Only fix the number of teams for NVIDIA architectures
// from Pascal and upwards.
-#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
- defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
+#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
+ defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
+ defined(KOKKOS_ARCH_HOPPER)
#if defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG >= 1300)
omp_set_num_teams(512);
#endif
diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
index 840db4327c..7e5addbc5b 100644
--- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
@@ -155,7 +155,7 @@ void SYCL::impl_initialize(InitializationSettings const& settings) {
#if !defined(KOKKOS_ARCH_INTEL_GPU) && !defined(KOKKOS_ARCH_KEPLER) && \
!defined(KOKKOS_ARCH_MAXWELL) && !defined(KOKKOS_ARCH_PASCAL) && \
!defined(KOKKOS_ARCH_VOLTA) && !defined(KOKKOS_ARCH_TURING75) && \
- !defined(KOKKOS_ARCH_AMPERE)
+ !defined(KOKKOS_ARCH_AMPERE) && !defined(KOKKOS_ARCH_HOPPER)
if (!settings.has_device_id() && gpu_devices.empty()) {
Impl::SYCLInternal::singleton().initialize(sycl::device());
return;
diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
index 5ac7d8af30..ba101f699e 100644
--- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
@@ -335,9 +335,10 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
return std::min({
int(m_space.impl_internal_space_instance()->m_maxWorkgroupSize),
// FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs.
-#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
- defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
- defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
+#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
+ defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
+ defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
+ defined(KOKKOS_ARCH_HOPPER)
256,
#endif
max_threads_for_memory
@@ -367,9 +368,10 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
return std::min<int>({
int(m_space.impl_internal_space_instance()->m_maxWorkgroupSize),
// FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs.
-#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
- defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
- defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
+#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
+ defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
+ defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
+ defined(KOKKOS_ARCH_HOPPER)
256,
#endif
max_threads_for_memory

View File

@ -1,523 +0,0 @@
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
index fcd4773dbc..30b6958a67 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
@@ -207,7 +207,6 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
LaunchBounds{});
}
-// Assuming cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1)
// NOTE these number can be obtained several ways:
// * One option is to download the CUDA Occupancy Calculator spreadsheet, select
// "Compute Capability" first and check what is the smallest "Shared Memory
@@ -242,6 +241,7 @@ inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) {
return 0;
}() * 1024;
}
+
} // namespace Impl
} // namespace Kokkos
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
index 5811498e01..e22eb3b842 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
@@ -569,12 +569,6 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default
}
#endif
-#ifdef KOKKOS_ENABLE_PRE_CUDA_10_DEPRECATION_API
- cudaThreadSetCacheConfig(cudaFuncCachePreferShared);
-#else
- cudaDeviceSetCacheConfig(cudaFuncCachePreferShared);
-#endif
-
// Init the array for used for arbitrarily sized atomics
if (stream == nullptr) Impl::initialize_host_cuda_lock_arrays();
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
index b7a80ad84f..5c4c3a7d39 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
@@ -93,10 +93,6 @@ namespace Impl {
// __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
// function qualifier which could be used to improve performance.
//----------------------------------------------------------------------------
-// Maximize L1 cache and minimize shared memory:
-// cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1 );
-// For 2.0 capability: 48 KB L1 and 16 KB shared
-//----------------------------------------------------------------------------
template <class DriverType>
__global__ static void cuda_parallel_launch_constant_memory() {
@@ -158,63 +154,105 @@ inline void check_shmem_request(CudaInternal const* cuda_instance, int shmem) {
}
}
-// This function needs to be template on DriverType and LaunchBounds
+// These functions needs to be template on DriverType and LaunchBounds
// so that the static bool is unique for each type combo
// KernelFuncPtr does not necessarily contain that type information.
+
template <class DriverType, class LaunchBounds, class KernelFuncPtr>
-inline void configure_shmem_preference(KernelFuncPtr const& func,
- bool prefer_shmem) {
+const cudaFuncAttributes& get_cuda_kernel_func_attributes(
+ const KernelFuncPtr& func) {
+ // Only call cudaFuncGetAttributes once for each unique kernel
+ // by leveraging static variable initialization rules
+ auto wrap_get_attributes = [&]() -> cudaFuncAttributes {
+ cudaFuncAttributes attr;
+ KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncGetAttributes(&attr, func));
+ return attr;
+ };
+ static cudaFuncAttributes func_attr = wrap_get_attributes();
+ return func_attr;
+}
+
+template <class DriverType, class LaunchBounds, class KernelFuncPtr>
+inline void configure_shmem_preference(const KernelFuncPtr& func,
+ const cudaDeviceProp& device_props,
+ const size_t block_size, int& shmem,
+ const size_t occupancy) {
#ifndef KOKKOS_ARCH_KEPLER
- // On Kepler the L1 has no benefit since it doesn't cache reads
+
+ const auto& func_attr =
+ get_cuda_kernel_func_attributes<DriverType, LaunchBounds>(func);
+
+ // Compute limits for number of blocks due to registers/SM
+ const size_t regs_per_sm = device_props.regsPerMultiprocessor;
+ const size_t regs_per_thread = func_attr.numRegs;
+ // The granularity of register allocation is chunks of 256 registers per warp
+ // -> 8 registers per thread
+ const size_t allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
+ const size_t max_blocks_regs =
+ regs_per_sm / (allocated_regs_per_thread * block_size);
+
+ // Compute how many threads per sm we actually want
+ const size_t max_threads_per_sm = device_props.maxThreadsPerMultiProcessor;
+ // only allocate multiples of warp size
+ const size_t num_threads_desired =
+ ((max_threads_per_sm * occupancy / 100 + 31) / 32) * 32;
+ // Get close to the desired occupancy,
+ // don't undershoot by much but also don't allocate a whole new block just
+ // because one is a few threads over otherwise.
+ size_t num_blocks_desired =
+ (num_threads_desired + block_size * 0.8) / block_size;
+ num_blocks_desired = ::std::min(max_blocks_regs, num_blocks_desired);
+ if (num_blocks_desired == 0) num_blocks_desired = 1;
+
+ // Calculate how much shared memory we need per block
+ size_t shmem_per_block = shmem + func_attr.sharedSizeBytes;
+
+ // The minimum shared memory allocation we can have in total per SM is 8kB.
+ // If we want to lower occupancy we have to make sure we request at least that
+ // much in aggregate over all blocks, so that shared memory actually becomes a
+ // limiting factor for occupancy
+ constexpr size_t min_shmem_size_per_sm = 8192;
+ if ((occupancy < 100) &&
+ (shmem_per_block * num_blocks_desired < min_shmem_size_per_sm)) {
+ shmem_per_block = min_shmem_size_per_sm / num_blocks_desired;
+ // Need to set the caller's shmem variable so that the
+ // kernel launch uses the correct dynamic shared memory request
+ shmem = shmem_per_block - func_attr.sharedSizeBytes;
+ }
+
+ // Compute the carveout fraction we need based on occupancy
+ // Use multiples of 8kB
+ const size_t max_shmem_per_sm = device_props.sharedMemPerMultiprocessor;
+ size_t carveout = shmem_per_block == 0
+ ? 0
+ : 100 *
+ (((num_blocks_desired * shmem_per_block +
+ min_shmem_size_per_sm - 1) /
+ min_shmem_size_per_sm) *
+ min_shmem_size_per_sm) /
+ max_shmem_per_sm;
+ if (carveout > 100) carveout = 100;
+
+ // Set the carveout, but only call it once per kernel or when it changes
auto set_cache_config = [&] {
- KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
- func,
- (prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1)));
- return prefer_shmem;
+ KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetAttribute(
+ func, cudaFuncAttributePreferredSharedMemoryCarveout, carveout));
+ return carveout;
};
- static bool cache_config_preference_cached = set_cache_config();
- if (cache_config_preference_cached != prefer_shmem) {
+ // Store the value in a static variable so we only reset if needed
+ static size_t cache_config_preference_cached = set_cache_config();
+ if (cache_config_preference_cached != carveout) {
cache_config_preference_cached = set_cache_config();
}
#else
// Use the parameters so we don't get a warning
(void)func;
- (void)prefer_shmem;
+ (void)device_props;
+ (void)block_size;
+ (void)occupancy;
#endif
}
-template <class Policy>
-std::enable_if_t<Policy::experimental_contains_desired_occupancy>
-modify_launch_configuration_if_desired_occupancy_is_specified(
- Policy const& policy, cudaDeviceProp const& properties,
- cudaFuncAttributes const& attributes, dim3 const& block, int& shmem,
- bool& prefer_shmem) {
- int const block_size = block.x * block.y * block.z;
- int const desired_occupancy = policy.impl_get_desired_occupancy().value();
-
- size_t const shmem_per_sm_prefer_l1 = get_shmem_per_sm_prefer_l1(properties);
- size_t const static_shmem = attributes.sharedSizeBytes;
-
- // round to nearest integer and avoid division by zero
- int active_blocks = std::max(
- 1, static_cast<int>(std::round(
- static_cast<double>(properties.maxThreadsPerMultiProcessor) /
- block_size * desired_occupancy / 100)));
- int const dynamic_shmem =
- shmem_per_sm_prefer_l1 / active_blocks - static_shmem;
-
- if (dynamic_shmem > shmem) {
- shmem = dynamic_shmem;
- prefer_shmem = false;
- }
-}
-
-template <class Policy>
-std::enable_if_t<!Policy::experimental_contains_desired_occupancy>
-modify_launch_configuration_if_desired_occupancy_is_specified(
- Policy const&, cudaDeviceProp const&, cudaFuncAttributes const&,
- dim3 const& /*block*/, int& /*shmem*/, bool& /*prefer_shmem*/) {}
-
// </editor-fold> end Some helper functions for launch code readability }}}1
//==============================================================================
@@ -348,7 +386,7 @@ struct CudaParallelLaunchKernelInvoker<
#ifdef KOKKOS_CUDA_ENABLE_GRAPHS
inline static void create_parallel_launch_graph_node(
DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
- CudaInternal const* cuda_instance, bool prefer_shmem) {
+ CudaInternal const* cuda_instance) {
//----------------------------------------
auto const& graph = Impl::get_cuda_graph_from_kernel(driver);
KOKKOS_EXPECTS(bool(graph));
@@ -358,8 +396,15 @@ struct CudaParallelLaunchKernelInvoker<
if (!Impl::is_empty_launch(grid, block)) {
Impl::check_shmem_request(cuda_instance, shmem);
- Impl::configure_shmem_preference<DriverType, LaunchBounds>(
- base_t::get_kernel_func(), prefer_shmem);
+ if (DriverType::Policy::
+ experimental_contains_desired_occupancy) {
+ int desired_occupancy =
+ driver.get_policy().impl_get_desired_occupancy().value();
+ size_t block_size = block.x * block.y * block.z;
+ Impl::configure_shmem_preference<DriverType, LaunchBounds>(
+ base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
+ shmem, desired_occupancy);
+ }
void const* args[] = {&driver};
@@ -442,7 +487,7 @@ struct CudaParallelLaunchKernelInvoker<
#ifdef KOKKOS_CUDA_ENABLE_GRAPHS
inline static void create_parallel_launch_graph_node(
DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
- CudaInternal const* cuda_instance, bool prefer_shmem) {
+ CudaInternal const* cuda_instance) {
//----------------------------------------
auto const& graph = Impl::get_cuda_graph_from_kernel(driver);
KOKKOS_EXPECTS(bool(graph));
@@ -452,8 +497,15 @@ struct CudaParallelLaunchKernelInvoker<
if (!Impl::is_empty_launch(grid, block)) {
Impl::check_shmem_request(cuda_instance, shmem);
- Impl::configure_shmem_preference<DriverType, LaunchBounds>(
- base_t::get_kernel_func(), prefer_shmem);
+ if constexpr (DriverType::Policy::
+ experimental_contains_desired_occupancy) {
+ int desired_occupancy =
+ driver.get_policy().impl_get_desired_occupancy().value();
+ size_t block_size = block.x * block.y * block.z;
+ Impl::configure_shmem_preference<DriverType, LaunchBounds>(
+ base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
+ shmem, desired_occupancy);
+ }
auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver);
@@ -566,7 +618,7 @@ struct CudaParallelLaunchKernelInvoker<
#ifdef KOKKOS_CUDA_ENABLE_GRAPHS
inline static void create_parallel_launch_graph_node(
DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
- CudaInternal const* cuda_instance, bool prefer_shmem) {
+ CudaInternal const* cuda_instance) {
// Just use global memory; coordinating through events to share constant
// memory with the non-graph interface is not really reasonable since
// events don't work with Graphs directly, and this would anyway require
@@ -580,7 +632,7 @@ struct CudaParallelLaunchKernelInvoker<
DriverType, LaunchBounds,
Experimental::CudaLaunchMechanism::GlobalMemory>;
global_launch_impl_t::create_parallel_launch_graph_node(
- driver, grid, block, shmem, cuda_instance, prefer_shmem);
+ driver, grid, block, shmem, cuda_instance);
}
#endif
};
@@ -613,8 +665,7 @@ struct CudaParallelLaunchImpl<
inline static void launch_kernel(const DriverType& driver, const dim3& grid,
const dim3& block, int shmem,
- const CudaInternal* cuda_instance,
- bool prefer_shmem) {
+ const CudaInternal* cuda_instance) {
if (!Impl::is_empty_launch(grid, block)) {
// Prevent multiple threads to simultaneously set the cache configuration
// preference and launch the same kernel
@@ -623,18 +674,17 @@ struct CudaParallelLaunchImpl<
Impl::check_shmem_request(cuda_instance, shmem);
- // If a desired occupancy is specified, we compute how much shared memory
- // to ask for to achieve that occupancy, assuming that the cache
- // configuration is `cudaFuncCachePreferL1`. If the amount of dynamic
- // shared memory computed is actually smaller than `shmem` we overwrite
- // `shmem` and set `prefer_shmem` to `false`.
- modify_launch_configuration_if_desired_occupancy_is_specified(
- driver.get_policy(), cuda_instance->m_deviceProp,
- get_cuda_func_attributes(), block, shmem, prefer_shmem);
-
- Impl::configure_shmem_preference<
- DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
- base_t::get_kernel_func(), prefer_shmem);
+ if (DriverType::Policy::
+ experimental_contains_desired_occupancy) {
+ int desired_occupancy =
+ driver.get_policy().impl_get_desired_occupancy().value();
+ size_t block_size = block.x * block.y * block.z;
+ Impl::configure_shmem_preference<
+ DriverType,
+ Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
+ base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
+ shmem, desired_occupancy);
+ }
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
@@ -650,18 +700,9 @@ struct CudaParallelLaunchImpl<
}
static cudaFuncAttributes get_cuda_func_attributes() {
- // Race condition inside of cudaFuncGetAttributes if the same address is
- // given requires using a local variable as input instead of a static Rely
- // on static variable initialization to make sure only one thread executes
- // the code and the result is visible.
- auto wrap_get_attributes = []() -> cudaFuncAttributes {
- cudaFuncAttributes attr_tmp;
- KOKKOS_IMPL_CUDA_SAFE_CALL(
- cudaFuncGetAttributes(&attr_tmp, base_t::get_kernel_func()));
- return attr_tmp;
- };
- static cudaFuncAttributes attr = wrap_get_attributes();
- return attr;
+ return get_cuda_kernel_func_attributes<
+ DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
+ base_t::get_kernel_func());
}
};
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
index e586bb4cc6..0e348c092a 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
@@ -121,8 +121,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
maxblocks[1]),
1);
CudaParallelLaunch<ParallelFor, LaunchBounds>(
- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
- false);
+ *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
} else if (RP::rank == 3) {
const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], m_rp.m_tile[2]);
KOKKOS_ASSERT(block.x > 0);
@@ -139,8 +138,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
(m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1) / block.z,
maxblocks[2]));
CudaParallelLaunch<ParallelFor, LaunchBounds>(
- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
- false);
+ *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
} else if (RP::rank == 4) {
// id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to
// threadIdx.z
@@ -158,8 +156,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
(m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1) / block.z,
maxblocks[2]));
CudaParallelLaunch<ParallelFor, LaunchBounds>(
- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
- false);
+ *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
} else if (RP::rank == 5) {
// id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to
// threadIdx.z
@@ -175,8 +172,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
(m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1) / block.z,
maxblocks[2]));
CudaParallelLaunch<ParallelFor, LaunchBounds>(
- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
- false);
+ *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
} else if (RP::rank == 6) {
// id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4,id5 to
// threadIdx.z
@@ -191,8 +187,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
std::min<array_index_type>(m_rp.m_tile_end[4] * m_rp.m_tile_end[5],
maxblocks[2]));
CudaParallelLaunch<ParallelFor, LaunchBounds>(
- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
- false);
+ *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
} else {
Kokkos::abort("Kokkos::MDRange Error: Exceeded rank bounds with Cuda\n");
}
@@ -405,8 +400,8 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
CudaParallelLaunch<ParallelReduce, LaunchBounds>(
*this, grid, block, shmem,
- m_policy.space().impl_internal_space_instance(),
- false); // copy to device and execute
+ m_policy.space()
+ .impl_internal_space_instance()); // copy to device and execute
if (!m_result_ptr_device_accessible) {
if (m_result_ptr) {
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
index ac160f8fe2..d1031751c2 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
@@ -135,8 +135,7 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
#endif
CudaParallelLaunch<ParallelFor, LaunchBounds>(
- *this, grid, block, 0, m_policy.space().impl_internal_space_instance(),
- false);
+ *this, grid, block, 0, m_policy.space().impl_internal_space_instance());
}
ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
@@ -375,8 +374,8 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
CudaParallelLaunch<ParallelReduce, LaunchBounds>(
*this, grid, block, shmem,
- m_policy.space().impl_internal_space_instance(),
- false); // copy to device and execute
+ m_policy.space()
+ .impl_internal_space_instance()); // copy to device and execute
if (!m_result_ptr_device_accessible) {
if (m_result_ptr) {
@@ -726,16 +725,16 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
m_final = false;
CudaParallelLaunch<ParallelScan, LaunchBounds>(
*this, grid, block, shmem,
- m_policy.space().impl_internal_space_instance(),
- false); // copy to device and execute
+ m_policy.space()
+ .impl_internal_space_instance()); // copy to device and execute
#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
}
#endif
m_final = true;
CudaParallelLaunch<ParallelScan, LaunchBounds>(
*this, grid, block, shmem,
- m_policy.space().impl_internal_space_instance(),
- false); // copy to device and execute
+ m_policy.space()
+ .impl_internal_space_instance()); // copy to device and execute
}
}
@@ -1038,16 +1037,16 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
m_final = false;
CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>(
*this, grid, block, shmem,
- m_policy.space().impl_internal_space_instance(),
- false); // copy to device and execute
+ m_policy.space()
+ .impl_internal_space_instance()); // copy to device and execute
#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
}
#endif
m_final = true;
CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>(
*this, grid, block, shmem,
- m_policy.space().impl_internal_space_instance(),
- false); // copy to device and execute
+ m_policy.space()
+ .impl_internal_space_instance()); // copy to device and execute
const int size = Analysis::value_size(m_functor);
#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
index cdd16085b3..ea9430b812 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
@@ -552,8 +552,8 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
CudaParallelLaunch<ParallelFor, LaunchBounds>(
*this, grid, block, shmem_size_total,
- m_policy.space().impl_internal_space_instance(),
- true); // copy to device and execute
+ m_policy.space()
+ .impl_internal_space_instance()); // copy to device and execute
}
ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
@@ -878,8 +878,8 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
CudaParallelLaunch<ParallelReduce, LaunchBounds>(
*this, grid, block, shmem_size_total,
- m_policy.space().impl_internal_space_instance(),
- true); // copy to device and execute
+ m_policy.space()
+ .impl_internal_space_instance()); // copy to device and execute
if (!m_result_ptr_device_accessible) {
m_policy.space().fence(
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
index 34d4bef9fd..178012431c 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
@@ -428,11 +428,6 @@ struct CudaReductionsFunctor<FunctorType, false, false> {
// __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
// function qualifier which could be used to improve performance.
//----------------------------------------------------------------------------
-// Maximize shared memory and minimize L1 cache:
-// cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferShared );
-// For 2.0 capability: 48 KB shared and 16 KB L1
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
/*
* Algorithmic constraints:
* (a) blockDim.y <= 1024
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
index fb3a6b138f..a12378a891 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
@@ -100,8 +100,7 @@ class ParallelFor<FunctorType, Kokkos::WorkGraphPolicy<Traits...>,
const int shared = 0;
Kokkos::Impl::CudaParallelLaunch<Self>(
- *this, grid, block, shared, Cuda().impl_internal_space_instance(),
- false);
+ *this, grid, block, shared, Cuda().impl_internal_space_instance());
}
inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)

View File

@ -1,46 +0,0 @@
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
index 30b6958a67..b94f053272 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
@@ -207,41 +207,6 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
LaunchBounds{});
}
-// NOTE these number can be obtained several ways:
-// * One option is to download the CUDA Occupancy Calculator spreadsheet, select
-// "Compute Capability" first and check what is the smallest "Shared Memory
-// Size Config" that is available. The "Shared Memory Per Multiprocessor" in
-// bytes is then to be found below in the summary.
-// * Another option would be to look for the information in the "Tuning
-// Guide(s)" of the CUDA Toolkit Documentation for each GPU architecture, in
-// the "Shared Memory" section (more tedious)
-inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) {
- int const compute_capability = properties.major * 10 + properties.minor;
- return [compute_capability]() {
- switch (compute_capability) {
- case 30:
- case 32:
- case 35: return 16;
- case 37: return 80;
- case 50:
- case 53:
- case 60:
- case 62: return 64;
- case 52:
- case 61: return 96;
- case 70:
- case 80:
- case 86:
- case 90: return 8;
- case 75: return 32;
- default:
- Kokkos::Impl::throw_runtime_exception(
- "Unknown device in cuda block size deduction");
- }
- return 0;
- }() * 1024;
-}
-
} // namespace Impl
} // namespace Kokkos

View File

@ -1,204 +0,0 @@
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
index b94f053272..252c13c524 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
@@ -53,17 +53,69 @@
namespace Kokkos {
namespace Impl {
+inline int cuda_warp_per_sm_allocation_granularity(
+ cudaDeviceProp const& properties) {
+ // Allocation granularity of warps in each sm
+ switch (properties.major) {
+ case 3:
+ case 5:
+ case 7:
+ case 8:
+ case 9: return 4;
+ case 6: return (properties.minor == 0 ? 2 : 4);
+ default:
+ throw_runtime_exception(
+ "Unknown device in cuda warp per sm allocation granularity");
+ return 0;
+ }
+}
+
+inline int cuda_max_warps_per_sm_registers(
+ cudaDeviceProp const& properties, cudaFuncAttributes const& attributes) {
+ // Maximum number of warps per sm as a function of register counts,
+ // subject to the constraint that warps are allocated with a fixed granularity
+ int const max_regs_per_block = properties.regsPerBlock;
+ int const regs_per_warp = attributes.numRegs * properties.warpSize;
+ int const warp_granularity =
+ cuda_warp_per_sm_allocation_granularity(properties);
+ // The granularity of register allocation is chunks of 256 registers per warp,
+ // which implies a need to over-allocate, so we round up
+ int const allocated_regs_per_warp = (regs_per_warp + 256 - 1) / 256;
+
+ // The maximum number of warps per SM is constrained from above by register
+ // allocation. To satisfy the constraint that warps per SM is allocated at a
+ // finite granularity, we need to round down.
+ int const max_warps_per_sm =
+ warp_granularity *
+ (max_regs_per_block / (allocated_regs_per_warp * warp_granularity));
+
+ return max_warps_per_sm;
+}
+
inline int cuda_max_active_blocks_per_sm(cudaDeviceProp const& properties,
cudaFuncAttributes const& attributes,
int block_size, size_t dynamic_shmem) {
- // Limits due do registers/SM
+ // Limits due to registers/SM
int const regs_per_sm = properties.regsPerMultiprocessor;
int const regs_per_thread = attributes.numRegs;
// The granularity of register allocation is chunks of 256 registers per warp
// -> 8 registers per thread
int const allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
- int const max_blocks_regs =
- regs_per_sm / (allocated_regs_per_thread * block_size);
+ int max_blocks_regs = regs_per_sm / (allocated_regs_per_thread * block_size);
+
+ // Compute the maximum number of warps as a function of the number of
+ // registers
+ int const max_warps_per_sm_registers =
+ cuda_max_warps_per_sm_registers(properties, attributes);
+
+ // Constrain the number of blocks to respect the maximum number of warps per
+ // SM On face value this should be an equality, but due to the warp
+ // granularity constraints noted in `cuda_max_warps_per_sm_registers` the
+ // left-hand-side of this comparison can overshoot what the hardware allows
+ // based on register counts alone
+ while ((max_blocks_regs * block_size / properties.warpSize) >
+ max_warps_per_sm_registers)
+ max_blocks_regs--;
// Limits due to shared memory/SM
size_t const shmem_per_sm = properties.sharedMemPerMultiprocessor;
@@ -207,6 +259,19 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
LaunchBounds{});
}
+template <class LaunchBounds>
+int cuda_get_opt_block_size_no_shmem(const cudaFuncAttributes& attr,
+ LaunchBounds) {
+ auto const& prop = Kokkos::Cuda().cuda_device_prop();
+
+ // Thin version of cuda_get_opt_block_size for cases where there is no shared
+ // memory
+ auto const block_size_to_no_shmem = [&](int /*block_size*/) { return 0; };
+
+ return cuda_deduce_block_size(false, prop, attr, block_size_to_no_shmem,
+ LaunchBounds{});
+}
+
} // namespace Impl
} // namespace Kokkos
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
index 5c4c3a7d39..170183ca0a 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
@@ -188,9 +188,23 @@ inline void configure_shmem_preference(const KernelFuncPtr& func,
// The granularity of register allocation is chunks of 256 registers per warp
// -> 8 registers per thread
const size_t allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
- const size_t max_blocks_regs =
+ size_t max_blocks_regs =
regs_per_sm / (allocated_regs_per_thread * block_size);
+ // Compute the maximum number of warps as a function of the number of
+ // registers
+ const size_t max_warps_per_sm_registers =
+ cuda_max_warps_per_sm_registers(device_props, func_attr);
+
+ // Constrain the number of blocks to respect the maximum number of warps per
+ // SM On face value this should be an equality, but due to the warp
+ // granularity constraints noted in `cuda_max_warps_per_sm_registers` the
+ // left-hand-side of this comparison can overshoot what the hardware allows
+ // based on register counts alone
+ while ((max_blocks_regs * block_size / device_props.warpSize) >
+ max_warps_per_sm_registers)
+ max_blocks_regs--;
+
// Compute how many threads per sm we actually want
const size_t max_threads_per_sm = device_props.maxThreadsPerMultiProcessor;
// only allocate multiples of warp size
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
index 0e348c092a..7e4f62f12e 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
@@ -67,6 +67,34 @@
namespace Kokkos {
namespace Impl {
+template <typename ParallelType, typename Policy, typename LaunchBounds>
+int max_tile_size_product_helper(const Policy& pol, const LaunchBounds&) {
+ cudaFuncAttributes attr =
+ CudaParallelLaunch<ParallelType,
+ LaunchBounds>::get_cuda_func_attributes();
+ auto const& prop = pol.space().cuda_device_prop();
+
+ // Limits due to registers/SM, MDRange doesn't have
+ // shared memory constraints
+ int const optimal_block_size =
+ Kokkos::Impl::cuda_get_opt_block_size_no_shmem(attr, LaunchBounds{});
+
+ // Compute how many blocks of this size we can launch, based on warp
+ // constraints
+ int const max_warps_per_sm_registers =
+ Kokkos::Impl::cuda_max_warps_per_sm_registers(prop, attr);
+ int const max_num_threads_from_warps =
+ max_warps_per_sm_registers * prop.warpSize;
+ int const max_num_blocks = max_num_threads_from_warps / optimal_block_size;
+
+ // Compute the total number of threads
+ int const max_threads_per_sm = optimal_block_size * max_num_blocks;
+
+ return std::min(
+ max_threads_per_sm,
+ static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
+}
+
template <class FunctorType, class... Traits>
class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
public:
@@ -85,18 +113,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
public:
template <typename Policy, typename Functor>
static int max_tile_size_product(const Policy& pol, const Functor&) {
- cudaFuncAttributes attr =
- CudaParallelLaunch<ParallelFor,
- LaunchBounds>::get_cuda_func_attributes();
- auto const& prop = pol.space().cuda_device_prop();
- // Limits due to registers/SM, MDRange doesn't have
- // shared memory constraints
- int const regs_per_sm = prop.regsPerMultiprocessor;
- int const regs_per_thread = attr.numRegs;
- int const max_threads_per_sm = regs_per_sm / regs_per_thread;
- return std::min(
- max_threads_per_sm,
- static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
+ return max_tile_size_product_helper<ParallelFor>(pol, LaunchBounds{});
}
Policy const& get_policy() const { return m_rp; }
inline __device__ void operator()() const {
@@ -258,17 +275,7 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
public:
template <typename Policy, typename Functor>
static int max_tile_size_product(const Policy& pol, const Functor&) {
- cudaFuncAttributes attr =
- CudaParallelLaunch<ParallelReduce,
- LaunchBounds>::get_cuda_func_attributes();
- auto const& prop = pol.space().cuda_device_prop();
- // Limits due do registers/SM
- int const regs_per_sm = prop.regsPerMultiprocessor;
- int const regs_per_thread = attr.numRegs;
- int const max_threads_per_sm = regs_per_sm / regs_per_thread;
- return std::min(
- max_threads_per_sm,
- static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
+ return max_tile_size_product_helper<ParallelReduce>(pol, LaunchBounds{});
}
Policy const& get_policy() const { return m_policy; }
inline __device__ void exec_range(reference_type update) const {

View File

@ -1,63 +0,0 @@
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
index 170183ca0a..ba43e362bb 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
@@ -412,12 +412,16 @@ struct CudaParallelLaunchKernelInvoker<
Impl::check_shmem_request(cuda_instance, shmem);
if (DriverType::Policy::
experimental_contains_desired_occupancy) {
+ /*
int desired_occupancy =
driver.get_policy().impl_get_desired_occupancy().value();
size_t block_size = block.x * block.y * block.z;
Impl::configure_shmem_preference<DriverType, LaunchBounds>(
base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
- shmem, desired_occupancy);
+ shmem, desired_occupancy);*/
+ Kokkos::Impl::throw_runtime_exception(
+ std::string("Cuda graph node creation FAILED:"
+ " occupancy requests are currently broken."));
}
void const* args[] = {&driver};
@@ -511,14 +515,17 @@ struct CudaParallelLaunchKernelInvoker<
if (!Impl::is_empty_launch(grid, block)) {
Impl::check_shmem_request(cuda_instance, shmem);
- if constexpr (DriverType::Policy::
+ if (DriverType::Policy::
experimental_contains_desired_occupancy) {
- int desired_occupancy =
+ /*int desired_occupancy =
driver.get_policy().impl_get_desired_occupancy().value();
size_t block_size = block.x * block.y * block.z;
Impl::configure_shmem_preference<DriverType, LaunchBounds>(
base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
- shmem, desired_occupancy);
+ shmem, desired_occupancy);*/
+ Kokkos::Impl::throw_runtime_exception(
+ std::string("Cuda graph node creation FAILED:"
+ " occupancy requests are currently broken."));
}
auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver);
@@ -690,14 +697,17 @@ struct CudaParallelLaunchImpl<
if (DriverType::Policy::
experimental_contains_desired_occupancy) {
- int desired_occupancy =
+ /*int desired_occupancy =
driver.get_policy().impl_get_desired_occupancy().value();
size_t block_size = block.x * block.y * block.z;
Impl::configure_shmem_preference<
DriverType,
Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
- shmem, desired_occupancy);
+ shmem, desired_occupancy);*/
+ Kokkos::Impl::throw_runtime_exception(
+ std::string("Cuda graph node creation FAILED:"
+ " occupancy requests are currently broken."));
}
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();