Whack extra files from #3532
This commit is contained in:
@ -1,199 +0,0 @@
|
||||
diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos
|
||||
index 22af411f32..530510a0d1 100644
|
||||
--- a/lib/kokkos/Makefile.kokkos
|
||||
+++ b/lib/kokkos/Makefile.kokkos
|
||||
@@ -20,7 +20,7 @@ KOKKOS_DEVICES ?= "OpenMP"
|
||||
#KOKKOS_DEVICES ?= "Threads"
|
||||
# Options:
|
||||
# Intel: KNC,KNL,SNB,HSW,BDW,SKL,SKX,ICL,ICX,SPR
|
||||
-# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86
|
||||
+# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86,Hopper90
|
||||
# ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX
|
||||
# IBM: BGQ,Power7,Power8,Power9
|
||||
# AMD-GPUS: Vega900,Vega906,Vega908,Vega90A
|
||||
@@ -401,6 +401,7 @@ KOKKOS_INTERNAL_USE_ARCH_VOLTA72 := $(call kokkos_has_string,$(KOKKOS_ARCH),Volt
|
||||
KOKKOS_INTERNAL_USE_ARCH_TURING75 := $(call kokkos_has_string,$(KOKKOS_ARCH),Turing75)
|
||||
KOKKOS_INTERNAL_USE_ARCH_AMPERE80 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere80)
|
||||
KOKKOS_INTERNAL_USE_ARCH_AMPERE86 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere86)
|
||||
+KOKKOS_INTERNAL_USE_ARCH_HOPPER90 := $(call kokkos_has_string,$(KOKKOS_ARCH),Hopper90)
|
||||
KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \
|
||||
@@ -414,7 +415,8 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLE
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_TURING75) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_AMPERE80) \
|
||||
- + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86))
|
||||
+ + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86) \
|
||||
+ + $(KOKKOS_INTERNAL_USE_ARCH_HOPPER90))
|
||||
|
||||
#SEK: This seems like a bug to me
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
|
||||
@@ -1194,6 +1196,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA_ARCH), 1)
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE86")
|
||||
KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_86
|
||||
endif
|
||||
+ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_HOPPER90), 1)
|
||||
+ tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER")
|
||||
+ tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER90")
|
||||
+ KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_90
|
||||
+ endif
|
||||
|
||||
ifneq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
|
||||
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)
|
||||
diff --git a/lib/kokkos/cmake/KokkosCore_config.h.in b/lib/kokkos/cmake/KokkosCore_config.h.in
|
||||
index 88ddc48378..b83ced9243 100644
|
||||
--- a/lib/kokkos/cmake/KokkosCore_config.h.in
|
||||
+++ b/lib/kokkos/cmake/KokkosCore_config.h.in
|
||||
@@ -102,6 +102,7 @@
|
||||
#cmakedefine KOKKOS_ARCH_AMPERE
|
||||
#cmakedefine KOKKOS_ARCH_AMPERE80
|
||||
#cmakedefine KOKKOS_ARCH_AMPERE86
|
||||
+#cmakedefine KOKKOS_ARCH_HOPPER90
|
||||
#cmakedefine KOKKOS_ARCH_AMD_ZEN
|
||||
#cmakedefine KOKKOS_ARCH_AMD_ZEN2
|
||||
#cmakedefine KOKKOS_ARCH_AMD_ZEN3
|
||||
diff --git a/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc b/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc
|
||||
index f56cef1651..2585a6a64c 100644
|
||||
--- a/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc
|
||||
+++ b/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc
|
||||
@@ -74,6 +74,7 @@ int main() {
|
||||
case 75: std::cout << "Set -DKokkos_ARCH_TURING75=ON ." << std::endl; break;
|
||||
case 80: std::cout << "Set -DKokkos_ARCH_AMPERE80=ON ." << std::endl; break;
|
||||
case 86: std::cout << "Set -DKokkos_ARCH_AMPERE86=ON ." << std::endl; break;
|
||||
+ case 90: std::cout << "Set -DKokkos_ARCH_HOPPER90=ON ." << std::endl; break;
|
||||
default:
|
||||
std::cout << "Compute capability " << compute_capability
|
||||
<< " is not supported" << std::endl;
|
||||
diff --git a/lib/kokkos/cmake/kokkos_arch.cmake b/lib/kokkos/cmake/kokkos_arch.cmake
|
||||
index ef16aad047..c1d76cceeb 100644
|
||||
--- a/lib/kokkos/cmake/kokkos_arch.cmake
|
||||
+++ b/lib/kokkos/cmake/kokkos_arch.cmake
|
||||
@@ -86,6 +86,7 @@ KOKKOS_ARCH_OPTION(VOLTA72 GPU "NVIDIA Volta generation CC 7.2" "KOKK
|
||||
KOKKOS_ARCH_OPTION(TURING75 GPU "NVIDIA Turing generation CC 7.5" "KOKKOS_SHOW_CUDA_ARCHS")
|
||||
KOKKOS_ARCH_OPTION(AMPERE80 GPU "NVIDIA Ampere generation CC 8.0" "KOKKOS_SHOW_CUDA_ARCHS")
|
||||
KOKKOS_ARCH_OPTION(AMPERE86 GPU "NVIDIA Ampere generation CC 8.6" "KOKKOS_SHOW_CUDA_ARCHS")
|
||||
+KOKKOS_ARCH_OPTION(HOPPER90 GPU "NVIDIA Hopper generation CC 9.0" "KOKKOS_SHOW_CUDA_ARCHS")
|
||||
|
||||
IF(Kokkos_ENABLE_HIP OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_UNSUPPORTED_ARCHS)
|
||||
SET(KOKKOS_SHOW_HIP_ARCHS ON)
|
||||
@@ -544,6 +545,7 @@ CHECK_CUDA_ARCH(VOLTA72 sm_72)
|
||||
CHECK_CUDA_ARCH(TURING75 sm_75)
|
||||
CHECK_CUDA_ARCH(AMPERE80 sm_80)
|
||||
CHECK_CUDA_ARCH(AMPERE86 sm_86)
|
||||
+CHECK_CUDA_ARCH(HOPPER90 sm_90)
|
||||
|
||||
SET(AMDGPU_ARCH_ALREADY_SPECIFIED "")
|
||||
FUNCTION(CHECK_AMDGPU_ARCH ARCH FLAG)
|
||||
@@ -806,6 +808,10 @@ IF (KOKKOS_ARCH_AMPERE80 OR KOKKOS_ARCH_AMPERE86)
|
||||
SET(KOKKOS_ARCH_AMPERE ON)
|
||||
ENDIF()
|
||||
|
||||
+IF (KOKKOS_ARCH_HOPPER90)
|
||||
+ SET(KOKKOS_ARCH_HOPPER ON)
|
||||
+ENDIF()
|
||||
+
|
||||
#Regardless of version, make sure we define the general architecture name
|
||||
IF (KOKKOS_ARCH_VEGA900 OR KOKKOS_ARCH_VEGA906 OR KOKKOS_ARCH_VEGA908 OR KOKKOS_ARCH_VEGA90A)
|
||||
SET(KOKKOS_ARCH_VEGA ON)
|
||||
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
|
||||
index 56f9117844..fcd4773dbc 100644
|
||||
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
|
||||
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
|
||||
@@ -232,7 +232,8 @@ inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) {
|
||||
case 61: return 96;
|
||||
case 70:
|
||||
case 80:
|
||||
- case 86: return 8;
|
||||
+ case 86:
|
||||
+ case 90: return 8;
|
||||
case 75: return 32;
|
||||
default:
|
||||
Kokkos::Impl::throw_runtime_exception(
|
||||
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp
|
||||
index 40a263561f..8c40ebd60d 100644
|
||||
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp
|
||||
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp
|
||||
@@ -418,7 +418,7 @@ KOKKOS_INLINE_FUNCTION
|
||||
#endif // CUDA_VERSION >= 11000 && CUDA_VERSION < 11010
|
||||
|
||||
#if CUDA_VERSION >= 11010 && \
|
||||
- ((defined(KOKKOS_ARCH_AMPERE80) || defined(KOKKOS_ARCH_AMPERE86)))
|
||||
+ ((defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER)))
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bhalf_t cast_to_bhalf(bhalf_t val) { return val; }
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp
|
||||
index f9451ecfe6..2ce1efb98c 100644
|
||||
--- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp
|
||||
+++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp
|
||||
@@ -51,7 +51,7 @@ namespace Kokkos::Experimental::Impl {
|
||||
|
||||
struct OpenACC_Traits {
|
||||
#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
|
||||
- defined(KOKKOS_ARCH_AMPERE)
|
||||
+ defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER)
|
||||
static constexpr acc_device_t dev_type = acc_device_nvidia;
|
||||
static constexpr bool may_fallback_to_host = false;
|
||||
#else
|
||||
diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
|
||||
index a9bc085912..27ee1d4232 100644
|
||||
--- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
|
||||
+++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
|
||||
@@ -115,8 +115,9 @@ void OpenMPTargetInternal::impl_initialize() {
|
||||
|
||||
// FIXME_OPENMPTARGET: Only fix the number of teams for NVIDIA architectures
|
||||
// from Pascal and upwards.
|
||||
-#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
|
||||
- defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
|
||||
+#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
|
||||
+ defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
|
||||
+ defined(KOKKOS_ARCH_HOPPER)
|
||||
#if defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG >= 1300)
|
||||
omp_set_num_teams(512);
|
||||
#endif
|
||||
diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
|
||||
index 840db4327c..7e5addbc5b 100644
|
||||
--- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
|
||||
+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
|
||||
@@ -155,7 +155,7 @@ void SYCL::impl_initialize(InitializationSettings const& settings) {
|
||||
#if !defined(KOKKOS_ARCH_INTEL_GPU) && !defined(KOKKOS_ARCH_KEPLER) && \
|
||||
!defined(KOKKOS_ARCH_MAXWELL) && !defined(KOKKOS_ARCH_PASCAL) && \
|
||||
!defined(KOKKOS_ARCH_VOLTA) && !defined(KOKKOS_ARCH_TURING75) && \
|
||||
- !defined(KOKKOS_ARCH_AMPERE)
|
||||
+ !defined(KOKKOS_ARCH_AMPERE) && !defined(KOKKOS_ARCH_HOPPER)
|
||||
if (!settings.has_device_id() && gpu_devices.empty()) {
|
||||
Impl::SYCLInternal::singleton().initialize(sycl::device());
|
||||
return;
|
||||
diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
|
||||
index 5ac7d8af30..ba101f699e 100644
|
||||
--- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
|
||||
+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
|
||||
@@ -335,9 +335,10 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
|
||||
return std::min({
|
||||
int(m_space.impl_internal_space_instance()->m_maxWorkgroupSize),
|
||||
// FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs.
|
||||
-#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
|
||||
- defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
|
||||
- defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
|
||||
+#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
|
||||
+ defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
|
||||
+ defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
|
||||
+ defined(KOKKOS_ARCH_HOPPER)
|
||||
256,
|
||||
#endif
|
||||
max_threads_for_memory
|
||||
@@ -367,9 +368,10 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
|
||||
return std::min<int>({
|
||||
int(m_space.impl_internal_space_instance()->m_maxWorkgroupSize),
|
||||
// FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs.
|
||||
-#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
|
||||
- defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
|
||||
- defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
|
||||
+#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
|
||||
+ defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
|
||||
+ defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
|
||||
+ defined(KOKKOS_ARCH_HOPPER)
|
||||
256,
|
||||
#endif
|
||||
max_threads_for_memory
|
||||
@ -1,523 +0,0 @@
|
||||
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
|
||||
index fcd4773dbc..30b6958a67 100644
|
||||
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
|
||||
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
|
||||
@@ -207,7 +207,6 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
|
||||
LaunchBounds{});
|
||||
}
|
||||
|
||||
-// Assuming cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1)
|
||||
// NOTE these number can be obtained several ways:
|
||||
// * One option is to download the CUDA Occupancy Calculator spreadsheet, select
|
||||
// "Compute Capability" first and check what is the smallest "Shared Memory
|
||||
@@ -242,6 +241,7 @@ inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) {
|
||||
return 0;
|
||||
}() * 1024;
|
||||
}
|
||||
+
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
|
||||
index 5811498e01..e22eb3b842 100644
|
||||
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
|
||||
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
|
||||
@@ -569,12 +569,6 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default
|
||||
}
|
||||
#endif
|
||||
|
||||
-#ifdef KOKKOS_ENABLE_PRE_CUDA_10_DEPRECATION_API
|
||||
- cudaThreadSetCacheConfig(cudaFuncCachePreferShared);
|
||||
-#else
|
||||
- cudaDeviceSetCacheConfig(cudaFuncCachePreferShared);
|
||||
-#endif
|
||||
-
|
||||
// Init the array for used for arbitrarily sized atomics
|
||||
if (stream == nullptr) Impl::initialize_host_cuda_lock_arrays();
|
||||
|
||||
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
|
||||
index b7a80ad84f..5c4c3a7d39 100644
|
||||
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
|
||||
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
|
||||
@@ -93,10 +93,6 @@ namespace Impl {
|
||||
// __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
|
||||
// function qualifier which could be used to improve performance.
|
||||
//----------------------------------------------------------------------------
|
||||
-// Maximize L1 cache and minimize shared memory:
|
||||
-// cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1 );
|
||||
-// For 2.0 capability: 48 KB L1 and 16 KB shared
|
||||
-//----------------------------------------------------------------------------
|
||||
|
||||
template <class DriverType>
|
||||
__global__ static void cuda_parallel_launch_constant_memory() {
|
||||
@@ -158,63 +154,105 @@ inline void check_shmem_request(CudaInternal const* cuda_instance, int shmem) {
|
||||
}
|
||||
}
|
||||
|
||||
-// This function needs to be template on DriverType and LaunchBounds
|
||||
+// These functions needs to be template on DriverType and LaunchBounds
|
||||
// so that the static bool is unique for each type combo
|
||||
// KernelFuncPtr does not necessarily contain that type information.
|
||||
+
|
||||
template <class DriverType, class LaunchBounds, class KernelFuncPtr>
|
||||
-inline void configure_shmem_preference(KernelFuncPtr const& func,
|
||||
- bool prefer_shmem) {
|
||||
+const cudaFuncAttributes& get_cuda_kernel_func_attributes(
|
||||
+ const KernelFuncPtr& func) {
|
||||
+ // Only call cudaFuncGetAttributes once for each unique kernel
|
||||
+ // by leveraging static variable initialization rules
|
||||
+ auto wrap_get_attributes = [&]() -> cudaFuncAttributes {
|
||||
+ cudaFuncAttributes attr;
|
||||
+ KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncGetAttributes(&attr, func));
|
||||
+ return attr;
|
||||
+ };
|
||||
+ static cudaFuncAttributes func_attr = wrap_get_attributes();
|
||||
+ return func_attr;
|
||||
+}
|
||||
+
|
||||
+template <class DriverType, class LaunchBounds, class KernelFuncPtr>
|
||||
+inline void configure_shmem_preference(const KernelFuncPtr& func,
|
||||
+ const cudaDeviceProp& device_props,
|
||||
+ const size_t block_size, int& shmem,
|
||||
+ const size_t occupancy) {
|
||||
#ifndef KOKKOS_ARCH_KEPLER
|
||||
- // On Kepler the L1 has no benefit since it doesn't cache reads
|
||||
+
|
||||
+ const auto& func_attr =
|
||||
+ get_cuda_kernel_func_attributes<DriverType, LaunchBounds>(func);
|
||||
+
|
||||
+ // Compute limits for number of blocks due to registers/SM
|
||||
+ const size_t regs_per_sm = device_props.regsPerMultiprocessor;
|
||||
+ const size_t regs_per_thread = func_attr.numRegs;
|
||||
+ // The granularity of register allocation is chunks of 256 registers per warp
|
||||
+ // -> 8 registers per thread
|
||||
+ const size_t allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
|
||||
+ const size_t max_blocks_regs =
|
||||
+ regs_per_sm / (allocated_regs_per_thread * block_size);
|
||||
+
|
||||
+ // Compute how many threads per sm we actually want
|
||||
+ const size_t max_threads_per_sm = device_props.maxThreadsPerMultiProcessor;
|
||||
+ // only allocate multiples of warp size
|
||||
+ const size_t num_threads_desired =
|
||||
+ ((max_threads_per_sm * occupancy / 100 + 31) / 32) * 32;
|
||||
+ // Get close to the desired occupancy,
|
||||
+ // don't undershoot by much but also don't allocate a whole new block just
|
||||
+ // because one is a few threads over otherwise.
|
||||
+ size_t num_blocks_desired =
|
||||
+ (num_threads_desired + block_size * 0.8) / block_size;
|
||||
+ num_blocks_desired = ::std::min(max_blocks_regs, num_blocks_desired);
|
||||
+ if (num_blocks_desired == 0) num_blocks_desired = 1;
|
||||
+
|
||||
+ // Calculate how much shared memory we need per block
|
||||
+ size_t shmem_per_block = shmem + func_attr.sharedSizeBytes;
|
||||
+
|
||||
+ // The minimum shared memory allocation we can have in total per SM is 8kB.
|
||||
+ // If we want to lower occupancy we have to make sure we request at least that
|
||||
+ // much in aggregate over all blocks, so that shared memory actually becomes a
|
||||
+ // limiting factor for occupancy
|
||||
+ constexpr size_t min_shmem_size_per_sm = 8192;
|
||||
+ if ((occupancy < 100) &&
|
||||
+ (shmem_per_block * num_blocks_desired < min_shmem_size_per_sm)) {
|
||||
+ shmem_per_block = min_shmem_size_per_sm / num_blocks_desired;
|
||||
+ // Need to set the caller's shmem variable so that the
|
||||
+ // kernel launch uses the correct dynamic shared memory request
|
||||
+ shmem = shmem_per_block - func_attr.sharedSizeBytes;
|
||||
+ }
|
||||
+
|
||||
+ // Compute the carveout fraction we need based on occupancy
|
||||
+ // Use multiples of 8kB
|
||||
+ const size_t max_shmem_per_sm = device_props.sharedMemPerMultiprocessor;
|
||||
+ size_t carveout = shmem_per_block == 0
|
||||
+ ? 0
|
||||
+ : 100 *
|
||||
+ (((num_blocks_desired * shmem_per_block +
|
||||
+ min_shmem_size_per_sm - 1) /
|
||||
+ min_shmem_size_per_sm) *
|
||||
+ min_shmem_size_per_sm) /
|
||||
+ max_shmem_per_sm;
|
||||
+ if (carveout > 100) carveout = 100;
|
||||
+
|
||||
+ // Set the carveout, but only call it once per kernel or when it changes
|
||||
auto set_cache_config = [&] {
|
||||
- KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
|
||||
- func,
|
||||
- (prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1)));
|
||||
- return prefer_shmem;
|
||||
+ KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetAttribute(
|
||||
+ func, cudaFuncAttributePreferredSharedMemoryCarveout, carveout));
|
||||
+ return carveout;
|
||||
};
|
||||
- static bool cache_config_preference_cached = set_cache_config();
|
||||
- if (cache_config_preference_cached != prefer_shmem) {
|
||||
+ // Store the value in a static variable so we only reset if needed
|
||||
+ static size_t cache_config_preference_cached = set_cache_config();
|
||||
+ if (cache_config_preference_cached != carveout) {
|
||||
cache_config_preference_cached = set_cache_config();
|
||||
}
|
||||
#else
|
||||
// Use the parameters so we don't get a warning
|
||||
(void)func;
|
||||
- (void)prefer_shmem;
|
||||
+ (void)device_props;
|
||||
+ (void)block_size;
|
||||
+ (void)occupancy;
|
||||
#endif
|
||||
}
|
||||
|
||||
-template <class Policy>
|
||||
-std::enable_if_t<Policy::experimental_contains_desired_occupancy>
|
||||
-modify_launch_configuration_if_desired_occupancy_is_specified(
|
||||
- Policy const& policy, cudaDeviceProp const& properties,
|
||||
- cudaFuncAttributes const& attributes, dim3 const& block, int& shmem,
|
||||
- bool& prefer_shmem) {
|
||||
- int const block_size = block.x * block.y * block.z;
|
||||
- int const desired_occupancy = policy.impl_get_desired_occupancy().value();
|
||||
-
|
||||
- size_t const shmem_per_sm_prefer_l1 = get_shmem_per_sm_prefer_l1(properties);
|
||||
- size_t const static_shmem = attributes.sharedSizeBytes;
|
||||
-
|
||||
- // round to nearest integer and avoid division by zero
|
||||
- int active_blocks = std::max(
|
||||
- 1, static_cast<int>(std::round(
|
||||
- static_cast<double>(properties.maxThreadsPerMultiProcessor) /
|
||||
- block_size * desired_occupancy / 100)));
|
||||
- int const dynamic_shmem =
|
||||
- shmem_per_sm_prefer_l1 / active_blocks - static_shmem;
|
||||
-
|
||||
- if (dynamic_shmem > shmem) {
|
||||
- shmem = dynamic_shmem;
|
||||
- prefer_shmem = false;
|
||||
- }
|
||||
-}
|
||||
-
|
||||
-template <class Policy>
|
||||
-std::enable_if_t<!Policy::experimental_contains_desired_occupancy>
|
||||
-modify_launch_configuration_if_desired_occupancy_is_specified(
|
||||
- Policy const&, cudaDeviceProp const&, cudaFuncAttributes const&,
|
||||
- dim3 const& /*block*/, int& /*shmem*/, bool& /*prefer_shmem*/) {}
|
||||
-
|
||||
// </editor-fold> end Some helper functions for launch code readability }}}1
|
||||
//==============================================================================
|
||||
|
||||
@@ -348,7 +386,7 @@ struct CudaParallelLaunchKernelInvoker<
|
||||
#ifdef KOKKOS_CUDA_ENABLE_GRAPHS
|
||||
inline static void create_parallel_launch_graph_node(
|
||||
DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
|
||||
- CudaInternal const* cuda_instance, bool prefer_shmem) {
|
||||
+ CudaInternal const* cuda_instance) {
|
||||
//----------------------------------------
|
||||
auto const& graph = Impl::get_cuda_graph_from_kernel(driver);
|
||||
KOKKOS_EXPECTS(bool(graph));
|
||||
@@ -358,8 +396,15 @@ struct CudaParallelLaunchKernelInvoker<
|
||||
|
||||
if (!Impl::is_empty_launch(grid, block)) {
|
||||
Impl::check_shmem_request(cuda_instance, shmem);
|
||||
- Impl::configure_shmem_preference<DriverType, LaunchBounds>(
|
||||
- base_t::get_kernel_func(), prefer_shmem);
|
||||
+ if (DriverType::Policy::
|
||||
+ experimental_contains_desired_occupancy) {
|
||||
+ int desired_occupancy =
|
||||
+ driver.get_policy().impl_get_desired_occupancy().value();
|
||||
+ size_t block_size = block.x * block.y * block.z;
|
||||
+ Impl::configure_shmem_preference<DriverType, LaunchBounds>(
|
||||
+ base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
|
||||
+ shmem, desired_occupancy);
|
||||
+ }
|
||||
|
||||
void const* args[] = {&driver};
|
||||
|
||||
@@ -442,7 +487,7 @@ struct CudaParallelLaunchKernelInvoker<
|
||||
#ifdef KOKKOS_CUDA_ENABLE_GRAPHS
|
||||
inline static void create_parallel_launch_graph_node(
|
||||
DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
|
||||
- CudaInternal const* cuda_instance, bool prefer_shmem) {
|
||||
+ CudaInternal const* cuda_instance) {
|
||||
//----------------------------------------
|
||||
auto const& graph = Impl::get_cuda_graph_from_kernel(driver);
|
||||
KOKKOS_EXPECTS(bool(graph));
|
||||
@@ -452,8 +497,15 @@ struct CudaParallelLaunchKernelInvoker<
|
||||
|
||||
if (!Impl::is_empty_launch(grid, block)) {
|
||||
Impl::check_shmem_request(cuda_instance, shmem);
|
||||
- Impl::configure_shmem_preference<DriverType, LaunchBounds>(
|
||||
- base_t::get_kernel_func(), prefer_shmem);
|
||||
+ if constexpr (DriverType::Policy::
|
||||
+ experimental_contains_desired_occupancy) {
|
||||
+ int desired_occupancy =
|
||||
+ driver.get_policy().impl_get_desired_occupancy().value();
|
||||
+ size_t block_size = block.x * block.y * block.z;
|
||||
+ Impl::configure_shmem_preference<DriverType, LaunchBounds>(
|
||||
+ base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
|
||||
+ shmem, desired_occupancy);
|
||||
+ }
|
||||
|
||||
auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver);
|
||||
|
||||
@@ -566,7 +618,7 @@ struct CudaParallelLaunchKernelInvoker<
|
||||
#ifdef KOKKOS_CUDA_ENABLE_GRAPHS
|
||||
inline static void create_parallel_launch_graph_node(
|
||||
DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
|
||||
- CudaInternal const* cuda_instance, bool prefer_shmem) {
|
||||
+ CudaInternal const* cuda_instance) {
|
||||
// Just use global memory; coordinating through events to share constant
|
||||
// memory with the non-graph interface is not really reasonable since
|
||||
// events don't work with Graphs directly, and this would anyway require
|
||||
@@ -580,7 +632,7 @@ struct CudaParallelLaunchKernelInvoker<
|
||||
DriverType, LaunchBounds,
|
||||
Experimental::CudaLaunchMechanism::GlobalMemory>;
|
||||
global_launch_impl_t::create_parallel_launch_graph_node(
|
||||
- driver, grid, block, shmem, cuda_instance, prefer_shmem);
|
||||
+ driver, grid, block, shmem, cuda_instance);
|
||||
}
|
||||
#endif
|
||||
};
|
||||
@@ -613,8 +665,7 @@ struct CudaParallelLaunchImpl<
|
||||
|
||||
inline static void launch_kernel(const DriverType& driver, const dim3& grid,
|
||||
const dim3& block, int shmem,
|
||||
- const CudaInternal* cuda_instance,
|
||||
- bool prefer_shmem) {
|
||||
+ const CudaInternal* cuda_instance) {
|
||||
if (!Impl::is_empty_launch(grid, block)) {
|
||||
// Prevent multiple threads to simultaneously set the cache configuration
|
||||
// preference and launch the same kernel
|
||||
@@ -623,18 +674,17 @@ struct CudaParallelLaunchImpl<
|
||||
|
||||
Impl::check_shmem_request(cuda_instance, shmem);
|
||||
|
||||
- // If a desired occupancy is specified, we compute how much shared memory
|
||||
- // to ask for to achieve that occupancy, assuming that the cache
|
||||
- // configuration is `cudaFuncCachePreferL1`. If the amount of dynamic
|
||||
- // shared memory computed is actually smaller than `shmem` we overwrite
|
||||
- // `shmem` and set `prefer_shmem` to `false`.
|
||||
- modify_launch_configuration_if_desired_occupancy_is_specified(
|
||||
- driver.get_policy(), cuda_instance->m_deviceProp,
|
||||
- get_cuda_func_attributes(), block, shmem, prefer_shmem);
|
||||
-
|
||||
- Impl::configure_shmem_preference<
|
||||
- DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
|
||||
- base_t::get_kernel_func(), prefer_shmem);
|
||||
+ if (DriverType::Policy::
|
||||
+ experimental_contains_desired_occupancy) {
|
||||
+ int desired_occupancy =
|
||||
+ driver.get_policy().impl_get_desired_occupancy().value();
|
||||
+ size_t block_size = block.x * block.y * block.z;
|
||||
+ Impl::configure_shmem_preference<
|
||||
+ DriverType,
|
||||
+ Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
|
||||
+ base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
|
||||
+ shmem, desired_occupancy);
|
||||
+ }
|
||||
|
||||
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
|
||||
|
||||
@@ -650,18 +700,9 @@ struct CudaParallelLaunchImpl<
|
||||
}
|
||||
|
||||
static cudaFuncAttributes get_cuda_func_attributes() {
|
||||
- // Race condition inside of cudaFuncGetAttributes if the same address is
|
||||
- // given requires using a local variable as input instead of a static Rely
|
||||
- // on static variable initialization to make sure only one thread executes
|
||||
- // the code and the result is visible.
|
||||
- auto wrap_get_attributes = []() -> cudaFuncAttributes {
|
||||
- cudaFuncAttributes attr_tmp;
|
||||
- KOKKOS_IMPL_CUDA_SAFE_CALL(
|
||||
- cudaFuncGetAttributes(&attr_tmp, base_t::get_kernel_func()));
|
||||
- return attr_tmp;
|
||||
- };
|
||||
- static cudaFuncAttributes attr = wrap_get_attributes();
|
||||
- return attr;
|
||||
+ return get_cuda_kernel_func_attributes<
|
||||
+ DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
|
||||
+ base_t::get_kernel_func());
|
||||
}
|
||||
};
|
||||
|
||||
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
|
||||
index e586bb4cc6..0e348c092a 100644
|
||||
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
|
||||
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
|
||||
@@ -121,8 +121,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
|
||||
maxblocks[1]),
|
||||
1);
|
||||
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
||||
- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
|
||||
- false);
|
||||
+ *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
|
||||
} else if (RP::rank == 3) {
|
||||
const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], m_rp.m_tile[2]);
|
||||
KOKKOS_ASSERT(block.x > 0);
|
||||
@@ -139,8 +138,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
|
||||
(m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1) / block.z,
|
||||
maxblocks[2]));
|
||||
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
||||
- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
|
||||
- false);
|
||||
+ *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
|
||||
} else if (RP::rank == 4) {
|
||||
// id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to
|
||||
// threadIdx.z
|
||||
@@ -158,8 +156,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
|
||||
(m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1) / block.z,
|
||||
maxblocks[2]));
|
||||
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
||||
- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
|
||||
- false);
|
||||
+ *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
|
||||
} else if (RP::rank == 5) {
|
||||
// id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to
|
||||
// threadIdx.z
|
||||
@@ -175,8 +172,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
|
||||
(m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1) / block.z,
|
||||
maxblocks[2]));
|
||||
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
||||
- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
|
||||
- false);
|
||||
+ *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
|
||||
} else if (RP::rank == 6) {
|
||||
// id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4,id5 to
|
||||
// threadIdx.z
|
||||
@@ -191,8 +187,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
|
||||
std::min<array_index_type>(m_rp.m_tile_end[4] * m_rp.m_tile_end[5],
|
||||
maxblocks[2]));
|
||||
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
||||
- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
|
||||
- false);
|
||||
+ *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
|
||||
} else {
|
||||
Kokkos::abort("Kokkos::MDRange Error: Exceeded rank bounds with Cuda\n");
|
||||
}
|
||||
@@ -405,8 +400,8 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
|
||||
|
||||
CudaParallelLaunch<ParallelReduce, LaunchBounds>(
|
||||
*this, grid, block, shmem,
|
||||
- m_policy.space().impl_internal_space_instance(),
|
||||
- false); // copy to device and execute
|
||||
+ m_policy.space()
|
||||
+ .impl_internal_space_instance()); // copy to device and execute
|
||||
|
||||
if (!m_result_ptr_device_accessible) {
|
||||
if (m_result_ptr) {
|
||||
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
|
||||
index ac160f8fe2..d1031751c2 100644
|
||||
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
|
||||
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
|
||||
@@ -135,8 +135,7 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
|
||||
#endif
|
||||
|
||||
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
||||
- *this, grid, block, 0, m_policy.space().impl_internal_space_instance(),
|
||||
- false);
|
||||
+ *this, grid, block, 0, m_policy.space().impl_internal_space_instance());
|
||||
}
|
||||
|
||||
ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
|
||||
@@ -375,8 +374,8 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
|
||||
|
||||
CudaParallelLaunch<ParallelReduce, LaunchBounds>(
|
||||
*this, grid, block, shmem,
|
||||
- m_policy.space().impl_internal_space_instance(),
|
||||
- false); // copy to device and execute
|
||||
+ m_policy.space()
|
||||
+ .impl_internal_space_instance()); // copy to device and execute
|
||||
|
||||
if (!m_result_ptr_device_accessible) {
|
||||
if (m_result_ptr) {
|
||||
@@ -726,16 +725,16 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
|
||||
m_final = false;
|
||||
CudaParallelLaunch<ParallelScan, LaunchBounds>(
|
||||
*this, grid, block, shmem,
|
||||
- m_policy.space().impl_internal_space_instance(),
|
||||
- false); // copy to device and execute
|
||||
+ m_policy.space()
|
||||
+ .impl_internal_space_instance()); // copy to device and execute
|
||||
#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
|
||||
}
|
||||
#endif
|
||||
m_final = true;
|
||||
CudaParallelLaunch<ParallelScan, LaunchBounds>(
|
||||
*this, grid, block, shmem,
|
||||
- m_policy.space().impl_internal_space_instance(),
|
||||
- false); // copy to device and execute
|
||||
+ m_policy.space()
|
||||
+ .impl_internal_space_instance()); // copy to device and execute
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1038,16 +1037,16 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
|
||||
m_final = false;
|
||||
CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>(
|
||||
*this, grid, block, shmem,
|
||||
- m_policy.space().impl_internal_space_instance(),
|
||||
- false); // copy to device and execute
|
||||
+ m_policy.space()
|
||||
+ .impl_internal_space_instance()); // copy to device and execute
|
||||
#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
|
||||
}
|
||||
#endif
|
||||
m_final = true;
|
||||
CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>(
|
||||
*this, grid, block, shmem,
|
||||
- m_policy.space().impl_internal_space_instance(),
|
||||
- false); // copy to device and execute
|
||||
+ m_policy.space()
|
||||
+ .impl_internal_space_instance()); // copy to device and execute
|
||||
|
||||
const int size = Analysis::value_size(m_functor);
|
||||
#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
|
||||
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
|
||||
index cdd16085b3..ea9430b812 100644
|
||||
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
|
||||
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
|
||||
@@ -552,8 +552,8 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
|
||||
|
||||
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
||||
*this, grid, block, shmem_size_total,
|
||||
- m_policy.space().impl_internal_space_instance(),
|
||||
- true); // copy to device and execute
|
||||
+ m_policy.space()
|
||||
+ .impl_internal_space_instance()); // copy to device and execute
|
||||
}
|
||||
|
||||
ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
|
||||
@@ -878,8 +878,8 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
|
||||
|
||||
CudaParallelLaunch<ParallelReduce, LaunchBounds>(
|
||||
*this, grid, block, shmem_size_total,
|
||||
- m_policy.space().impl_internal_space_instance(),
|
||||
- true); // copy to device and execute
|
||||
+ m_policy.space()
|
||||
+ .impl_internal_space_instance()); // copy to device and execute
|
||||
|
||||
if (!m_result_ptr_device_accessible) {
|
||||
m_policy.space().fence(
|
||||
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
|
||||
index 34d4bef9fd..178012431c 100644
|
||||
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
|
||||
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
|
||||
@@ -428,11 +428,6 @@ struct CudaReductionsFunctor<FunctorType, false, false> {
|
||||
// __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
|
||||
// function qualifier which could be used to improve performance.
|
||||
//----------------------------------------------------------------------------
|
||||
-// Maximize shared memory and minimize L1 cache:
|
||||
-// cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferShared );
|
||||
-// For 2.0 capability: 48 KB shared and 16 KB L1
|
||||
-//----------------------------------------------------------------------------
|
||||
-//----------------------------------------------------------------------------
|
||||
/*
|
||||
* Algorithmic constraints:
|
||||
* (a) blockDim.y <= 1024
|
||||
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
|
||||
index fb3a6b138f..a12378a891 100644
|
||||
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
|
||||
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
|
||||
@@ -100,8 +100,7 @@ class ParallelFor<FunctorType, Kokkos::WorkGraphPolicy<Traits...>,
|
||||
const int shared = 0;
|
||||
|
||||
Kokkos::Impl::CudaParallelLaunch<Self>(
|
||||
- *this, grid, block, shared, Cuda().impl_internal_space_instance(),
|
||||
- false);
|
||||
+ *this, grid, block, shared, Cuda().impl_internal_space_instance());
|
||||
}
|
||||
|
||||
inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
|
||||
@ -1,46 +0,0 @@
|
||||
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
|
||||
index 30b6958a67..b94f053272 100644
|
||||
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
|
||||
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
|
||||
@@ -207,41 +207,6 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
|
||||
LaunchBounds{});
|
||||
}
|
||||
|
||||
-// NOTE these number can be obtained several ways:
|
||||
-// * One option is to download the CUDA Occupancy Calculator spreadsheet, select
|
||||
-// "Compute Capability" first and check what is the smallest "Shared Memory
|
||||
-// Size Config" that is available. The "Shared Memory Per Multiprocessor" in
|
||||
-// bytes is then to be found below in the summary.
|
||||
-// * Another option would be to look for the information in the "Tuning
|
||||
-// Guide(s)" of the CUDA Toolkit Documentation for each GPU architecture, in
|
||||
-// the "Shared Memory" section (more tedious)
|
||||
-inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) {
|
||||
- int const compute_capability = properties.major * 10 + properties.minor;
|
||||
- return [compute_capability]() {
|
||||
- switch (compute_capability) {
|
||||
- case 30:
|
||||
- case 32:
|
||||
- case 35: return 16;
|
||||
- case 37: return 80;
|
||||
- case 50:
|
||||
- case 53:
|
||||
- case 60:
|
||||
- case 62: return 64;
|
||||
- case 52:
|
||||
- case 61: return 96;
|
||||
- case 70:
|
||||
- case 80:
|
||||
- case 86:
|
||||
- case 90: return 8;
|
||||
- case 75: return 32;
|
||||
- default:
|
||||
- Kokkos::Impl::throw_runtime_exception(
|
||||
- "Unknown device in cuda block size deduction");
|
||||
- }
|
||||
- return 0;
|
||||
- }() * 1024;
|
||||
-}
|
||||
-
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
@ -1,204 +0,0 @@
|
||||
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
|
||||
index b94f053272..252c13c524 100644
|
||||
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
|
||||
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
|
||||
@@ -53,17 +53,69 @@
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
+inline int cuda_warp_per_sm_allocation_granularity(
|
||||
+ cudaDeviceProp const& properties) {
|
||||
+ // Allocation granularity of warps in each sm
|
||||
+ switch (properties.major) {
|
||||
+ case 3:
|
||||
+ case 5:
|
||||
+ case 7:
|
||||
+ case 8:
|
||||
+ case 9: return 4;
|
||||
+ case 6: return (properties.minor == 0 ? 2 : 4);
|
||||
+ default:
|
||||
+ throw_runtime_exception(
|
||||
+ "Unknown device in cuda warp per sm allocation granularity");
|
||||
+ return 0;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+inline int cuda_max_warps_per_sm_registers(
|
||||
+ cudaDeviceProp const& properties, cudaFuncAttributes const& attributes) {
|
||||
+ // Maximum number of warps per sm as a function of register counts,
|
||||
+ // subject to the constraint that warps are allocated with a fixed granularity
|
||||
+ int const max_regs_per_block = properties.regsPerBlock;
|
||||
+ int const regs_per_warp = attributes.numRegs * properties.warpSize;
|
||||
+ int const warp_granularity =
|
||||
+ cuda_warp_per_sm_allocation_granularity(properties);
|
||||
+ // The granularity of register allocation is chunks of 256 registers per warp,
|
||||
+ // which implies a need to over-allocate, so we round up
|
||||
+ int const allocated_regs_per_warp = (regs_per_warp + 256 - 1) / 256;
|
||||
+
|
||||
+ // The maximum number of warps per SM is constrained from above by register
|
||||
+ // allocation. To satisfy the constraint that warps per SM is allocated at a
|
||||
+ // finite granularity, we need to round down.
|
||||
+ int const max_warps_per_sm =
|
||||
+ warp_granularity *
|
||||
+ (max_regs_per_block / (allocated_regs_per_warp * warp_granularity));
|
||||
+
|
||||
+ return max_warps_per_sm;
|
||||
+}
|
||||
+
|
||||
inline int cuda_max_active_blocks_per_sm(cudaDeviceProp const& properties,
|
||||
cudaFuncAttributes const& attributes,
|
||||
int block_size, size_t dynamic_shmem) {
|
||||
- // Limits due do registers/SM
|
||||
+ // Limits due to registers/SM
|
||||
int const regs_per_sm = properties.regsPerMultiprocessor;
|
||||
int const regs_per_thread = attributes.numRegs;
|
||||
// The granularity of register allocation is chunks of 256 registers per warp
|
||||
// -> 8 registers per thread
|
||||
int const allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
|
||||
- int const max_blocks_regs =
|
||||
- regs_per_sm / (allocated_regs_per_thread * block_size);
|
||||
+ int max_blocks_regs = regs_per_sm / (allocated_regs_per_thread * block_size);
|
||||
+
|
||||
+ // Compute the maximum number of warps as a function of the number of
|
||||
+ // registers
|
||||
+ int const max_warps_per_sm_registers =
|
||||
+ cuda_max_warps_per_sm_registers(properties, attributes);
|
||||
+
|
||||
+ // Constrain the number of blocks to respect the maximum number of warps per
|
||||
+ // SM On face value this should be an equality, but due to the warp
|
||||
+ // granularity constraints noted in `cuda_max_warps_per_sm_registers` the
|
||||
+ // left-hand-side of this comparison can overshoot what the hardware allows
|
||||
+ // based on register counts alone
|
||||
+ while ((max_blocks_regs * block_size / properties.warpSize) >
|
||||
+ max_warps_per_sm_registers)
|
||||
+ max_blocks_regs--;
|
||||
|
||||
// Limits due to shared memory/SM
|
||||
size_t const shmem_per_sm = properties.sharedMemPerMultiprocessor;
|
||||
@@ -207,6 +259,19 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
|
||||
LaunchBounds{});
|
||||
}
|
||||
|
||||
+template <class LaunchBounds>
|
||||
+int cuda_get_opt_block_size_no_shmem(const cudaFuncAttributes& attr,
|
||||
+ LaunchBounds) {
|
||||
+ auto const& prop = Kokkos::Cuda().cuda_device_prop();
|
||||
+
|
||||
+ // Thin version of cuda_get_opt_block_size for cases where there is no shared
|
||||
+ // memory
|
||||
+ auto const block_size_to_no_shmem = [&](int /*block_size*/) { return 0; };
|
||||
+
|
||||
+ return cuda_deduce_block_size(false, prop, attr, block_size_to_no_shmem,
|
||||
+ LaunchBounds{});
|
||||
+}
|
||||
+
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
|
||||
index 5c4c3a7d39..170183ca0a 100644
|
||||
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
|
||||
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
|
||||
@@ -188,9 +188,23 @@ inline void configure_shmem_preference(const KernelFuncPtr& func,
|
||||
// The granularity of register allocation is chunks of 256 registers per warp
|
||||
// -> 8 registers per thread
|
||||
const size_t allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
|
||||
- const size_t max_blocks_regs =
|
||||
+ size_t max_blocks_regs =
|
||||
regs_per_sm / (allocated_regs_per_thread * block_size);
|
||||
|
||||
+ // Compute the maximum number of warps as a function of the number of
|
||||
+ // registers
|
||||
+ const size_t max_warps_per_sm_registers =
|
||||
+ cuda_max_warps_per_sm_registers(device_props, func_attr);
|
||||
+
|
||||
+ // Constrain the number of blocks to respect the maximum number of warps per
|
||||
+ // SM On face value this should be an equality, but due to the warp
|
||||
+ // granularity constraints noted in `cuda_max_warps_per_sm_registers` the
|
||||
+ // left-hand-side of this comparison can overshoot what the hardware allows
|
||||
+ // based on register counts alone
|
||||
+ while ((max_blocks_regs * block_size / device_props.warpSize) >
|
||||
+ max_warps_per_sm_registers)
|
||||
+ max_blocks_regs--;
|
||||
+
|
||||
// Compute how many threads per sm we actually want
|
||||
const size_t max_threads_per_sm = device_props.maxThreadsPerMultiProcessor;
|
||||
// only allocate multiples of warp size
|
||||
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
|
||||
index 0e348c092a..7e4f62f12e 100644
|
||||
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
|
||||
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
|
||||
@@ -67,6 +67,34 @@
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
+template <typename ParallelType, typename Policy, typename LaunchBounds>
|
||||
+int max_tile_size_product_helper(const Policy& pol, const LaunchBounds&) {
|
||||
+ cudaFuncAttributes attr =
|
||||
+ CudaParallelLaunch<ParallelType,
|
||||
+ LaunchBounds>::get_cuda_func_attributes();
|
||||
+ auto const& prop = pol.space().cuda_device_prop();
|
||||
+
|
||||
+ // Limits due to registers/SM, MDRange doesn't have
|
||||
+ // shared memory constraints
|
||||
+ int const optimal_block_size =
|
||||
+ Kokkos::Impl::cuda_get_opt_block_size_no_shmem(attr, LaunchBounds{});
|
||||
+
|
||||
+ // Compute how many blocks of this size we can launch, based on warp
|
||||
+ // constraints
|
||||
+ int const max_warps_per_sm_registers =
|
||||
+ Kokkos::Impl::cuda_max_warps_per_sm_registers(prop, attr);
|
||||
+ int const max_num_threads_from_warps =
|
||||
+ max_warps_per_sm_registers * prop.warpSize;
|
||||
+ int const max_num_blocks = max_num_threads_from_warps / optimal_block_size;
|
||||
+
|
||||
+ // Compute the total number of threads
|
||||
+ int const max_threads_per_sm = optimal_block_size * max_num_blocks;
|
||||
+
|
||||
+ return std::min(
|
||||
+ max_threads_per_sm,
|
||||
+ static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
|
||||
+}
|
||||
+
|
||||
template <class FunctorType, class... Traits>
|
||||
class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
|
||||
public:
|
||||
@@ -85,18 +113,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
|
||||
public:
|
||||
template <typename Policy, typename Functor>
|
||||
static int max_tile_size_product(const Policy& pol, const Functor&) {
|
||||
- cudaFuncAttributes attr =
|
||||
- CudaParallelLaunch<ParallelFor,
|
||||
- LaunchBounds>::get_cuda_func_attributes();
|
||||
- auto const& prop = pol.space().cuda_device_prop();
|
||||
- // Limits due to registers/SM, MDRange doesn't have
|
||||
- // shared memory constraints
|
||||
- int const regs_per_sm = prop.regsPerMultiprocessor;
|
||||
- int const regs_per_thread = attr.numRegs;
|
||||
- int const max_threads_per_sm = regs_per_sm / regs_per_thread;
|
||||
- return std::min(
|
||||
- max_threads_per_sm,
|
||||
- static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
|
||||
+ return max_tile_size_product_helper<ParallelFor>(pol, LaunchBounds{});
|
||||
}
|
||||
Policy const& get_policy() const { return m_rp; }
|
||||
inline __device__ void operator()() const {
|
||||
@@ -258,17 +275,7 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
|
||||
public:
|
||||
template <typename Policy, typename Functor>
|
||||
static int max_tile_size_product(const Policy& pol, const Functor&) {
|
||||
- cudaFuncAttributes attr =
|
||||
- CudaParallelLaunch<ParallelReduce,
|
||||
- LaunchBounds>::get_cuda_func_attributes();
|
||||
- auto const& prop = pol.space().cuda_device_prop();
|
||||
- // Limits due do registers/SM
|
||||
- int const regs_per_sm = prop.regsPerMultiprocessor;
|
||||
- int const regs_per_thread = attr.numRegs;
|
||||
- int const max_threads_per_sm = regs_per_sm / regs_per_thread;
|
||||
- return std::min(
|
||||
- max_threads_per_sm,
|
||||
- static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
|
||||
+ return max_tile_size_product_helper<ParallelReduce>(pol, LaunchBounds{});
|
||||
}
|
||||
Policy const& get_policy() const { return m_policy; }
|
||||
inline __device__ void exec_range(reference_type update) const {
|
||||
@ -1,63 +0,0 @@
|
||||
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
|
||||
index 170183ca0a..ba43e362bb 100644
|
||||
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
|
||||
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
|
||||
@@ -412,12 +412,16 @@ struct CudaParallelLaunchKernelInvoker<
|
||||
Impl::check_shmem_request(cuda_instance, shmem);
|
||||
if (DriverType::Policy::
|
||||
experimental_contains_desired_occupancy) {
|
||||
+ /*
|
||||
int desired_occupancy =
|
||||
driver.get_policy().impl_get_desired_occupancy().value();
|
||||
size_t block_size = block.x * block.y * block.z;
|
||||
Impl::configure_shmem_preference<DriverType, LaunchBounds>(
|
||||
base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
|
||||
- shmem, desired_occupancy);
|
||||
+ shmem, desired_occupancy);*/
|
||||
+ Kokkos::Impl::throw_runtime_exception(
|
||||
+ std::string("Cuda graph node creation FAILED:"
|
||||
+ " occupancy requests are currently broken."));
|
||||
}
|
||||
|
||||
void const* args[] = {&driver};
|
||||
@@ -511,14 +515,17 @@ struct CudaParallelLaunchKernelInvoker<
|
||||
|
||||
if (!Impl::is_empty_launch(grid, block)) {
|
||||
Impl::check_shmem_request(cuda_instance, shmem);
|
||||
- if constexpr (DriverType::Policy::
|
||||
+ if (DriverType::Policy::
|
||||
experimental_contains_desired_occupancy) {
|
||||
- int desired_occupancy =
|
||||
+ /*int desired_occupancy =
|
||||
driver.get_policy().impl_get_desired_occupancy().value();
|
||||
size_t block_size = block.x * block.y * block.z;
|
||||
Impl::configure_shmem_preference<DriverType, LaunchBounds>(
|
||||
base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
|
||||
- shmem, desired_occupancy);
|
||||
+ shmem, desired_occupancy);*/
|
||||
+ Kokkos::Impl::throw_runtime_exception(
|
||||
+ std::string("Cuda graph node creation FAILED:"
|
||||
+ " occupancy requests are currently broken."));
|
||||
}
|
||||
|
||||
auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver);
|
||||
@@ -690,14 +697,17 @@ struct CudaParallelLaunchImpl<
|
||||
|
||||
if (DriverType::Policy::
|
||||
experimental_contains_desired_occupancy) {
|
||||
- int desired_occupancy =
|
||||
+ /*int desired_occupancy =
|
||||
driver.get_policy().impl_get_desired_occupancy().value();
|
||||
size_t block_size = block.x * block.y * block.z;
|
||||
Impl::configure_shmem_preference<
|
||||
DriverType,
|
||||
Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
|
||||
base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
|
||||
- shmem, desired_occupancy);
|
||||
+ shmem, desired_occupancy);*/
|
||||
+ Kokkos::Impl::throw_runtime_exception(
|
||||
+ std::string("Cuda graph node creation FAILED:"
|
||||
+ " occupancy requests are currently broken."));
|
||||
}
|
||||
|
||||
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
|
||||
Reference in New Issue
Block a user