Merge pull request #3532 from stanmoore1/kk_occupancy
Update Kokkos version in LAMMPS to 3.7.1
This commit is contained in:
@ -49,8 +49,8 @@ if(DOWNLOAD_KOKKOS)
|
||||
list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_CXX_EXTENSIONS=${CMAKE_CXX_EXTENSIONS}")
|
||||
list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}")
|
||||
include(ExternalProject)
|
||||
set(KOKKOS_URL "https://github.com/kokkos/kokkos/archive/3.7.00.tar.gz" CACHE STRING "URL for KOKKOS tarball")
|
||||
set(KOKKOS_MD5 "84991eca9f066383abe119a5bc7a11c4" CACHE STRING "MD5 checksum of KOKKOS tarball")
|
||||
set(KOKKOS_URL "https://github.com/kokkos/kokkos/archive/3.7.01.tar.gz" CACHE STRING "URL for KOKKOS tarball")
|
||||
set(KOKKOS_MD5 "f140e02b826223b1045207d9bc10d404" CACHE STRING "MD5 checksum of KOKKOS tarball")
|
||||
mark_as_advanced(KOKKOS_URL)
|
||||
mark_as_advanced(KOKKOS_MD5)
|
||||
ExternalProject_Add(kokkos_build
|
||||
@ -74,7 +74,7 @@ if(DOWNLOAD_KOKKOS)
|
||||
add_dependencies(LAMMPS::KOKKOSCORE kokkos_build)
|
||||
add_dependencies(LAMMPS::KOKKOSCONTAINERS kokkos_build)
|
||||
elseif(EXTERNAL_KOKKOS)
|
||||
find_package(Kokkos 3.7.00 REQUIRED CONFIG)
|
||||
find_package(Kokkos 3.7.01 REQUIRED CONFIG)
|
||||
target_link_libraries(lammps PRIVATE Kokkos::kokkos)
|
||||
target_link_libraries(lmp PRIVATE Kokkos::kokkos)
|
||||
else()
|
||||
|
||||
@ -105,13 +105,12 @@ Either the full word or an abbreviation can be used for the keywords.
|
||||
Note that the keywords do not use a leading minus sign. I.e. the
|
||||
keyword is "t", not "-t". Also note that each of the keywords has a
|
||||
default setting. Examples of when to use these options and what
|
||||
settings to use on different platforms is given on the :doc:`KOKKOS package <Speed_kokkos>`
|
||||
doc page.
|
||||
settings to use on different platforms is given on the :doc:`KOKKOS
|
||||
package <Speed_kokkos>` doc page.
|
||||
|
||||
* d or device
|
||||
* g or gpus
|
||||
* t or threads
|
||||
* n or numa
|
||||
|
||||
.. parsed-literal::
|
||||
|
||||
@ -164,19 +163,10 @@ the number of physical cores per node, to use your available hardware
|
||||
optimally. This also sets the number of threads used by the host when
|
||||
LAMMPS is compiled with CUDA=yes.
|
||||
|
||||
.. parsed-literal::
|
||||
.. deprecated:: 22Dec2022
|
||||
|
||||
numa Nm
|
||||
|
||||
This option is only relevant when using pthreads with hwloc support.
|
||||
In this case Nm defines the number of NUMA regions (typically sockets)
|
||||
on a node which will be utilized by a single MPI rank. By default Nm
|
||||
= 1. If this option is used the total number of worker-threads per
|
||||
MPI rank is threads\*numa. Currently it is always almost better to
|
||||
assign at least one MPI rank per NUMA region, and leave numa set to
|
||||
its default value of 1. This is because letting a single process span
|
||||
multiple NUMA regions induces a significant amount of cross NUMA data
|
||||
traffic which is slow.
|
||||
Support for the "numa" or "n" option was removed as its functionality
|
||||
was ignored in Kokkos for some time already.
|
||||
|
||||
----------
|
||||
|
||||
|
||||
@ -1,5 +1,27 @@
|
||||
# Change Log
|
||||
|
||||
## [3.7.01](https://github.com/kokkos/kokkos/tree/3.7.01) (2022-12-01)
|
||||
[Full Changelog](https://github.com/kokkos/kokkos/compare/3.7.00...3.7.01)
|
||||
|
||||
### Bug Fixes:
|
||||
- Add fences to all sorting routines not taking an execution space instance argument [\#5547](https://github.com/kokkos/kokkos/pull/5547)
|
||||
- Fix repeated `team_reduce` without barrier [\#5552](https://github.com/kokkos/kokkos/pull/5552)
|
||||
- Fix memory spaces in `create_mirror_view` overloads using `view_alloc` [\#5521](https://github.com/kokkos/kokkos/pull/5521)
|
||||
- Allow `as_view_of_rank_n()` to be overloaded for "special" scalar types [\#5553](https://github.com/kokkos/kokkos/pull/5553)
|
||||
- Fix warning calling a `__host__` function from a `__host__ __device__` from `View:: as_view_of_rank_n` [\#5591](https://github.com/kokkos/kokkos/pull/5591)
|
||||
- OpenMPTarget: adding implementation to set device id. [\#5557](https://github.com/kokkos/kokkos/pull/5557)
|
||||
- Use `Kokkos::atomic_load` to Correct Race Condition Giving Rise to Seg Faulting Error in OpenMP tests [\#5559](https://github.com/kokkos/kokkos/pull/5559)
|
||||
- cmake: define `KOKKOS_ARCH_A64FX` [\#5561](https://github.com/kokkos/kokkos/pull/5561)
|
||||
- Only link against libatomic in gnu-make OpenMPTarget build [\#5565](https://github.com/kokkos/kokkos/pull/5565)
|
||||
- Fix static extents assignment for LayoutLeft/LayoutRight assignment [\#5566](https://github.com/kokkos/kokkos/pull/5566)
|
||||
- Do not add -cuda to the link line with NVHPC compiler when the CUDA backend is not actually enabled [\#5569](https://github.com/kokkos/kokkos/pull/5569)
|
||||
- Export the flags in `KOKKOS_AMDGPU_OPTIONS` when using Trilinos [\#5571](https://github.com/kokkos/kokkos/pull/5571)
|
||||
- Add support for detecting MPI local rank with MPICH and PMI [\#5570](https://github.com/kokkos/kokkos/pull/5570) [\#5582](https://github.com/kokkos/kokkos/pull/5582)
|
||||
- Remove listing of undefined TPL dependencies [\#5573](https://github.com/kokkos/kokkos/pull/5573)
|
||||
- ClockTic changed to 64 bit to fix overflow on Power [\#5592](https://github.com/kokkos/kokkos/pull/5592)
|
||||
- Fix incorrect offset in CUDA and HIP parallel scan for < 4 byte types [\#5607](https://github.com/kokkos/kokkos/pull/5607)
|
||||
- Fix initialization of Cuda lock arrays [\#5622](https://github.com/kokkos/kokkos/pull/5622)
|
||||
|
||||
## [3.7.00](https://github.com/kokkos/kokkos/tree/3.7.00) (2022-08-22)
|
||||
[Full Changelog](https://github.com/kokkos/kokkos/compare/3.6.01...3.7.00)
|
||||
|
||||
@ -102,7 +124,6 @@
|
||||
- Deprecate command line arguments (other than `--help`) that are not prefixed with `kokkos-*` [\#5120](https://github.com/kokkos/kokkos/pull/5120)
|
||||
- Deprecate `--[kokkos-]numa` cmdline arg and `KOKKOS_NUMA` env var [\#5117](https://github.com/kokkos/kokkos/pull/5117)
|
||||
- Deprecate `--[kokkos-]threads` command line argument in favor of `--[kokkos-]num-threads` [\#5111](https://github.com/kokkos/kokkos/pull/5111)
|
||||
- Deprecate `Kokkos::common_view_alloc_prop` [\#5059](https://github.com/kokkos/kokkos/pull/5059)
|
||||
- Deprecate `Kokkos::is_reducer_type` [\#4957](https://github.com/kokkos/kokkos/pull/4957)
|
||||
- Deprecate `OffsetView` constructors taking `index_list_type` [\#4810](https://github.com/kokkos/kokkos/pull/4810)
|
||||
- Deprecate overloads of `Kokkos::sort` taking a parameter `bool always_use_kokkos_sort` [\#5382](https://github.com/kokkos/kokkos/issues/5382)
|
||||
|
||||
@ -129,7 +129,7 @@ ENDIF()
|
||||
|
||||
set(Kokkos_VERSION_MAJOR 3)
|
||||
set(Kokkos_VERSION_MINOR 7)
|
||||
set(Kokkos_VERSION_PATCH 00)
|
||||
set(Kokkos_VERSION_PATCH 01)
|
||||
set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}")
|
||||
math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}")
|
||||
|
||||
@ -152,6 +152,7 @@ ENDIF()
|
||||
# but scoping issues can make it difficult
|
||||
GLOBAL_SET(KOKKOS_COMPILE_OPTIONS)
|
||||
GLOBAL_SET(KOKKOS_LINK_OPTIONS)
|
||||
GLOBAL_SET(KOKKOS_AMDGPU_OPTIONS)
|
||||
GLOBAL_SET(KOKKOS_CUDA_OPTIONS)
|
||||
GLOBAL_SET(KOKKOS_CUDAFE_OPTIONS)
|
||||
GLOBAL_SET(KOKKOS_XCOMPILER_OPTIONS)
|
||||
@ -228,6 +229,9 @@ IF (KOKKOS_HAS_TRILINOS)
|
||||
# we have to match the annoying behavior, also we have to preserve quotes
|
||||
# which needs another workaround.
|
||||
SET(KOKKOS_COMPILE_OPTIONS_TMP)
|
||||
IF (KOKKOS_ENABLE_HIP)
|
||||
LIST(APPEND KOKKOS_COMPILE_OPTIONS ${KOKKOS_AMDGPU_OPTIONS})
|
||||
ENDIF()
|
||||
FOREACH(OPTION ${KOKKOS_COMPILE_OPTIONS})
|
||||
STRING(FIND "${OPTION}" " " OPTION_HAS_WHITESPACE)
|
||||
IF(OPTION_HAS_WHITESPACE EQUAL -1)
|
||||
|
||||
@ -12,7 +12,7 @@ endif
|
||||
|
||||
KOKKOS_VERSION_MAJOR = 3
|
||||
KOKKOS_VERSION_MINOR = 7
|
||||
KOKKOS_VERSION_PATCH = 00
|
||||
KOKKOS_VERSION_PATCH = 01
|
||||
KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc)
|
||||
|
||||
# Options: Cuda,HIP,SYCL,OpenMPTarget,OpenMP,Threads,Serial
|
||||
@ -20,7 +20,7 @@ KOKKOS_DEVICES ?= "OpenMP"
|
||||
#KOKKOS_DEVICES ?= "Threads"
|
||||
# Options:
|
||||
# Intel: KNC,KNL,SNB,HSW,BDW,SKL,SKX,ICL,ICX,SPR
|
||||
# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86
|
||||
# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86,Hopper90
|
||||
# ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX
|
||||
# IBM: BGQ,Power7,Power8,Power9
|
||||
# AMD-GPUS: Vega900,Vega906,Vega908,Vega90A
|
||||
@ -401,6 +401,7 @@ KOKKOS_INTERNAL_USE_ARCH_VOLTA72 := $(call kokkos_has_string,$(KOKKOS_ARCH),Volt
|
||||
KOKKOS_INTERNAL_USE_ARCH_TURING75 := $(call kokkos_has_string,$(KOKKOS_ARCH),Turing75)
|
||||
KOKKOS_INTERNAL_USE_ARCH_AMPERE80 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere80)
|
||||
KOKKOS_INTERNAL_USE_ARCH_AMPERE86 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere86)
|
||||
KOKKOS_INTERNAL_USE_ARCH_HOPPER90 := $(call kokkos_has_string,$(KOKKOS_ARCH),Hopper90)
|
||||
KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \
|
||||
@ -414,7 +415,8 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLE
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_TURING75) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_AMPERE80) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86))
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_HOPPER90))
|
||||
|
||||
#SEK: This seems like a bug to me
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
|
||||
@ -505,10 +507,6 @@ KOKKOS_LINK_FLAGS =
|
||||
KOKKOS_SRC =
|
||||
KOKKOS_HEADERS =
|
||||
|
||||
#ifeq ($(KOKKOS_INTERNAL_COMPILER_GCC), 1)
|
||||
KOKKOS_LIBS += -latomic
|
||||
#endif
|
||||
|
||||
# Generating the KokkosCore_config.h file.
|
||||
|
||||
KOKKOS_INTERNAL_CONFIG_TMP=KokkosCore_config.tmp
|
||||
@ -550,6 +548,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1)
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
|
||||
KOKKOS_LIBS += -latomic
|
||||
tmp := $(call kokkos_append_header,'$H''define KOKKOS_ENABLE_OPENMPTARGET')
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_GCC), 1)
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_WORKAROUND_OPENMPTARGET_GCC")
|
||||
@ -1197,6 +1196,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA_ARCH), 1)
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE86")
|
||||
KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_86
|
||||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_HOPPER90), 1)
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER90")
|
||||
KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_90
|
||||
endif
|
||||
|
||||
ifneq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
|
||||
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
|
||||
LIB_REQUIRED_PACKAGES KokkosCore KokkosContainers
|
||||
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC HPX
|
||||
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC
|
||||
TEST_OPTIONAL_TPLS CUSPARSE
|
||||
)
|
||||
|
||||
@ -265,8 +265,8 @@ class BinSort {
|
||||
//----------------------------------------
|
||||
// Create the permutation vector, the bin_offset array and the bin_count
|
||||
// array. Can be called again if keys changed
|
||||
template <class ExecutionSpace = exec_space>
|
||||
void create_permute_vector(const ExecutionSpace& exec = exec_space{}) {
|
||||
template <class ExecutionSpace>
|
||||
void create_permute_vector(const ExecutionSpace& exec) {
|
||||
static_assert(
|
||||
Kokkos::SpaceAccessibility<ExecutionSpace,
|
||||
typename Space::memory_space>::accessible,
|
||||
@ -297,6 +297,15 @@ class BinSort {
|
||||
*this);
|
||||
}
|
||||
|
||||
// Create the permutation vector, the bin_offset array and the bin_count
|
||||
// array. Can be called again if keys changed
|
||||
void create_permute_vector() {
|
||||
Kokkos::fence("Kokkos::Binsort::create_permute_vector: before");
|
||||
exec_space e{};
|
||||
create_permute_vector(e);
|
||||
e.fence("Kokkos::Binsort::create_permute_vector: after");
|
||||
}
|
||||
|
||||
// Sort a subset of a view with respect to the first dimension using the
|
||||
// permutation array
|
||||
template <class ExecutionSpace, class ValuesViewType>
|
||||
@ -372,9 +381,10 @@ class BinSort {
|
||||
template <class ValuesViewType>
|
||||
void sort(ValuesViewType const& values, int values_range_begin,
|
||||
int values_range_end) const {
|
||||
Kokkos::fence("Kokkos::Binsort::sort: before");
|
||||
exec_space exec;
|
||||
sort(exec, values, values_range_begin, values_range_end);
|
||||
exec.fence("Kokkos::Sort: fence after sorting");
|
||||
exec.fence("Kokkos::BinSort:sort: after");
|
||||
}
|
||||
|
||||
template <class ExecutionSpace, class ValuesViewType>
|
||||
@ -641,9 +651,10 @@ std::enable_if_t<Kokkos::is_execution_space<ExecutionSpace>::value> sort(
|
||||
|
||||
template <class ViewType>
|
||||
void sort(ViewType const& view) {
|
||||
Kokkos::fence("Kokkos::sort: before");
|
||||
typename ViewType::execution_space exec;
|
||||
sort(exec, view);
|
||||
exec.fence("Kokkos::Sort: fence after sorting");
|
||||
exec.fence("Kokkos::sort: fence after sorting");
|
||||
}
|
||||
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
|
||||
@ -682,6 +693,7 @@ std::enable_if_t<Kokkos::is_execution_space<ExecutionSpace>::value> sort(
|
||||
|
||||
template <class ViewType>
|
||||
void sort(ViewType view, size_t const begin, size_t const end) {
|
||||
Kokkos::fence("Kokkos::sort: before");
|
||||
typename ViewType::execution_space exec;
|
||||
sort(exec, view, begin, end);
|
||||
exec.fence("Kokkos::Sort: fence after sorting");
|
||||
|
||||
@ -10,10 +10,12 @@
|
||||
# Default settings: change those according to your machine. For
|
||||
# example, you may have have two different wrappers with either icpc
|
||||
# or g++ as their back-end compiler. The defaults can be overwritten
|
||||
# by using the usual arguments (e.g., -arch=sm_30 -ccbin icpc).
|
||||
# by using the usual arguments (e.g., -arch=sm_80 -ccbin icpc).
|
||||
# sm_70 is supported by every CUDA version from 9-12 and is thus
|
||||
# chosen as default
|
||||
|
||||
default_arch="sm_35"
|
||||
#default_arch="sm_50"
|
||||
default_arch="sm_70"
|
||||
#default_arch="sm_80"
|
||||
|
||||
#
|
||||
# The default C++ compiler.
|
||||
|
||||
@ -66,6 +66,7 @@
|
||||
#cmakedefine KOKKOS_ARCH_ARMV8_THUNDERX
|
||||
#cmakedefine KOKKOS_ARCH_ARMV81
|
||||
#cmakedefine KOKKOS_ARCH_ARMV8_THUNDERX2
|
||||
#cmakedefine KOKKOS_ARCH_A64FX
|
||||
#cmakedefine KOKKOS_ARCH_AMD_AVX2
|
||||
#cmakedefine KOKKOS_ARCH_AVX
|
||||
#cmakedefine KOKKOS_ARCH_AVX2
|
||||
@ -101,6 +102,7 @@
|
||||
#cmakedefine KOKKOS_ARCH_AMPERE
|
||||
#cmakedefine KOKKOS_ARCH_AMPERE80
|
||||
#cmakedefine KOKKOS_ARCH_AMPERE86
|
||||
#cmakedefine KOKKOS_ARCH_HOPPER90
|
||||
#cmakedefine KOKKOS_ARCH_AMD_ZEN
|
||||
#cmakedefine KOKKOS_ARCH_AMD_ZEN2
|
||||
#cmakedefine KOKKOS_ARCH_AMD_ZEN3
|
||||
|
||||
@ -74,6 +74,7 @@ int main() {
|
||||
case 75: std::cout << "Set -DKokkos_ARCH_TURING75=ON ." << std::endl; break;
|
||||
case 80: std::cout << "Set -DKokkos_ARCH_AMPERE80=ON ." << std::endl; break;
|
||||
case 86: std::cout << "Set -DKokkos_ARCH_AMPERE86=ON ." << std::endl; break;
|
||||
case 90: std::cout << "Set -DKokkos_ARCH_HOPPER90=ON ." << std::endl; break;
|
||||
default:
|
||||
std::cout << "Compute capability " << compute_capability
|
||||
<< " is not supported" << std::endl;
|
||||
|
||||
@ -86,6 +86,7 @@ KOKKOS_ARCH_OPTION(VOLTA72 GPU "NVIDIA Volta generation CC 7.2" "KOKK
|
||||
KOKKOS_ARCH_OPTION(TURING75 GPU "NVIDIA Turing generation CC 7.5" "KOKKOS_SHOW_CUDA_ARCHS")
|
||||
KOKKOS_ARCH_OPTION(AMPERE80 GPU "NVIDIA Ampere generation CC 8.0" "KOKKOS_SHOW_CUDA_ARCHS")
|
||||
KOKKOS_ARCH_OPTION(AMPERE86 GPU "NVIDIA Ampere generation CC 8.6" "KOKKOS_SHOW_CUDA_ARCHS")
|
||||
KOKKOS_ARCH_OPTION(HOPPER90 GPU "NVIDIA Hopper generation CC 9.0" "KOKKOS_SHOW_CUDA_ARCHS")
|
||||
|
||||
IF(Kokkos_ENABLE_HIP OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_UNSUPPORTED_ARCHS)
|
||||
SET(KOKKOS_SHOW_HIP_ARCHS ON)
|
||||
@ -187,7 +188,9 @@ IF (KOKKOS_CXX_COMPILER_ID STREQUAL Clang)
|
||||
ELSEIF (KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
|
||||
SET(CUDA_ARCH_FLAG "-gpu")
|
||||
GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS -cuda)
|
||||
GLOBAL_APPEND(KOKKOS_LINK_OPTIONS -cuda)
|
||||
IF (KOKKOS_ENABLE_CUDA) # FIXME ideally unreachable when CUDA not enabled
|
||||
GLOBAL_APPEND(KOKKOS_LINK_OPTIONS -cuda)
|
||||
ENDIF()
|
||||
ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA)
|
||||
SET(CUDA_ARCH_FLAG "-arch")
|
||||
ENDIF()
|
||||
@ -542,6 +545,7 @@ CHECK_CUDA_ARCH(VOLTA72 sm_72)
|
||||
CHECK_CUDA_ARCH(TURING75 sm_75)
|
||||
CHECK_CUDA_ARCH(AMPERE80 sm_80)
|
||||
CHECK_CUDA_ARCH(AMPERE86 sm_86)
|
||||
CHECK_CUDA_ARCH(HOPPER90 sm_90)
|
||||
|
||||
SET(AMDGPU_ARCH_ALREADY_SPECIFIED "")
|
||||
FUNCTION(CHECK_AMDGPU_ARCH ARCH FLAG)
|
||||
@ -804,6 +808,10 @@ IF (KOKKOS_ARCH_AMPERE80 OR KOKKOS_ARCH_AMPERE86)
|
||||
SET(KOKKOS_ARCH_AMPERE ON)
|
||||
ENDIF()
|
||||
|
||||
IF (KOKKOS_ARCH_HOPPER90)
|
||||
SET(KOKKOS_ARCH_HOPPER ON)
|
||||
ENDIF()
|
||||
|
||||
#Regardless of version, make sure we define the general architecture name
|
||||
IF (KOKKOS_ARCH_VEGA900 OR KOKKOS_ARCH_VEGA906 OR KOKKOS_ARCH_VEGA908 OR KOKKOS_ARCH_VEGA90A)
|
||||
SET(KOKKOS_ARCH_VEGA ON)
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
|
||||
LIB_REQUIRED_PACKAGES KokkosCore
|
||||
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC HPX
|
||||
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC
|
||||
TEST_OPTIONAL_TPLS CUSPARSE
|
||||
)
|
||||
|
||||
@ -1701,7 +1701,11 @@ namespace Impl {
|
||||
underlying memory, to facilitate implementation of deep_copy() and
|
||||
other routines that are defined on View */
|
||||
template <unsigned N, typename T, typename... Args>
|
||||
KOKKOS_FUNCTION auto as_view_of_rank_n(DynRankView<T, Args...> v) {
|
||||
KOKKOS_FUNCTION auto as_view_of_rank_n(
|
||||
DynRankView<T, Args...> v,
|
||||
typename std::enable_if<std::is_same<
|
||||
typename ViewTraits<T, Args...>::specialize, void>::value>::type* =
|
||||
nullptr) {
|
||||
if (v.rank() != N) {
|
||||
KOKKOS_IF_ON_HOST(
|
||||
const std::string message =
|
||||
@ -2114,9 +2118,10 @@ inline auto create_mirror(
|
||||
namespace Impl {
|
||||
template <class T, class... P, class... ViewCtorArgs>
|
||||
inline std::enable_if_t<
|
||||
std::is_same<
|
||||
typename DynRankView<T, P...>::memory_space,
|
||||
typename DynRankView<T, P...>::HostMirror::memory_space>::value &&
|
||||
!Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
|
||||
std::is_same<
|
||||
typename DynRankView<T, P...>::memory_space,
|
||||
typename DynRankView<T, P...>::HostMirror::memory_space>::value &&
|
||||
std::is_same<
|
||||
typename DynRankView<T, P...>::data_type,
|
||||
typename DynRankView<T, P...>::HostMirror::data_type>::value,
|
||||
@ -2128,12 +2133,13 @@ create_mirror_view(const DynRankView<T, P...>& src,
|
||||
|
||||
template <class T, class... P, class... ViewCtorArgs>
|
||||
inline std::enable_if_t<
|
||||
!(std::is_same<
|
||||
typename DynRankView<T, P...>::memory_space,
|
||||
typename DynRankView<T, P...>::HostMirror::memory_space>::value &&
|
||||
std::is_same<
|
||||
typename DynRankView<T, P...>::data_type,
|
||||
typename DynRankView<T, P...>::HostMirror::data_type>::value),
|
||||
!Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
|
||||
!(std::is_same<
|
||||
typename DynRankView<T, P...>::memory_space,
|
||||
typename DynRankView<T, P...>::HostMirror::memory_space>::value &&
|
||||
std::is_same<
|
||||
typename DynRankView<T, P...>::data_type,
|
||||
typename DynRankView<T, P...>::HostMirror::data_type>::value),
|
||||
typename DynRankView<T, P...>::HostMirror>
|
||||
create_mirror_view(
|
||||
const DynRankView<T, P...>& src,
|
||||
@ -2141,29 +2147,39 @@ create_mirror_view(
|
||||
return Kokkos::Impl::create_mirror(src, arg_prop);
|
||||
}
|
||||
|
||||
template <class Space, class T, class... P, class... ViewCtorArgs>
|
||||
template <class T, class... P, class... ViewCtorArgs,
|
||||
class = std::enable_if_t<
|
||||
Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
|
||||
inline std::enable_if_t<
|
||||
Kokkos::is_space<Space>::value &&
|
||||
Impl::MirrorDRViewType<Space, T, P...>::is_same_memspace,
|
||||
typename Impl::MirrorDRViewType<Space, T, P...>::view_type>
|
||||
create_mirror_view(const Space&, const Kokkos::DynRankView<T, P...>& src,
|
||||
Kokkos::is_space<
|
||||
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space>::value &&
|
||||
Impl::MirrorDRViewType<
|
||||
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T,
|
||||
P...>::is_same_memspace,
|
||||
typename Impl::MirrorDRViewType<
|
||||
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T,
|
||||
P...>::view_type>
|
||||
create_mirror_view(const Kokkos::DynRankView<T, P...>& src,
|
||||
const typename Impl::ViewCtorProp<ViewCtorArgs...>&) {
|
||||
return src;
|
||||
}
|
||||
|
||||
template <class Space, class T, class... P, class... ViewCtorArgs>
|
||||
template <class T, class... P, class... ViewCtorArgs,
|
||||
class = std::enable_if_t<
|
||||
Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
|
||||
inline std::enable_if_t<
|
||||
Kokkos::is_space<Space>::value &&
|
||||
!Impl::MirrorDRViewType<Space, T, P...>::is_same_memspace,
|
||||
typename Impl::MirrorDRViewType<Space, T, P...>::view_type>
|
||||
Kokkos::is_space<
|
||||
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space>::value &&
|
||||
!Impl::MirrorDRViewType<
|
||||
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T,
|
||||
P...>::is_same_memspace,
|
||||
typename Impl::MirrorDRViewType<
|
||||
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T,
|
||||
P...>::view_type>
|
||||
create_mirror_view(
|
||||
const Space&, const Kokkos::DynRankView<T, P...>& src,
|
||||
const Kokkos::DynRankView<T, P...>& src,
|
||||
const typename Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
|
||||
using MemorySpace = typename Space::memory_space;
|
||||
using alloc_prop = Impl::ViewCtorProp<ViewCtorArgs..., MemorySpace>;
|
||||
alloc_prop prop_copy(arg_prop);
|
||||
|
||||
return Kokkos::Impl::create_mirror(src, prop_copy);
|
||||
return Kokkos::Impl::create_mirror(src, arg_prop);
|
||||
}
|
||||
} // namespace Impl
|
||||
|
||||
@ -2224,9 +2240,10 @@ create_mirror_view(
|
||||
|
||||
template <class Space, class T, class... P>
|
||||
inline auto create_mirror_view(Kokkos::Impl::WithoutInitializing_t wi,
|
||||
const Space& space,
|
||||
const Space&,
|
||||
const Kokkos::DynRankView<T, P...>& src) {
|
||||
return Impl::create_mirror_view(space, src, Kokkos::view_alloc(wi));
|
||||
return Impl::create_mirror_view(
|
||||
src, Kokkos::view_alloc(typename Space::memory_space{}, wi));
|
||||
}
|
||||
|
||||
template <class T, class... P, class... ViewCtorArgs>
|
||||
|
||||
@ -710,7 +710,7 @@ template <class Space, class T, class... P>
|
||||
inline auto create_mirror(
|
||||
const Space&, const Kokkos::Experimental::DynamicView<T, P...>& src) {
|
||||
return Impl::create_mirror(
|
||||
src, Impl::ViewCtorProp<>{typename Space::memory_space{}});
|
||||
src, Kokkos::view_alloc(typename Space::memory_space{}));
|
||||
}
|
||||
|
||||
template <class Space, class T, class... P>
|
||||
@ -729,48 +729,68 @@ inline auto create_mirror(
|
||||
}
|
||||
|
||||
namespace Impl {
|
||||
|
||||
template <class T, class... P, class... ViewCtorArgs>
|
||||
inline std::enable_if_t<
|
||||
(std::is_same<
|
||||
typename Kokkos::Experimental::DynamicView<T, P...>::memory_space,
|
||||
typename Kokkos::Experimental::DynamicView<
|
||||
T, P...>::HostMirror::memory_space>::value &&
|
||||
std::is_same<
|
||||
typename Kokkos::Experimental::DynamicView<T, P...>::data_type,
|
||||
typename Kokkos::Experimental::DynamicView<
|
||||
T, P...>::HostMirror::data_type>::value),
|
||||
!Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
|
||||
(std::is_same<
|
||||
typename Kokkos::Experimental::DynamicView<T, P...>::memory_space,
|
||||
typename Kokkos::Experimental::DynamicView<
|
||||
T, P...>::HostMirror::memory_space>::value &&
|
||||
std::is_same<
|
||||
typename Kokkos::Experimental::DynamicView<T, P...>::data_type,
|
||||
typename Kokkos::Experimental::DynamicView<
|
||||
T, P...>::HostMirror::data_type>::value),
|
||||
typename Kokkos::Experimental::DynamicView<T, P...>::HostMirror>
|
||||
create_mirror_view(
|
||||
const typename Kokkos::Experimental::DynamicView<T, P...>& src,
|
||||
const Impl::ViewCtorProp<ViewCtorArgs...>&) {
|
||||
create_mirror_view(const Kokkos::Experimental::DynamicView<T, P...>& src,
|
||||
const Impl::ViewCtorProp<ViewCtorArgs...>&) {
|
||||
return src;
|
||||
}
|
||||
|
||||
template <class T, class... P, class... ViewCtorArgs>
|
||||
inline std::enable_if_t<
|
||||
!(std::is_same<
|
||||
typename Kokkos::Experimental::DynamicView<T, P...>::memory_space,
|
||||
typename Kokkos::Experimental::DynamicView<
|
||||
T, P...>::HostMirror::memory_space>::value &&
|
||||
std::is_same<
|
||||
typename Kokkos::Experimental::DynamicView<T, P...>::data_type,
|
||||
typename Kokkos::Experimental::DynamicView<
|
||||
T, P...>::HostMirror::data_type>::value),
|
||||
!Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
|
||||
!(std::is_same<
|
||||
typename Kokkos::Experimental::DynamicView<T, P...>::memory_space,
|
||||
typename Kokkos::Experimental::DynamicView<
|
||||
T, P...>::HostMirror::memory_space>::value &&
|
||||
std::is_same<
|
||||
typename Kokkos::Experimental::DynamicView<T, P...>::data_type,
|
||||
typename Kokkos::Experimental::DynamicView<
|
||||
T, P...>::HostMirror::data_type>::value),
|
||||
typename Kokkos::Experimental::DynamicView<T, P...>::HostMirror>
|
||||
create_mirror_view(const Kokkos::Experimental::DynamicView<T, P...>& src,
|
||||
const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
|
||||
return Kokkos::create_mirror(arg_prop, src);
|
||||
}
|
||||
|
||||
template <class Space, class T, class... P, class... ViewCtorArgs>
|
||||
inline std::enable_if_t<
|
||||
Impl::MirrorDynamicViewType<Space, T, P...>::is_same_memspace,
|
||||
typename Kokkos::Impl::MirrorDynamicViewType<Space, T, P...>::view_type>
|
||||
create_mirror_view(const Space&,
|
||||
const Kokkos::Experimental::DynamicView<T, P...>& src,
|
||||
template <class T, class... P, class... ViewCtorArgs,
|
||||
class = std::enable_if_t<
|
||||
Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
|
||||
std::enable_if_t<Impl::MirrorDynamicViewType<
|
||||
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
|
||||
T, P...>::is_same_memspace,
|
||||
typename Impl::MirrorDynamicViewType<
|
||||
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
|
||||
T, P...>::view_type>
|
||||
create_mirror_view(const Kokkos::Experimental::DynamicView<T, P...>& src,
|
||||
const Impl::ViewCtorProp<ViewCtorArgs...>&) {
|
||||
return src;
|
||||
}
|
||||
|
||||
template <class T, class... P, class... ViewCtorArgs,
|
||||
class = std::enable_if_t<
|
||||
Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
|
||||
std::enable_if_t<!Impl::MirrorDynamicViewType<
|
||||
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
|
||||
T, P...>::is_same_memspace,
|
||||
typename Impl::MirrorDynamicViewType<
|
||||
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
|
||||
T, P...>::view_type>
|
||||
create_mirror_view(const Kokkos::Experimental::DynamicView<T, P...>& src,
|
||||
const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
|
||||
return Kokkos::Impl::create_mirror(src, arg_prop);
|
||||
}
|
||||
} // namespace Impl
|
||||
|
||||
// Create a mirror view in host space
|
||||
@ -790,8 +810,9 @@ inline auto create_mirror_view(
|
||||
// Create a mirror in a new space
|
||||
template <class Space, class T, class... P>
|
||||
inline auto create_mirror_view(
|
||||
const Space& space, const Kokkos::Experimental::DynamicView<T, P...>& src) {
|
||||
return Impl::create_mirror_view(space, src, Impl::ViewCtorProp<>{});
|
||||
const Space&, const Kokkos::Experimental::DynamicView<T, P...>& src) {
|
||||
return Impl::create_mirror_view(src,
|
||||
view_alloc(typename Space::memory_space{}));
|
||||
}
|
||||
|
||||
template <class Space, class T, class... P>
|
||||
|
||||
@ -1901,19 +1901,22 @@ struct MirrorOffsetType {
|
||||
|
||||
namespace Impl {
|
||||
template <class T, class... P, class... ViewCtorArgs>
|
||||
inline typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror
|
||||
inline std::enable_if_t<
|
||||
!Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space,
|
||||
typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror>
|
||||
create_mirror(const Kokkos::Experimental::OffsetView<T, P...>& src,
|
||||
const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
|
||||
return typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror(
|
||||
Kokkos::create_mirror(arg_prop, src.view()), src.begins());
|
||||
}
|
||||
|
||||
template <class Space, class T, class... P, class... ViewCtorArgs>
|
||||
inline typename Kokkos::Impl::MirrorOffsetType<Space, T, P...>::view_type
|
||||
create_mirror(const Space&,
|
||||
const Kokkos::Experimental::OffsetView<T, P...>& src,
|
||||
const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
|
||||
template <class T, class... P, class... ViewCtorArgs,
|
||||
class = std::enable_if_t<
|
||||
Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
|
||||
inline auto create_mirror(const Kokkos::Experimental::OffsetView<T, P...>& src,
|
||||
const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
|
||||
using alloc_prop_input = Impl::ViewCtorProp<ViewCtorArgs...>;
|
||||
using Space = typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space;
|
||||
|
||||
static_assert(
|
||||
!alloc_prop_input::has_label,
|
||||
@ -1923,10 +1926,6 @@ create_mirror(const Space&,
|
||||
!alloc_prop_input::has_pointer,
|
||||
"The view constructor arguments passed to Kokkos::create_mirror must "
|
||||
"not include a pointer!");
|
||||
static_assert(
|
||||
!alloc_prop_input::has_memory_space,
|
||||
"The view constructor arguments passed to Kokkos::create_mirror must "
|
||||
"not include a memory space instance!");
|
||||
static_assert(
|
||||
!alloc_prop_input::allow_padding,
|
||||
"The view constructor arguments passed to Kokkos::create_mirror must "
|
||||
@ -1962,15 +1961,17 @@ inline auto create_mirror(
|
||||
template <class Space, class T, class... P,
|
||||
typename Enable = std::enable_if_t<Kokkos::is_space<Space>::value>>
|
||||
inline auto create_mirror(
|
||||
const Space& space, const Kokkos::Experimental::OffsetView<T, P...>& src) {
|
||||
return Impl::create_mirror(space, src, Impl::ViewCtorProp<>{});
|
||||
const Space&, const Kokkos::Experimental::OffsetView<T, P...>& src) {
|
||||
return Impl::create_mirror(
|
||||
src, Kokkos::view_alloc(typename Space::memory_space{}));
|
||||
}
|
||||
|
||||
template <class Space, class T, class... P>
|
||||
typename Kokkos::Impl::MirrorOffsetType<Space, T, P...>::view_type
|
||||
create_mirror(Kokkos::Impl::WithoutInitializing_t wi, const Space& space,
|
||||
create_mirror(Kokkos::Impl::WithoutInitializing_t wi, const Space&,
|
||||
const Kokkos::Experimental::OffsetView<T, P...>& src) {
|
||||
return Impl::create_mirror(space, src, Kokkos::view_alloc(wi));
|
||||
return Impl::create_mirror(
|
||||
src, Kokkos::view_alloc(typename Space::memory_space{}, wi));
|
||||
}
|
||||
|
||||
template <class T, class... P, class... ViewCtorArgs>
|
||||
@ -1983,54 +1984,64 @@ inline auto create_mirror(
|
||||
namespace Impl {
|
||||
template <class T, class... P, class... ViewCtorArgs>
|
||||
inline std::enable_if_t<
|
||||
(std::is_same<
|
||||
typename Kokkos::Experimental::OffsetView<T, P...>::memory_space,
|
||||
typename Kokkos::Experimental::OffsetView<
|
||||
T, P...>::HostMirror::memory_space>::value &&
|
||||
std::is_same<typename Kokkos::Experimental::OffsetView<T, P...>::data_type,
|
||||
typename Kokkos::Experimental::OffsetView<
|
||||
T, P...>::HostMirror::data_type>::value),
|
||||
!Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
|
||||
(std::is_same<
|
||||
typename Kokkos::Experimental::OffsetView<T, P...>::memory_space,
|
||||
typename Kokkos::Experimental::OffsetView<
|
||||
T, P...>::HostMirror::memory_space>::value &&
|
||||
std::is_same<
|
||||
typename Kokkos::Experimental::OffsetView<T, P...>::data_type,
|
||||
typename Kokkos::Experimental::OffsetView<
|
||||
T, P...>::HostMirror::data_type>::value),
|
||||
typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror>
|
||||
create_mirror_view(
|
||||
const typename Kokkos::Experimental::OffsetView<T, P...>& src,
|
||||
const Impl::ViewCtorProp<ViewCtorArgs...>&) {
|
||||
create_mirror_view(const Kokkos::Experimental::OffsetView<T, P...>& src,
|
||||
const Impl::ViewCtorProp<ViewCtorArgs...>&) {
|
||||
return src;
|
||||
}
|
||||
|
||||
template <class T, class... P, class... ViewCtorArgs>
|
||||
inline std::enable_if_t<
|
||||
!(std::is_same<
|
||||
typename Kokkos::Experimental::OffsetView<T, P...>::memory_space,
|
||||
typename Kokkos::Experimental::OffsetView<
|
||||
T, P...>::HostMirror::memory_space>::value &&
|
||||
std::is_same<
|
||||
typename Kokkos::Experimental::OffsetView<T, P...>::data_type,
|
||||
typename Kokkos::Experimental::OffsetView<
|
||||
T, P...>::HostMirror::data_type>::value),
|
||||
!Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
|
||||
!(std::is_same<
|
||||
typename Kokkos::Experimental::OffsetView<T, P...>::memory_space,
|
||||
typename Kokkos::Experimental::OffsetView<
|
||||
T, P...>::HostMirror::memory_space>::value &&
|
||||
std::is_same<
|
||||
typename Kokkos::Experimental::OffsetView<T, P...>::data_type,
|
||||
typename Kokkos::Experimental::OffsetView<
|
||||
T, P...>::HostMirror::data_type>::value),
|
||||
typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror>
|
||||
create_mirror_view(const Kokkos::Experimental::OffsetView<T, P...>& src,
|
||||
const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
|
||||
return Kokkos::create_mirror(arg_prop, src);
|
||||
}
|
||||
|
||||
template <class Space, class T, class... P, class... ViewCtorArgs>
|
||||
inline std::enable_if_t<
|
||||
Impl::MirrorOffsetViewType<Space, T, P...>::is_same_memspace,
|
||||
Kokkos::Experimental::OffsetView<T, P...>>
|
||||
create_mirror_view(const Space&,
|
||||
const Kokkos::Experimental::OffsetView<T, P...>& src,
|
||||
template <class T, class... P, class... ViewCtorArgs,
|
||||
class = std::enable_if_t<
|
||||
Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
|
||||
std::enable_if_t<Impl::MirrorOffsetViewType<
|
||||
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
|
||||
T, P...>::is_same_memspace,
|
||||
typename Impl::MirrorOffsetViewType<
|
||||
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
|
||||
T, P...>::view_type>
|
||||
create_mirror_view(const Kokkos::Experimental::OffsetView<T, P...>& src,
|
||||
const Impl::ViewCtorProp<ViewCtorArgs...>&) {
|
||||
return src;
|
||||
}
|
||||
|
||||
template <class Space, class T, class... P, class... ViewCtorArgs>
|
||||
std::enable_if_t<
|
||||
!Impl::MirrorOffsetViewType<Space, T, P...>::is_same_memspace,
|
||||
typename Kokkos::Impl::MirrorOffsetViewType<Space, T, P...>::view_type>
|
||||
create_mirror_view(const Space& space,
|
||||
const Kokkos::Experimental::OffsetView<T, P...>& src,
|
||||
template <class T, class... P, class... ViewCtorArgs,
|
||||
class = std::enable_if_t<
|
||||
Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
|
||||
std::enable_if_t<!Impl::MirrorOffsetViewType<
|
||||
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
|
||||
T, P...>::is_same_memspace,
|
||||
typename Impl::MirrorOffsetViewType<
|
||||
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
|
||||
T, P...>::view_type>
|
||||
create_mirror_view(const Kokkos::Experimental::OffsetView<T, P...>& src,
|
||||
const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
|
||||
return create_mirror(space, src, arg_prop);
|
||||
return Kokkos::Impl::create_mirror(src, arg_prop);
|
||||
}
|
||||
} // namespace Impl
|
||||
|
||||
@ -2052,15 +2063,17 @@ inline auto create_mirror_view(
|
||||
template <class Space, class T, class... P,
|
||||
typename Enable = std::enable_if_t<Kokkos::is_space<Space>::value>>
|
||||
inline auto create_mirror_view(
|
||||
const Space& space, const Kokkos::Experimental::OffsetView<T, P...>& src) {
|
||||
return Impl::create_mirror_view(space, src, Impl::ViewCtorProp<>{});
|
||||
const Space&, const Kokkos::Experimental::OffsetView<T, P...>& src) {
|
||||
return Impl::create_mirror_view(
|
||||
src, Kokkos::view_alloc(typename Space::memory_space{}));
|
||||
}
|
||||
|
||||
template <class Space, class T, class... P>
|
||||
inline auto create_mirror_view(
|
||||
Kokkos::Impl::WithoutInitializing_t wi, const Space& space,
|
||||
Kokkos::Impl::WithoutInitializing_t wi, const Space&,
|
||||
const Kokkos::Experimental::OffsetView<T, P...>& src) {
|
||||
return Impl::create_mirror_view(space, src, Kokkos::view_alloc(wi));
|
||||
return Impl::create_mirror_view(
|
||||
src, Kokkos::view_alloc(typename Space::memory_space{}, wi));
|
||||
}
|
||||
|
||||
template <class T, class... P, class... ViewCtorArgs>
|
||||
|
||||
@ -46,3 +46,13 @@ foreach(Tag Threads;Serial;OpenMP;HPX;Cuda;HIP;SYCL)
|
||||
KOKKOS_ADD_EXECUTABLE_AND_TEST(UnitTest_${Tag} SOURCES ${UnitTestSources})
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
SET(COMPILE_ONLY_SOURCES
|
||||
TestCreateMirror.cpp
|
||||
)
|
||||
KOKKOS_ADD_EXECUTABLE(
|
||||
TestCompileOnly
|
||||
SOURCES
|
||||
TestCompileMain.cpp
|
||||
${COMPILE_ONLY_SOURCES}
|
||||
)
|
||||
|
||||
1
lib/kokkos/containers/unit_tests/TestCompileMain.cpp
Normal file
1
lib/kokkos/containers/unit_tests/TestCompileMain.cpp
Normal file
@ -0,0 +1 @@
|
||||
int main() {}
|
||||
179
lib/kokkos/containers/unit_tests/TestCreateMirror.cpp
Normal file
179
lib/kokkos/containers/unit_tests/TestCreateMirror.cpp
Normal file
@ -0,0 +1,179 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 3.0
|
||||
// Copyright (2020) National Technology & Engineering
|
||||
// Solutions of Sandia, LLC (NTESS).
|
||||
//
|
||||
// Under the terms of Contract DE-NA0003525 with NTESS,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
#include <Kokkos_DynamicView.hpp>
|
||||
#include <Kokkos_DynRankView.hpp>
|
||||
#include <Kokkos_OffsetView.hpp>
|
||||
|
||||
template <typename TestView, typename MemorySpace>
|
||||
void check_memory_space(TestView, MemorySpace) {
|
||||
static_assert(
|
||||
std::is_same<typename TestView::memory_space, MemorySpace>::value, "");
|
||||
}
|
||||
|
||||
template <class View>
|
||||
auto host_mirror_test_space(View) {
|
||||
return std::conditional_t<
|
||||
Kokkos::SpaceAccessibility<Kokkos::HostSpace,
|
||||
typename View::memory_space>::accessible,
|
||||
typename View::memory_space, Kokkos::HostSpace>{};
|
||||
}
|
||||
|
||||
template <typename View>
|
||||
void test_create_mirror_properties(const View& view) {
|
||||
using namespace Kokkos;
|
||||
using DeviceMemorySpace = typename DefaultExecutionSpace::memory_space;
|
||||
|
||||
// clang-format off
|
||||
|
||||
// create_mirror
|
||||
#ifndef KOKKOS_ENABLE_CXX14
|
||||
// FIXME DynamicView: HostMirror is the same type
|
||||
if constexpr (!is_dynamic_view<View>::value) {
|
||||
check_memory_space(create_mirror(WithoutInitializing, view), host_mirror_test_space(view));
|
||||
check_memory_space(create_mirror( view), host_mirror_test_space(view));
|
||||
}
|
||||
#endif
|
||||
check_memory_space(create_mirror(WithoutInitializing, DefaultExecutionSpace{}, view), DeviceMemorySpace{});
|
||||
check_memory_space(create_mirror( DefaultExecutionSpace{}, view), DeviceMemorySpace{});
|
||||
|
||||
// create_mirror_view
|
||||
#ifndef KOKKOS_ENABLE_CXX14
|
||||
// FIXME DynamicView: HostMirror is the same type
|
||||
if constexpr (!is_dynamic_view<View>::value) {
|
||||
check_memory_space(create_mirror_view(WithoutInitializing, view), host_mirror_test_space(view));
|
||||
check_memory_space(create_mirror_view( view), host_mirror_test_space(view));
|
||||
}
|
||||
#endif
|
||||
check_memory_space(create_mirror_view(WithoutInitializing, DefaultExecutionSpace{}, view), DeviceMemorySpace{});
|
||||
check_memory_space(create_mirror_view( DefaultExecutionSpace{}, view), DeviceMemorySpace{});
|
||||
|
||||
// create_mirror view_alloc
|
||||
#ifndef KOKKOS_ENABLE_CXX14
|
||||
// FIXME DynamicView: HostMirror is the same type
|
||||
if constexpr (!is_dynamic_view<View>::value) {
|
||||
check_memory_space(create_mirror(view_alloc(WithoutInitializing), view), host_mirror_test_space(view));
|
||||
check_memory_space(create_mirror(view_alloc(), view), host_mirror_test_space(view));
|
||||
}
|
||||
#endif
|
||||
check_memory_space(create_mirror(view_alloc(WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
|
||||
check_memory_space(create_mirror(view_alloc( DeviceMemorySpace{}), view), DeviceMemorySpace{});
|
||||
|
||||
// create_mirror_view view_alloc
|
||||
#ifndef KOKKOS_ENABLE_CXX14
|
||||
// FIXME DynamicView: HostMirror is the same type
|
||||
if constexpr (!is_dynamic_view<View>::value) {
|
||||
check_memory_space(create_mirror_view(view_alloc(WithoutInitializing), view), host_mirror_test_space(view));
|
||||
check_memory_space(create_mirror_view(view_alloc(), view), host_mirror_test_space(view));
|
||||
}
|
||||
#endif
|
||||
check_memory_space(create_mirror_view(view_alloc(WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
|
||||
check_memory_space(create_mirror_view(view_alloc( DeviceMemorySpace{}), view), DeviceMemorySpace{});
|
||||
|
||||
// create_mirror view_alloc + execution space
|
||||
#ifndef KOKKOS_ENABLE_CXX14
|
||||
// FIXME DynamicView: HostMirror is the same type
|
||||
if constexpr (!is_dynamic_view<View>::value) {
|
||||
check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{}, WithoutInitializing), view), host_mirror_test_space(view));
|
||||
check_memory_space(create_mirror(view_alloc(DefaultHostExecutionSpace{}), view), host_mirror_test_space(view));
|
||||
}
|
||||
#endif
|
||||
check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{}, WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
|
||||
check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{}, DeviceMemorySpace{}), view), DeviceMemorySpace{});
|
||||
|
||||
// create_mirror_view view_alloc + execution space
|
||||
#ifndef KOKKOS_ENABLE_CXX14
|
||||
// FIXME DynamicView: HostMirror is the same type
|
||||
if constexpr (!is_dynamic_view<View>::value) {
|
||||
check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{}, WithoutInitializing), view), host_mirror_test_space(view));
|
||||
check_memory_space(create_mirror_view(view_alloc(DefaultHostExecutionSpace{}), view), host_mirror_test_space(view));
|
||||
}
|
||||
#endif
|
||||
check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{}, WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
|
||||
check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{}, DeviceMemorySpace{}), view), DeviceMemorySpace{});
|
||||
|
||||
// create_mirror_view_and_copy
|
||||
check_memory_space(create_mirror_view_and_copy(HostSpace{}, view), HostSpace{});
|
||||
check_memory_space(create_mirror_view_and_copy(DeviceMemorySpace{}, view), DeviceMemorySpace{});
|
||||
|
||||
// create_mirror_view_and_copy view_alloc
|
||||
check_memory_space(create_mirror_view_and_copy(view_alloc(HostSpace{}), view), HostSpace{});
|
||||
check_memory_space(create_mirror_view_and_copy(view_alloc(DeviceMemorySpace{}), view), DeviceMemorySpace{});
|
||||
|
||||
// create_mirror_view_and_copy view_alloc + execution space
|
||||
check_memory_space(create_mirror_view_and_copy(view_alloc(HostSpace{}, DefaultHostExecutionSpace{}), view), HostSpace{});
|
||||
check_memory_space(create_mirror_view_and_copy(view_alloc(DeviceMemorySpace{}, DefaultExecutionSpace{}), view), DeviceMemorySpace{});
|
||||
|
||||
// clang-format on
|
||||
}
|
||||
|
||||
void test_create_mirror_dynrankview() {
|
||||
Kokkos::DynRankView<int, Kokkos::DefaultExecutionSpace> device_view(
|
||||
"device view", 10);
|
||||
Kokkos::DynRankView<int, Kokkos::HostSpace> host_view("host view", 10);
|
||||
|
||||
test_create_mirror_properties(device_view);
|
||||
test_create_mirror_properties(host_view);
|
||||
}
|
||||
|
||||
void test_reate_mirror_offsetview() {
|
||||
Kokkos::Experimental::OffsetView<int*, Kokkos::DefaultExecutionSpace>
|
||||
device_view("device view", {0, 10});
|
||||
Kokkos::Experimental::OffsetView<int*, Kokkos::HostSpace> host_view(
|
||||
"host view", {0, 10});
|
||||
|
||||
test_create_mirror_properties(device_view);
|
||||
test_create_mirror_properties(host_view);
|
||||
}
|
||||
|
||||
void test_create_mirror_dynamicview() {
|
||||
Kokkos::Experimental::DynamicView<int*, Kokkos::DefaultExecutionSpace>
|
||||
device_view("device view", 2, 10);
|
||||
Kokkos::Experimental::DynamicView<int*, Kokkos::HostSpace> host_view(
|
||||
"host view", 2, 10);
|
||||
|
||||
test_create_mirror_properties(device_view);
|
||||
test_create_mirror_properties(host_view);
|
||||
}
|
||||
@ -1,5 +1,5 @@
|
||||
TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
|
||||
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC DLlib HPX
|
||||
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC DLlib
|
||||
TEST_OPTIONAL_TPLS CUSPARSE
|
||||
)
|
||||
|
||||
|
||||
@ -53,13 +53,69 @@
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
inline int cuda_warp_per_sm_allocation_granularity(
|
||||
cudaDeviceProp const& properties) {
|
||||
// Allocation granularity of warps in each sm
|
||||
switch (properties.major) {
|
||||
case 3:
|
||||
case 5:
|
||||
case 7:
|
||||
case 8:
|
||||
case 9: return 4;
|
||||
case 6: return (properties.minor == 0 ? 2 : 4);
|
||||
default:
|
||||
throw_runtime_exception(
|
||||
"Unknown device in cuda warp per sm allocation granularity");
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
inline int cuda_max_warps_per_sm_registers(
|
||||
cudaDeviceProp const& properties, cudaFuncAttributes const& attributes) {
|
||||
// Maximum number of warps per sm as a function of register counts,
|
||||
// subject to the constraint that warps are allocated with a fixed granularity
|
||||
int const max_regs_per_block = properties.regsPerBlock;
|
||||
int const regs_per_warp = attributes.numRegs * properties.warpSize;
|
||||
int const warp_granularity =
|
||||
cuda_warp_per_sm_allocation_granularity(properties);
|
||||
// The granularity of register allocation is chunks of 256 registers per warp,
|
||||
// which implies a need to over-allocate, so we round up
|
||||
int const allocated_regs_per_warp = 256 * ((regs_per_warp + 256 - 1) / 256);
|
||||
|
||||
// The maximum number of warps per SM is constrained from above by register
|
||||
// allocation. To satisfy the constraint that warps per SM is allocated at a
|
||||
// finite granularity, we need to round down.
|
||||
int const max_warps_per_sm =
|
||||
warp_granularity *
|
||||
(max_regs_per_block / (allocated_regs_per_warp * warp_granularity));
|
||||
|
||||
return max_warps_per_sm;
|
||||
}
|
||||
|
||||
inline int cuda_max_active_blocks_per_sm(cudaDeviceProp const& properties,
|
||||
cudaFuncAttributes const& attributes,
|
||||
int block_size, size_t dynamic_shmem) {
|
||||
// Limits due do registers/SM
|
||||
// Limits due to registers/SM
|
||||
int const regs_per_sm = properties.regsPerMultiprocessor;
|
||||
int const regs_per_thread = attributes.numRegs;
|
||||
int const max_blocks_regs = regs_per_sm / (regs_per_thread * block_size);
|
||||
// The granularity of register allocation is chunks of 256 registers per warp
|
||||
// -> 8 registers per thread
|
||||
int const allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
|
||||
int max_blocks_regs = regs_per_sm / (allocated_regs_per_thread * block_size);
|
||||
|
||||
// Compute the maximum number of warps as a function of the number of
|
||||
// registers
|
||||
int const max_warps_per_sm_registers =
|
||||
cuda_max_warps_per_sm_registers(properties, attributes);
|
||||
|
||||
// Constrain the number of blocks to respect the maximum number of warps per
|
||||
// SM On face value this should be an equality, but due to the warp
|
||||
// granularity constraints noted in `cuda_max_warps_per_sm_registers` the
|
||||
// left-hand-side of this comparison can overshoot what the hardware allows
|
||||
// based on register counts alone
|
||||
while ((max_blocks_regs * block_size / properties.warpSize) >
|
||||
max_warps_per_sm_registers)
|
||||
max_blocks_regs--;
|
||||
|
||||
// Limits due to shared memory/SM
|
||||
size_t const shmem_per_sm = properties.sharedMemPerMultiprocessor;
|
||||
@ -203,40 +259,19 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
|
||||
LaunchBounds{});
|
||||
}
|
||||
|
||||
// Assuming cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1)
|
||||
// NOTE these number can be obtained several ways:
|
||||
// * One option is to download the CUDA Occupancy Calculator spreadsheet, select
|
||||
// "Compute Capability" first and check what is the smallest "Shared Memory
|
||||
// Size Config" that is available. The "Shared Memory Per Multiprocessor" in
|
||||
// bytes is then to be found below in the summary.
|
||||
// * Another option would be to look for the information in the "Tuning
|
||||
// Guide(s)" of the CUDA Toolkit Documentation for each GPU architecture, in
|
||||
// the "Shared Memory" section (more tedious)
|
||||
inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) {
|
||||
int const compute_capability = properties.major * 10 + properties.minor;
|
||||
return [compute_capability]() {
|
||||
switch (compute_capability) {
|
||||
case 30:
|
||||
case 32:
|
||||
case 35: return 16;
|
||||
case 37: return 80;
|
||||
case 50:
|
||||
case 53:
|
||||
case 60:
|
||||
case 62: return 64;
|
||||
case 52:
|
||||
case 61: return 96;
|
||||
case 70:
|
||||
case 80:
|
||||
case 86: return 8;
|
||||
case 75: return 32;
|
||||
default:
|
||||
Kokkos::Impl::throw_runtime_exception(
|
||||
"Unknown device in cuda block size deduction");
|
||||
}
|
||||
return 0;
|
||||
}() * 1024;
|
||||
template <class LaunchBounds>
|
||||
int cuda_get_opt_block_size_no_shmem(const cudaFuncAttributes& attr,
|
||||
LaunchBounds) {
|
||||
auto const& prop = Kokkos::Cuda().cuda_device_prop();
|
||||
|
||||
// Thin version of cuda_get_opt_block_size for cases where there is no shared
|
||||
// memory
|
||||
auto const block_size_to_no_shmem = [&](int /*block_size*/) { return 0; };
|
||||
|
||||
return cuda_deduce_block_size(false, prop, attr, block_size_to_no_shmem,
|
||||
LaunchBounds{});
|
||||
}
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
|
||||
@ -418,7 +418,7 @@ KOKKOS_INLINE_FUNCTION
|
||||
#endif // CUDA_VERSION >= 11000 && CUDA_VERSION < 11010
|
||||
|
||||
#if CUDA_VERSION >= 11010 && \
|
||||
((defined(KOKKOS_ARCH_AMPERE80) || defined(KOKKOS_ARCH_AMPERE86)))
|
||||
((defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER)))
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bhalf_t cast_to_bhalf(bhalf_t val) { return val; }
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
|
||||
@ -569,12 +569,6 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef KOKKOS_ENABLE_PRE_CUDA_10_DEPRECATION_API
|
||||
cudaThreadSetCacheConfig(cudaFuncCachePreferShared);
|
||||
#else
|
||||
cudaDeviceSetCacheConfig(cudaFuncCachePreferShared);
|
||||
#endif
|
||||
|
||||
// Init the array for used for arbitrarily sized atomics
|
||||
if (stream == nullptr) Impl::initialize_host_cuda_lock_arrays();
|
||||
|
||||
|
||||
@ -93,10 +93,6 @@ namespace Impl {
|
||||
// __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
|
||||
// function qualifier which could be used to improve performance.
|
||||
//----------------------------------------------------------------------------
|
||||
// Maximize L1 cache and minimize shared memory:
|
||||
// cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1 );
|
||||
// For 2.0 capability: 48 KB L1 and 16 KB shared
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template <class DriverType>
|
||||
__global__ static void cuda_parallel_launch_constant_memory() {
|
||||
@ -158,63 +154,119 @@ inline void check_shmem_request(CudaInternal const* cuda_instance, int shmem) {
|
||||
}
|
||||
}
|
||||
|
||||
// This function needs to be template on DriverType and LaunchBounds
|
||||
// These functions needs to be template on DriverType and LaunchBounds
|
||||
// so that the static bool is unique for each type combo
|
||||
// KernelFuncPtr does not necessarily contain that type information.
|
||||
|
||||
template <class DriverType, class LaunchBounds, class KernelFuncPtr>
|
||||
inline void configure_shmem_preference(KernelFuncPtr const& func,
|
||||
bool prefer_shmem) {
|
||||
#ifndef KOKKOS_ARCH_KEPLER
|
||||
// On Kepler the L1 has no benefit since it doesn't cache reads
|
||||
auto set_cache_config = [&] {
|
||||
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
|
||||
func,
|
||||
(prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1)));
|
||||
return prefer_shmem;
|
||||
const cudaFuncAttributes& get_cuda_kernel_func_attributes(
|
||||
const KernelFuncPtr& func) {
|
||||
// Only call cudaFuncGetAttributes once for each unique kernel
|
||||
// by leveraging static variable initialization rules
|
||||
auto wrap_get_attributes = [&]() -> cudaFuncAttributes {
|
||||
cudaFuncAttributes attr;
|
||||
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncGetAttributes(&attr, func));
|
||||
return attr;
|
||||
};
|
||||
static bool cache_config_preference_cached = set_cache_config();
|
||||
if (cache_config_preference_cached != prefer_shmem) {
|
||||
static cudaFuncAttributes func_attr = wrap_get_attributes();
|
||||
return func_attr;
|
||||
}
|
||||
|
||||
template <class DriverType, class LaunchBounds, class KernelFuncPtr>
|
||||
inline void configure_shmem_preference(const KernelFuncPtr& func,
|
||||
const cudaDeviceProp& device_props,
|
||||
const size_t block_size, int& shmem,
|
||||
const size_t occupancy) {
|
||||
#ifndef KOKKOS_ARCH_KEPLER
|
||||
|
||||
const auto& func_attr =
|
||||
get_cuda_kernel_func_attributes<DriverType, LaunchBounds>(func);
|
||||
|
||||
// Compute limits for number of blocks due to registers/SM
|
||||
const size_t regs_per_sm = device_props.regsPerMultiprocessor;
|
||||
const size_t regs_per_thread = func_attr.numRegs;
|
||||
// The granularity of register allocation is chunks of 256 registers per warp
|
||||
// -> 8 registers per thread
|
||||
const size_t allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
|
||||
size_t max_blocks_regs =
|
||||
regs_per_sm / (allocated_regs_per_thread * block_size);
|
||||
|
||||
// Compute the maximum number of warps as a function of the number of
|
||||
// registers
|
||||
const size_t max_warps_per_sm_registers =
|
||||
cuda_max_warps_per_sm_registers(device_props, func_attr);
|
||||
|
||||
// Constrain the number of blocks to respect the maximum number of warps per
|
||||
// SM On face value this should be an equality, but due to the warp
|
||||
// granularity constraints noted in `cuda_max_warps_per_sm_registers` the
|
||||
// left-hand-side of this comparison can overshoot what the hardware allows
|
||||
// based on register counts alone
|
||||
while ((max_blocks_regs * block_size / device_props.warpSize) >
|
||||
max_warps_per_sm_registers)
|
||||
max_blocks_regs--;
|
||||
|
||||
// Compute how many threads per sm we actually want
|
||||
const size_t max_threads_per_sm = device_props.maxThreadsPerMultiProcessor;
|
||||
// only allocate multiples of warp size
|
||||
const size_t num_threads_desired =
|
||||
((max_threads_per_sm * occupancy / 100 + 31) / 32) * 32;
|
||||
// Get close to the desired occupancy,
|
||||
// don't undershoot by much but also don't allocate a whole new block just
|
||||
// because one is a few threads over otherwise.
|
||||
size_t num_blocks_desired =
|
||||
(num_threads_desired + block_size * 0.8) / block_size;
|
||||
num_blocks_desired = ::std::min(max_blocks_regs, num_blocks_desired);
|
||||
if (num_blocks_desired == 0) num_blocks_desired = 1;
|
||||
|
||||
// Calculate how much shared memory we need per block
|
||||
size_t shmem_per_block = shmem + func_attr.sharedSizeBytes;
|
||||
|
||||
// The minimum shared memory allocation we can have in total per SM is 8kB.
|
||||
// If we want to lower occupancy we have to make sure we request at least that
|
||||
// much in aggregate over all blocks, so that shared memory actually becomes a
|
||||
// limiting factor for occupancy
|
||||
constexpr size_t min_shmem_size_per_sm = 8192;
|
||||
if ((occupancy < 100) &&
|
||||
(shmem_per_block * num_blocks_desired < min_shmem_size_per_sm)) {
|
||||
shmem_per_block = min_shmem_size_per_sm / num_blocks_desired;
|
||||
// Need to set the caller's shmem variable so that the
|
||||
// kernel launch uses the correct dynamic shared memory request
|
||||
shmem = shmem_per_block - func_attr.sharedSizeBytes;
|
||||
}
|
||||
|
||||
// Compute the carveout fraction we need based on occupancy
|
||||
// Use multiples of 8kB
|
||||
const size_t max_shmem_per_sm = device_props.sharedMemPerMultiprocessor;
|
||||
size_t carveout = shmem_per_block == 0
|
||||
? 0
|
||||
: 100 *
|
||||
(((num_blocks_desired * shmem_per_block +
|
||||
min_shmem_size_per_sm - 1) /
|
||||
min_shmem_size_per_sm) *
|
||||
min_shmem_size_per_sm) /
|
||||
max_shmem_per_sm;
|
||||
if (carveout > 100) carveout = 100;
|
||||
|
||||
// Set the carveout, but only call it once per kernel or when it changes
|
||||
auto set_cache_config = [&] {
|
||||
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetAttribute(
|
||||
func, cudaFuncAttributePreferredSharedMemoryCarveout, carveout));
|
||||
return carveout;
|
||||
};
|
||||
// Store the value in a static variable so we only reset if needed
|
||||
static size_t cache_config_preference_cached = set_cache_config();
|
||||
if (cache_config_preference_cached != carveout) {
|
||||
cache_config_preference_cached = set_cache_config();
|
||||
}
|
||||
#else
|
||||
// Use the parameters so we don't get a warning
|
||||
(void)func;
|
||||
(void)prefer_shmem;
|
||||
(void)device_props;
|
||||
(void)block_size;
|
||||
(void)occupancy;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class Policy>
|
||||
std::enable_if_t<Policy::experimental_contains_desired_occupancy>
|
||||
modify_launch_configuration_if_desired_occupancy_is_specified(
|
||||
Policy const& policy, cudaDeviceProp const& properties,
|
||||
cudaFuncAttributes const& attributes, dim3 const& block, int& shmem,
|
||||
bool& prefer_shmem) {
|
||||
int const block_size = block.x * block.y * block.z;
|
||||
int const desired_occupancy = policy.impl_get_desired_occupancy().value();
|
||||
|
||||
size_t const shmem_per_sm_prefer_l1 = get_shmem_per_sm_prefer_l1(properties);
|
||||
size_t const static_shmem = attributes.sharedSizeBytes;
|
||||
|
||||
// round to nearest integer and avoid division by zero
|
||||
int active_blocks = std::max(
|
||||
1, static_cast<int>(std::round(
|
||||
static_cast<double>(properties.maxThreadsPerMultiProcessor) /
|
||||
block_size * desired_occupancy / 100)));
|
||||
int const dynamic_shmem =
|
||||
shmem_per_sm_prefer_l1 / active_blocks - static_shmem;
|
||||
|
||||
if (dynamic_shmem > shmem) {
|
||||
shmem = dynamic_shmem;
|
||||
prefer_shmem = false;
|
||||
}
|
||||
}
|
||||
|
||||
template <class Policy>
|
||||
std::enable_if_t<!Policy::experimental_contains_desired_occupancy>
|
||||
modify_launch_configuration_if_desired_occupancy_is_specified(
|
||||
Policy const&, cudaDeviceProp const&, cudaFuncAttributes const&,
|
||||
dim3 const& /*block*/, int& /*shmem*/, bool& /*prefer_shmem*/) {}
|
||||
|
||||
// </editor-fold> end Some helper functions for launch code readability }}}1
|
||||
//==============================================================================
|
||||
|
||||
@ -348,7 +400,7 @@ struct CudaParallelLaunchKernelInvoker<
|
||||
#ifdef KOKKOS_CUDA_ENABLE_GRAPHS
|
||||
inline static void create_parallel_launch_graph_node(
|
||||
DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
|
||||
CudaInternal const* cuda_instance, bool prefer_shmem) {
|
||||
CudaInternal const* cuda_instance) {
|
||||
//----------------------------------------
|
||||
auto const& graph = Impl::get_cuda_graph_from_kernel(driver);
|
||||
KOKKOS_EXPECTS(bool(graph));
|
||||
@ -358,8 +410,19 @@ struct CudaParallelLaunchKernelInvoker<
|
||||
|
||||
if (!Impl::is_empty_launch(grid, block)) {
|
||||
Impl::check_shmem_request(cuda_instance, shmem);
|
||||
Impl::configure_shmem_preference<DriverType, LaunchBounds>(
|
||||
base_t::get_kernel_func(), prefer_shmem);
|
||||
if (DriverType::Policy::
|
||||
experimental_contains_desired_occupancy) {
|
||||
/*
|
||||
int desired_occupancy =
|
||||
driver.get_policy().impl_get_desired_occupancy().value();
|
||||
size_t block_size = block.x * block.y * block.z;
|
||||
Impl::configure_shmem_preference<DriverType, LaunchBounds>(
|
||||
base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
|
||||
shmem, desired_occupancy);*/
|
||||
Kokkos::Impl::throw_runtime_exception(
|
||||
std::string("Cuda graph node creation FAILED:"
|
||||
" occupancy requests are currently broken."));
|
||||
}
|
||||
|
||||
void const* args[] = {&driver};
|
||||
|
||||
@ -442,7 +505,7 @@ struct CudaParallelLaunchKernelInvoker<
|
||||
#ifdef KOKKOS_CUDA_ENABLE_GRAPHS
|
||||
inline static void create_parallel_launch_graph_node(
|
||||
DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
|
||||
CudaInternal const* cuda_instance, bool prefer_shmem) {
|
||||
CudaInternal const* cuda_instance) {
|
||||
//----------------------------------------
|
||||
auto const& graph = Impl::get_cuda_graph_from_kernel(driver);
|
||||
KOKKOS_EXPECTS(bool(graph));
|
||||
@ -452,8 +515,18 @@ struct CudaParallelLaunchKernelInvoker<
|
||||
|
||||
if (!Impl::is_empty_launch(grid, block)) {
|
||||
Impl::check_shmem_request(cuda_instance, shmem);
|
||||
Impl::configure_shmem_preference<DriverType, LaunchBounds>(
|
||||
base_t::get_kernel_func(), prefer_shmem);
|
||||
if (DriverType::Policy::
|
||||
experimental_contains_desired_occupancy) {
|
||||
/*int desired_occupancy =
|
||||
driver.get_policy().impl_get_desired_occupancy().value();
|
||||
size_t block_size = block.x * block.y * block.z;
|
||||
Impl::configure_shmem_preference<DriverType, LaunchBounds>(
|
||||
base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
|
||||
shmem, desired_occupancy);*/
|
||||
Kokkos::Impl::throw_runtime_exception(
|
||||
std::string("Cuda graph node creation FAILED:"
|
||||
" occupancy requests are currently broken."));
|
||||
}
|
||||
|
||||
auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver);
|
||||
|
||||
@ -566,7 +639,7 @@ struct CudaParallelLaunchKernelInvoker<
|
||||
#ifdef KOKKOS_CUDA_ENABLE_GRAPHS
|
||||
inline static void create_parallel_launch_graph_node(
|
||||
DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
|
||||
CudaInternal const* cuda_instance, bool prefer_shmem) {
|
||||
CudaInternal const* cuda_instance) {
|
||||
// Just use global memory; coordinating through events to share constant
|
||||
// memory with the non-graph interface is not really reasonable since
|
||||
// events don't work with Graphs directly, and this would anyway require
|
||||
@ -580,7 +653,7 @@ struct CudaParallelLaunchKernelInvoker<
|
||||
DriverType, LaunchBounds,
|
||||
Experimental::CudaLaunchMechanism::GlobalMemory>;
|
||||
global_launch_impl_t::create_parallel_launch_graph_node(
|
||||
driver, grid, block, shmem, cuda_instance, prefer_shmem);
|
||||
driver, grid, block, shmem, cuda_instance);
|
||||
}
|
||||
#endif
|
||||
};
|
||||
@ -613,8 +686,7 @@ struct CudaParallelLaunchImpl<
|
||||
|
||||
inline static void launch_kernel(const DriverType& driver, const dim3& grid,
|
||||
const dim3& block, int shmem,
|
||||
const CudaInternal* cuda_instance,
|
||||
bool prefer_shmem) {
|
||||
const CudaInternal* cuda_instance) {
|
||||
if (!Impl::is_empty_launch(grid, block)) {
|
||||
// Prevent multiple threads to simultaneously set the cache configuration
|
||||
// preference and launch the same kernel
|
||||
@ -623,20 +695,22 @@ struct CudaParallelLaunchImpl<
|
||||
|
||||
Impl::check_shmem_request(cuda_instance, shmem);
|
||||
|
||||
// If a desired occupancy is specified, we compute how much shared memory
|
||||
// to ask for to achieve that occupancy, assuming that the cache
|
||||
// configuration is `cudaFuncCachePreferL1`. If the amount of dynamic
|
||||
// shared memory computed is actually smaller than `shmem` we overwrite
|
||||
// `shmem` and set `prefer_shmem` to `false`.
|
||||
modify_launch_configuration_if_desired_occupancy_is_specified(
|
||||
driver.get_policy(), cuda_instance->m_deviceProp,
|
||||
get_cuda_func_attributes(), block, shmem, prefer_shmem);
|
||||
if (DriverType::Policy::
|
||||
experimental_contains_desired_occupancy) {
|
||||
/*int desired_occupancy =
|
||||
driver.get_policy().impl_get_desired_occupancy().value();
|
||||
size_t block_size = block.x * block.y * block.z;
|
||||
Impl::configure_shmem_preference<
|
||||
DriverType,
|
||||
Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
|
||||
base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
|
||||
shmem, desired_occupancy);*/
|
||||
Kokkos::Impl::throw_runtime_exception(
|
||||
std::string("Cuda graph node creation FAILED:"
|
||||
" occupancy requests are currently broken."));
|
||||
}
|
||||
|
||||
Impl::configure_shmem_preference<
|
||||
DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
|
||||
base_t::get_kernel_func(), prefer_shmem);
|
||||
|
||||
ensure_cuda_lock_arrays_on_device();
|
||||
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
|
||||
|
||||
// Invoke the driver function on the device
|
||||
base_t::invoke_kernel(driver, grid, block, shmem, cuda_instance);
|
||||
@ -650,18 +724,9 @@ struct CudaParallelLaunchImpl<
|
||||
}
|
||||
|
||||
static cudaFuncAttributes get_cuda_func_attributes() {
|
||||
// Race condition inside of cudaFuncGetAttributes if the same address is
|
||||
// given requires using a local variable as input instead of a static Rely
|
||||
// on static variable initialization to make sure only one thread executes
|
||||
// the code and the result is visible.
|
||||
auto wrap_get_attributes = []() -> cudaFuncAttributes {
|
||||
cudaFuncAttributes attr_tmp;
|
||||
KOKKOS_IMPL_CUDA_SAFE_CALL(
|
||||
cudaFuncGetAttributes(&attr_tmp, base_t::get_kernel_func()));
|
||||
return attr_tmp;
|
||||
};
|
||||
static cudaFuncAttributes attr = wrap_get_attributes();
|
||||
return attr;
|
||||
return get_cuda_kernel_func_attributes<
|
||||
DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
|
||||
base_t::get_kernel_func());
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@ -79,7 +79,8 @@ CudaLockArrays g_host_cuda_lock_arrays = {nullptr, 0};
|
||||
void initialize_host_cuda_lock_arrays() {
|
||||
#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
|
||||
desul::Impl::init_lock_arrays();
|
||||
desul::ensure_cuda_lock_arrays_on_device();
|
||||
|
||||
DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
|
||||
#endif
|
||||
if (g_host_cuda_lock_arrays.atomic != nullptr) return;
|
||||
KOKKOS_IMPL_CUDA_SAFE_CALL(
|
||||
@ -88,7 +89,7 @@ void initialize_host_cuda_lock_arrays() {
|
||||
Impl::cuda_device_synchronize(
|
||||
"Kokkos::Impl::initialize_host_cuda_lock_arrays: Pre Init Lock Arrays");
|
||||
g_host_cuda_lock_arrays.n = Cuda::concurrency();
|
||||
copy_cuda_lock_arrays_to_device();
|
||||
KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
|
||||
init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK + 1 + 255) / 256,
|
||||
256>>>();
|
||||
Impl::cuda_device_synchronize(
|
||||
@ -105,7 +106,7 @@ void finalize_host_cuda_lock_arrays() {
|
||||
g_host_cuda_lock_arrays.atomic = nullptr;
|
||||
g_host_cuda_lock_arrays.n = 0;
|
||||
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
|
||||
copy_cuda_lock_arrays_to_device();
|
||||
KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@ -67,7 +67,7 @@ struct CudaLockArrays {
|
||||
|
||||
/// \brief This global variable in Host space is the central definition
|
||||
/// of these arrays.
|
||||
extern CudaLockArrays g_host_cuda_lock_arrays;
|
||||
extern Kokkos::Impl::CudaLockArrays g_host_cuda_lock_arrays;
|
||||
|
||||
/// \brief After this call, the g_host_cuda_lock_arrays variable has
|
||||
/// valid, initialized arrays.
|
||||
@ -105,12 +105,12 @@ namespace Impl {
|
||||
/// instances in other translation units, we must update this CUDA global
|
||||
/// variable based on the Host global variable prior to running any kernels
|
||||
/// that will use it.
|
||||
/// That is the purpose of the ensure_cuda_lock_arrays_on_device function.
|
||||
/// That is the purpose of the KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE macro.
|
||||
__device__
|
||||
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
|
||||
__constant__ extern
|
||||
#endif
|
||||
CudaLockArrays g_device_cuda_lock_arrays;
|
||||
Kokkos::Impl::CudaLockArrays g_device_cuda_lock_arrays;
|
||||
|
||||
#define CUDA_SPACE_ATOMIC_MASK 0x1FFFF
|
||||
|
||||
@ -123,7 +123,9 @@ __device__ inline bool lock_address_cuda_space(void* ptr) {
|
||||
size_t offset = size_t(ptr);
|
||||
offset = offset >> 2;
|
||||
offset = offset & CUDA_SPACE_ATOMIC_MASK;
|
||||
return (0 == atomicCAS(&g_device_cuda_lock_arrays.atomic[offset], 0, 1));
|
||||
return (
|
||||
0 ==
|
||||
atomicCAS(&Kokkos::Impl::g_device_cuda_lock_arrays.atomic[offset], 0, 1));
|
||||
}
|
||||
|
||||
/// \brief Release lock for the address
|
||||
@ -136,7 +138,7 @@ __device__ inline void unlock_address_cuda_space(void* ptr) {
|
||||
size_t offset = size_t(ptr);
|
||||
offset = offset >> 2;
|
||||
offset = offset & CUDA_SPACE_ATOMIC_MASK;
|
||||
atomicExch(&g_device_cuda_lock_arrays.atomic[offset], 0);
|
||||
atomicExch(&Kokkos::Impl::g_device_cuda_lock_arrays.atomic[offset], 0);
|
||||
}
|
||||
|
||||
} // namespace Impl
|
||||
@ -149,49 +151,45 @@ namespace {
|
||||
static int lock_array_copied = 0;
|
||||
inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
|
||||
} // namespace
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
|
||||
inline
|
||||
#else
|
||||
static
|
||||
#endif
|
||||
void
|
||||
copy_cuda_lock_arrays_to_device() {
|
||||
if (lock_array_copied == 0) {
|
||||
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemcpyToSymbol(g_device_cuda_lock_arrays,
|
||||
&g_host_cuda_lock_arrays,
|
||||
sizeof(CudaLockArrays)));
|
||||
/* Dan Ibanez: it is critical that this code be a macro, so that it will
|
||||
capture the right address for Kokkos::Impl::g_device_cuda_lock_arrays!
|
||||
putting this in an inline function will NOT do the right thing! */
|
||||
#define KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE() \
|
||||
{ \
|
||||
if (::Kokkos::Impl::lock_array_copied == 0) { \
|
||||
KOKKOS_IMPL_CUDA_SAFE_CALL( \
|
||||
cudaMemcpyToSymbol(Kokkos::Impl::g_device_cuda_lock_arrays, \
|
||||
&Kokkos::Impl::g_host_cuda_lock_arrays, \
|
||||
sizeof(Kokkos::Impl::CudaLockArrays))); \
|
||||
} \
|
||||
lock_array_copied = 1; \
|
||||
}
|
||||
lock_array_copied = 1;
|
||||
}
|
||||
|
||||
#ifndef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
|
||||
|
||||
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
|
||||
inline void ensure_cuda_lock_arrays_on_device() {}
|
||||
#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
|
||||
#else
|
||||
inline static void ensure_cuda_lock_arrays_on_device() {
|
||||
copy_cuda_lock_arrays_to_device();
|
||||
}
|
||||
#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() \
|
||||
KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
|
||||
inline void ensure_cuda_lock_arrays_on_device() {}
|
||||
#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
|
||||
#else
|
||||
// Still Need COPY_CUDA_LOCK_ARRAYS for team scratch etc.
|
||||
inline static void ensure_cuda_lock_arrays_on_device() {
|
||||
copy_cuda_lock_arrays_to_device();
|
||||
desul::ensure_cuda_lock_arrays_on_device();
|
||||
}
|
||||
#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() \
|
||||
KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE() \
|
||||
DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
|
||||
#endif
|
||||
|
||||
#endif /* defined( KOKKOS_ENABLE_IMPL_DESUL_ATOMICS ) */
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif /* defined( KOKKOS_ENABLE_CUDA ) */
|
||||
|
||||
#endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP */
|
||||
|
||||
@ -67,6 +67,34 @@
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template <typename ParallelType, typename Policy, typename LaunchBounds>
|
||||
int max_tile_size_product_helper(const Policy& pol, const LaunchBounds&) {
|
||||
cudaFuncAttributes attr =
|
||||
CudaParallelLaunch<ParallelType,
|
||||
LaunchBounds>::get_cuda_func_attributes();
|
||||
auto const& prop = pol.space().cuda_device_prop();
|
||||
|
||||
// Limits due to registers/SM, MDRange doesn't have
|
||||
// shared memory constraints
|
||||
int const optimal_block_size =
|
||||
Kokkos::Impl::cuda_get_opt_block_size_no_shmem(attr, LaunchBounds{});
|
||||
|
||||
// Compute how many blocks of this size we can launch, based on warp
|
||||
// constraints
|
||||
int const max_warps_per_sm_registers =
|
||||
Kokkos::Impl::cuda_max_warps_per_sm_registers(prop, attr);
|
||||
int const max_num_threads_from_warps =
|
||||
max_warps_per_sm_registers * prop.warpSize;
|
||||
int const max_num_blocks = max_num_threads_from_warps / optimal_block_size;
|
||||
|
||||
// Compute the total number of threads
|
||||
int const max_threads_per_sm = optimal_block_size * max_num_blocks;
|
||||
|
||||
return std::min(
|
||||
max_threads_per_sm,
|
||||
static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
|
||||
}
|
||||
|
||||
template <class FunctorType, class... Traits>
|
||||
class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
|
||||
public:
|
||||
@ -85,18 +113,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
|
||||
public:
|
||||
template <typename Policy, typename Functor>
|
||||
static int max_tile_size_product(const Policy& pol, const Functor&) {
|
||||
cudaFuncAttributes attr =
|
||||
CudaParallelLaunch<ParallelFor,
|
||||
LaunchBounds>::get_cuda_func_attributes();
|
||||
auto const& prop = pol.space().cuda_device_prop();
|
||||
// Limits due to registers/SM, MDRange doesn't have
|
||||
// shared memory constraints
|
||||
int const regs_per_sm = prop.regsPerMultiprocessor;
|
||||
int const regs_per_thread = attr.numRegs;
|
||||
int const max_threads_per_sm = regs_per_sm / regs_per_thread;
|
||||
return std::min(
|
||||
max_threads_per_sm,
|
||||
static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
|
||||
return max_tile_size_product_helper<ParallelFor>(pol, LaunchBounds{});
|
||||
}
|
||||
Policy const& get_policy() const { return m_rp; }
|
||||
inline __device__ void operator()() const {
|
||||
@ -121,8 +138,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
|
||||
maxblocks[1]),
|
||||
1);
|
||||
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
||||
*this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
|
||||
false);
|
||||
*this, grid, block, 0, m_rp.space().impl_internal_space_instance());
|
||||
} else if (RP::rank == 3) {
|
||||
const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], m_rp.m_tile[2]);
|
||||
KOKKOS_ASSERT(block.x > 0);
|
||||
@ -139,8 +155,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
|
||||
(m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1) / block.z,
|
||||
maxblocks[2]));
|
||||
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
||||
*this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
|
||||
false);
|
||||
*this, grid, block, 0, m_rp.space().impl_internal_space_instance());
|
||||
} else if (RP::rank == 4) {
|
||||
// id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to
|
||||
// threadIdx.z
|
||||
@ -158,8 +173,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
|
||||
(m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1) / block.z,
|
||||
maxblocks[2]));
|
||||
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
||||
*this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
|
||||
false);
|
||||
*this, grid, block, 0, m_rp.space().impl_internal_space_instance());
|
||||
} else if (RP::rank == 5) {
|
||||
// id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to
|
||||
// threadIdx.z
|
||||
@ -175,8 +189,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
|
||||
(m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1) / block.z,
|
||||
maxblocks[2]));
|
||||
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
||||
*this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
|
||||
false);
|
||||
*this, grid, block, 0, m_rp.space().impl_internal_space_instance());
|
||||
} else if (RP::rank == 6) {
|
||||
// id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4,id5 to
|
||||
// threadIdx.z
|
||||
@ -191,8 +204,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
|
||||
std::min<array_index_type>(m_rp.m_tile_end[4] * m_rp.m_tile_end[5],
|
||||
maxblocks[2]));
|
||||
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
||||
*this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
|
||||
false);
|
||||
*this, grid, block, 0, m_rp.space().impl_internal_space_instance());
|
||||
} else {
|
||||
Kokkos::abort("Kokkos::MDRange Error: Exceeded rank bounds with Cuda\n");
|
||||
}
|
||||
@ -263,17 +275,7 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
|
||||
public:
|
||||
template <typename Policy, typename Functor>
|
||||
static int max_tile_size_product(const Policy& pol, const Functor&) {
|
||||
cudaFuncAttributes attr =
|
||||
CudaParallelLaunch<ParallelReduce,
|
||||
LaunchBounds>::get_cuda_func_attributes();
|
||||
auto const& prop = pol.space().cuda_device_prop();
|
||||
// Limits due do registers/SM
|
||||
int const regs_per_sm = prop.regsPerMultiprocessor;
|
||||
int const regs_per_thread = attr.numRegs;
|
||||
int const max_threads_per_sm = regs_per_sm / regs_per_thread;
|
||||
return std::min(
|
||||
max_threads_per_sm,
|
||||
static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
|
||||
return max_tile_size_product_helper<ParallelReduce>(pol, LaunchBounds{});
|
||||
}
|
||||
Policy const& get_policy() const { return m_policy; }
|
||||
inline __device__ void exec_range(reference_type update) const {
|
||||
@ -405,8 +407,8 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
|
||||
|
||||
CudaParallelLaunch<ParallelReduce, LaunchBounds>(
|
||||
*this, grid, block, shmem,
|
||||
m_policy.space().impl_internal_space_instance(),
|
||||
false); // copy to device and execute
|
||||
m_policy.space()
|
||||
.impl_internal_space_instance()); // copy to device and execute
|
||||
|
||||
if (!m_result_ptr_device_accessible) {
|
||||
if (m_result_ptr) {
|
||||
|
||||
@ -135,8 +135,7 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
|
||||
#endif
|
||||
|
||||
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
||||
*this, grid, block, 0, m_policy.space().impl_internal_space_instance(),
|
||||
false);
|
||||
*this, grid, block, 0, m_policy.space().impl_internal_space_instance());
|
||||
}
|
||||
|
||||
ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
|
||||
@ -375,8 +374,8 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
|
||||
|
||||
CudaParallelLaunch<ParallelReduce, LaunchBounds>(
|
||||
*this, grid, block, shmem,
|
||||
m_policy.space().impl_internal_space_instance(),
|
||||
false); // copy to device and execute
|
||||
m_policy.space()
|
||||
.impl_internal_space_instance()); // copy to device and execute
|
||||
|
||||
if (!m_result_ptr_device_accessible) {
|
||||
if (m_result_ptr) {
|
||||
@ -465,8 +464,24 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
|
||||
public:
|
||||
using pointer_type = typename Analysis::pointer_type;
|
||||
using reference_type = typename Analysis::reference_type;
|
||||
using value_type = typename Analysis::value_type;
|
||||
using functor_type = FunctorType;
|
||||
using size_type = Cuda::size_type;
|
||||
// Conditionally set word_size_type to int16_t or int8_t if value_type is
|
||||
// smaller than int32_t (Kokkos::Cuda::size_type)
|
||||
// word_size_type is used to determine the word count, shared memory buffer
|
||||
// size, and global memory buffer size before the scan is performed.
|
||||
// Within the scan, the word count is recomputed based on word_size_type
|
||||
// and when calculating indexes into the shared/global memory buffers for
|
||||
// performing the scan, word_size_type is used again.
|
||||
// For scalars > 4 bytes in size, indexing into shared/global memory relies
|
||||
// on the block and grid dimensions to ensure that we index at the correct
|
||||
// offset rather than at every 4 byte word; such that, when the join is
|
||||
// performed, we have the correct data that was copied over in chunks of 4
|
||||
// bytes.
|
||||
using word_size_type = std::conditional_t<
|
||||
sizeof(value_type) < sizeof(size_type),
|
||||
std::conditional_t<sizeof(value_type) == 2, int16_t, int8_t>, size_type>;
|
||||
|
||||
private:
|
||||
// Algorithmic constraints:
|
||||
@ -477,7 +492,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
|
||||
|
||||
const FunctorType m_functor;
|
||||
const Policy m_policy;
|
||||
size_type* m_scratch_space;
|
||||
word_size_type* m_scratch_space;
|
||||
size_type* m_scratch_flags;
|
||||
size_type m_final;
|
||||
#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
|
||||
@ -501,12 +516,12 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
|
||||
__device__ inline void initial() const {
|
||||
typename Analysis::Reducer final_reducer(&m_functor);
|
||||
|
||||
const integral_nonzero_constant<size_type, Analysis::StaticValueSize /
|
||||
sizeof(size_type)>
|
||||
word_count(Analysis::value_size(m_functor) / sizeof(size_type));
|
||||
const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
|
||||
sizeof(word_size_type)>
|
||||
word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));
|
||||
|
||||
size_type* const shared_value =
|
||||
kokkos_impl_cuda_shared_memory<size_type>() +
|
||||
word_size_type* const shared_value =
|
||||
kokkos_impl_cuda_shared_memory<word_size_type>() +
|
||||
word_count.value * threadIdx.y;
|
||||
|
||||
final_reducer.init(reinterpret_cast<pointer_type>(shared_value));
|
||||
@ -532,7 +547,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
|
||||
// gridDim.x
|
||||
cuda_single_inter_block_reduce_scan<true>(
|
||||
final_reducer, blockIdx.x, gridDim.x,
|
||||
kokkos_impl_cuda_shared_memory<size_type>(), m_scratch_space,
|
||||
kokkos_impl_cuda_shared_memory<word_size_type>(), m_scratch_space,
|
||||
m_scratch_flags);
|
||||
}
|
||||
|
||||
@ -541,21 +556,22 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
|
||||
__device__ inline void final() const {
|
||||
typename Analysis::Reducer final_reducer(&m_functor);
|
||||
|
||||
const integral_nonzero_constant<size_type, Analysis::StaticValueSize /
|
||||
sizeof(size_type)>
|
||||
word_count(Analysis::value_size(m_functor) / sizeof(size_type));
|
||||
const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
|
||||
sizeof(word_size_type)>
|
||||
word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));
|
||||
|
||||
// Use shared memory as an exclusive scan: { 0 , value[0] , value[1] ,
|
||||
// value[2] , ... }
|
||||
size_type* const shared_data = kokkos_impl_cuda_shared_memory<size_type>();
|
||||
size_type* const shared_prefix =
|
||||
word_size_type* const shared_data =
|
||||
kokkos_impl_cuda_shared_memory<word_size_type>();
|
||||
word_size_type* const shared_prefix =
|
||||
shared_data + word_count.value * threadIdx.y;
|
||||
size_type* const shared_accum =
|
||||
word_size_type* const shared_accum =
|
||||
shared_data + word_count.value * (blockDim.y + 1);
|
||||
|
||||
// Starting value for this thread block is the previous block's total.
|
||||
if (blockIdx.x) {
|
||||
size_type* const block_total =
|
||||
word_size_type* const block_total =
|
||||
m_scratch_space + word_count.value * (blockIdx.x - 1);
|
||||
for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
|
||||
shared_accum[i] = block_total[i];
|
||||
@ -602,7 +618,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
|
||||
typename Analysis::pointer_type(shared_data + word_count.value));
|
||||
|
||||
{
|
||||
size_type* const block_total =
|
||||
word_size_type* const block_total =
|
||||
shared_data + word_count.value * blockDim.y;
|
||||
for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
|
||||
shared_accum[i] = block_total[i];
|
||||
@ -690,8 +706,9 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
|
||||
// How many block are really needed for this much work:
|
||||
const int grid_x = (nwork + work_per_block - 1) / work_per_block;
|
||||
|
||||
m_scratch_space = cuda_internal_scratch_space(
|
||||
m_policy.space(), Analysis::value_size(m_functor) * grid_x);
|
||||
m_scratch_space =
|
||||
reinterpret_cast<word_size_type*>(cuda_internal_scratch_space(
|
||||
m_policy.space(), Analysis::value_size(m_functor) * grid_x));
|
||||
m_scratch_flags =
|
||||
cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type) * 1);
|
||||
|
||||
@ -708,16 +725,16 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
|
||||
m_final = false;
|
||||
CudaParallelLaunch<ParallelScan, LaunchBounds>(
|
||||
*this, grid, block, shmem,
|
||||
m_policy.space().impl_internal_space_instance(),
|
||||
false); // copy to device and execute
|
||||
m_policy.space()
|
||||
.impl_internal_space_instance()); // copy to device and execute
|
||||
#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
|
||||
}
|
||||
#endif
|
||||
m_final = true;
|
||||
CudaParallelLaunch<ParallelScan, LaunchBounds>(
|
||||
*this, grid, block, shmem,
|
||||
m_policy.space().impl_internal_space_instance(),
|
||||
false); // copy to device and execute
|
||||
m_policy.space()
|
||||
.impl_internal_space_instance()); // copy to device and execute
|
||||
}
|
||||
}
|
||||
|
||||
@ -752,10 +769,26 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
|
||||
Policy, FunctorType>;
|
||||
|
||||
public:
|
||||
using value_type = typename Analysis::value_type;
|
||||
using pointer_type = typename Analysis::pointer_type;
|
||||
using reference_type = typename Analysis::reference_type;
|
||||
using functor_type = FunctorType;
|
||||
using size_type = Cuda::size_type;
|
||||
// Conditionally set word_size_type to int16_t or int8_t if value_type is
|
||||
// smaller than int32_t (Kokkos::Cuda::size_type)
|
||||
// word_size_type is used to determine the word count, shared memory buffer
|
||||
// size, and global memory buffer size before the scan is performed.
|
||||
// Within the scan, the word count is recomputed based on word_size_type
|
||||
// and when calculating indexes into the shared/global memory buffers for
|
||||
// performing the scan, word_size_type is used again.
|
||||
// For scalars > 4 bytes in size, indexing into shared/global memory relies
|
||||
// on the block and grid dimensions to ensure that we index at the correct
|
||||
// offset rather than at every 4 byte word; such that, when the join is
|
||||
// performed, we have the correct data that was copied over in chunks of 4
|
||||
// bytes.
|
||||
using word_size_type = std::conditional_t<
|
||||
sizeof(value_type) < sizeof(size_type),
|
||||
std::conditional_t<sizeof(value_type) == 2, int16_t, int8_t>, size_type>;
|
||||
|
||||
private:
|
||||
// Algorithmic constraints:
|
||||
@ -766,7 +799,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
|
||||
|
||||
const FunctorType m_functor;
|
||||
const Policy m_policy;
|
||||
size_type* m_scratch_space;
|
||||
word_size_type* m_scratch_space;
|
||||
size_type* m_scratch_flags;
|
||||
size_type m_final;
|
||||
ReturnType& m_returnvalue;
|
||||
@ -791,12 +824,12 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
|
||||
__device__ inline void initial() const {
|
||||
typename Analysis::Reducer final_reducer(&m_functor);
|
||||
|
||||
const integral_nonzero_constant<size_type, Analysis::StaticValueSize /
|
||||
sizeof(size_type)>
|
||||
word_count(Analysis::value_size(m_functor) / sizeof(size_type));
|
||||
const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
|
||||
sizeof(word_size_type)>
|
||||
word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));
|
||||
|
||||
size_type* const shared_value =
|
||||
kokkos_impl_cuda_shared_memory<size_type>() +
|
||||
word_size_type* const shared_value =
|
||||
kokkos_impl_cuda_shared_memory<word_size_type>() +
|
||||
word_count.value * threadIdx.y;
|
||||
|
||||
final_reducer.init(reinterpret_cast<pointer_type>(shared_value));
|
||||
@ -822,7 +855,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
|
||||
// gridDim.x
|
||||
cuda_single_inter_block_reduce_scan<true>(
|
||||
final_reducer, blockIdx.x, gridDim.x,
|
||||
kokkos_impl_cuda_shared_memory<size_type>(), m_scratch_space,
|
||||
kokkos_impl_cuda_shared_memory<word_size_type>(), m_scratch_space,
|
||||
m_scratch_flags);
|
||||
}
|
||||
|
||||
@ -831,21 +864,22 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
|
||||
__device__ inline void final() const {
|
||||
typename Analysis::Reducer final_reducer(&m_functor);
|
||||
|
||||
const integral_nonzero_constant<size_type, Analysis::StaticValueSize /
|
||||
sizeof(size_type)>
|
||||
word_count(Analysis::value_size(m_functor) / sizeof(size_type));
|
||||
const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
|
||||
sizeof(word_size_type)>
|
||||
word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));
|
||||
|
||||
// Use shared memory as an exclusive scan: { 0 , value[0] , value[1] ,
|
||||
// value[2] , ... }
|
||||
size_type* const shared_data = kokkos_impl_cuda_shared_memory<size_type>();
|
||||
size_type* const shared_prefix =
|
||||
word_size_type* const shared_data =
|
||||
kokkos_impl_cuda_shared_memory<word_size_type>();
|
||||
word_size_type* const shared_prefix =
|
||||
shared_data + word_count.value * threadIdx.y;
|
||||
size_type* const shared_accum =
|
||||
word_size_type* const shared_accum =
|
||||
shared_data + word_count.value * (blockDim.y + 1);
|
||||
|
||||
// Starting value for this thread block is the previous block's total.
|
||||
if (blockIdx.x) {
|
||||
size_type* const block_total =
|
||||
word_size_type* const block_total =
|
||||
m_scratch_space + word_count.value * (blockIdx.x - 1);
|
||||
for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
|
||||
shared_accum[i] = block_total[i];
|
||||
@ -894,7 +928,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
|
||||
typename Analysis::pointer_type(shared_data + word_count.value));
|
||||
|
||||
{
|
||||
size_type* const block_total =
|
||||
word_size_type* const block_total =
|
||||
shared_data + word_count.value * blockDim.y;
|
||||
for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
|
||||
shared_accum[i] = block_total[i];
|
||||
@ -983,8 +1017,9 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
|
||||
// How many block are really needed for this much work:
|
||||
const int grid_x = (nwork + work_per_block - 1) / work_per_block;
|
||||
|
||||
m_scratch_space = cuda_internal_scratch_space(
|
||||
m_policy.space(), Analysis::value_size(m_functor) * grid_x);
|
||||
m_scratch_space =
|
||||
reinterpret_cast<word_size_type*>(cuda_internal_scratch_space(
|
||||
m_policy.space(), Analysis::value_size(m_functor) * grid_x));
|
||||
m_scratch_flags =
|
||||
cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type) * 1);
|
||||
|
||||
@ -1002,16 +1037,16 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
|
||||
m_final = false;
|
||||
CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>(
|
||||
*this, grid, block, shmem,
|
||||
m_policy.space().impl_internal_space_instance(),
|
||||
false); // copy to device and execute
|
||||
m_policy.space()
|
||||
.impl_internal_space_instance()); // copy to device and execute
|
||||
#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
|
||||
}
|
||||
#endif
|
||||
m_final = true;
|
||||
CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>(
|
||||
*this, grid, block, shmem,
|
||||
m_policy.space().impl_internal_space_instance(),
|
||||
false); // copy to device and execute
|
||||
m_policy.space()
|
||||
.impl_internal_space_instance()); // copy to device and execute
|
||||
|
||||
const int size = Analysis::value_size(m_functor);
|
||||
#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
|
||||
@ -1022,7 +1057,8 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
|
||||
#endif
|
||||
DeepCopy<HostSpace, CudaSpace, Cuda>(
|
||||
m_policy.space(), &m_returnvalue,
|
||||
m_scratch_space + (grid_x - 1) * size / sizeof(int), size);
|
||||
m_scratch_space + (grid_x - 1) * size / sizeof(word_size_type),
|
||||
size);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -552,8 +552,8 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
|
||||
|
||||
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
||||
*this, grid, block, shmem_size_total,
|
||||
m_policy.space().impl_internal_space_instance(),
|
||||
true); // copy to device and execute
|
||||
m_policy.space()
|
||||
.impl_internal_space_instance()); // copy to device and execute
|
||||
}
|
||||
|
||||
ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
|
||||
@ -878,8 +878,8 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
|
||||
|
||||
CudaParallelLaunch<ParallelReduce, LaunchBounds>(
|
||||
*this, grid, block, shmem_size_total,
|
||||
m_policy.space().impl_internal_space_instance(),
|
||||
true); // copy to device and execute
|
||||
m_policy.space()
|
||||
.impl_internal_space_instance()); // copy to device and execute
|
||||
|
||||
if (!m_result_ptr_device_accessible) {
|
||||
m_policy.space().fence(
|
||||
|
||||
@ -116,6 +116,7 @@ __device__ inline void cuda_inter_warp_reduction(
|
||||
value = result[0];
|
||||
for (int i = 1; (i * step < max_active_thread) && i < STEP_WIDTH; i++)
|
||||
reducer.join(&value, &result[i]);
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
template <class ValueType, class ReducerType>
|
||||
@ -427,11 +428,6 @@ struct CudaReductionsFunctor<FunctorType, false, false> {
|
||||
// __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
|
||||
// function qualifier which could be used to improve performance.
|
||||
//----------------------------------------------------------------------------
|
||||
// Maximize shared memory and minimize L1 cache:
|
||||
// cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferShared );
|
||||
// For 2.0 capability: 48 KB shared and 16 KB L1
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
/*
|
||||
* Algorithmic constraints:
|
||||
* (a) blockDim.y <= 1024
|
||||
|
||||
@ -100,8 +100,7 @@ class ParallelFor<FunctorType, Kokkos::WorkGraphPolicy<Traits...>,
|
||||
const int shared = 0;
|
||||
|
||||
Kokkos::Impl::CudaParallelLaunch<Self>(
|
||||
*this, grid, block, shared, Cuda().impl_internal_space_instance(),
|
||||
false);
|
||||
*this, grid, block, shared, Cuda().impl_internal_space_instance());
|
||||
}
|
||||
|
||||
inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
|
||||
|
||||
@ -448,11 +448,27 @@ class ParallelScanHIPBase {
|
||||
Policy, FunctorType>;
|
||||
|
||||
public:
|
||||
using value_type = typename Analysis::value_type;
|
||||
using pointer_type = typename Analysis::pointer_type;
|
||||
using reference_type = typename Analysis::reference_type;
|
||||
using functor_type = FunctorType;
|
||||
using size_type = Kokkos::Experimental::HIP::size_type;
|
||||
using index_type = typename Policy::index_type;
|
||||
// Conditionally set word_size_type to int16_t or int8_t if value_type is
|
||||
// smaller than int32_t (Kokkos::HIP::size_type)
|
||||
// word_size_type is used to determine the word count, shared memory buffer
|
||||
// size, and global memory buffer size before the scan is performed.
|
||||
// Within the scan, the word count is recomputed based on word_size_type
|
||||
// and when calculating indexes into the shared/global memory buffers for
|
||||
// performing the scan, word_size_type is used again.
|
||||
// For scalars > 4 bytes in size, indexing into shared/global memory relies
|
||||
// on the block and grid dimensions to ensure that we index at the correct
|
||||
// offset rather than at every 4 byte word; such that, when the join is
|
||||
// performed, we have the correct data that was copied over in chunks of 4
|
||||
// bytes.
|
||||
using word_size_type = std::conditional_t<
|
||||
sizeof(value_type) < sizeof(size_type),
|
||||
std::conditional_t<sizeof(value_type) == 2, int16_t, int8_t>, size_type>;
|
||||
|
||||
protected:
|
||||
// Algorithmic constraints:
|
||||
@ -463,10 +479,10 @@ class ParallelScanHIPBase {
|
||||
|
||||
const FunctorType m_functor;
|
||||
const Policy m_policy;
|
||||
size_type* m_scratch_space = nullptr;
|
||||
size_type* m_scratch_flags = nullptr;
|
||||
size_type m_final = false;
|
||||
int m_grid_x = 0;
|
||||
word_size_type* m_scratch_space = nullptr;
|
||||
size_type* m_scratch_flags = nullptr;
|
||||
size_type m_final = false;
|
||||
int m_grid_x = 0;
|
||||
// Only let one ParallelReduce/Scan modify the shared memory. The
|
||||
// constructor acquires the mutex which is released in the destructor.
|
||||
std::lock_guard<std::mutex> m_shared_memory_lock;
|
||||
@ -489,12 +505,12 @@ class ParallelScanHIPBase {
|
||||
__device__ inline void initial() const {
|
||||
typename Analysis::Reducer final_reducer(&m_functor);
|
||||
|
||||
const integral_nonzero_constant<size_type, Analysis::StaticValueSize /
|
||||
sizeof(size_type)>
|
||||
word_count(Analysis::value_size(m_functor) / sizeof(size_type));
|
||||
const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
|
||||
sizeof(word_size_type)>
|
||||
word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));
|
||||
|
||||
pointer_type const shared_value = reinterpret_cast<pointer_type>(
|
||||
Kokkos::Experimental::kokkos_impl_hip_shared_memory<size_type>() +
|
||||
Kokkos::Experimental::kokkos_impl_hip_shared_memory<word_size_type>() +
|
||||
word_count.value * threadIdx.y);
|
||||
|
||||
final_reducer.init(shared_value);
|
||||
@ -518,7 +534,7 @@ class ParallelScanHIPBase {
|
||||
// gridDim.x
|
||||
hip_single_inter_block_reduce_scan<true>(
|
||||
final_reducer, blockIdx.x, gridDim.x,
|
||||
Kokkos::Experimental::kokkos_impl_hip_shared_memory<size_type>(),
|
||||
Kokkos::Experimental::kokkos_impl_hip_shared_memory<word_size_type>(),
|
||||
m_scratch_space, m_scratch_flags);
|
||||
}
|
||||
|
||||
@ -527,22 +543,22 @@ class ParallelScanHIPBase {
|
||||
__device__ inline void final() const {
|
||||
typename Analysis::Reducer final_reducer(&m_functor);
|
||||
|
||||
const integral_nonzero_constant<size_type, Analysis::StaticValueSize /
|
||||
sizeof(size_type)>
|
||||
word_count(Analysis::value_size(m_functor) / sizeof(size_type));
|
||||
const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
|
||||
sizeof(word_size_type)>
|
||||
word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));
|
||||
|
||||
// Use shared memory as an exclusive scan: { 0 , value[0] , value[1] ,
|
||||
// value[2] , ... }
|
||||
size_type* const shared_data =
|
||||
Kokkos::Experimental::kokkos_impl_hip_shared_memory<size_type>();
|
||||
size_type* const shared_prefix =
|
||||
word_size_type* const shared_data =
|
||||
Kokkos::Experimental::kokkos_impl_hip_shared_memory<word_size_type>();
|
||||
word_size_type* const shared_prefix =
|
||||
shared_data + word_count.value * threadIdx.y;
|
||||
size_type* const shared_accum =
|
||||
word_size_type* const shared_accum =
|
||||
shared_data + word_count.value * (blockDim.y + 1);
|
||||
|
||||
// Starting value for this thread block is the previous block's total.
|
||||
if (blockIdx.x) {
|
||||
size_type* const block_total =
|
||||
word_size_type* const block_total =
|
||||
m_scratch_space + word_count.value * (blockIdx.x - 1);
|
||||
for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
|
||||
shared_accum[i] = block_total[i];
|
||||
@ -588,7 +604,7 @@ class ParallelScanHIPBase {
|
||||
typename Analysis::pointer_type(shared_data + word_count.value));
|
||||
|
||||
{
|
||||
size_type* const block_total =
|
||||
word_size_type* const block_total =
|
||||
shared_data + word_count.value * blockDim.y;
|
||||
for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
|
||||
shared_accum[i] = block_total[i];
|
||||
@ -647,8 +663,9 @@ class ParallelScanHIPBase {
|
||||
// How many block are really needed for this much work:
|
||||
m_grid_x = (nwork + work_per_block - 1) / work_per_block;
|
||||
|
||||
m_scratch_space = Kokkos::Experimental::Impl::hip_internal_scratch_space(
|
||||
m_policy.space(), Analysis::value_size(m_functor) * m_grid_x);
|
||||
m_scratch_space = reinterpret_cast<word_size_type*>(
|
||||
Kokkos::Experimental::Impl::hip_internal_scratch_space(
|
||||
m_policy.space(), Analysis::value_size(m_functor) * m_grid_x));
|
||||
m_scratch_flags = Kokkos::Experimental::Impl::hip_internal_scratch_flags(
|
||||
m_policy.space(), sizeof(size_type) * 1);
|
||||
|
||||
@ -734,7 +751,8 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
|
||||
DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace,
|
||||
Kokkos::Experimental::HIP>(
|
||||
Base::m_policy.space(), &m_returnvalue,
|
||||
Base::m_scratch_space + (Base::m_grid_x - 1) * size / sizeof(int),
|
||||
Base::m_scratch_space + (Base::m_grid_x - 1) * size /
|
||||
sizeof(typename Base::word_size_type),
|
||||
size);
|
||||
}
|
||||
}
|
||||
|
||||
@ -225,11 +225,11 @@ struct HIPReductionsFunctor<FunctorType, false> {
|
||||
}
|
||||
}
|
||||
|
||||
template <typename SizeType>
|
||||
__device__ static inline bool scalar_inter_block_reduction(
|
||||
FunctorType const& functor,
|
||||
::Kokkos::Experimental::HIP::size_type const block_count,
|
||||
::Kokkos::Experimental::HIP::size_type* const shared_data,
|
||||
::Kokkos::Experimental::HIP::size_type* const global_data,
|
||||
SizeType* const shared_data, SizeType* const global_data,
|
||||
::Kokkos::Experimental::HIP::size_type* const global_flags) {
|
||||
Scalar* const global_team_buffer_element =
|
||||
reinterpret_cast<Scalar*>(global_data);
|
||||
@ -411,16 +411,14 @@ __device__ void hip_intra_block_reduce_scan(
|
||||
* Global reduce result is in the last threads' 'shared_data' location.
|
||||
*/
|
||||
|
||||
template <bool DoScan, class FunctorType>
|
||||
template <bool DoScan, typename FunctorType, typename SizeType>
|
||||
__device__ bool hip_single_inter_block_reduce_scan_impl(
|
||||
FunctorType const& functor,
|
||||
::Kokkos::Experimental::HIP::size_type const block_id,
|
||||
::Kokkos::Experimental::HIP::size_type const block_count,
|
||||
::Kokkos::Experimental::HIP::size_type* const shared_data,
|
||||
::Kokkos::Experimental::HIP::size_type* const global_data,
|
||||
SizeType* const shared_data, SizeType* const global_data,
|
||||
::Kokkos::Experimental::HIP::size_type* const global_flags) {
|
||||
using size_type = ::Kokkos::Experimental::HIP::size_type;
|
||||
|
||||
using size_type = SizeType;
|
||||
using value_type = typename FunctorType::value_type;
|
||||
using pointer_type = typename FunctorType::pointer_type;
|
||||
|
||||
@ -518,13 +516,12 @@ __device__ bool hip_single_inter_block_reduce_scan_impl(
|
||||
return is_last_block;
|
||||
}
|
||||
|
||||
template <bool DoScan, typename FunctorType>
|
||||
template <bool DoScan, typename FunctorType, typename SizeType>
|
||||
__device__ bool hip_single_inter_block_reduce_scan(
|
||||
FunctorType const& functor,
|
||||
::Kokkos::Experimental::HIP::size_type const block_id,
|
||||
::Kokkos::Experimental::HIP::size_type const block_count,
|
||||
::Kokkos::Experimental::HIP::size_type* const shared_data,
|
||||
::Kokkos::Experimental::HIP::size_type* const global_data,
|
||||
SizeType* const shared_data, SizeType* const global_data,
|
||||
::Kokkos::Experimental::HIP::size_type* const global_flags) {
|
||||
// If we are doing a reduction and we don't do an array reduction, we use the
|
||||
// reduction-only path. Otherwise, we use the common path between reduction
|
||||
|
||||
@ -116,6 +116,7 @@ __device__ inline void hip_inter_warp_shuffle_reduction(
|
||||
value = result[0];
|
||||
for (int i = 1; (i * step < max_active_thread) && (i < step_width); ++i)
|
||||
reducer.join(&value, &result[i]);
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
template <typename ValueType, typename ReducerType>
|
||||
|
||||
@ -3711,12 +3711,13 @@ namespace Impl {
|
||||
|
||||
template <class T, class... P, class... ViewCtorArgs>
|
||||
inline std::enable_if_t<
|
||||
(std::is_same<
|
||||
typename Kokkos::View<T, P...>::memory_space,
|
||||
typename Kokkos::View<T, P...>::HostMirror::memory_space>::value &&
|
||||
std::is_same<
|
||||
typename Kokkos::View<T, P...>::data_type,
|
||||
typename Kokkos::View<T, P...>::HostMirror::data_type>::value),
|
||||
!Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
|
||||
(std::is_same<
|
||||
typename Kokkos::View<T, P...>::memory_space,
|
||||
typename Kokkos::View<T, P...>::HostMirror::memory_space>::value &&
|
||||
std::is_same<
|
||||
typename Kokkos::View<T, P...>::data_type,
|
||||
typename Kokkos::View<T, P...>::HostMirror::data_type>::value),
|
||||
typename Kokkos::View<T, P...>::HostMirror>
|
||||
create_mirror_view(const Kokkos::View<T, P...>& src,
|
||||
const Impl::ViewCtorProp<ViewCtorArgs...>&) {
|
||||
@ -3725,12 +3726,13 @@ create_mirror_view(const Kokkos::View<T, P...>& src,
|
||||
|
||||
template <class T, class... P, class... ViewCtorArgs>
|
||||
inline std::enable_if_t<
|
||||
!(std::is_same<
|
||||
typename Kokkos::View<T, P...>::memory_space,
|
||||
typename Kokkos::View<T, P...>::HostMirror::memory_space>::value &&
|
||||
std::is_same<
|
||||
typename Kokkos::View<T, P...>::data_type,
|
||||
typename Kokkos::View<T, P...>::HostMirror::data_type>::value),
|
||||
!Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
|
||||
!(std::is_same<typename Kokkos::View<T, P...>::memory_space,
|
||||
typename Kokkos::View<
|
||||
T, P...>::HostMirror::memory_space>::value &&
|
||||
std::is_same<
|
||||
typename Kokkos::View<T, P...>::data_type,
|
||||
typename Kokkos::View<T, P...>::HostMirror::data_type>::value),
|
||||
typename Kokkos::View<T, P...>::HostMirror>
|
||||
create_mirror_view(const Kokkos::View<T, P...>& src,
|
||||
const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
|
||||
@ -3738,25 +3740,33 @@ create_mirror_view(const Kokkos::View<T, P...>& src,
|
||||
}
|
||||
|
||||
// Create a mirror view in a new space (specialization for same space)
|
||||
template <class Space, class T, class... P, class... ViewCtorArgs>
|
||||
std::enable_if_t<Impl::MirrorViewType<Space, T, P...>::is_same_memspace,
|
||||
typename Impl::MirrorViewType<Space, T, P...>::view_type>
|
||||
create_mirror_view(const Space&, const Kokkos::View<T, P...>& src,
|
||||
template <class T, class... P, class... ViewCtorArgs,
|
||||
class = std::enable_if_t<
|
||||
Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
|
||||
std::enable_if_t<Impl::MirrorViewType<
|
||||
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
|
||||
T, P...>::is_same_memspace,
|
||||
typename Impl::MirrorViewType<
|
||||
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
|
||||
T, P...>::view_type>
|
||||
create_mirror_view(const Kokkos::View<T, P...>& src,
|
||||
const Impl::ViewCtorProp<ViewCtorArgs...>&) {
|
||||
return src;
|
||||
}
|
||||
|
||||
// Create a mirror view in a new space (specialization for different space)
|
||||
template <class Space, class T, class... P, class... ViewCtorArgs>
|
||||
std::enable_if_t<!Impl::MirrorViewType<Space, T, P...>::is_same_memspace,
|
||||
typename Impl::MirrorViewType<Space, T, P...>::view_type>
|
||||
create_mirror_view(const Space&, const Kokkos::View<T, P...>& src,
|
||||
template <class T, class... P, class... ViewCtorArgs,
|
||||
class = std::enable_if_t<
|
||||
Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
|
||||
std::enable_if_t<!Impl::MirrorViewType<
|
||||
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
|
||||
T, P...>::is_same_memspace,
|
||||
typename Impl::MirrorViewType<
|
||||
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
|
||||
T, P...>::view_type>
|
||||
create_mirror_view(const Kokkos::View<T, P...>& src,
|
||||
const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
|
||||
using MemorySpace = typename Space::memory_space;
|
||||
using alloc_prop = Impl::ViewCtorProp<ViewCtorArgs..., MemorySpace>;
|
||||
alloc_prop prop_copy(arg_prop);
|
||||
|
||||
return Kokkos::Impl::create_mirror(src, prop_copy);
|
||||
return Kokkos::Impl::create_mirror(src, arg_prop);
|
||||
}
|
||||
} // namespace Impl
|
||||
|
||||
@ -3815,9 +3825,10 @@ typename Impl::MirrorViewType<Space, T, P...>::view_type create_mirror_view(
|
||||
template <class Space, class T, class... P,
|
||||
typename Enable = std::enable_if_t<Kokkos::is_space<Space>::value>>
|
||||
typename Impl::MirrorViewType<Space, T, P...>::view_type create_mirror_view(
|
||||
Kokkos::Impl::WithoutInitializing_t wi, Space const& space,
|
||||
Kokkos::Impl::WithoutInitializing_t wi, Space const&,
|
||||
Kokkos::View<T, P...> const& v) {
|
||||
return Impl::create_mirror_view(space, v, view_alloc(wi));
|
||||
return Impl::create_mirror_view(
|
||||
v, view_alloc(typename Space::memory_space{}, wi));
|
||||
}
|
||||
|
||||
template <class T, class... P, class... ViewCtorArgs>
|
||||
|
||||
@ -1754,7 +1754,10 @@ struct RankDataType<ValueType, 0> {
|
||||
};
|
||||
|
||||
template <unsigned N, typename... Args>
|
||||
KOKKOS_FUNCTION std::enable_if_t<N == View<Args...>::Rank, View<Args...>>
|
||||
KOKKOS_FUNCTION std::enable_if_t<
|
||||
N == View<Args...>::Rank &&
|
||||
std::is_same<typename ViewTraits<Args...>::specialize, void>::value,
|
||||
View<Args...>>
|
||||
as_view_of_rank_n(View<Args...> v) {
|
||||
return v;
|
||||
}
|
||||
@ -1762,13 +1765,13 @@ as_view_of_rank_n(View<Args...> v) {
|
||||
// Placeholder implementation to compile generic code for DynRankView; should
|
||||
// never be called
|
||||
template <unsigned N, typename T, typename... Args>
|
||||
std::enable_if_t<
|
||||
N != View<T, Args...>::Rank,
|
||||
KOKKOS_FUNCTION std::enable_if_t<
|
||||
N != View<T, Args...>::Rank &&
|
||||
std::is_same<typename ViewTraits<T, Args...>::specialize, void>::value,
|
||||
View<typename RankDataType<typename View<T, Args...>::value_type, N>::type,
|
||||
Args...>>
|
||||
as_view_of_rank_n(View<T, Args...>) {
|
||||
Kokkos::Impl::throw_runtime_exception(
|
||||
"Trying to get at a View of the wrong rank");
|
||||
Kokkos::abort("Trying to get at a View of the wrong rank");
|
||||
return {};
|
||||
}
|
||||
|
||||
|
||||
@ -101,8 +101,8 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
|
||||
void push_work(const std::int32_t w) const noexcept {
|
||||
const std::int32_t N = m_graph.numRows();
|
||||
|
||||
std::int32_t volatile* const ready_queue = &m_queue[0];
|
||||
std::int32_t volatile* const end_hint = &m_queue[2 * N + 1];
|
||||
std::int32_t* const ready_queue = &m_queue[0];
|
||||
std::int32_t* const end_hint = &m_queue[2 * N + 1];
|
||||
|
||||
// Push work to end of queue
|
||||
const std::int32_t j = atomic_fetch_add(end_hint, 1);
|
||||
@ -134,14 +134,14 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
|
||||
std::int32_t pop_work() const noexcept {
|
||||
const std::int32_t N = m_graph.numRows();
|
||||
|
||||
std::int32_t volatile* const ready_queue = &m_queue[0];
|
||||
std::int32_t volatile* const begin_hint = &m_queue[2 * N];
|
||||
std::int32_t* const ready_queue = &m_queue[0];
|
||||
std::int32_t* const begin_hint = &m_queue[2 * N];
|
||||
|
||||
// begin hint is guaranteed to be less than or equal to
|
||||
// actual begin location in the queue.
|
||||
|
||||
for (std::int32_t i = *begin_hint; i < N; ++i) {
|
||||
const std::int32_t w = ready_queue[i];
|
||||
for (std::int32_t i = Kokkos::atomic_load(begin_hint); i < N; ++i) {
|
||||
const std::int32_t w = Kokkos::atomic_load(&ready_queue[i]);
|
||||
|
||||
if (w == END_TOKEN) {
|
||||
return END_TOKEN;
|
||||
@ -169,7 +169,7 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
|
||||
|
||||
const std::int32_t N = m_graph.numRows();
|
||||
|
||||
std::int32_t volatile* const count_queue = &m_queue[N];
|
||||
std::int32_t* const count_queue = &m_queue[N];
|
||||
|
||||
const std::int32_t B = m_graph.row_map(w);
|
||||
const std::int32_t E = m_graph.row_map(w + 1);
|
||||
@ -199,7 +199,7 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(const TagCount, int i) const noexcept {
|
||||
std::int32_t volatile* const count_queue = &m_queue[m_graph.numRows()];
|
||||
std::int32_t* const count_queue = &m_queue[m_graph.numRows()];
|
||||
|
||||
atomic_increment(count_queue + m_graph.entries[i]);
|
||||
}
|
||||
|
||||
@ -51,7 +51,7 @@ namespace Kokkos::Experimental::Impl {
|
||||
|
||||
struct OpenACC_Traits {
|
||||
#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
|
||||
defined(KOKKOS_ARCH_AMPERE)
|
||||
defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER)
|
||||
static constexpr acc_device_t dev_type = acc_device_nvidia;
|
||||
static constexpr bool may_fallback_to_host = false;
|
||||
#else
|
||||
|
||||
@ -47,6 +47,7 @@
|
||||
#endif
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#include <impl/Kokkos_DeviceManagement.hpp>
|
||||
|
||||
#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(_OPENMP)
|
||||
|
||||
@ -114,8 +115,9 @@ void OpenMPTargetInternal::impl_initialize() {
|
||||
|
||||
// FIXME_OPENMPTARGET: Only fix the number of teams for NVIDIA architectures
|
||||
// from Pascal and upwards.
|
||||
#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
|
||||
defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
|
||||
#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
|
||||
defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
|
||||
defined(KOKKOS_ARCH_HOPPER)
|
||||
#if defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG >= 1300)
|
||||
omp_set_num_teams(512);
|
||||
#endif
|
||||
@ -164,7 +166,11 @@ void OpenMPTarget::impl_static_fence(const std::string& name) {
|
||||
name, Kokkos::Experimental::Impl::openmp_fence_is_static::yes);
|
||||
}
|
||||
|
||||
void OpenMPTarget::impl_initialize(InitializationSettings const&) {
|
||||
void OpenMPTarget::impl_initialize(InitializationSettings const& settings) {
|
||||
using Kokkos::Impl::get_gpu;
|
||||
const int device_num = get_gpu(settings);
|
||||
omp_set_default_device(device_num);
|
||||
|
||||
Impl::OpenMPTargetInternal::impl_singleton()->impl_initialize();
|
||||
}
|
||||
void OpenMPTarget::impl_finalize() {
|
||||
|
||||
@ -155,7 +155,7 @@ void SYCL::impl_initialize(InitializationSettings const& settings) {
|
||||
#if !defined(KOKKOS_ARCH_INTEL_GPU) && !defined(KOKKOS_ARCH_KEPLER) && \
|
||||
!defined(KOKKOS_ARCH_MAXWELL) && !defined(KOKKOS_ARCH_PASCAL) && \
|
||||
!defined(KOKKOS_ARCH_VOLTA) && !defined(KOKKOS_ARCH_TURING75) && \
|
||||
!defined(KOKKOS_ARCH_AMPERE)
|
||||
!defined(KOKKOS_ARCH_AMPERE) && !defined(KOKKOS_ARCH_HOPPER)
|
||||
if (!settings.has_device_id() && gpu_devices.empty()) {
|
||||
Impl::SYCLInternal::singleton().initialize(sycl::device());
|
||||
return;
|
||||
|
||||
@ -335,9 +335,10 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
|
||||
return std::min({
|
||||
int(m_space.impl_internal_space_instance()->m_maxWorkgroupSize),
|
||||
// FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs.
|
||||
#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
|
||||
defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
|
||||
defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
|
||||
#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
|
||||
defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
|
||||
defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
|
||||
defined(KOKKOS_ARCH_HOPPER)
|
||||
256,
|
||||
#endif
|
||||
max_threads_for_memory
|
||||
@ -367,9 +368,10 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
|
||||
return std::min<int>({
|
||||
int(m_space.impl_internal_space_instance()->m_maxWorkgroupSize),
|
||||
// FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs.
|
||||
#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
|
||||
defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
|
||||
defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
|
||||
#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
|
||||
defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
|
||||
defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
|
||||
defined(KOKKOS_ARCH_HOPPER)
|
||||
256,
|
||||
#endif
|
||||
max_threads_for_memory
|
||||
|
||||
@ -110,10 +110,9 @@ KOKKOS_IMPL_HOST_FUNCTION inline uint64_t clock_tic_host() noexcept {
|
||||
|
||||
return ((uint64_t)a) | (((uint64_t)d) << 32);
|
||||
|
||||
#elif defined(__powerpc) || defined(__powerpc__) || defined(__powerpc64__) || \
|
||||
defined(__POWERPC__) || defined(__ppc__) || defined(__ppc64__)
|
||||
#elif defined(__powerpc64__) || defined(__ppc64__)
|
||||
|
||||
unsigned int cycles = 0;
|
||||
unsigned long cycles = 0;
|
||||
|
||||
asm volatile("mftb %0" : "=r"(cycles));
|
||||
|
||||
|
||||
@ -166,6 +166,8 @@ int get_device_count() {
|
||||
#elif defined(KOKKOS_ENABLE_OPENACC)
|
||||
return acc_get_num_devices(
|
||||
Kokkos::Experimental::Impl::OpenACC_Traits::dev_type);
|
||||
#elif defined(KOKKOS_ENABLE_OPENMPTARGET)
|
||||
return omp_get_num_devices();
|
||||
#else
|
||||
Kokkos::abort("implementation bug");
|
||||
return -1;
|
||||
@ -426,11 +428,17 @@ int Kokkos::Impl::get_gpu(const InitializationSettings& settings) {
|
||||
Kokkos::abort("implementation bug");
|
||||
}
|
||||
|
||||
auto const* local_rank_str =
|
||||
std::getenv("OMPI_COMM_WORLD_LOCAL_RANK"); // OpenMPI
|
||||
if (!local_rank_str)
|
||||
local_rank_str = std::getenv("MV2_COMM_WORLD_LOCAL_RANK"); // MVAPICH2
|
||||
if (!local_rank_str) local_rank_str = std::getenv("SLURM_LOCALID"); // SLURM
|
||||
char const* local_rank_str = nullptr;
|
||||
for (char const* env_var : {
|
||||
"OMPI_COMM_WORLD_LOCAL_RANK", // OpenMPI
|
||||
"MV2_COMM_WORLD_LOCAL_RANK", // MVAPICH2
|
||||
"MPI_LOCALRANKID", // MPICH
|
||||
"SLURM_LOCALID", // SLURM
|
||||
"PMI_LOCAL_RANK" // PMI
|
||||
}) {
|
||||
local_rank_str = std::getenv(env_var);
|
||||
if (local_rank_str) break;
|
||||
}
|
||||
|
||||
// use first GPU available for execution if unable to detect local MPI rank
|
||||
if (!local_rank_str) {
|
||||
|
||||
@ -1128,9 +1128,8 @@ struct ViewOffset<
|
||||
KOKKOS_INLINE_FUNCTION constexpr ViewOffset(
|
||||
const ViewOffset<DimRHS, Kokkos::LayoutRight, void>& rhs)
|
||||
: m_dim(rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0) {
|
||||
static_assert((DimRHS::rank == 0 && dimension_type::rank == 0) ||
|
||||
(DimRHS::rank == 1 && dimension_type::rank == 1 &&
|
||||
dimension_type::rank_dynamic == 1),
|
||||
static_assert(((DimRHS::rank == 0 && dimension_type::rank == 0) ||
|
||||
(DimRHS::rank == 1 && dimension_type::rank == 1)),
|
||||
"ViewOffset LayoutLeft and LayoutRight are only compatible "
|
||||
"when rank <= 1");
|
||||
}
|
||||
@ -1778,8 +1777,7 @@ struct ViewOffset<
|
||||
const ViewOffset<DimRHS, Kokkos::LayoutLeft, void>& rhs)
|
||||
: m_dim(rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0) {
|
||||
static_assert((DimRHS::rank == 0 && dimension_type::rank == 0) ||
|
||||
(DimRHS::rank == 1 && dimension_type::rank == 1 &&
|
||||
dimension_type::rank_dynamic == 1),
|
||||
(DimRHS::rank == 1 && dimension_type::rank == 1),
|
||||
"ViewOffset LayoutRight and LayoutLeft are only compatible "
|
||||
"when rank <= 1");
|
||||
}
|
||||
@ -3059,10 +3057,10 @@ struct ViewValueFunctor<DeviceType, ValueType, true /* is_scalar */> {
|
||||
std::is_trivially_copy_assignable<Dummy>::value>
|
||||
construct_shared_allocation() {
|
||||
// Shortcut for zero initialization
|
||||
ValueType value{};
|
||||
// On A64FX memset seems to do the wrong thing with regards to first touch
|
||||
// leading to the significant performance issues
|
||||
#ifndef KOKKOS_ARCH_A64FX
|
||||
ValueType value{};
|
||||
if (Impl::is_zero_byte(value)) {
|
||||
uint64_t kpID = 0;
|
||||
if (Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
@ -3539,9 +3537,7 @@ class ViewMapping<
|
||||
typename SrcTraits::array_layout>::value ||
|
||||
std::is_same<typename DstTraits::array_layout,
|
||||
Kokkos::LayoutStride>::value ||
|
||||
(DstTraits::dimension::rank == 0) ||
|
||||
(DstTraits::dimension::rank == 1 &&
|
||||
DstTraits::dimension::rank_dynamic == 1)
|
||||
(DstTraits::dimension::rank == 0) || (DstTraits::dimension::rank == 1)
|
||||
};
|
||||
|
||||
public:
|
||||
|
||||
@ -73,6 +73,7 @@ KOKKOS_INCLUDE_DIRECTORIES(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files)
|
||||
|
||||
SET(COMPILE_ONLY_SOURCES
|
||||
TestArray.cpp
|
||||
TestCreateMirror.cpp
|
||||
TestDetectionIdiom.cpp
|
||||
TestInterOp.cpp
|
||||
TestLegionInteroperability.cpp
|
||||
@ -86,6 +87,7 @@ ENDIF()
|
||||
KOKKOS_ADD_EXECUTABLE(
|
||||
TestCompileOnly
|
||||
SOURCES
|
||||
TestCompileMain.cpp
|
||||
${COMPILE_ONLY_SOURCES}
|
||||
)
|
||||
|
||||
|
||||
1
lib/kokkos/core/unit_test/TestCompileMain.cpp
Normal file
1
lib/kokkos/core/unit_test/TestCompileMain.cpp
Normal file
@ -0,0 +1 @@
|
||||
int main() {}
|
||||
126
lib/kokkos/core/unit_test/TestCreateMirror.cpp
Normal file
126
lib/kokkos/core/unit_test/TestCreateMirror.cpp
Normal file
@ -0,0 +1,126 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 3.0
|
||||
// Copyright (2020) National Technology & Engineering
|
||||
// Solutions of Sandia, LLC (NTESS).
|
||||
//
|
||||
// Under the terms of Contract DE-NA0003525 with NTESS,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
|
||||
template <typename TestView, typename MemorySpace>
|
||||
void check_memory_space(TestView, MemorySpace) {
|
||||
static_assert(
|
||||
std::is_same<typename TestView::memory_space, MemorySpace>::value, "");
|
||||
}
|
||||
|
||||
template <class View>
|
||||
auto host_mirror_test_space(View) {
|
||||
return std::conditional_t<
|
||||
Kokkos::SpaceAccessibility<Kokkos::HostSpace,
|
||||
typename View::memory_space>::accessible,
|
||||
typename View::memory_space, Kokkos::HostSpace>{};
|
||||
}
|
||||
|
||||
template <typename View>
|
||||
void test_create_mirror_properties(const View& view) {
|
||||
using namespace Kokkos;
|
||||
using DeviceMemorySpace = typename DefaultExecutionSpace::memory_space;
|
||||
|
||||
// clang-format off
|
||||
|
||||
// create_mirror
|
||||
check_memory_space(create_mirror(WithoutInitializing, view), host_mirror_test_space(view));
|
||||
check_memory_space(create_mirror( view), host_mirror_test_space(view));
|
||||
check_memory_space(create_mirror(WithoutInitializing, DefaultExecutionSpace{}, view), DeviceMemorySpace{});
|
||||
check_memory_space(create_mirror( DefaultExecutionSpace{}, view), DeviceMemorySpace{});
|
||||
|
||||
// create_mirror_view
|
||||
check_memory_space(create_mirror_view(WithoutInitializing, view), host_mirror_test_space(view));
|
||||
check_memory_space(create_mirror_view( view), host_mirror_test_space(view));
|
||||
check_memory_space(create_mirror_view(WithoutInitializing, DefaultExecutionSpace{}, view), DeviceMemorySpace{});
|
||||
check_memory_space(create_mirror_view( DefaultExecutionSpace{}, view), DeviceMemorySpace{});
|
||||
|
||||
// create_mirror view_alloc
|
||||
check_memory_space(create_mirror(view_alloc(WithoutInitializing), view), host_mirror_test_space(view));
|
||||
check_memory_space(create_mirror(view_alloc(), view), host_mirror_test_space(view));
|
||||
check_memory_space(create_mirror(view_alloc(WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
|
||||
check_memory_space(create_mirror(view_alloc( DeviceMemorySpace{}), view), DeviceMemorySpace{});
|
||||
|
||||
// create_mirror_view view_alloc
|
||||
check_memory_space(create_mirror_view(view_alloc(WithoutInitializing), view), host_mirror_test_space(view));
|
||||
check_memory_space(create_mirror_view(view_alloc(), view), host_mirror_test_space(view));
|
||||
check_memory_space(create_mirror_view(view_alloc(WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
|
||||
check_memory_space(create_mirror_view(view_alloc( DeviceMemorySpace{}), view), DeviceMemorySpace{});
|
||||
|
||||
// create_mirror view_alloc + execution space
|
||||
check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{}, WithoutInitializing), view), host_mirror_test_space(view));
|
||||
check_memory_space(create_mirror(view_alloc(DefaultHostExecutionSpace{}), view), host_mirror_test_space(view));
|
||||
check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{}, WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
|
||||
check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{}, DeviceMemorySpace{}), view), DeviceMemorySpace{});
|
||||
|
||||
// create_mirror_view view_alloc + execution space
|
||||
check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{}, WithoutInitializing), view), host_mirror_test_space(view));
|
||||
check_memory_space(create_mirror_view(view_alloc(DefaultHostExecutionSpace{}), view), host_mirror_test_space(view));
|
||||
check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{}, WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
|
||||
check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{}, DeviceMemorySpace{}), view), DeviceMemorySpace{});
|
||||
|
||||
// create_mirror_view_and_copy
|
||||
check_memory_space(create_mirror_view_and_copy(HostSpace{}, view), HostSpace{});
|
||||
check_memory_space(create_mirror_view_and_copy(DeviceMemorySpace{}, view), DeviceMemorySpace{});
|
||||
|
||||
// create_mirror_view_and_copy view_alloc
|
||||
check_memory_space(create_mirror_view_and_copy(view_alloc(HostSpace{}), view), HostSpace{});
|
||||
check_memory_space(create_mirror_view_and_copy(view_alloc(DeviceMemorySpace{}), view), DeviceMemorySpace{});
|
||||
|
||||
// create_mirror_view_and_copy view_alloc + execution space
|
||||
check_memory_space(create_mirror_view_and_copy(view_alloc(HostSpace{}, DefaultHostExecutionSpace{}), view), HostSpace{});
|
||||
check_memory_space(create_mirror_view_and_copy(view_alloc(DeviceMemorySpace{}, DefaultExecutionSpace{}), view), DeviceMemorySpace{});
|
||||
|
||||
// clang-format on
|
||||
}
|
||||
|
||||
void test() {
|
||||
Kokkos::View<int*, Kokkos::DefaultExecutionSpace> device_view("device view",
|
||||
10);
|
||||
Kokkos::View<int*, Kokkos::HostSpace> host_view("host view", 10);
|
||||
|
||||
test_create_mirror_properties(device_view);
|
||||
test_create_mirror_properties(host_view);
|
||||
}
|
||||
@ -92,5 +92,3 @@ static_assert(std::is_same<difference_type<Woof>, int>::value,
|
||||
static_assert(std::is_same<difference_type<Bark>, std::ptrdiff_t>::value,
|
||||
"Bark's difference_type should be ptrdiff_t!");
|
||||
} // namespace Example
|
||||
|
||||
int main() {}
|
||||
|
||||
@ -45,20 +45,23 @@
|
||||
#include <Kokkos_Core.hpp>
|
||||
#include <cstdio>
|
||||
|
||||
namespace Test {
|
||||
namespace {
|
||||
|
||||
template <class Device>
|
||||
template <class Device, class T, T ImbalanceSz>
|
||||
struct TestScan {
|
||||
using execution_space = Device;
|
||||
using value_type = int64_t;
|
||||
using value_type = T;
|
||||
|
||||
Kokkos::View<int, Device, Kokkos::MemoryTraits<Kokkos::Atomic> > errors;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(const int iwork, value_type& update,
|
||||
const bool final_pass) const {
|
||||
const value_type n = iwork + 1;
|
||||
const value_type imbalance = ((1000 <= n) && (0 == n % 1000)) ? 1000 : 0;
|
||||
const value_type n = iwork + 1;
|
||||
const value_type imbalance =
|
||||
((ImbalanceSz <= n) && (value_type(0) == n % ImbalanceSz))
|
||||
? ImbalanceSz
|
||||
: value_type(0);
|
||||
|
||||
// Insert an artificial load imbalance
|
||||
|
||||
@ -133,12 +136,29 @@ struct TestScan {
|
||||
}
|
||||
}
|
||||
};
|
||||
} // namespace
|
||||
|
||||
TEST(TEST_CATEGORY, scan) {
|
||||
TestScan<TEST_EXECSPACE>::test_range(1, 1000);
|
||||
TestScan<TEST_EXECSPACE>(0);
|
||||
TestScan<TEST_EXECSPACE>(100000);
|
||||
TestScan<TEST_EXECSPACE>(10000000);
|
||||
TEST_EXECSPACE().fence();
|
||||
constexpr auto imbalance_size = 1000;
|
||||
TestScan<TEST_EXECSPACE, int64_t, imbalance_size>::test_range(1, 1000);
|
||||
TestScan<TEST_EXECSPACE, int64_t, imbalance_size>(0);
|
||||
TestScan<TEST_EXECSPACE, int64_t, imbalance_size>(100000);
|
||||
TestScan<TEST_EXECSPACE, int64_t, imbalance_size>(10000000);
|
||||
}
|
||||
|
||||
TEST(TEST_CATEGORY, small_size_scan) {
|
||||
constexpr auto imbalance_size = 10; // Pick to not overflow...
|
||||
TestScan<TEST_EXECSPACE, std::int8_t, imbalance_size>(0);
|
||||
TestScan<TEST_EXECSPACE, std::int8_t, imbalance_size>(5);
|
||||
TestScan<TEST_EXECSPACE, std::int8_t, imbalance_size>(10);
|
||||
TestScan<TEST_EXECSPACE, std::int8_t, imbalance_size>(
|
||||
static_cast<std::size_t>(
|
||||
std::sqrt(std::numeric_limits<std::int8_t>::max())));
|
||||
constexpr auto short_imbalance_size = 100; // Pick to not overflow...
|
||||
TestScan<TEST_EXECSPACE, std::int16_t, short_imbalance_size>(0);
|
||||
TestScan<TEST_EXECSPACE, std::int16_t, short_imbalance_size>(5);
|
||||
TestScan<TEST_EXECSPACE, std::int16_t, short_imbalance_size>(100);
|
||||
TestScan<TEST_EXECSPACE, std::int16_t, short_imbalance_size>(
|
||||
static_cast<std::size_t>(
|
||||
std::sqrt(std::numeric_limits<std::int16_t>::max())));
|
||||
}
|
||||
} // namespace Test
|
||||
|
||||
@ -1616,6 +1616,73 @@ struct TestTeamPolicyHandleByValue {
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace {
|
||||
template <typename ExecutionSpace>
|
||||
struct TestRepeatedTeamReduce {
|
||||
static constexpr int ncol = 1500; // nothing special, just some work
|
||||
|
||||
KOKKOS_FUNCTION void operator()(
|
||||
const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type &team)
|
||||
const {
|
||||
// non-divisible by power of two to make triggering problems easier
|
||||
constexpr int nlev = 129;
|
||||
constexpr auto pi = Kokkos::Experimental::pi_v<double>;
|
||||
double b = 0.;
|
||||
for (int ri = 0; ri < 10; ++ri) {
|
||||
// The contributions here must be sufficiently complex, simply adding ones
|
||||
// wasn't enough to trigger the bug.
|
||||
const auto g1 = [&](const int k, double &acc) {
|
||||
acc += Kokkos::cos(pi * double(k) / nlev);
|
||||
};
|
||||
const auto g2 = [&](const int k, double &acc) {
|
||||
acc += Kokkos::sin(pi * double(k) / nlev);
|
||||
};
|
||||
double a1, a2;
|
||||
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team, nlev), g1, a1);
|
||||
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team, nlev), g2, a2);
|
||||
b += a1;
|
||||
b += a2;
|
||||
}
|
||||
const auto h = [&]() {
|
||||
const auto col = team.league_rank();
|
||||
v(col) = b + col;
|
||||
};
|
||||
Kokkos::single(Kokkos::PerTeam(team), h);
|
||||
}
|
||||
|
||||
KOKKOS_FUNCTION void operator()(const int i, int &bad) const {
|
||||
if (v(i) != v(0) + i) {
|
||||
++bad;
|
||||
KOKKOS_IMPL_DO_NOT_USE_PRINTF("Failing at %d!\n", i);
|
||||
}
|
||||
}
|
||||
|
||||
TestRepeatedTeamReduce() : v("v", ncol) { test(); }
|
||||
|
||||
void test() {
|
||||
int team_size_recommended =
|
||||
Kokkos::TeamPolicy<ExecutionSpace>(1, 1).team_size_recommended(
|
||||
*this, Kokkos::ParallelForTag());
|
||||
// Choose a non-recommened (non-power of two for GPUs) team size
|
||||
int team_size = team_size_recommended > 1 ? team_size_recommended - 1 : 1;
|
||||
|
||||
// The failure was non-deterministic so run the test a bunch of times
|
||||
for (int it = 0; it < 100; ++it) {
|
||||
Kokkos::parallel_for(
|
||||
Kokkos::TeamPolicy<ExecutionSpace>(ncol, team_size, 1), *this);
|
||||
|
||||
int bad = 0;
|
||||
Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecutionSpace>(0, ncol),
|
||||
*this, bad);
|
||||
ASSERT_EQ(bad, 0) << " Failing in iteration " << it;
|
||||
}
|
||||
}
|
||||
|
||||
Kokkos::View<double *, ExecutionSpace> v;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
} // namespace Test
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
@ -134,5 +134,15 @@ TEST(TEST_CATEGORY, team_parallel_dummy_with_reducer_and_scratch_space) {
|
||||
}
|
||||
}
|
||||
|
||||
TEST(TEST_CATEGORY, repeated_team_reduce) {
|
||||
#ifdef KOKKOS_ENABLE_OPENMPTARGET
|
||||
if (std::is_same<TEST_EXECSPACE, Kokkos::Experimental::OpenMPTarget>::value)
|
||||
GTEST_SKIP() << "skipping since team_reduce for OpenMPTarget is not "
|
||||
"properly implemented";
|
||||
#endif
|
||||
|
||||
TestRepeatedTeamReduce<TEST_EXECSPACE>();
|
||||
}
|
||||
|
||||
} // namespace Test
|
||||
#endif
|
||||
|
||||
@ -92,8 +92,18 @@ TEST(TEST_CATEGORY, view_is_assignable) {
|
||||
View<double*, left, d_exec>>::test(false, false, 10);
|
||||
|
||||
// Layout assignment
|
||||
Impl::TestAssignability<View<int, left, d_exec>,
|
||||
View<int, right, d_exec>>::test(true, true);
|
||||
Impl::TestAssignability<View<int*, left, d_exec>,
|
||||
View<int*, right, d_exec>>::test(true, true, 10);
|
||||
Impl::TestAssignability<View<int[5], left, d_exec>,
|
||||
View<int*, right, d_exec>>::test(false, false, 10);
|
||||
Impl::TestAssignability<View<int[10], left, d_exec>,
|
||||
View<int*, right, d_exec>>::test(false, true, 10);
|
||||
Impl::TestAssignability<View<int*, left, d_exec>,
|
||||
View<int[5], right, d_exec>>::test(true, true);
|
||||
Impl::TestAssignability<View<int[5], left, d_exec>,
|
||||
View<int[10], right, d_exec>>::test(false, false);
|
||||
|
||||
// This could be made possible (due to the degenerate nature of the views) but
|
||||
// we do not allow this yet
|
||||
|
||||
199
lib/kokkos/kokkos_5538.diff
Normal file
199
lib/kokkos/kokkos_5538.diff
Normal file
@ -0,0 +1,199 @@
|
||||
diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos
|
||||
index 22af411f32..530510a0d1 100644
|
||||
--- a/lib/kokkos/Makefile.kokkos
|
||||
+++ b/lib/kokkos/Makefile.kokkos
|
||||
@@ -20,7 +20,7 @@ KOKKOS_DEVICES ?= "OpenMP"
|
||||
#KOKKOS_DEVICES ?= "Threads"
|
||||
# Options:
|
||||
# Intel: KNC,KNL,SNB,HSW,BDW,SKL,SKX,ICL,ICX,SPR
|
||||
-# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86
|
||||
+# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86,Hopper90
|
||||
# ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX
|
||||
# IBM: BGQ,Power7,Power8,Power9
|
||||
# AMD-GPUS: Vega900,Vega906,Vega908,Vega90A
|
||||
@@ -401,6 +401,7 @@ KOKKOS_INTERNAL_USE_ARCH_VOLTA72 := $(call kokkos_has_string,$(KOKKOS_ARCH),Volt
|
||||
KOKKOS_INTERNAL_USE_ARCH_TURING75 := $(call kokkos_has_string,$(KOKKOS_ARCH),Turing75)
|
||||
KOKKOS_INTERNAL_USE_ARCH_AMPERE80 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere80)
|
||||
KOKKOS_INTERNAL_USE_ARCH_AMPERE86 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere86)
|
||||
+KOKKOS_INTERNAL_USE_ARCH_HOPPER90 := $(call kokkos_has_string,$(KOKKOS_ARCH),Hopper90)
|
||||
KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \
|
||||
@@ -414,7 +415,8 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLE
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_TURING75) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_AMPERE80) \
|
||||
- + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86))
|
||||
+ + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86) \
|
||||
+ + $(KOKKOS_INTERNAL_USE_ARCH_HOPPER90))
|
||||
|
||||
#SEK: This seems like a bug to me
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
|
||||
@@ -1194,6 +1196,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA_ARCH), 1)
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE86")
|
||||
KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_86
|
||||
endif
|
||||
+ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_HOPPER90), 1)
|
||||
+ tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER")
|
||||
+ tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER90")
|
||||
+ KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_90
|
||||
+ endif
|
||||
|
||||
ifneq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
|
||||
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)
|
||||
diff --git a/lib/kokkos/cmake/KokkosCore_config.h.in b/lib/kokkos/cmake/KokkosCore_config.h.in
|
||||
index 88ddc48378..b83ced9243 100644
|
||||
--- a/lib/kokkos/cmake/KokkosCore_config.h.in
|
||||
+++ b/lib/kokkos/cmake/KokkosCore_config.h.in
|
||||
@@ -102,6 +102,7 @@
|
||||
#cmakedefine KOKKOS_ARCH_AMPERE
|
||||
#cmakedefine KOKKOS_ARCH_AMPERE80
|
||||
#cmakedefine KOKKOS_ARCH_AMPERE86
|
||||
+#cmakedefine KOKKOS_ARCH_HOPPER90
|
||||
#cmakedefine KOKKOS_ARCH_AMD_ZEN
|
||||
#cmakedefine KOKKOS_ARCH_AMD_ZEN2
|
||||
#cmakedefine KOKKOS_ARCH_AMD_ZEN3
|
||||
diff --git a/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc b/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc
|
||||
index f56cef1651..2585a6a64c 100644
|
||||
--- a/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc
|
||||
+++ b/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc
|
||||
@@ -74,6 +74,7 @@ int main() {
|
||||
case 75: std::cout << "Set -DKokkos_ARCH_TURING75=ON ." << std::endl; break;
|
||||
case 80: std::cout << "Set -DKokkos_ARCH_AMPERE80=ON ." << std::endl; break;
|
||||
case 86: std::cout << "Set -DKokkos_ARCH_AMPERE86=ON ." << std::endl; break;
|
||||
+ case 90: std::cout << "Set -DKokkos_ARCH_HOPPER90=ON ." << std::endl; break;
|
||||
default:
|
||||
std::cout << "Compute capability " << compute_capability
|
||||
<< " is not supported" << std::endl;
|
||||
diff --git a/lib/kokkos/cmake/kokkos_arch.cmake b/lib/kokkos/cmake/kokkos_arch.cmake
|
||||
index ef16aad047..c1d76cceeb 100644
|
||||
--- a/lib/kokkos/cmake/kokkos_arch.cmake
|
||||
+++ b/lib/kokkos/cmake/kokkos_arch.cmake
|
||||
@@ -86,6 +86,7 @@ KOKKOS_ARCH_OPTION(VOLTA72 GPU "NVIDIA Volta generation CC 7.2" "KOKK
|
||||
KOKKOS_ARCH_OPTION(TURING75 GPU "NVIDIA Turing generation CC 7.5" "KOKKOS_SHOW_CUDA_ARCHS")
|
||||
KOKKOS_ARCH_OPTION(AMPERE80 GPU "NVIDIA Ampere generation CC 8.0" "KOKKOS_SHOW_CUDA_ARCHS")
|
||||
KOKKOS_ARCH_OPTION(AMPERE86 GPU "NVIDIA Ampere generation CC 8.6" "KOKKOS_SHOW_CUDA_ARCHS")
|
||||
+KOKKOS_ARCH_OPTION(HOPPER90 GPU "NVIDIA Hopper generation CC 9.0" "KOKKOS_SHOW_CUDA_ARCHS")
|
||||
|
||||
IF(Kokkos_ENABLE_HIP OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_UNSUPPORTED_ARCHS)
|
||||
SET(KOKKOS_SHOW_HIP_ARCHS ON)
|
||||
@@ -544,6 +545,7 @@ CHECK_CUDA_ARCH(VOLTA72 sm_72)
|
||||
CHECK_CUDA_ARCH(TURING75 sm_75)
|
||||
CHECK_CUDA_ARCH(AMPERE80 sm_80)
|
||||
CHECK_CUDA_ARCH(AMPERE86 sm_86)
|
||||
+CHECK_CUDA_ARCH(HOPPER90 sm_90)
|
||||
|
||||
SET(AMDGPU_ARCH_ALREADY_SPECIFIED "")
|
||||
FUNCTION(CHECK_AMDGPU_ARCH ARCH FLAG)
|
||||
@@ -806,6 +808,10 @@ IF (KOKKOS_ARCH_AMPERE80 OR KOKKOS_ARCH_AMPERE86)
|
||||
SET(KOKKOS_ARCH_AMPERE ON)
|
||||
ENDIF()
|
||||
|
||||
+IF (KOKKOS_ARCH_HOPPER90)
|
||||
+ SET(KOKKOS_ARCH_HOPPER ON)
|
||||
+ENDIF()
|
||||
+
|
||||
#Regardless of version, make sure we define the general architecture name
|
||||
IF (KOKKOS_ARCH_VEGA900 OR KOKKOS_ARCH_VEGA906 OR KOKKOS_ARCH_VEGA908 OR KOKKOS_ARCH_VEGA90A)
|
||||
SET(KOKKOS_ARCH_VEGA ON)
|
||||
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
|
||||
index 56f9117844..fcd4773dbc 100644
|
||||
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
|
||||
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
|
||||
@@ -232,7 +232,8 @@ inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) {
|
||||
case 61: return 96;
|
||||
case 70:
|
||||
case 80:
|
||||
- case 86: return 8;
|
||||
+ case 86:
|
||||
+ case 90: return 8;
|
||||
case 75: return 32;
|
||||
default:
|
||||
Kokkos::Impl::throw_runtime_exception(
|
||||
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp
|
||||
index 40a263561f..8c40ebd60d 100644
|
||||
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp
|
||||
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp
|
||||
@@ -418,7 +418,7 @@ KOKKOS_INLINE_FUNCTION
|
||||
#endif // CUDA_VERSION >= 11000 && CUDA_VERSION < 11010
|
||||
|
||||
#if CUDA_VERSION >= 11010 && \
|
||||
- ((defined(KOKKOS_ARCH_AMPERE80) || defined(KOKKOS_ARCH_AMPERE86)))
|
||||
+ ((defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER)))
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bhalf_t cast_to_bhalf(bhalf_t val) { return val; }
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp
|
||||
index f9451ecfe6..2ce1efb98c 100644
|
||||
--- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp
|
||||
+++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp
|
||||
@@ -51,7 +51,7 @@ namespace Kokkos::Experimental::Impl {
|
||||
|
||||
struct OpenACC_Traits {
|
||||
#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
|
||||
- defined(KOKKOS_ARCH_AMPERE)
|
||||
+ defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER)
|
||||
static constexpr acc_device_t dev_type = acc_device_nvidia;
|
||||
static constexpr bool may_fallback_to_host = false;
|
||||
#else
|
||||
diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
|
||||
index a9bc085912..27ee1d4232 100644
|
||||
--- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
|
||||
+++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
|
||||
@@ -115,8 +115,9 @@ void OpenMPTargetInternal::impl_initialize() {
|
||||
|
||||
// FIXME_OPENMPTARGET: Only fix the number of teams for NVIDIA architectures
|
||||
// from Pascal and upwards.
|
||||
-#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
|
||||
- defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
|
||||
+#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
|
||||
+ defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
|
||||
+ defined(KOKKOS_ARCH_HOPPER)
|
||||
#if defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG >= 1300)
|
||||
omp_set_num_teams(512);
|
||||
#endif
|
||||
diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
|
||||
index 840db4327c..7e5addbc5b 100644
|
||||
--- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
|
||||
+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
|
||||
@@ -155,7 +155,7 @@ void SYCL::impl_initialize(InitializationSettings const& settings) {
|
||||
#if !defined(KOKKOS_ARCH_INTEL_GPU) && !defined(KOKKOS_ARCH_KEPLER) && \
|
||||
!defined(KOKKOS_ARCH_MAXWELL) && !defined(KOKKOS_ARCH_PASCAL) && \
|
||||
!defined(KOKKOS_ARCH_VOLTA) && !defined(KOKKOS_ARCH_TURING75) && \
|
||||
- !defined(KOKKOS_ARCH_AMPERE)
|
||||
+ !defined(KOKKOS_ARCH_AMPERE) && !defined(KOKKOS_ARCH_HOPPER)
|
||||
if (!settings.has_device_id() && gpu_devices.empty()) {
|
||||
Impl::SYCLInternal::singleton().initialize(sycl::device());
|
||||
return;
|
||||
diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
|
||||
index 5ac7d8af30..ba101f699e 100644
|
||||
--- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
|
||||
+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
|
||||
@@ -335,9 +335,10 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
|
||||
return std::min({
|
||||
int(m_space.impl_internal_space_instance()->m_maxWorkgroupSize),
|
||||
// FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs.
|
||||
-#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
|
||||
- defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
|
||||
- defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
|
||||
+#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
|
||||
+ defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
|
||||
+ defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
|
||||
+ defined(KOKKOS_ARCH_HOPPER)
|
||||
256,
|
||||
#endif
|
||||
max_threads_for_memory
|
||||
@@ -367,9 +368,10 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
|
||||
return std::min<int>({
|
||||
int(m_space.impl_internal_space_instance()->m_maxWorkgroupSize),
|
||||
// FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs.
|
||||
-#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
|
||||
- defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
|
||||
- defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
|
||||
+#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
|
||||
+ defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
|
||||
+ defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
|
||||
+ defined(KOKKOS_ARCH_HOPPER)
|
||||
256,
|
||||
#endif
|
||||
max_threads_for_memory
|
||||
523
lib/kokkos/kokkos_5706.diff
Normal file
523
lib/kokkos/kokkos_5706.diff
Normal file
@ -0,0 +1,523 @@
|
||||
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
|
||||
index fcd4773dbc..30b6958a67 100644
|
||||
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
|
||||
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
|
||||
@@ -207,7 +207,6 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
|
||||
LaunchBounds{});
|
||||
}
|
||||
|
||||
-// Assuming cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1)
|
||||
// NOTE these number can be obtained several ways:
|
||||
// * One option is to download the CUDA Occupancy Calculator spreadsheet, select
|
||||
// "Compute Capability" first and check what is the smallest "Shared Memory
|
||||
@@ -242,6 +241,7 @@ inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) {
|
||||
return 0;
|
||||
}() * 1024;
|
||||
}
|
||||
+
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
|
||||
index 5811498e01..e22eb3b842 100644
|
||||
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
|
||||
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
|
||||
@@ -569,12 +569,6 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default
|
||||
}
|
||||
#endif
|
||||
|
||||
-#ifdef KOKKOS_ENABLE_PRE_CUDA_10_DEPRECATION_API
|
||||
- cudaThreadSetCacheConfig(cudaFuncCachePreferShared);
|
||||
-#else
|
||||
- cudaDeviceSetCacheConfig(cudaFuncCachePreferShared);
|
||||
-#endif
|
||||
-
|
||||
// Init the array for used for arbitrarily sized atomics
|
||||
if (stream == nullptr) Impl::initialize_host_cuda_lock_arrays();
|
||||
|
||||
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
|
||||
index b7a80ad84f..5c4c3a7d39 100644
|
||||
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
|
||||
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
|
||||
@@ -93,10 +93,6 @@ namespace Impl {
|
||||
// __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
|
||||
// function qualifier which could be used to improve performance.
|
||||
//----------------------------------------------------------------------------
|
||||
-// Maximize L1 cache and minimize shared memory:
|
||||
-// cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1 );
|
||||
-// For 2.0 capability: 48 KB L1 and 16 KB shared
|
||||
-//----------------------------------------------------------------------------
|
||||
|
||||
template <class DriverType>
|
||||
__global__ static void cuda_parallel_launch_constant_memory() {
|
||||
@@ -158,63 +154,105 @@ inline void check_shmem_request(CudaInternal const* cuda_instance, int shmem) {
|
||||
}
|
||||
}
|
||||
|
||||
-// This function needs to be template on DriverType and LaunchBounds
|
||||
+// These functions needs to be template on DriverType and LaunchBounds
|
||||
// so that the static bool is unique for each type combo
|
||||
// KernelFuncPtr does not necessarily contain that type information.
|
||||
+
|
||||
template <class DriverType, class LaunchBounds, class KernelFuncPtr>
|
||||
-inline void configure_shmem_preference(KernelFuncPtr const& func,
|
||||
- bool prefer_shmem) {
|
||||
+const cudaFuncAttributes& get_cuda_kernel_func_attributes(
|
||||
+ const KernelFuncPtr& func) {
|
||||
+ // Only call cudaFuncGetAttributes once for each unique kernel
|
||||
+ // by leveraging static variable initialization rules
|
||||
+ auto wrap_get_attributes = [&]() -> cudaFuncAttributes {
|
||||
+ cudaFuncAttributes attr;
|
||||
+ KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncGetAttributes(&attr, func));
|
||||
+ return attr;
|
||||
+ };
|
||||
+ static cudaFuncAttributes func_attr = wrap_get_attributes();
|
||||
+ return func_attr;
|
||||
+}
|
||||
+
|
||||
+template <class DriverType, class LaunchBounds, class KernelFuncPtr>
|
||||
+inline void configure_shmem_preference(const KernelFuncPtr& func,
|
||||
+ const cudaDeviceProp& device_props,
|
||||
+ const size_t block_size, int& shmem,
|
||||
+ const size_t occupancy) {
|
||||
#ifndef KOKKOS_ARCH_KEPLER
|
||||
- // On Kepler the L1 has no benefit since it doesn't cache reads
|
||||
+
|
||||
+ const auto& func_attr =
|
||||
+ get_cuda_kernel_func_attributes<DriverType, LaunchBounds>(func);
|
||||
+
|
||||
+ // Compute limits for number of blocks due to registers/SM
|
||||
+ const size_t regs_per_sm = device_props.regsPerMultiprocessor;
|
||||
+ const size_t regs_per_thread = func_attr.numRegs;
|
||||
+ // The granularity of register allocation is chunks of 256 registers per warp
|
||||
+ // -> 8 registers per thread
|
||||
+ const size_t allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
|
||||
+ const size_t max_blocks_regs =
|
||||
+ regs_per_sm / (allocated_regs_per_thread * block_size);
|
||||
+
|
||||
+ // Compute how many threads per sm we actually want
|
||||
+ const size_t max_threads_per_sm = device_props.maxThreadsPerMultiProcessor;
|
||||
+ // only allocate multiples of warp size
|
||||
+ const size_t num_threads_desired =
|
||||
+ ((max_threads_per_sm * occupancy / 100 + 31) / 32) * 32;
|
||||
+ // Get close to the desired occupancy,
|
||||
+ // don't undershoot by much but also don't allocate a whole new block just
|
||||
+ // because one is a few threads over otherwise.
|
||||
+ size_t num_blocks_desired =
|
||||
+ (num_threads_desired + block_size * 0.8) / block_size;
|
||||
+ num_blocks_desired = ::std::min(max_blocks_regs, num_blocks_desired);
|
||||
+ if (num_blocks_desired == 0) num_blocks_desired = 1;
|
||||
+
|
||||
+ // Calculate how much shared memory we need per block
|
||||
+ size_t shmem_per_block = shmem + func_attr.sharedSizeBytes;
|
||||
+
|
||||
+ // The minimum shared memory allocation we can have in total per SM is 8kB.
|
||||
+ // If we want to lower occupancy we have to make sure we request at least that
|
||||
+ // much in aggregate over all blocks, so that shared memory actually becomes a
|
||||
+ // limiting factor for occupancy
|
||||
+ constexpr size_t min_shmem_size_per_sm = 8192;
|
||||
+ if ((occupancy < 100) &&
|
||||
+ (shmem_per_block * num_blocks_desired < min_shmem_size_per_sm)) {
|
||||
+ shmem_per_block = min_shmem_size_per_sm / num_blocks_desired;
|
||||
+ // Need to set the caller's shmem variable so that the
|
||||
+ // kernel launch uses the correct dynamic shared memory request
|
||||
+ shmem = shmem_per_block - func_attr.sharedSizeBytes;
|
||||
+ }
|
||||
+
|
||||
+ // Compute the carveout fraction we need based on occupancy
|
||||
+ // Use multiples of 8kB
|
||||
+ const size_t max_shmem_per_sm = device_props.sharedMemPerMultiprocessor;
|
||||
+ size_t carveout = shmem_per_block == 0
|
||||
+ ? 0
|
||||
+ : 100 *
|
||||
+ (((num_blocks_desired * shmem_per_block +
|
||||
+ min_shmem_size_per_sm - 1) /
|
||||
+ min_shmem_size_per_sm) *
|
||||
+ min_shmem_size_per_sm) /
|
||||
+ max_shmem_per_sm;
|
||||
+ if (carveout > 100) carveout = 100;
|
||||
+
|
||||
+ // Set the carveout, but only call it once per kernel or when it changes
|
||||
auto set_cache_config = [&] {
|
||||
- KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
|
||||
- func,
|
||||
- (prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1)));
|
||||
- return prefer_shmem;
|
||||
+ KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetAttribute(
|
||||
+ func, cudaFuncAttributePreferredSharedMemoryCarveout, carveout));
|
||||
+ return carveout;
|
||||
};
|
||||
- static bool cache_config_preference_cached = set_cache_config();
|
||||
- if (cache_config_preference_cached != prefer_shmem) {
|
||||
+ // Store the value in a static variable so we only reset if needed
|
||||
+ static size_t cache_config_preference_cached = set_cache_config();
|
||||
+ if (cache_config_preference_cached != carveout) {
|
||||
cache_config_preference_cached = set_cache_config();
|
||||
}
|
||||
#else
|
||||
// Use the parameters so we don't get a warning
|
||||
(void)func;
|
||||
- (void)prefer_shmem;
|
||||
+ (void)device_props;
|
||||
+ (void)block_size;
|
||||
+ (void)occupancy;
|
||||
#endif
|
||||
}
|
||||
|
||||
-template <class Policy>
|
||||
-std::enable_if_t<Policy::experimental_contains_desired_occupancy>
|
||||
-modify_launch_configuration_if_desired_occupancy_is_specified(
|
||||
- Policy const& policy, cudaDeviceProp const& properties,
|
||||
- cudaFuncAttributes const& attributes, dim3 const& block, int& shmem,
|
||||
- bool& prefer_shmem) {
|
||||
- int const block_size = block.x * block.y * block.z;
|
||||
- int const desired_occupancy = policy.impl_get_desired_occupancy().value();
|
||||
-
|
||||
- size_t const shmem_per_sm_prefer_l1 = get_shmem_per_sm_prefer_l1(properties);
|
||||
- size_t const static_shmem = attributes.sharedSizeBytes;
|
||||
-
|
||||
- // round to nearest integer and avoid division by zero
|
||||
- int active_blocks = std::max(
|
||||
- 1, static_cast<int>(std::round(
|
||||
- static_cast<double>(properties.maxThreadsPerMultiProcessor) /
|
||||
- block_size * desired_occupancy / 100)));
|
||||
- int const dynamic_shmem =
|
||||
- shmem_per_sm_prefer_l1 / active_blocks - static_shmem;
|
||||
-
|
||||
- if (dynamic_shmem > shmem) {
|
||||
- shmem = dynamic_shmem;
|
||||
- prefer_shmem = false;
|
||||
- }
|
||||
-}
|
||||
-
|
||||
-template <class Policy>
|
||||
-std::enable_if_t<!Policy::experimental_contains_desired_occupancy>
|
||||
-modify_launch_configuration_if_desired_occupancy_is_specified(
|
||||
- Policy const&, cudaDeviceProp const&, cudaFuncAttributes const&,
|
||||
- dim3 const& /*block*/, int& /*shmem*/, bool& /*prefer_shmem*/) {}
|
||||
-
|
||||
// </editor-fold> end Some helper functions for launch code readability }}}1
|
||||
//==============================================================================
|
||||
|
||||
@@ -348,7 +386,7 @@ struct CudaParallelLaunchKernelInvoker<
|
||||
#ifdef KOKKOS_CUDA_ENABLE_GRAPHS
|
||||
inline static void create_parallel_launch_graph_node(
|
||||
DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
|
||||
- CudaInternal const* cuda_instance, bool prefer_shmem) {
|
||||
+ CudaInternal const* cuda_instance) {
|
||||
//----------------------------------------
|
||||
auto const& graph = Impl::get_cuda_graph_from_kernel(driver);
|
||||
KOKKOS_EXPECTS(bool(graph));
|
||||
@@ -358,8 +396,15 @@ struct CudaParallelLaunchKernelInvoker<
|
||||
|
||||
if (!Impl::is_empty_launch(grid, block)) {
|
||||
Impl::check_shmem_request(cuda_instance, shmem);
|
||||
- Impl::configure_shmem_preference<DriverType, LaunchBounds>(
|
||||
- base_t::get_kernel_func(), prefer_shmem);
|
||||
+ if (DriverType::Policy::
|
||||
+ experimental_contains_desired_occupancy) {
|
||||
+ int desired_occupancy =
|
||||
+ driver.get_policy().impl_get_desired_occupancy().value();
|
||||
+ size_t block_size = block.x * block.y * block.z;
|
||||
+ Impl::configure_shmem_preference<DriverType, LaunchBounds>(
|
||||
+ base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
|
||||
+ shmem, desired_occupancy);
|
||||
+ }
|
||||
|
||||
void const* args[] = {&driver};
|
||||
|
||||
@@ -442,7 +487,7 @@ struct CudaParallelLaunchKernelInvoker<
|
||||
#ifdef KOKKOS_CUDA_ENABLE_GRAPHS
|
||||
inline static void create_parallel_launch_graph_node(
|
||||
DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
|
||||
- CudaInternal const* cuda_instance, bool prefer_shmem) {
|
||||
+ CudaInternal const* cuda_instance) {
|
||||
//----------------------------------------
|
||||
auto const& graph = Impl::get_cuda_graph_from_kernel(driver);
|
||||
KOKKOS_EXPECTS(bool(graph));
|
||||
@@ -452,8 +497,15 @@ struct CudaParallelLaunchKernelInvoker<
|
||||
|
||||
if (!Impl::is_empty_launch(grid, block)) {
|
||||
Impl::check_shmem_request(cuda_instance, shmem);
|
||||
- Impl::configure_shmem_preference<DriverType, LaunchBounds>(
|
||||
- base_t::get_kernel_func(), prefer_shmem);
|
||||
+ if constexpr (DriverType::Policy::
|
||||
+ experimental_contains_desired_occupancy) {
|
||||
+ int desired_occupancy =
|
||||
+ driver.get_policy().impl_get_desired_occupancy().value();
|
||||
+ size_t block_size = block.x * block.y * block.z;
|
||||
+ Impl::configure_shmem_preference<DriverType, LaunchBounds>(
|
||||
+ base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
|
||||
+ shmem, desired_occupancy);
|
||||
+ }
|
||||
|
||||
auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver);
|
||||
|
||||
@@ -566,7 +618,7 @@ struct CudaParallelLaunchKernelInvoker<
|
||||
#ifdef KOKKOS_CUDA_ENABLE_GRAPHS
|
||||
inline static void create_parallel_launch_graph_node(
|
||||
DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
|
||||
- CudaInternal const* cuda_instance, bool prefer_shmem) {
|
||||
+ CudaInternal const* cuda_instance) {
|
||||
// Just use global memory; coordinating through events to share constant
|
||||
// memory with the non-graph interface is not really reasonable since
|
||||
// events don't work with Graphs directly, and this would anyway require
|
||||
@@ -580,7 +632,7 @@ struct CudaParallelLaunchKernelInvoker<
|
||||
DriverType, LaunchBounds,
|
||||
Experimental::CudaLaunchMechanism::GlobalMemory>;
|
||||
global_launch_impl_t::create_parallel_launch_graph_node(
|
||||
- driver, grid, block, shmem, cuda_instance, prefer_shmem);
|
||||
+ driver, grid, block, shmem, cuda_instance);
|
||||
}
|
||||
#endif
|
||||
};
|
||||
@@ -613,8 +665,7 @@ struct CudaParallelLaunchImpl<
|
||||
|
||||
inline static void launch_kernel(const DriverType& driver, const dim3& grid,
|
||||
const dim3& block, int shmem,
|
||||
- const CudaInternal* cuda_instance,
|
||||
- bool prefer_shmem) {
|
||||
+ const CudaInternal* cuda_instance) {
|
||||
if (!Impl::is_empty_launch(grid, block)) {
|
||||
// Prevent multiple threads to simultaneously set the cache configuration
|
||||
// preference and launch the same kernel
|
||||
@@ -623,18 +674,17 @@ struct CudaParallelLaunchImpl<
|
||||
|
||||
Impl::check_shmem_request(cuda_instance, shmem);
|
||||
|
||||
- // If a desired occupancy is specified, we compute how much shared memory
|
||||
- // to ask for to achieve that occupancy, assuming that the cache
|
||||
- // configuration is `cudaFuncCachePreferL1`. If the amount of dynamic
|
||||
- // shared memory computed is actually smaller than `shmem` we overwrite
|
||||
- // `shmem` and set `prefer_shmem` to `false`.
|
||||
- modify_launch_configuration_if_desired_occupancy_is_specified(
|
||||
- driver.get_policy(), cuda_instance->m_deviceProp,
|
||||
- get_cuda_func_attributes(), block, shmem, prefer_shmem);
|
||||
-
|
||||
- Impl::configure_shmem_preference<
|
||||
- DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
|
||||
- base_t::get_kernel_func(), prefer_shmem);
|
||||
+ if (DriverType::Policy::
|
||||
+ experimental_contains_desired_occupancy) {
|
||||
+ int desired_occupancy =
|
||||
+ driver.get_policy().impl_get_desired_occupancy().value();
|
||||
+ size_t block_size = block.x * block.y * block.z;
|
||||
+ Impl::configure_shmem_preference<
|
||||
+ DriverType,
|
||||
+ Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
|
||||
+ base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
|
||||
+ shmem, desired_occupancy);
|
||||
+ }
|
||||
|
||||
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
|
||||
|
||||
@@ -650,18 +700,9 @@ struct CudaParallelLaunchImpl<
|
||||
}
|
||||
|
||||
static cudaFuncAttributes get_cuda_func_attributes() {
|
||||
- // Race condition inside of cudaFuncGetAttributes if the same address is
|
||||
- // given requires using a local variable as input instead of a static Rely
|
||||
- // on static variable initialization to make sure only one thread executes
|
||||
- // the code and the result is visible.
|
||||
- auto wrap_get_attributes = []() -> cudaFuncAttributes {
|
||||
- cudaFuncAttributes attr_tmp;
|
||||
- KOKKOS_IMPL_CUDA_SAFE_CALL(
|
||||
- cudaFuncGetAttributes(&attr_tmp, base_t::get_kernel_func()));
|
||||
- return attr_tmp;
|
||||
- };
|
||||
- static cudaFuncAttributes attr = wrap_get_attributes();
|
||||
- return attr;
|
||||
+ return get_cuda_kernel_func_attributes<
|
||||
+ DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
|
||||
+ base_t::get_kernel_func());
|
||||
}
|
||||
};
|
||||
|
||||
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
|
||||
index e586bb4cc6..0e348c092a 100644
|
||||
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
|
||||
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
|
||||
@@ -121,8 +121,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
|
||||
maxblocks[1]),
|
||||
1);
|
||||
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
||||
- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
|
||||
- false);
|
||||
+ *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
|
||||
} else if (RP::rank == 3) {
|
||||
const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], m_rp.m_tile[2]);
|
||||
KOKKOS_ASSERT(block.x > 0);
|
||||
@@ -139,8 +138,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
|
||||
(m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1) / block.z,
|
||||
maxblocks[2]));
|
||||
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
||||
- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
|
||||
- false);
|
||||
+ *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
|
||||
} else if (RP::rank == 4) {
|
||||
// id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to
|
||||
// threadIdx.z
|
||||
@@ -158,8 +156,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
|
||||
(m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1) / block.z,
|
||||
maxblocks[2]));
|
||||
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
||||
- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
|
||||
- false);
|
||||
+ *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
|
||||
} else if (RP::rank == 5) {
|
||||
// id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to
|
||||
// threadIdx.z
|
||||
@@ -175,8 +172,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
|
||||
(m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1) / block.z,
|
||||
maxblocks[2]));
|
||||
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
||||
- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
|
||||
- false);
|
||||
+ *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
|
||||
} else if (RP::rank == 6) {
|
||||
// id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4,id5 to
|
||||
// threadIdx.z
|
||||
@@ -191,8 +187,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
|
||||
std::min<array_index_type>(m_rp.m_tile_end[4] * m_rp.m_tile_end[5],
|
||||
maxblocks[2]));
|
||||
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
||||
- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
|
||||
- false);
|
||||
+ *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
|
||||
} else {
|
||||
Kokkos::abort("Kokkos::MDRange Error: Exceeded rank bounds with Cuda\n");
|
||||
}
|
||||
@@ -405,8 +400,8 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
|
||||
|
||||
CudaParallelLaunch<ParallelReduce, LaunchBounds>(
|
||||
*this, grid, block, shmem,
|
||||
- m_policy.space().impl_internal_space_instance(),
|
||||
- false); // copy to device and execute
|
||||
+ m_policy.space()
|
||||
+ .impl_internal_space_instance()); // copy to device and execute
|
||||
|
||||
if (!m_result_ptr_device_accessible) {
|
||||
if (m_result_ptr) {
|
||||
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
|
||||
index ac160f8fe2..d1031751c2 100644
|
||||
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
|
||||
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
|
||||
@@ -135,8 +135,7 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
|
||||
#endif
|
||||
|
||||
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
||||
- *this, grid, block, 0, m_policy.space().impl_internal_space_instance(),
|
||||
- false);
|
||||
+ *this, grid, block, 0, m_policy.space().impl_internal_space_instance());
|
||||
}
|
||||
|
||||
ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
|
||||
@@ -375,8 +374,8 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
|
||||
|
||||
CudaParallelLaunch<ParallelReduce, LaunchBounds>(
|
||||
*this, grid, block, shmem,
|
||||
- m_policy.space().impl_internal_space_instance(),
|
||||
- false); // copy to device and execute
|
||||
+ m_policy.space()
|
||||
+ .impl_internal_space_instance()); // copy to device and execute
|
||||
|
||||
if (!m_result_ptr_device_accessible) {
|
||||
if (m_result_ptr) {
|
||||
@@ -726,16 +725,16 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
|
||||
m_final = false;
|
||||
CudaParallelLaunch<ParallelScan, LaunchBounds>(
|
||||
*this, grid, block, shmem,
|
||||
- m_policy.space().impl_internal_space_instance(),
|
||||
- false); // copy to device and execute
|
||||
+ m_policy.space()
|
||||
+ .impl_internal_space_instance()); // copy to device and execute
|
||||
#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
|
||||
}
|
||||
#endif
|
||||
m_final = true;
|
||||
CudaParallelLaunch<ParallelScan, LaunchBounds>(
|
||||
*this, grid, block, shmem,
|
||||
- m_policy.space().impl_internal_space_instance(),
|
||||
- false); // copy to device and execute
|
||||
+ m_policy.space()
|
||||
+ .impl_internal_space_instance()); // copy to device and execute
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1038,16 +1037,16 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
|
||||
m_final = false;
|
||||
CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>(
|
||||
*this, grid, block, shmem,
|
||||
- m_policy.space().impl_internal_space_instance(),
|
||||
- false); // copy to device and execute
|
||||
+ m_policy.space()
|
||||
+ .impl_internal_space_instance()); // copy to device and execute
|
||||
#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
|
||||
}
|
||||
#endif
|
||||
m_final = true;
|
||||
CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>(
|
||||
*this, grid, block, shmem,
|
||||
- m_policy.space().impl_internal_space_instance(),
|
||||
- false); // copy to device and execute
|
||||
+ m_policy.space()
|
||||
+ .impl_internal_space_instance()); // copy to device and execute
|
||||
|
||||
const int size = Analysis::value_size(m_functor);
|
||||
#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
|
||||
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
|
||||
index cdd16085b3..ea9430b812 100644
|
||||
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
|
||||
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
|
||||
@@ -552,8 +552,8 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
|
||||
|
||||
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
||||
*this, grid, block, shmem_size_total,
|
||||
- m_policy.space().impl_internal_space_instance(),
|
||||
- true); // copy to device and execute
|
||||
+ m_policy.space()
|
||||
+ .impl_internal_space_instance()); // copy to device and execute
|
||||
}
|
||||
|
||||
ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
|
||||
@@ -878,8 +878,8 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
|
||||
|
||||
CudaParallelLaunch<ParallelReduce, LaunchBounds>(
|
||||
*this, grid, block, shmem_size_total,
|
||||
- m_policy.space().impl_internal_space_instance(),
|
||||
- true); // copy to device and execute
|
||||
+ m_policy.space()
|
||||
+ .impl_internal_space_instance()); // copy to device and execute
|
||||
|
||||
if (!m_result_ptr_device_accessible) {
|
||||
m_policy.space().fence(
|
||||
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
|
||||
index 34d4bef9fd..178012431c 100644
|
||||
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
|
||||
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
|
||||
@@ -428,11 +428,6 @@ struct CudaReductionsFunctor<FunctorType, false, false> {
|
||||
// __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
|
||||
// function qualifier which could be used to improve performance.
|
||||
//----------------------------------------------------------------------------
|
||||
-// Maximize shared memory and minimize L1 cache:
|
||||
-// cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferShared );
|
||||
-// For 2.0 capability: 48 KB shared and 16 KB L1
|
||||
-//----------------------------------------------------------------------------
|
||||
-//----------------------------------------------------------------------------
|
||||
/*
|
||||
* Algorithmic constraints:
|
||||
* (a) blockDim.y <= 1024
|
||||
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
|
||||
index fb3a6b138f..a12378a891 100644
|
||||
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
|
||||
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
|
||||
@@ -100,8 +100,7 @@ class ParallelFor<FunctorType, Kokkos::WorkGraphPolicy<Traits...>,
|
||||
const int shared = 0;
|
||||
|
||||
Kokkos::Impl::CudaParallelLaunch<Self>(
|
||||
- *this, grid, block, shared, Cuda().impl_internal_space_instance(),
|
||||
- false);
|
||||
+ *this, grid, block, shared, Cuda().impl_internal_space_instance());
|
||||
}
|
||||
|
||||
inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
|
||||
46
lib/kokkos/kokkos_5731.diff
Normal file
46
lib/kokkos/kokkos_5731.diff
Normal file
@ -0,0 +1,46 @@
|
||||
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
|
||||
index 30b6958a67..b94f053272 100644
|
||||
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
|
||||
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
|
||||
@@ -207,41 +207,6 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
|
||||
LaunchBounds{});
|
||||
}
|
||||
|
||||
-// NOTE these number can be obtained several ways:
|
||||
-// * One option is to download the CUDA Occupancy Calculator spreadsheet, select
|
||||
-// "Compute Capability" first and check what is the smallest "Shared Memory
|
||||
-// Size Config" that is available. The "Shared Memory Per Multiprocessor" in
|
||||
-// bytes is then to be found below in the summary.
|
||||
-// * Another option would be to look for the information in the "Tuning
|
||||
-// Guide(s)" of the CUDA Toolkit Documentation for each GPU architecture, in
|
||||
-// the "Shared Memory" section (more tedious)
|
||||
-inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) {
|
||||
- int const compute_capability = properties.major * 10 + properties.minor;
|
||||
- return [compute_capability]() {
|
||||
- switch (compute_capability) {
|
||||
- case 30:
|
||||
- case 32:
|
||||
- case 35: return 16;
|
||||
- case 37: return 80;
|
||||
- case 50:
|
||||
- case 53:
|
||||
- case 60:
|
||||
- case 62: return 64;
|
||||
- case 52:
|
||||
- case 61: return 96;
|
||||
- case 70:
|
||||
- case 80:
|
||||
- case 86:
|
||||
- case 90: return 8;
|
||||
- case 75: return 32;
|
||||
- default:
|
||||
- Kokkos::Impl::throw_runtime_exception(
|
||||
- "Unknown device in cuda block size deduction");
|
||||
- }
|
||||
- return 0;
|
||||
- }() * 1024;
|
||||
-}
|
||||
-
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
204
lib/kokkos/kokkos_5739.diff
Normal file
204
lib/kokkos/kokkos_5739.diff
Normal file
@ -0,0 +1,204 @@
|
||||
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
|
||||
index b94f053272..252c13c524 100644
|
||||
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
|
||||
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
|
||||
@@ -53,17 +53,69 @@
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
+inline int cuda_warp_per_sm_allocation_granularity(
|
||||
+ cudaDeviceProp const& properties) {
|
||||
+ // Allocation granularity of warps in each sm
|
||||
+ switch (properties.major) {
|
||||
+ case 3:
|
||||
+ case 5:
|
||||
+ case 7:
|
||||
+ case 8:
|
||||
+ case 9: return 4;
|
||||
+ case 6: return (properties.minor == 0 ? 2 : 4);
|
||||
+ default:
|
||||
+ throw_runtime_exception(
|
||||
+ "Unknown device in cuda warp per sm allocation granularity");
|
||||
+ return 0;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+inline int cuda_max_warps_per_sm_registers(
|
||||
+ cudaDeviceProp const& properties, cudaFuncAttributes const& attributes) {
|
||||
+ // Maximum number of warps per sm as a function of register counts,
|
||||
+ // subject to the constraint that warps are allocated with a fixed granularity
|
||||
+ int const max_regs_per_block = properties.regsPerBlock;
|
||||
+ int const regs_per_warp = attributes.numRegs * properties.warpSize;
|
||||
+ int const warp_granularity =
|
||||
+ cuda_warp_per_sm_allocation_granularity(properties);
|
||||
+ // The granularity of register allocation is chunks of 256 registers per warp,
|
||||
+ // which implies a need to over-allocate, so we round up
|
||||
+ int const allocated_regs_per_warp = (regs_per_warp + 256 - 1) / 256;
|
||||
+
|
||||
+ // The maximum number of warps per SM is constrained from above by register
|
||||
+ // allocation. To satisfy the constraint that warps per SM is allocated at a
|
||||
+ // finite granularity, we need to round down.
|
||||
+ int const max_warps_per_sm =
|
||||
+ warp_granularity *
|
||||
+ (max_regs_per_block / (allocated_regs_per_warp * warp_granularity));
|
||||
+
|
||||
+ return max_warps_per_sm;
|
||||
+}
|
||||
+
|
||||
inline int cuda_max_active_blocks_per_sm(cudaDeviceProp const& properties,
|
||||
cudaFuncAttributes const& attributes,
|
||||
int block_size, size_t dynamic_shmem) {
|
||||
- // Limits due do registers/SM
|
||||
+ // Limits due to registers/SM
|
||||
int const regs_per_sm = properties.regsPerMultiprocessor;
|
||||
int const regs_per_thread = attributes.numRegs;
|
||||
// The granularity of register allocation is chunks of 256 registers per warp
|
||||
// -> 8 registers per thread
|
||||
int const allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
|
||||
- int const max_blocks_regs =
|
||||
- regs_per_sm / (allocated_regs_per_thread * block_size);
|
||||
+ int max_blocks_regs = regs_per_sm / (allocated_regs_per_thread * block_size);
|
||||
+
|
||||
+ // Compute the maximum number of warps as a function of the number of
|
||||
+ // registers
|
||||
+ int const max_warps_per_sm_registers =
|
||||
+ cuda_max_warps_per_sm_registers(properties, attributes);
|
||||
+
|
||||
+ // Constrain the number of blocks to respect the maximum number of warps per
|
||||
+ // SM On face value this should be an equality, but due to the warp
|
||||
+ // granularity constraints noted in `cuda_max_warps_per_sm_registers` the
|
||||
+ // left-hand-side of this comparison can overshoot what the hardware allows
|
||||
+ // based on register counts alone
|
||||
+ while ((max_blocks_regs * block_size / properties.warpSize) >
|
||||
+ max_warps_per_sm_registers)
|
||||
+ max_blocks_regs--;
|
||||
|
||||
// Limits due to shared memory/SM
|
||||
size_t const shmem_per_sm = properties.sharedMemPerMultiprocessor;
|
||||
@@ -207,6 +259,19 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
|
||||
LaunchBounds{});
|
||||
}
|
||||
|
||||
+template <class LaunchBounds>
|
||||
+int cuda_get_opt_block_size_no_shmem(const cudaFuncAttributes& attr,
|
||||
+ LaunchBounds) {
|
||||
+ auto const& prop = Kokkos::Cuda().cuda_device_prop();
|
||||
+
|
||||
+ // Thin version of cuda_get_opt_block_size for cases where there is no shared
|
||||
+ // memory
|
||||
+ auto const block_size_to_no_shmem = [&](int /*block_size*/) { return 0; };
|
||||
+
|
||||
+ return cuda_deduce_block_size(false, prop, attr, block_size_to_no_shmem,
|
||||
+ LaunchBounds{});
|
||||
+}
|
||||
+
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
|
||||
index 5c4c3a7d39..170183ca0a 100644
|
||||
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
|
||||
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
|
||||
@@ -188,9 +188,23 @@ inline void configure_shmem_preference(const KernelFuncPtr& func,
|
||||
// The granularity of register allocation is chunks of 256 registers per warp
|
||||
// -> 8 registers per thread
|
||||
const size_t allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
|
||||
- const size_t max_blocks_regs =
|
||||
+ size_t max_blocks_regs =
|
||||
regs_per_sm / (allocated_regs_per_thread * block_size);
|
||||
|
||||
+ // Compute the maximum number of warps as a function of the number of
|
||||
+ // registers
|
||||
+ const size_t max_warps_per_sm_registers =
|
||||
+ cuda_max_warps_per_sm_registers(device_props, func_attr);
|
||||
+
|
||||
+ // Constrain the number of blocks to respect the maximum number of warps per
|
||||
+ // SM On face value this should be an equality, but due to the warp
|
||||
+ // granularity constraints noted in `cuda_max_warps_per_sm_registers` the
|
||||
+ // left-hand-side of this comparison can overshoot what the hardware allows
|
||||
+ // based on register counts alone
|
||||
+ while ((max_blocks_regs * block_size / device_props.warpSize) >
|
||||
+ max_warps_per_sm_registers)
|
||||
+ max_blocks_regs--;
|
||||
+
|
||||
// Compute how many threads per sm we actually want
|
||||
const size_t max_threads_per_sm = device_props.maxThreadsPerMultiProcessor;
|
||||
// only allocate multiples of warp size
|
||||
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
|
||||
index 0e348c092a..7e4f62f12e 100644
|
||||
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
|
||||
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
|
||||
@@ -67,6 +67,34 @@
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
+template <typename ParallelType, typename Policy, typename LaunchBounds>
|
||||
+int max_tile_size_product_helper(const Policy& pol, const LaunchBounds&) {
|
||||
+ cudaFuncAttributes attr =
|
||||
+ CudaParallelLaunch<ParallelType,
|
||||
+ LaunchBounds>::get_cuda_func_attributes();
|
||||
+ auto const& prop = pol.space().cuda_device_prop();
|
||||
+
|
||||
+ // Limits due to registers/SM, MDRange doesn't have
|
||||
+ // shared memory constraints
|
||||
+ int const optimal_block_size =
|
||||
+ Kokkos::Impl::cuda_get_opt_block_size_no_shmem(attr, LaunchBounds{});
|
||||
+
|
||||
+ // Compute how many blocks of this size we can launch, based on warp
|
||||
+ // constraints
|
||||
+ int const max_warps_per_sm_registers =
|
||||
+ Kokkos::Impl::cuda_max_warps_per_sm_registers(prop, attr);
|
||||
+ int const max_num_threads_from_warps =
|
||||
+ max_warps_per_sm_registers * prop.warpSize;
|
||||
+ int const max_num_blocks = max_num_threads_from_warps / optimal_block_size;
|
||||
+
|
||||
+ // Compute the total number of threads
|
||||
+ int const max_threads_per_sm = optimal_block_size * max_num_blocks;
|
||||
+
|
||||
+ return std::min(
|
||||
+ max_threads_per_sm,
|
||||
+ static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
|
||||
+}
|
||||
+
|
||||
template <class FunctorType, class... Traits>
|
||||
class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
|
||||
public:
|
||||
@@ -85,18 +113,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
|
||||
public:
|
||||
template <typename Policy, typename Functor>
|
||||
static int max_tile_size_product(const Policy& pol, const Functor&) {
|
||||
- cudaFuncAttributes attr =
|
||||
- CudaParallelLaunch<ParallelFor,
|
||||
- LaunchBounds>::get_cuda_func_attributes();
|
||||
- auto const& prop = pol.space().cuda_device_prop();
|
||||
- // Limits due to registers/SM, MDRange doesn't have
|
||||
- // shared memory constraints
|
||||
- int const regs_per_sm = prop.regsPerMultiprocessor;
|
||||
- int const regs_per_thread = attr.numRegs;
|
||||
- int const max_threads_per_sm = regs_per_sm / regs_per_thread;
|
||||
- return std::min(
|
||||
- max_threads_per_sm,
|
||||
- static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
|
||||
+ return max_tile_size_product_helper<ParallelFor>(pol, LaunchBounds{});
|
||||
}
|
||||
Policy const& get_policy() const { return m_rp; }
|
||||
inline __device__ void operator()() const {
|
||||
@@ -258,17 +275,7 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
|
||||
public:
|
||||
template <typename Policy, typename Functor>
|
||||
static int max_tile_size_product(const Policy& pol, const Functor&) {
|
||||
- cudaFuncAttributes attr =
|
||||
- CudaParallelLaunch<ParallelReduce,
|
||||
- LaunchBounds>::get_cuda_func_attributes();
|
||||
- auto const& prop = pol.space().cuda_device_prop();
|
||||
- // Limits due do registers/SM
|
||||
- int const regs_per_sm = prop.regsPerMultiprocessor;
|
||||
- int const regs_per_thread = attr.numRegs;
|
||||
- int const max_threads_per_sm = regs_per_sm / regs_per_thread;
|
||||
- return std::min(
|
||||
- max_threads_per_sm,
|
||||
- static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
|
||||
+ return max_tile_size_product_helper<ParallelReduce>(pol, LaunchBounds{});
|
||||
}
|
||||
Policy const& get_policy() const { return m_policy; }
|
||||
inline __device__ void exec_range(reference_type update) const {
|
||||
63
lib/kokkos/kokkos_fix_5706_apply_last.diff
Normal file
63
lib/kokkos/kokkos_fix_5706_apply_last.diff
Normal file
@ -0,0 +1,63 @@
|
||||
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
|
||||
index 170183ca0a..ba43e362bb 100644
|
||||
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
|
||||
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
|
||||
@@ -412,12 +412,16 @@ struct CudaParallelLaunchKernelInvoker<
|
||||
Impl::check_shmem_request(cuda_instance, shmem);
|
||||
if (DriverType::Policy::
|
||||
experimental_contains_desired_occupancy) {
|
||||
+ /*
|
||||
int desired_occupancy =
|
||||
driver.get_policy().impl_get_desired_occupancy().value();
|
||||
size_t block_size = block.x * block.y * block.z;
|
||||
Impl::configure_shmem_preference<DriverType, LaunchBounds>(
|
||||
base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
|
||||
- shmem, desired_occupancy);
|
||||
+ shmem, desired_occupancy);*/
|
||||
+ Kokkos::Impl::throw_runtime_exception(
|
||||
+ std::string("Cuda graph node creation FAILED:"
|
||||
+ " occupancy requests are currently broken."));
|
||||
}
|
||||
|
||||
void const* args[] = {&driver};
|
||||
@@ -511,14 +515,17 @@ struct CudaParallelLaunchKernelInvoker<
|
||||
|
||||
if (!Impl::is_empty_launch(grid, block)) {
|
||||
Impl::check_shmem_request(cuda_instance, shmem);
|
||||
- if constexpr (DriverType::Policy::
|
||||
+ if (DriverType::Policy::
|
||||
experimental_contains_desired_occupancy) {
|
||||
- int desired_occupancy =
|
||||
+ /*int desired_occupancy =
|
||||
driver.get_policy().impl_get_desired_occupancy().value();
|
||||
size_t block_size = block.x * block.y * block.z;
|
||||
Impl::configure_shmem_preference<DriverType, LaunchBounds>(
|
||||
base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
|
||||
- shmem, desired_occupancy);
|
||||
+ shmem, desired_occupancy);*/
|
||||
+ Kokkos::Impl::throw_runtime_exception(
|
||||
+ std::string("Cuda graph node creation FAILED:"
|
||||
+ " occupancy requests are currently broken."));
|
||||
}
|
||||
|
||||
auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver);
|
||||
@@ -690,14 +697,17 @@ struct CudaParallelLaunchImpl<
|
||||
|
||||
if (DriverType::Policy::
|
||||
experimental_contains_desired_occupancy) {
|
||||
- int desired_occupancy =
|
||||
+ /*int desired_occupancy =
|
||||
driver.get_policy().impl_get_desired_occupancy().value();
|
||||
size_t block_size = block.x * block.y * block.z;
|
||||
Impl::configure_shmem_preference<
|
||||
DriverType,
|
||||
Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
|
||||
base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
|
||||
- shmem, desired_occupancy);
|
||||
+ shmem, desired_occupancy);*/
|
||||
+ Kokkos::Impl::throw_runtime_exception(
|
||||
+ std::string("Cuda graph node creation FAILED:"
|
||||
+ " occupancy requests are currently broken."));
|
||||
}
|
||||
|
||||
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
|
||||
@ -29,3 +29,4 @@ tag: 3.5.00 date: 11:19:2021 master: c28a8b03 release: 21b879e4
|
||||
tag: 3.6.00 date: 04:14:2022 master: 2834f94a release: 6ea708ff
|
||||
tag: 3.6.01 date: 06:16:2022 master: b52f8c83 release: afe9b404
|
||||
tag: 3.7.00 date: 08:25:2022 master: d19aab99 release: 0018e5fb
|
||||
tag: 3.7.01 date: 12:01:2022 master: 61d7db55 release: d3bb8cfe
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
|
||||
LIB_REQUIRED_PACKAGES KokkosCore
|
||||
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC HPX
|
||||
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC
|
||||
TEST_OPTIONAL_TPLS CUSPARSE
|
||||
)
|
||||
|
||||
3
lib/kokkos/tpls/.clang-format
Normal file
3
lib/kokkos/tpls/.clang-format
Normal file
@ -0,0 +1,3 @@
|
||||
#Official Tool: clang-format version 8.0.0
|
||||
DisableFormat: true
|
||||
SortIncludes: false
|
||||
@ -76,7 +76,7 @@ namespace Impl {
|
||||
/// instances in other translation units, we must update this CUDA global
|
||||
/// variable based on the Host global variable prior to running any kernels
|
||||
/// that will use it.
|
||||
/// That is the purpose of the ensure_cuda_lock_arrays_on_device function.
|
||||
/// That is the purpose of the KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE macro.
|
||||
__device__
|
||||
#ifdef __CUDACC_RDC__
|
||||
__constant__ extern
|
||||
@ -138,42 +138,33 @@ namespace {
|
||||
static int lock_array_copied = 0;
|
||||
inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
|
||||
} // namespace
|
||||
|
||||
#ifdef __CUDACC_RDC__
|
||||
inline
|
||||
#else
|
||||
static
|
||||
#endif
|
||||
void
|
||||
copy_cuda_lock_arrays_to_device() {
|
||||
if (lock_array_copied == 0) {
|
||||
cudaMemcpyToSymbol(CUDA_SPACE_ATOMIC_LOCKS_DEVICE,
|
||||
&CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h,
|
||||
sizeof(int32_t*));
|
||||
cudaMemcpyToSymbol(CUDA_SPACE_ATOMIC_LOCKS_NODE,
|
||||
&CUDA_SPACE_ATOMIC_LOCKS_NODE_h,
|
||||
sizeof(int32_t*));
|
||||
}
|
||||
lock_array_copied = 1;
|
||||
}
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace desul
|
||||
/* It is critical that this code be a macro, so that it will
|
||||
capture the right address for desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE
|
||||
putting this in an inline function will NOT do the right thing! */
|
||||
#define DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE() \
|
||||
{ \
|
||||
if (::desul::Impl::lock_array_copied == 0) { \
|
||||
cudaMemcpyToSymbol(::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE, \
|
||||
&::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h, \
|
||||
sizeof(int32_t*)); \
|
||||
cudaMemcpyToSymbol(::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_NODE, \
|
||||
&::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_NODE_h, \
|
||||
sizeof(int32_t*)); \
|
||||
} \
|
||||
::desul::Impl::lock_array_copied = 1; \
|
||||
}
|
||||
|
||||
#endif /* defined( __CUDACC__ ) */
|
||||
|
||||
#endif /* defined( DESUL_HAVE_CUDA_ATOMICS ) */
|
||||
|
||||
namespace desul {
|
||||
|
||||
#if defined(__CUDACC_RDC__) || (!defined(__CUDACC__))
|
||||
inline void ensure_cuda_lock_arrays_on_device() {}
|
||||
#define DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
|
||||
#else
|
||||
static inline void ensure_cuda_lock_arrays_on_device() {
|
||||
Impl::copy_cuda_lock_arrays_to_device();
|
||||
}
|
||||
#define DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() \
|
||||
DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()
|
||||
#endif
|
||||
|
||||
} // namespace desul
|
||||
|
||||
#endif /* #ifndef DESUL_ATOMICS_LOCK_ARRAY_CUDA_HPP_ */
|
||||
#endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP_ */
|
||||
|
||||
@ -70,7 +70,7 @@ void init_lock_arrays_cuda() {
|
||||
"init_lock_arrays_cuda: cudaMalloc host locks");
|
||||
|
||||
auto error_sync1 = cudaDeviceSynchronize();
|
||||
copy_cuda_lock_arrays_to_device();
|
||||
DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
|
||||
check_error_and_throw_cuda(error_sync1, "init_lock_arrays_cuda: post mallocs");
|
||||
init_lock_arrays_cuda_kernel<<<(CUDA_SPACE_ATOMIC_MASK + 1 + 255) / 256, 256>>>();
|
||||
auto error_sync2 = cudaDeviceSynchronize();
|
||||
@ -85,7 +85,7 @@ void finalize_lock_arrays_cuda() {
|
||||
CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr;
|
||||
CUDA_SPACE_ATOMIC_LOCKS_NODE_h = nullptr;
|
||||
#ifdef __CUDACC_RDC__
|
||||
copy_cuda_lock_arrays_to_device();
|
||||
DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@ -190,6 +190,20 @@ namespace LAMMPS_NS {
|
||||
Tp_BIAS,Tp_RMASS,Tp_ZERO>(i);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static void init(value_type &update) {
|
||||
update.fx = 0.0;
|
||||
update.fy = 0.0;
|
||||
update.fz = 0.0;
|
||||
}
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static void join(value_type &update,
|
||||
const value_type &source) {
|
||||
update.fx += source.fx;
|
||||
update.fy += source.fy;
|
||||
update.fz += source.fz;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static void init(volatile value_type &update) {
|
||||
update.fx = 0.0;
|
||||
@ -233,6 +247,15 @@ namespace LAMMPS_NS {
|
||||
energy += c.compute_energy_item(i);
|
||||
}
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static void init(value_type &update) {
|
||||
update = 0.0;
|
||||
}
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static void join(value_type &update,
|
||||
const value_type &source) {
|
||||
update += source;
|
||||
}
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static void init(volatile value_type &update) {
|
||||
update = 0.0;
|
||||
}
|
||||
|
||||
@ -77,7 +77,6 @@ GPU_AWARE_UNKNOWN
|
||||
|
||||
using namespace LAMMPS_NS;
|
||||
|
||||
Kokkos::InitArguments KokkosLMP::args{-1, -1, -1, false};
|
||||
int KokkosLMP::is_finalized = 0;
|
||||
int KokkosLMP::init_ngpus = 0;
|
||||
|
||||
@ -110,7 +109,6 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
|
||||
ngpus = 0;
|
||||
int device = 0;
|
||||
nthreads = 1;
|
||||
numa = 1;
|
||||
|
||||
int iarg = 0;
|
||||
while (iarg < narg) {
|
||||
@ -189,30 +187,24 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
|
||||
|
||||
iarg += 2;
|
||||
|
||||
} else if (strcmp(arg[iarg],"n") == 0 ||
|
||||
strcmp(arg[iarg],"numa") == 0) {
|
||||
numa = utils::inumeric(FLERR, arg[iarg+1], false, lmp);
|
||||
iarg += 2;
|
||||
|
||||
} else error->all(FLERR,"Invalid Kokkos command-line arg: {}", arg[iarg]);
|
||||
}
|
||||
|
||||
// Initialize Kokkos. However, we cannot change any
|
||||
// Kokkos library parameters after the first initalization
|
||||
|
||||
if (args.num_threads != -1) {
|
||||
if ((args.num_threads != nthreads) || (args.num_numa != numa) || (args.device_id != device))
|
||||
Kokkos::InitializationSettings args;
|
||||
|
||||
if (args.has_num_threads()) {
|
||||
if ((args.get_num_threads() != nthreads) || (args.get_device_id() != device))
|
||||
if (me == 0)
|
||||
error->warning(FLERR,"Kokkos package already initalized, "
|
||||
"cannot reinitialize with different parameters");
|
||||
nthreads = args.num_threads;
|
||||
numa = args.num_numa;
|
||||
device = args.device_id;
|
||||
error->warning(FLERR,"Kokkos package already initalized. Cannot change parameters");
|
||||
nthreads = args.get_num_threads();
|
||||
device = args.get_device_id();
|
||||
ngpus = init_ngpus;
|
||||
} else {
|
||||
args.num_threads = nthreads;
|
||||
args.num_numa = numa;
|
||||
args.device_id = device;
|
||||
args.set_num_threads(nthreads);
|
||||
args.set_device_id(device);
|
||||
init_ngpus = ngpus;
|
||||
}
|
||||
|
||||
@ -350,7 +342,7 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
void KokkosLMP::initialize(Kokkos::InitArguments args, Error *error)
|
||||
void KokkosLMP::initialize(Kokkos::InitializationSettings args, Error *error)
|
||||
{
|
||||
if (!Kokkos::is_initialized()) {
|
||||
if (is_finalized)
|
||||
|
||||
@ -43,7 +43,6 @@ class KokkosLMP : protected Pointers {
|
||||
int forward_fix_comm_changed;
|
||||
int reverse_comm_changed;
|
||||
int nthreads,ngpus;
|
||||
int numa;
|
||||
int auto_sync;
|
||||
int gpu_aware_flag;
|
||||
int neigh_thread;
|
||||
@ -53,12 +52,11 @@ class KokkosLMP : protected Pointers {
|
||||
double binsize;
|
||||
|
||||
static int is_finalized;
|
||||
static Kokkos::InitArguments args;
|
||||
static int init_ngpus;
|
||||
|
||||
KokkosLMP(class LAMMPS *, int, char **);
|
||||
|
||||
static void initialize(Kokkos::InitArguments, Error *);
|
||||
static void initialize(Kokkos::InitializationSettings, Error *);
|
||||
static void finalize();
|
||||
void accelerator(int, char **);
|
||||
int neigh_count(int);
|
||||
|
||||
@ -524,6 +524,12 @@ struct PairReaxKokkosFindBondFunctor {
|
||||
PairReaxFFKokkos<DeviceType> c;
|
||||
PairReaxKokkosFindBondFunctor(PairReaxFFKokkos<DeviceType>* c_ptr):c(*c_ptr) {};
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void join(int &dst,
|
||||
const int &src) const {
|
||||
dst = MAX(dst,src);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void join(volatile int &dst,
|
||||
const volatile int &src) const {
|
||||
|
||||
@ -52,7 +52,6 @@ class KokkosLMP {
|
||||
int kokkos_exists;
|
||||
int nthreads;
|
||||
int ngpus;
|
||||
int numa;
|
||||
|
||||
KokkosLMP(class LAMMPS *, int, char **) { kokkos_exists = 0; }
|
||||
~KokkosLMP() {}
|
||||
|
||||
@ -91,7 +91,7 @@ Comm::Comm(LAMMPS *lmp) : Pointers(lmp)
|
||||
nthreads = 1;
|
||||
#ifdef _OPENMP
|
||||
if (lmp->kokkos) {
|
||||
nthreads = lmp->kokkos->nthreads * lmp->kokkos->numa;
|
||||
nthreads = lmp->kokkos->nthreads;
|
||||
} else if (getenv("OMP_NUM_THREADS") == nullptr) {
|
||||
nthreads = 1;
|
||||
if (me == 0)
|
||||
|
||||
Reference in New Issue
Block a user