Merge pull request #3532 from stanmoore1/kk_occupancy

Update Kokkos version in LAMMPS to 3.7.1
This commit is contained in:
Axel Kohlmeyer
2023-01-20 17:52:05 -05:00
committed by GitHub
67 changed files with 2295 additions and 561 deletions

View File

@ -49,8 +49,8 @@ if(DOWNLOAD_KOKKOS)
list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_CXX_EXTENSIONS=${CMAKE_CXX_EXTENSIONS}") list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_CXX_EXTENSIONS=${CMAKE_CXX_EXTENSIONS}")
list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}") list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}")
include(ExternalProject) include(ExternalProject)
set(KOKKOS_URL "https://github.com/kokkos/kokkos/archive/3.7.00.tar.gz" CACHE STRING "URL for KOKKOS tarball") set(KOKKOS_URL "https://github.com/kokkos/kokkos/archive/3.7.01.tar.gz" CACHE STRING "URL for KOKKOS tarball")
set(KOKKOS_MD5 "84991eca9f066383abe119a5bc7a11c4" CACHE STRING "MD5 checksum of KOKKOS tarball") set(KOKKOS_MD5 "f140e02b826223b1045207d9bc10d404" CACHE STRING "MD5 checksum of KOKKOS tarball")
mark_as_advanced(KOKKOS_URL) mark_as_advanced(KOKKOS_URL)
mark_as_advanced(KOKKOS_MD5) mark_as_advanced(KOKKOS_MD5)
ExternalProject_Add(kokkos_build ExternalProject_Add(kokkos_build
@ -74,7 +74,7 @@ if(DOWNLOAD_KOKKOS)
add_dependencies(LAMMPS::KOKKOSCORE kokkos_build) add_dependencies(LAMMPS::KOKKOSCORE kokkos_build)
add_dependencies(LAMMPS::KOKKOSCONTAINERS kokkos_build) add_dependencies(LAMMPS::KOKKOSCONTAINERS kokkos_build)
elseif(EXTERNAL_KOKKOS) elseif(EXTERNAL_KOKKOS)
find_package(Kokkos 3.7.00 REQUIRED CONFIG) find_package(Kokkos 3.7.01 REQUIRED CONFIG)
target_link_libraries(lammps PRIVATE Kokkos::kokkos) target_link_libraries(lammps PRIVATE Kokkos::kokkos)
target_link_libraries(lmp PRIVATE Kokkos::kokkos) target_link_libraries(lmp PRIVATE Kokkos::kokkos)
else() else()

View File

@ -105,13 +105,12 @@ Either the full word or an abbreviation can be used for the keywords.
Note that the keywords do not use a leading minus sign. I.e. the Note that the keywords do not use a leading minus sign. I.e. the
keyword is "t", not "-t". Also note that each of the keywords has a keyword is "t", not "-t". Also note that each of the keywords has a
default setting. Examples of when to use these options and what default setting. Examples of when to use these options and what
settings to use on different platforms is given on the :doc:`KOKKOS package <Speed_kokkos>` settings to use on different platforms is given on the :doc:`KOKKOS
doc page. package <Speed_kokkos>` doc page.
* d or device * d or device
* g or gpus * g or gpus
* t or threads * t or threads
* n or numa
.. parsed-literal:: .. parsed-literal::
@ -164,19 +163,10 @@ the number of physical cores per node, to use your available hardware
optimally. This also sets the number of threads used by the host when optimally. This also sets the number of threads used by the host when
LAMMPS is compiled with CUDA=yes. LAMMPS is compiled with CUDA=yes.
.. parsed-literal:: .. deprecated:: 22Dec2022
numa Nm Support for the "numa" or "n" option was removed as its functionality
was ignored in Kokkos for some time already.
This option is only relevant when using pthreads with hwloc support.
In this case Nm defines the number of NUMA regions (typically sockets)
on a node which will be utilized by a single MPI rank. By default Nm
= 1. If this option is used the total number of worker-threads per
MPI rank is threads\*numa. Currently it is always almost better to
assign at least one MPI rank per NUMA region, and leave numa set to
its default value of 1. This is because letting a single process span
multiple NUMA regions induces a significant amount of cross NUMA data
traffic which is slow.
---------- ----------

View File

@ -1,5 +1,27 @@
# Change Log # Change Log
## [3.7.01](https://github.com/kokkos/kokkos/tree/3.7.01) (2022-12-01)
[Full Changelog](https://github.com/kokkos/kokkos/compare/3.7.00...3.7.01)
### Bug Fixes:
- Add fences to all sorting routines not taking an execution space instance argument [\#5547](https://github.com/kokkos/kokkos/pull/5547)
- Fix repeated `team_reduce` without barrier [\#5552](https://github.com/kokkos/kokkos/pull/5552)
- Fix memory spaces in `create_mirror_view` overloads using `view_alloc` [\#5521](https://github.com/kokkos/kokkos/pull/5521)
- Allow `as_view_of_rank_n()` to be overloaded for "special" scalar types [\#5553](https://github.com/kokkos/kokkos/pull/5553)
- Fix warning calling a `__host__` function from a `__host__ __device__` from `View:: as_view_of_rank_n` [\#5591](https://github.com/kokkos/kokkos/pull/5591)
- OpenMPTarget: adding implementation to set device id. [\#5557](https://github.com/kokkos/kokkos/pull/5557)
- Use `Kokkos::atomic_load` to Correct Race Condition Giving Rise to Seg Faulting Error in OpenMP tests [\#5559](https://github.com/kokkos/kokkos/pull/5559)
- cmake: define `KOKKOS_ARCH_A64FX` [\#5561](https://github.com/kokkos/kokkos/pull/5561)
- Only link against libatomic in gnu-make OpenMPTarget build [\#5565](https://github.com/kokkos/kokkos/pull/5565)
- Fix static extents assignment for LayoutLeft/LayoutRight assignment [\#5566](https://github.com/kokkos/kokkos/pull/5566)
- Do not add -cuda to the link line with NVHPC compiler when the CUDA backend is not actually enabled [\#5569](https://github.com/kokkos/kokkos/pull/5569)
- Export the flags in `KOKKOS_AMDGPU_OPTIONS` when using Trilinos [\#5571](https://github.com/kokkos/kokkos/pull/5571)
- Add support for detecting MPI local rank with MPICH and PMI [\#5570](https://github.com/kokkos/kokkos/pull/5570) [\#5582](https://github.com/kokkos/kokkos/pull/5582)
- Remove listing of undefined TPL dependencies [\#5573](https://github.com/kokkos/kokkos/pull/5573)
- ClockTic changed to 64 bit to fix overflow on Power [\#5592](https://github.com/kokkos/kokkos/pull/5592)
- Fix incorrect offset in CUDA and HIP parallel scan for < 4 byte types [\#5607](https://github.com/kokkos/kokkos/pull/5607)
- Fix initialization of Cuda lock arrays [\#5622](https://github.com/kokkos/kokkos/pull/5622)
## [3.7.00](https://github.com/kokkos/kokkos/tree/3.7.00) (2022-08-22) ## [3.7.00](https://github.com/kokkos/kokkos/tree/3.7.00) (2022-08-22)
[Full Changelog](https://github.com/kokkos/kokkos/compare/3.6.01...3.7.00) [Full Changelog](https://github.com/kokkos/kokkos/compare/3.6.01...3.7.00)
@ -102,7 +124,6 @@
- Deprecate command line arguments (other than `--help`) that are not prefixed with `kokkos-*` [\#5120](https://github.com/kokkos/kokkos/pull/5120) - Deprecate command line arguments (other than `--help`) that are not prefixed with `kokkos-*` [\#5120](https://github.com/kokkos/kokkos/pull/5120)
- Deprecate `--[kokkos-]numa` cmdline arg and `KOKKOS_NUMA` env var [\#5117](https://github.com/kokkos/kokkos/pull/5117) - Deprecate `--[kokkos-]numa` cmdline arg and `KOKKOS_NUMA` env var [\#5117](https://github.com/kokkos/kokkos/pull/5117)
- Deprecate `--[kokkos-]threads` command line argument in favor of `--[kokkos-]num-threads` [\#5111](https://github.com/kokkos/kokkos/pull/5111) - Deprecate `--[kokkos-]threads` command line argument in favor of `--[kokkos-]num-threads` [\#5111](https://github.com/kokkos/kokkos/pull/5111)
- Deprecate `Kokkos::common_view_alloc_prop` [\#5059](https://github.com/kokkos/kokkos/pull/5059)
- Deprecate `Kokkos::is_reducer_type` [\#4957](https://github.com/kokkos/kokkos/pull/4957) - Deprecate `Kokkos::is_reducer_type` [\#4957](https://github.com/kokkos/kokkos/pull/4957)
- Deprecate `OffsetView` constructors taking `index_list_type` [\#4810](https://github.com/kokkos/kokkos/pull/4810) - Deprecate `OffsetView` constructors taking `index_list_type` [\#4810](https://github.com/kokkos/kokkos/pull/4810)
- Deprecate overloads of `Kokkos::sort` taking a parameter `bool always_use_kokkos_sort` [\#5382](https://github.com/kokkos/kokkos/issues/5382) - Deprecate overloads of `Kokkos::sort` taking a parameter `bool always_use_kokkos_sort` [\#5382](https://github.com/kokkos/kokkos/issues/5382)

View File

@ -129,7 +129,7 @@ ENDIF()
set(Kokkos_VERSION_MAJOR 3) set(Kokkos_VERSION_MAJOR 3)
set(Kokkos_VERSION_MINOR 7) set(Kokkos_VERSION_MINOR 7)
set(Kokkos_VERSION_PATCH 00) set(Kokkos_VERSION_PATCH 01)
set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}") set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}")
math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}") math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}")
@ -152,6 +152,7 @@ ENDIF()
# but scoping issues can make it difficult # but scoping issues can make it difficult
GLOBAL_SET(KOKKOS_COMPILE_OPTIONS) GLOBAL_SET(KOKKOS_COMPILE_OPTIONS)
GLOBAL_SET(KOKKOS_LINK_OPTIONS) GLOBAL_SET(KOKKOS_LINK_OPTIONS)
GLOBAL_SET(KOKKOS_AMDGPU_OPTIONS)
GLOBAL_SET(KOKKOS_CUDA_OPTIONS) GLOBAL_SET(KOKKOS_CUDA_OPTIONS)
GLOBAL_SET(KOKKOS_CUDAFE_OPTIONS) GLOBAL_SET(KOKKOS_CUDAFE_OPTIONS)
GLOBAL_SET(KOKKOS_XCOMPILER_OPTIONS) GLOBAL_SET(KOKKOS_XCOMPILER_OPTIONS)
@ -228,6 +229,9 @@ IF (KOKKOS_HAS_TRILINOS)
# we have to match the annoying behavior, also we have to preserve quotes # we have to match the annoying behavior, also we have to preserve quotes
# which needs another workaround. # which needs another workaround.
SET(KOKKOS_COMPILE_OPTIONS_TMP) SET(KOKKOS_COMPILE_OPTIONS_TMP)
IF (KOKKOS_ENABLE_HIP)
LIST(APPEND KOKKOS_COMPILE_OPTIONS ${KOKKOS_AMDGPU_OPTIONS})
ENDIF()
FOREACH(OPTION ${KOKKOS_COMPILE_OPTIONS}) FOREACH(OPTION ${KOKKOS_COMPILE_OPTIONS})
STRING(FIND "${OPTION}" " " OPTION_HAS_WHITESPACE) STRING(FIND "${OPTION}" " " OPTION_HAS_WHITESPACE)
IF(OPTION_HAS_WHITESPACE EQUAL -1) IF(OPTION_HAS_WHITESPACE EQUAL -1)

View File

@ -12,7 +12,7 @@ endif
KOKKOS_VERSION_MAJOR = 3 KOKKOS_VERSION_MAJOR = 3
KOKKOS_VERSION_MINOR = 7 KOKKOS_VERSION_MINOR = 7
KOKKOS_VERSION_PATCH = 00 KOKKOS_VERSION_PATCH = 01
KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc) KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc)
# Options: Cuda,HIP,SYCL,OpenMPTarget,OpenMP,Threads,Serial # Options: Cuda,HIP,SYCL,OpenMPTarget,OpenMP,Threads,Serial
@ -20,7 +20,7 @@ KOKKOS_DEVICES ?= "OpenMP"
#KOKKOS_DEVICES ?= "Threads" #KOKKOS_DEVICES ?= "Threads"
# Options: # Options:
# Intel: KNC,KNL,SNB,HSW,BDW,SKL,SKX,ICL,ICX,SPR # Intel: KNC,KNL,SNB,HSW,BDW,SKL,SKX,ICL,ICX,SPR
# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86 # NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86,Hopper90
# ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX # ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX
# IBM: BGQ,Power7,Power8,Power9 # IBM: BGQ,Power7,Power8,Power9
# AMD-GPUS: Vega900,Vega906,Vega908,Vega90A # AMD-GPUS: Vega900,Vega906,Vega908,Vega90A
@ -401,6 +401,7 @@ KOKKOS_INTERNAL_USE_ARCH_VOLTA72 := $(call kokkos_has_string,$(KOKKOS_ARCH),Volt
KOKKOS_INTERNAL_USE_ARCH_TURING75 := $(call kokkos_has_string,$(KOKKOS_ARCH),Turing75) KOKKOS_INTERNAL_USE_ARCH_TURING75 := $(call kokkos_has_string,$(KOKKOS_ARCH),Turing75)
KOKKOS_INTERNAL_USE_ARCH_AMPERE80 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere80) KOKKOS_INTERNAL_USE_ARCH_AMPERE80 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere80)
KOKKOS_INTERNAL_USE_ARCH_AMPERE86 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere86) KOKKOS_INTERNAL_USE_ARCH_AMPERE86 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere86)
KOKKOS_INTERNAL_USE_ARCH_HOPPER90 := $(call kokkos_has_string,$(KOKKOS_ARCH),Hopper90)
KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30) \ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30) \
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32) \ + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32) \
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \ + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \
@ -414,7 +415,8 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLE
+ $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72) \ + $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72) \
+ $(KOKKOS_INTERNAL_USE_ARCH_TURING75) \ + $(KOKKOS_INTERNAL_USE_ARCH_TURING75) \
+ $(KOKKOS_INTERNAL_USE_ARCH_AMPERE80) \ + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE80) \
+ $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86)) + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86) \
+ $(KOKKOS_INTERNAL_USE_ARCH_HOPPER90))
#SEK: This seems like a bug to me #SEK: This seems like a bug to me
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0) ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
@ -505,10 +507,6 @@ KOKKOS_LINK_FLAGS =
KOKKOS_SRC = KOKKOS_SRC =
KOKKOS_HEADERS = KOKKOS_HEADERS =
#ifeq ($(KOKKOS_INTERNAL_COMPILER_GCC), 1)
KOKKOS_LIBS += -latomic
#endif
# Generating the KokkosCore_config.h file. # Generating the KokkosCore_config.h file.
KOKKOS_INTERNAL_CONFIG_TMP=KokkosCore_config.tmp KOKKOS_INTERNAL_CONFIG_TMP=KokkosCore_config.tmp
@ -550,6 +548,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1)
endif endif
ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
KOKKOS_LIBS += -latomic
tmp := $(call kokkos_append_header,'$H''define KOKKOS_ENABLE_OPENMPTARGET') tmp := $(call kokkos_append_header,'$H''define KOKKOS_ENABLE_OPENMPTARGET')
ifeq ($(KOKKOS_INTERNAL_COMPILER_GCC), 1) ifeq ($(KOKKOS_INTERNAL_COMPILER_GCC), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_WORKAROUND_OPENMPTARGET_GCC") tmp := $(call kokkos_append_header,"$H""define KOKKOS_WORKAROUND_OPENMPTARGET_GCC")
@ -1197,6 +1196,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA_ARCH), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE86") tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE86")
KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_86 KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_86
endif endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_HOPPER90), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER")
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER90")
KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_90
endif
ifneq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0) ifneq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG) KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)

View File

@ -1,5 +1,5 @@
TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
LIB_REQUIRED_PACKAGES KokkosCore KokkosContainers LIB_REQUIRED_PACKAGES KokkosCore KokkosContainers
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC HPX LIB_OPTIONAL_TPLS Pthread CUDA HWLOC
TEST_OPTIONAL_TPLS CUSPARSE TEST_OPTIONAL_TPLS CUSPARSE
) )

View File

@ -265,8 +265,8 @@ class BinSort {
//---------------------------------------- //----------------------------------------
// Create the permutation vector, the bin_offset array and the bin_count // Create the permutation vector, the bin_offset array and the bin_count
// array. Can be called again if keys changed // array. Can be called again if keys changed
template <class ExecutionSpace = exec_space> template <class ExecutionSpace>
void create_permute_vector(const ExecutionSpace& exec = exec_space{}) { void create_permute_vector(const ExecutionSpace& exec) {
static_assert( static_assert(
Kokkos::SpaceAccessibility<ExecutionSpace, Kokkos::SpaceAccessibility<ExecutionSpace,
typename Space::memory_space>::accessible, typename Space::memory_space>::accessible,
@ -297,6 +297,15 @@ class BinSort {
*this); *this);
} }
// Create the permutation vector, the bin_offset array and the bin_count
// array. Can be called again if keys changed
void create_permute_vector() {
Kokkos::fence("Kokkos::Binsort::create_permute_vector: before");
exec_space e{};
create_permute_vector(e);
e.fence("Kokkos::Binsort::create_permute_vector: after");
}
// Sort a subset of a view with respect to the first dimension using the // Sort a subset of a view with respect to the first dimension using the
// permutation array // permutation array
template <class ExecutionSpace, class ValuesViewType> template <class ExecutionSpace, class ValuesViewType>
@ -372,9 +381,10 @@ class BinSort {
template <class ValuesViewType> template <class ValuesViewType>
void sort(ValuesViewType const& values, int values_range_begin, void sort(ValuesViewType const& values, int values_range_begin,
int values_range_end) const { int values_range_end) const {
Kokkos::fence("Kokkos::Binsort::sort: before");
exec_space exec; exec_space exec;
sort(exec, values, values_range_begin, values_range_end); sort(exec, values, values_range_begin, values_range_end);
exec.fence("Kokkos::Sort: fence after sorting"); exec.fence("Kokkos::BinSort:sort: after");
} }
template <class ExecutionSpace, class ValuesViewType> template <class ExecutionSpace, class ValuesViewType>
@ -641,9 +651,10 @@ std::enable_if_t<Kokkos::is_execution_space<ExecutionSpace>::value> sort(
template <class ViewType> template <class ViewType>
void sort(ViewType const& view) { void sort(ViewType const& view) {
Kokkos::fence("Kokkos::sort: before");
typename ViewType::execution_space exec; typename ViewType::execution_space exec;
sort(exec, view); sort(exec, view);
exec.fence("Kokkos::Sort: fence after sorting"); exec.fence("Kokkos::sort: fence after sorting");
} }
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
@ -682,6 +693,7 @@ std::enable_if_t<Kokkos::is_execution_space<ExecutionSpace>::value> sort(
template <class ViewType> template <class ViewType>
void sort(ViewType view, size_t const begin, size_t const end) { void sort(ViewType view, size_t const begin, size_t const end) {
Kokkos::fence("Kokkos::sort: before");
typename ViewType::execution_space exec; typename ViewType::execution_space exec;
sort(exec, view, begin, end); sort(exec, view, begin, end);
exec.fence("Kokkos::Sort: fence after sorting"); exec.fence("Kokkos::Sort: fence after sorting");

View File

@ -10,10 +10,12 @@
# Default settings: change those according to your machine. For # Default settings: change those according to your machine. For
# example, you may have have two different wrappers with either icpc # example, you may have have two different wrappers with either icpc
# or g++ as their back-end compiler. The defaults can be overwritten # or g++ as their back-end compiler. The defaults can be overwritten
# by using the usual arguments (e.g., -arch=sm_30 -ccbin icpc). # by using the usual arguments (e.g., -arch=sm_80 -ccbin icpc).
# sm_70 is supported by every CUDA version from 9-12 and is thus
# chosen as default
default_arch="sm_35" default_arch="sm_70"
#default_arch="sm_50" #default_arch="sm_80"
# #
# The default C++ compiler. # The default C++ compiler.

View File

@ -66,6 +66,7 @@
#cmakedefine KOKKOS_ARCH_ARMV8_THUNDERX #cmakedefine KOKKOS_ARCH_ARMV8_THUNDERX
#cmakedefine KOKKOS_ARCH_ARMV81 #cmakedefine KOKKOS_ARCH_ARMV81
#cmakedefine KOKKOS_ARCH_ARMV8_THUNDERX2 #cmakedefine KOKKOS_ARCH_ARMV8_THUNDERX2
#cmakedefine KOKKOS_ARCH_A64FX
#cmakedefine KOKKOS_ARCH_AMD_AVX2 #cmakedefine KOKKOS_ARCH_AMD_AVX2
#cmakedefine KOKKOS_ARCH_AVX #cmakedefine KOKKOS_ARCH_AVX
#cmakedefine KOKKOS_ARCH_AVX2 #cmakedefine KOKKOS_ARCH_AVX2
@ -101,6 +102,7 @@
#cmakedefine KOKKOS_ARCH_AMPERE #cmakedefine KOKKOS_ARCH_AMPERE
#cmakedefine KOKKOS_ARCH_AMPERE80 #cmakedefine KOKKOS_ARCH_AMPERE80
#cmakedefine KOKKOS_ARCH_AMPERE86 #cmakedefine KOKKOS_ARCH_AMPERE86
#cmakedefine KOKKOS_ARCH_HOPPER90
#cmakedefine KOKKOS_ARCH_AMD_ZEN #cmakedefine KOKKOS_ARCH_AMD_ZEN
#cmakedefine KOKKOS_ARCH_AMD_ZEN2 #cmakedefine KOKKOS_ARCH_AMD_ZEN2
#cmakedefine KOKKOS_ARCH_AMD_ZEN3 #cmakedefine KOKKOS_ARCH_AMD_ZEN3

View File

@ -74,6 +74,7 @@ int main() {
case 75: std::cout << "Set -DKokkos_ARCH_TURING75=ON ." << std::endl; break; case 75: std::cout << "Set -DKokkos_ARCH_TURING75=ON ." << std::endl; break;
case 80: std::cout << "Set -DKokkos_ARCH_AMPERE80=ON ." << std::endl; break; case 80: std::cout << "Set -DKokkos_ARCH_AMPERE80=ON ." << std::endl; break;
case 86: std::cout << "Set -DKokkos_ARCH_AMPERE86=ON ." << std::endl; break; case 86: std::cout << "Set -DKokkos_ARCH_AMPERE86=ON ." << std::endl; break;
case 90: std::cout << "Set -DKokkos_ARCH_HOPPER90=ON ." << std::endl; break;
default: default:
std::cout << "Compute capability " << compute_capability std::cout << "Compute capability " << compute_capability
<< " is not supported" << std::endl; << " is not supported" << std::endl;

View File

@ -86,6 +86,7 @@ KOKKOS_ARCH_OPTION(VOLTA72 GPU "NVIDIA Volta generation CC 7.2" "KOKK
KOKKOS_ARCH_OPTION(TURING75 GPU "NVIDIA Turing generation CC 7.5" "KOKKOS_SHOW_CUDA_ARCHS") KOKKOS_ARCH_OPTION(TURING75 GPU "NVIDIA Turing generation CC 7.5" "KOKKOS_SHOW_CUDA_ARCHS")
KOKKOS_ARCH_OPTION(AMPERE80 GPU "NVIDIA Ampere generation CC 8.0" "KOKKOS_SHOW_CUDA_ARCHS") KOKKOS_ARCH_OPTION(AMPERE80 GPU "NVIDIA Ampere generation CC 8.0" "KOKKOS_SHOW_CUDA_ARCHS")
KOKKOS_ARCH_OPTION(AMPERE86 GPU "NVIDIA Ampere generation CC 8.6" "KOKKOS_SHOW_CUDA_ARCHS") KOKKOS_ARCH_OPTION(AMPERE86 GPU "NVIDIA Ampere generation CC 8.6" "KOKKOS_SHOW_CUDA_ARCHS")
KOKKOS_ARCH_OPTION(HOPPER90 GPU "NVIDIA Hopper generation CC 9.0" "KOKKOS_SHOW_CUDA_ARCHS")
IF(Kokkos_ENABLE_HIP OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_UNSUPPORTED_ARCHS) IF(Kokkos_ENABLE_HIP OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_UNSUPPORTED_ARCHS)
SET(KOKKOS_SHOW_HIP_ARCHS ON) SET(KOKKOS_SHOW_HIP_ARCHS ON)
@ -187,7 +188,9 @@ IF (KOKKOS_CXX_COMPILER_ID STREQUAL Clang)
ELSEIF (KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) ELSEIF (KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
SET(CUDA_ARCH_FLAG "-gpu") SET(CUDA_ARCH_FLAG "-gpu")
GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS -cuda) GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS -cuda)
IF (KOKKOS_ENABLE_CUDA) # FIXME ideally unreachable when CUDA not enabled
GLOBAL_APPEND(KOKKOS_LINK_OPTIONS -cuda) GLOBAL_APPEND(KOKKOS_LINK_OPTIONS -cuda)
ENDIF()
ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA)
SET(CUDA_ARCH_FLAG "-arch") SET(CUDA_ARCH_FLAG "-arch")
ENDIF() ENDIF()
@ -542,6 +545,7 @@ CHECK_CUDA_ARCH(VOLTA72 sm_72)
CHECK_CUDA_ARCH(TURING75 sm_75) CHECK_CUDA_ARCH(TURING75 sm_75)
CHECK_CUDA_ARCH(AMPERE80 sm_80) CHECK_CUDA_ARCH(AMPERE80 sm_80)
CHECK_CUDA_ARCH(AMPERE86 sm_86) CHECK_CUDA_ARCH(AMPERE86 sm_86)
CHECK_CUDA_ARCH(HOPPER90 sm_90)
SET(AMDGPU_ARCH_ALREADY_SPECIFIED "") SET(AMDGPU_ARCH_ALREADY_SPECIFIED "")
FUNCTION(CHECK_AMDGPU_ARCH ARCH FLAG) FUNCTION(CHECK_AMDGPU_ARCH ARCH FLAG)
@ -804,6 +808,10 @@ IF (KOKKOS_ARCH_AMPERE80 OR KOKKOS_ARCH_AMPERE86)
SET(KOKKOS_ARCH_AMPERE ON) SET(KOKKOS_ARCH_AMPERE ON)
ENDIF() ENDIF()
IF (KOKKOS_ARCH_HOPPER90)
SET(KOKKOS_ARCH_HOPPER ON)
ENDIF()
#Regardless of version, make sure we define the general architecture name #Regardless of version, make sure we define the general architecture name
IF (KOKKOS_ARCH_VEGA900 OR KOKKOS_ARCH_VEGA906 OR KOKKOS_ARCH_VEGA908 OR KOKKOS_ARCH_VEGA90A) IF (KOKKOS_ARCH_VEGA900 OR KOKKOS_ARCH_VEGA906 OR KOKKOS_ARCH_VEGA908 OR KOKKOS_ARCH_VEGA90A)
SET(KOKKOS_ARCH_VEGA ON) SET(KOKKOS_ARCH_VEGA ON)

View File

@ -1,5 +1,5 @@
TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
LIB_REQUIRED_PACKAGES KokkosCore LIB_REQUIRED_PACKAGES KokkosCore
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC HPX LIB_OPTIONAL_TPLS Pthread CUDA HWLOC
TEST_OPTIONAL_TPLS CUSPARSE TEST_OPTIONAL_TPLS CUSPARSE
) )

View File

@ -1701,7 +1701,11 @@ namespace Impl {
underlying memory, to facilitate implementation of deep_copy() and underlying memory, to facilitate implementation of deep_copy() and
other routines that are defined on View */ other routines that are defined on View */
template <unsigned N, typename T, typename... Args> template <unsigned N, typename T, typename... Args>
KOKKOS_FUNCTION auto as_view_of_rank_n(DynRankView<T, Args...> v) { KOKKOS_FUNCTION auto as_view_of_rank_n(
DynRankView<T, Args...> v,
typename std::enable_if<std::is_same<
typename ViewTraits<T, Args...>::specialize, void>::value>::type* =
nullptr) {
if (v.rank() != N) { if (v.rank() != N) {
KOKKOS_IF_ON_HOST( KOKKOS_IF_ON_HOST(
const std::string message = const std::string message =
@ -2114,6 +2118,7 @@ inline auto create_mirror(
namespace Impl { namespace Impl {
template <class T, class... P, class... ViewCtorArgs> template <class T, class... P, class... ViewCtorArgs>
inline std::enable_if_t< inline std::enable_if_t<
!Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
std::is_same< std::is_same<
typename DynRankView<T, P...>::memory_space, typename DynRankView<T, P...>::memory_space,
typename DynRankView<T, P...>::HostMirror::memory_space>::value && typename DynRankView<T, P...>::HostMirror::memory_space>::value &&
@ -2128,6 +2133,7 @@ create_mirror_view(const DynRankView<T, P...>& src,
template <class T, class... P, class... ViewCtorArgs> template <class T, class... P, class... ViewCtorArgs>
inline std::enable_if_t< inline std::enable_if_t<
!Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
!(std::is_same< !(std::is_same<
typename DynRankView<T, P...>::memory_space, typename DynRankView<T, P...>::memory_space,
typename DynRankView<T, P...>::HostMirror::memory_space>::value && typename DynRankView<T, P...>::HostMirror::memory_space>::value &&
@ -2141,29 +2147,39 @@ create_mirror_view(
return Kokkos::Impl::create_mirror(src, arg_prop); return Kokkos::Impl::create_mirror(src, arg_prop);
} }
template <class Space, class T, class... P, class... ViewCtorArgs> template <class T, class... P, class... ViewCtorArgs,
class = std::enable_if_t<
Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
inline std::enable_if_t< inline std::enable_if_t<
Kokkos::is_space<Space>::value && Kokkos::is_space<
Impl::MirrorDRViewType<Space, T, P...>::is_same_memspace, typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space>::value &&
typename Impl::MirrorDRViewType<Space, T, P...>::view_type> Impl::MirrorDRViewType<
create_mirror_view(const Space&, const Kokkos::DynRankView<T, P...>& src, typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T,
P...>::is_same_memspace,
typename Impl::MirrorDRViewType<
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T,
P...>::view_type>
create_mirror_view(const Kokkos::DynRankView<T, P...>& src,
const typename Impl::ViewCtorProp<ViewCtorArgs...>&) { const typename Impl::ViewCtorProp<ViewCtorArgs...>&) {
return src; return src;
} }
template <class Space, class T, class... P, class... ViewCtorArgs> template <class T, class... P, class... ViewCtorArgs,
class = std::enable_if_t<
Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
inline std::enable_if_t< inline std::enable_if_t<
Kokkos::is_space<Space>::value && Kokkos::is_space<
!Impl::MirrorDRViewType<Space, T, P...>::is_same_memspace, typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space>::value &&
typename Impl::MirrorDRViewType<Space, T, P...>::view_type> !Impl::MirrorDRViewType<
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T,
P...>::is_same_memspace,
typename Impl::MirrorDRViewType<
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T,
P...>::view_type>
create_mirror_view( create_mirror_view(
const Space&, const Kokkos::DynRankView<T, P...>& src, const Kokkos::DynRankView<T, P...>& src,
const typename Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) { const typename Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
using MemorySpace = typename Space::memory_space; return Kokkos::Impl::create_mirror(src, arg_prop);
using alloc_prop = Impl::ViewCtorProp<ViewCtorArgs..., MemorySpace>;
alloc_prop prop_copy(arg_prop);
return Kokkos::Impl::create_mirror(src, prop_copy);
} }
} // namespace Impl } // namespace Impl
@ -2224,9 +2240,10 @@ create_mirror_view(
template <class Space, class T, class... P> template <class Space, class T, class... P>
inline auto create_mirror_view(Kokkos::Impl::WithoutInitializing_t wi, inline auto create_mirror_view(Kokkos::Impl::WithoutInitializing_t wi,
const Space& space, const Space&,
const Kokkos::DynRankView<T, P...>& src) { const Kokkos::DynRankView<T, P...>& src) {
return Impl::create_mirror_view(space, src, Kokkos::view_alloc(wi)); return Impl::create_mirror_view(
src, Kokkos::view_alloc(typename Space::memory_space{}, wi));
} }
template <class T, class... P, class... ViewCtorArgs> template <class T, class... P, class... ViewCtorArgs>

View File

@ -710,7 +710,7 @@ template <class Space, class T, class... P>
inline auto create_mirror( inline auto create_mirror(
const Space&, const Kokkos::Experimental::DynamicView<T, P...>& src) { const Space&, const Kokkos::Experimental::DynamicView<T, P...>& src) {
return Impl::create_mirror( return Impl::create_mirror(
src, Impl::ViewCtorProp<>{typename Space::memory_space{}}); src, Kokkos::view_alloc(typename Space::memory_space{}));
} }
template <class Space, class T, class... P> template <class Space, class T, class... P>
@ -729,8 +729,10 @@ inline auto create_mirror(
} }
namespace Impl { namespace Impl {
template <class T, class... P, class... ViewCtorArgs> template <class T, class... P, class... ViewCtorArgs>
inline std::enable_if_t< inline std::enable_if_t<
!Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
(std::is_same< (std::is_same<
typename Kokkos::Experimental::DynamicView<T, P...>::memory_space, typename Kokkos::Experimental::DynamicView<T, P...>::memory_space,
typename Kokkos::Experimental::DynamicView< typename Kokkos::Experimental::DynamicView<
@ -740,14 +742,14 @@ inline std::enable_if_t<
typename Kokkos::Experimental::DynamicView< typename Kokkos::Experimental::DynamicView<
T, P...>::HostMirror::data_type>::value), T, P...>::HostMirror::data_type>::value),
typename Kokkos::Experimental::DynamicView<T, P...>::HostMirror> typename Kokkos::Experimental::DynamicView<T, P...>::HostMirror>
create_mirror_view( create_mirror_view(const Kokkos::Experimental::DynamicView<T, P...>& src,
const typename Kokkos::Experimental::DynamicView<T, P...>& src,
const Impl::ViewCtorProp<ViewCtorArgs...>&) { const Impl::ViewCtorProp<ViewCtorArgs...>&) {
return src; return src;
} }
template <class T, class... P, class... ViewCtorArgs> template <class T, class... P, class... ViewCtorArgs>
inline std::enable_if_t< inline std::enable_if_t<
!Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
!(std::is_same< !(std::is_same<
typename Kokkos::Experimental::DynamicView<T, P...>::memory_space, typename Kokkos::Experimental::DynamicView<T, P...>::memory_space,
typename Kokkos::Experimental::DynamicView< typename Kokkos::Experimental::DynamicView<
@ -762,15 +764,33 @@ create_mirror_view(const Kokkos::Experimental::DynamicView<T, P...>& src,
return Kokkos::create_mirror(arg_prop, src); return Kokkos::create_mirror(arg_prop, src);
} }
template <class Space, class T, class... P, class... ViewCtorArgs> template <class T, class... P, class... ViewCtorArgs,
inline std::enable_if_t< class = std::enable_if_t<
Impl::MirrorDynamicViewType<Space, T, P...>::is_same_memspace, Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
typename Kokkos::Impl::MirrorDynamicViewType<Space, T, P...>::view_type> std::enable_if_t<Impl::MirrorDynamicViewType<
create_mirror_view(const Space&, typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
const Kokkos::Experimental::DynamicView<T, P...>& src, T, P...>::is_same_memspace,
typename Impl::MirrorDynamicViewType<
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
T, P...>::view_type>
create_mirror_view(const Kokkos::Experimental::DynamicView<T, P...>& src,
const Impl::ViewCtorProp<ViewCtorArgs...>&) { const Impl::ViewCtorProp<ViewCtorArgs...>&) {
return src; return src;
} }
template <class T, class... P, class... ViewCtorArgs,
class = std::enable_if_t<
Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
std::enable_if_t<!Impl::MirrorDynamicViewType<
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
T, P...>::is_same_memspace,
typename Impl::MirrorDynamicViewType<
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
T, P...>::view_type>
create_mirror_view(const Kokkos::Experimental::DynamicView<T, P...>& src,
const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
return Kokkos::Impl::create_mirror(src, arg_prop);
}
} // namespace Impl } // namespace Impl
// Create a mirror view in host space // Create a mirror view in host space
@ -790,8 +810,9 @@ inline auto create_mirror_view(
// Create a mirror in a new space // Create a mirror in a new space
template <class Space, class T, class... P> template <class Space, class T, class... P>
inline auto create_mirror_view( inline auto create_mirror_view(
const Space& space, const Kokkos::Experimental::DynamicView<T, P...>& src) { const Space&, const Kokkos::Experimental::DynamicView<T, P...>& src) {
return Impl::create_mirror_view(space, src, Impl::ViewCtorProp<>{}); return Impl::create_mirror_view(src,
view_alloc(typename Space::memory_space{}));
} }
template <class Space, class T, class... P> template <class Space, class T, class... P>

View File

@ -1901,19 +1901,22 @@ struct MirrorOffsetType {
namespace Impl { namespace Impl {
template <class T, class... P, class... ViewCtorArgs> template <class T, class... P, class... ViewCtorArgs>
inline typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror inline std::enable_if_t<
!Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space,
typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror>
create_mirror(const Kokkos::Experimental::OffsetView<T, P...>& src, create_mirror(const Kokkos::Experimental::OffsetView<T, P...>& src,
const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) { const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
return typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror( return typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror(
Kokkos::create_mirror(arg_prop, src.view()), src.begins()); Kokkos::create_mirror(arg_prop, src.view()), src.begins());
} }
template <class Space, class T, class... P, class... ViewCtorArgs> template <class T, class... P, class... ViewCtorArgs,
inline typename Kokkos::Impl::MirrorOffsetType<Space, T, P...>::view_type class = std::enable_if_t<
create_mirror(const Space&, Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
const Kokkos::Experimental::OffsetView<T, P...>& src, inline auto create_mirror(const Kokkos::Experimental::OffsetView<T, P...>& src,
const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) { const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
using alloc_prop_input = Impl::ViewCtorProp<ViewCtorArgs...>; using alloc_prop_input = Impl::ViewCtorProp<ViewCtorArgs...>;
using Space = typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space;
static_assert( static_assert(
!alloc_prop_input::has_label, !alloc_prop_input::has_label,
@ -1923,10 +1926,6 @@ create_mirror(const Space&,
!alloc_prop_input::has_pointer, !alloc_prop_input::has_pointer,
"The view constructor arguments passed to Kokkos::create_mirror must " "The view constructor arguments passed to Kokkos::create_mirror must "
"not include a pointer!"); "not include a pointer!");
static_assert(
!alloc_prop_input::has_memory_space,
"The view constructor arguments passed to Kokkos::create_mirror must "
"not include a memory space instance!");
static_assert( static_assert(
!alloc_prop_input::allow_padding, !alloc_prop_input::allow_padding,
"The view constructor arguments passed to Kokkos::create_mirror must " "The view constructor arguments passed to Kokkos::create_mirror must "
@ -1962,15 +1961,17 @@ inline auto create_mirror(
template <class Space, class T, class... P, template <class Space, class T, class... P,
typename Enable = std::enable_if_t<Kokkos::is_space<Space>::value>> typename Enable = std::enable_if_t<Kokkos::is_space<Space>::value>>
inline auto create_mirror( inline auto create_mirror(
const Space& space, const Kokkos::Experimental::OffsetView<T, P...>& src) { const Space&, const Kokkos::Experimental::OffsetView<T, P...>& src) {
return Impl::create_mirror(space, src, Impl::ViewCtorProp<>{}); return Impl::create_mirror(
src, Kokkos::view_alloc(typename Space::memory_space{}));
} }
template <class Space, class T, class... P> template <class Space, class T, class... P>
typename Kokkos::Impl::MirrorOffsetType<Space, T, P...>::view_type typename Kokkos::Impl::MirrorOffsetType<Space, T, P...>::view_type
create_mirror(Kokkos::Impl::WithoutInitializing_t wi, const Space& space, create_mirror(Kokkos::Impl::WithoutInitializing_t wi, const Space&,
const Kokkos::Experimental::OffsetView<T, P...>& src) { const Kokkos::Experimental::OffsetView<T, P...>& src) {
return Impl::create_mirror(space, src, Kokkos::view_alloc(wi)); return Impl::create_mirror(
src, Kokkos::view_alloc(typename Space::memory_space{}, wi));
} }
template <class T, class... P, class... ViewCtorArgs> template <class T, class... P, class... ViewCtorArgs>
@ -1983,22 +1984,24 @@ inline auto create_mirror(
namespace Impl { namespace Impl {
template <class T, class... P, class... ViewCtorArgs> template <class T, class... P, class... ViewCtorArgs>
inline std::enable_if_t< inline std::enable_if_t<
!Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
(std::is_same< (std::is_same<
typename Kokkos::Experimental::OffsetView<T, P...>::memory_space, typename Kokkos::Experimental::OffsetView<T, P...>::memory_space,
typename Kokkos::Experimental::OffsetView< typename Kokkos::Experimental::OffsetView<
T, P...>::HostMirror::memory_space>::value && T, P...>::HostMirror::memory_space>::value &&
std::is_same<typename Kokkos::Experimental::OffsetView<T, P...>::data_type, std::is_same<
typename Kokkos::Experimental::OffsetView<T, P...>::data_type,
typename Kokkos::Experimental::OffsetView< typename Kokkos::Experimental::OffsetView<
T, P...>::HostMirror::data_type>::value), T, P...>::HostMirror::data_type>::value),
typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror> typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror>
create_mirror_view( create_mirror_view(const Kokkos::Experimental::OffsetView<T, P...>& src,
const typename Kokkos::Experimental::OffsetView<T, P...>& src,
const Impl::ViewCtorProp<ViewCtorArgs...>&) { const Impl::ViewCtorProp<ViewCtorArgs...>&) {
return src; return src;
} }
template <class T, class... P, class... ViewCtorArgs> template <class T, class... P, class... ViewCtorArgs>
inline std::enable_if_t< inline std::enable_if_t<
!Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
!(std::is_same< !(std::is_same<
typename Kokkos::Experimental::OffsetView<T, P...>::memory_space, typename Kokkos::Experimental::OffsetView<T, P...>::memory_space,
typename Kokkos::Experimental::OffsetView< typename Kokkos::Experimental::OffsetView<
@ -2013,24 +2016,32 @@ create_mirror_view(const Kokkos::Experimental::OffsetView<T, P...>& src,
return Kokkos::create_mirror(arg_prop, src); return Kokkos::create_mirror(arg_prop, src);
} }
template <class Space, class T, class... P, class... ViewCtorArgs> template <class T, class... P, class... ViewCtorArgs,
inline std::enable_if_t< class = std::enable_if_t<
Impl::MirrorOffsetViewType<Space, T, P...>::is_same_memspace, Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
Kokkos::Experimental::OffsetView<T, P...>> std::enable_if_t<Impl::MirrorOffsetViewType<
create_mirror_view(const Space&, typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
const Kokkos::Experimental::OffsetView<T, P...>& src, T, P...>::is_same_memspace,
typename Impl::MirrorOffsetViewType<
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
T, P...>::view_type>
create_mirror_view(const Kokkos::Experimental::OffsetView<T, P...>& src,
const Impl::ViewCtorProp<ViewCtorArgs...>&) { const Impl::ViewCtorProp<ViewCtorArgs...>&) {
return src; return src;
} }
template <class Space, class T, class... P, class... ViewCtorArgs> template <class T, class... P, class... ViewCtorArgs,
std::enable_if_t< class = std::enable_if_t<
!Impl::MirrorOffsetViewType<Space, T, P...>::is_same_memspace, Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
typename Kokkos::Impl::MirrorOffsetViewType<Space, T, P...>::view_type> std::enable_if_t<!Impl::MirrorOffsetViewType<
create_mirror_view(const Space& space, typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
const Kokkos::Experimental::OffsetView<T, P...>& src, T, P...>::is_same_memspace,
typename Impl::MirrorOffsetViewType<
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
T, P...>::view_type>
create_mirror_view(const Kokkos::Experimental::OffsetView<T, P...>& src,
const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) { const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
return create_mirror(space, src, arg_prop); return Kokkos::Impl::create_mirror(src, arg_prop);
} }
} // namespace Impl } // namespace Impl
@ -2052,15 +2063,17 @@ inline auto create_mirror_view(
template <class Space, class T, class... P, template <class Space, class T, class... P,
typename Enable = std::enable_if_t<Kokkos::is_space<Space>::value>> typename Enable = std::enable_if_t<Kokkos::is_space<Space>::value>>
inline auto create_mirror_view( inline auto create_mirror_view(
const Space& space, const Kokkos::Experimental::OffsetView<T, P...>& src) { const Space&, const Kokkos::Experimental::OffsetView<T, P...>& src) {
return Impl::create_mirror_view(space, src, Impl::ViewCtorProp<>{}); return Impl::create_mirror_view(
src, Kokkos::view_alloc(typename Space::memory_space{}));
} }
template <class Space, class T, class... P> template <class Space, class T, class... P>
inline auto create_mirror_view( inline auto create_mirror_view(
Kokkos::Impl::WithoutInitializing_t wi, const Space& space, Kokkos::Impl::WithoutInitializing_t wi, const Space&,
const Kokkos::Experimental::OffsetView<T, P...>& src) { const Kokkos::Experimental::OffsetView<T, P...>& src) {
return Impl::create_mirror_view(space, src, Kokkos::view_alloc(wi)); return Impl::create_mirror_view(
src, Kokkos::view_alloc(typename Space::memory_space{}, wi));
} }
template <class T, class... P, class... ViewCtorArgs> template <class T, class... P, class... ViewCtorArgs>

View File

@ -46,3 +46,13 @@ foreach(Tag Threads;Serial;OpenMP;HPX;Cuda;HIP;SYCL)
KOKKOS_ADD_EXECUTABLE_AND_TEST(UnitTest_${Tag} SOURCES ${UnitTestSources}) KOKKOS_ADD_EXECUTABLE_AND_TEST(UnitTest_${Tag} SOURCES ${UnitTestSources})
endif() endif()
endforeach() endforeach()
SET(COMPILE_ONLY_SOURCES
TestCreateMirror.cpp
)
KOKKOS_ADD_EXECUTABLE(
TestCompileOnly
SOURCES
TestCompileMain.cpp
${COMPILE_ONLY_SOURCES}
)

View File

@ -0,0 +1 @@
int main() {}

View File

@ -0,0 +1,179 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 3.0
// Copyright (2020) National Technology & Engineering
// Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Core.hpp>
#include <Kokkos_DynamicView.hpp>
#include <Kokkos_DynRankView.hpp>
#include <Kokkos_OffsetView.hpp>
template <typename TestView, typename MemorySpace>
void check_memory_space(TestView, MemorySpace) {
static_assert(
std::is_same<typename TestView::memory_space, MemorySpace>::value, "");
}
template <class View>
auto host_mirror_test_space(View) {
return std::conditional_t<
Kokkos::SpaceAccessibility<Kokkos::HostSpace,
typename View::memory_space>::accessible,
typename View::memory_space, Kokkos::HostSpace>{};
}
template <typename View>
void test_create_mirror_properties(const View& view) {
using namespace Kokkos;
using DeviceMemorySpace = typename DefaultExecutionSpace::memory_space;
// clang-format off
// create_mirror
#ifndef KOKKOS_ENABLE_CXX14
// FIXME DynamicView: HostMirror is the same type
if constexpr (!is_dynamic_view<View>::value) {
check_memory_space(create_mirror(WithoutInitializing, view), host_mirror_test_space(view));
check_memory_space(create_mirror( view), host_mirror_test_space(view));
}
#endif
check_memory_space(create_mirror(WithoutInitializing, DefaultExecutionSpace{}, view), DeviceMemorySpace{});
check_memory_space(create_mirror( DefaultExecutionSpace{}, view), DeviceMemorySpace{});
// create_mirror_view
#ifndef KOKKOS_ENABLE_CXX14
// FIXME DynamicView: HostMirror is the same type
if constexpr (!is_dynamic_view<View>::value) {
check_memory_space(create_mirror_view(WithoutInitializing, view), host_mirror_test_space(view));
check_memory_space(create_mirror_view( view), host_mirror_test_space(view));
}
#endif
check_memory_space(create_mirror_view(WithoutInitializing, DefaultExecutionSpace{}, view), DeviceMemorySpace{});
check_memory_space(create_mirror_view( DefaultExecutionSpace{}, view), DeviceMemorySpace{});
// create_mirror view_alloc
#ifndef KOKKOS_ENABLE_CXX14
// FIXME DynamicView: HostMirror is the same type
if constexpr (!is_dynamic_view<View>::value) {
check_memory_space(create_mirror(view_alloc(WithoutInitializing), view), host_mirror_test_space(view));
check_memory_space(create_mirror(view_alloc(), view), host_mirror_test_space(view));
}
#endif
check_memory_space(create_mirror(view_alloc(WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
check_memory_space(create_mirror(view_alloc( DeviceMemorySpace{}), view), DeviceMemorySpace{});
// create_mirror_view view_alloc
#ifndef KOKKOS_ENABLE_CXX14
// FIXME DynamicView: HostMirror is the same type
if constexpr (!is_dynamic_view<View>::value) {
check_memory_space(create_mirror_view(view_alloc(WithoutInitializing), view), host_mirror_test_space(view));
check_memory_space(create_mirror_view(view_alloc(), view), host_mirror_test_space(view));
}
#endif
check_memory_space(create_mirror_view(view_alloc(WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
check_memory_space(create_mirror_view(view_alloc( DeviceMemorySpace{}), view), DeviceMemorySpace{});
// create_mirror view_alloc + execution space
#ifndef KOKKOS_ENABLE_CXX14
// FIXME DynamicView: HostMirror is the same type
if constexpr (!is_dynamic_view<View>::value) {
check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{}, WithoutInitializing), view), host_mirror_test_space(view));
check_memory_space(create_mirror(view_alloc(DefaultHostExecutionSpace{}), view), host_mirror_test_space(view));
}
#endif
check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{}, WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{}, DeviceMemorySpace{}), view), DeviceMemorySpace{});
// create_mirror_view view_alloc + execution space
#ifndef KOKKOS_ENABLE_CXX14
// FIXME DynamicView: HostMirror is the same type
if constexpr (!is_dynamic_view<View>::value) {
check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{}, WithoutInitializing), view), host_mirror_test_space(view));
check_memory_space(create_mirror_view(view_alloc(DefaultHostExecutionSpace{}), view), host_mirror_test_space(view));
}
#endif
check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{}, WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{}, DeviceMemorySpace{}), view), DeviceMemorySpace{});
// create_mirror_view_and_copy
check_memory_space(create_mirror_view_and_copy(HostSpace{}, view), HostSpace{});
check_memory_space(create_mirror_view_and_copy(DeviceMemorySpace{}, view), DeviceMemorySpace{});
// create_mirror_view_and_copy view_alloc
check_memory_space(create_mirror_view_and_copy(view_alloc(HostSpace{}), view), HostSpace{});
check_memory_space(create_mirror_view_and_copy(view_alloc(DeviceMemorySpace{}), view), DeviceMemorySpace{});
// create_mirror_view_and_copy view_alloc + execution space
check_memory_space(create_mirror_view_and_copy(view_alloc(HostSpace{}, DefaultHostExecutionSpace{}), view), HostSpace{});
check_memory_space(create_mirror_view_and_copy(view_alloc(DeviceMemorySpace{}, DefaultExecutionSpace{}), view), DeviceMemorySpace{});
// clang-format on
}
void test_create_mirror_dynrankview() {
Kokkos::DynRankView<int, Kokkos::DefaultExecutionSpace> device_view(
"device view", 10);
Kokkos::DynRankView<int, Kokkos::HostSpace> host_view("host view", 10);
test_create_mirror_properties(device_view);
test_create_mirror_properties(host_view);
}
void test_reate_mirror_offsetview() {
Kokkos::Experimental::OffsetView<int*, Kokkos::DefaultExecutionSpace>
device_view("device view", {0, 10});
Kokkos::Experimental::OffsetView<int*, Kokkos::HostSpace> host_view(
"host view", {0, 10});
test_create_mirror_properties(device_view);
test_create_mirror_properties(host_view);
}
void test_create_mirror_dynamicview() {
Kokkos::Experimental::DynamicView<int*, Kokkos::DefaultExecutionSpace>
device_view("device view", 2, 10);
Kokkos::Experimental::DynamicView<int*, Kokkos::HostSpace> host_view(
"host view", 2, 10);
test_create_mirror_properties(device_view);
test_create_mirror_properties(host_view);
}

View File

@ -1,5 +1,5 @@
TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC DLlib HPX LIB_OPTIONAL_TPLS Pthread CUDA HWLOC DLlib
TEST_OPTIONAL_TPLS CUSPARSE TEST_OPTIONAL_TPLS CUSPARSE
) )

View File

@ -53,13 +53,69 @@
namespace Kokkos { namespace Kokkos {
namespace Impl { namespace Impl {
inline int cuda_warp_per_sm_allocation_granularity(
cudaDeviceProp const& properties) {
// Allocation granularity of warps in each sm
switch (properties.major) {
case 3:
case 5:
case 7:
case 8:
case 9: return 4;
case 6: return (properties.minor == 0 ? 2 : 4);
default:
throw_runtime_exception(
"Unknown device in cuda warp per sm allocation granularity");
return 0;
}
}
inline int cuda_max_warps_per_sm_registers(
cudaDeviceProp const& properties, cudaFuncAttributes const& attributes) {
// Maximum number of warps per sm as a function of register counts,
// subject to the constraint that warps are allocated with a fixed granularity
int const max_regs_per_block = properties.regsPerBlock;
int const regs_per_warp = attributes.numRegs * properties.warpSize;
int const warp_granularity =
cuda_warp_per_sm_allocation_granularity(properties);
// The granularity of register allocation is chunks of 256 registers per warp,
// which implies a need to over-allocate, so we round up
int const allocated_regs_per_warp = 256 * ((regs_per_warp + 256 - 1) / 256);
// The maximum number of warps per SM is constrained from above by register
// allocation. To satisfy the constraint that warps per SM is allocated at a
// finite granularity, we need to round down.
int const max_warps_per_sm =
warp_granularity *
(max_regs_per_block / (allocated_regs_per_warp * warp_granularity));
return max_warps_per_sm;
}
inline int cuda_max_active_blocks_per_sm(cudaDeviceProp const& properties, inline int cuda_max_active_blocks_per_sm(cudaDeviceProp const& properties,
cudaFuncAttributes const& attributes, cudaFuncAttributes const& attributes,
int block_size, size_t dynamic_shmem) { int block_size, size_t dynamic_shmem) {
// Limits due do registers/SM // Limits due to registers/SM
int const regs_per_sm = properties.regsPerMultiprocessor; int const regs_per_sm = properties.regsPerMultiprocessor;
int const regs_per_thread = attributes.numRegs; int const regs_per_thread = attributes.numRegs;
int const max_blocks_regs = regs_per_sm / (regs_per_thread * block_size); // The granularity of register allocation is chunks of 256 registers per warp
// -> 8 registers per thread
int const allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
int max_blocks_regs = regs_per_sm / (allocated_regs_per_thread * block_size);
// Compute the maximum number of warps as a function of the number of
// registers
int const max_warps_per_sm_registers =
cuda_max_warps_per_sm_registers(properties, attributes);
// Constrain the number of blocks to respect the maximum number of warps per
// SM On face value this should be an equality, but due to the warp
// granularity constraints noted in `cuda_max_warps_per_sm_registers` the
// left-hand-side of this comparison can overshoot what the hardware allows
// based on register counts alone
while ((max_blocks_regs * block_size / properties.warpSize) >
max_warps_per_sm_registers)
max_blocks_regs--;
// Limits due to shared memory/SM // Limits due to shared memory/SM
size_t const shmem_per_sm = properties.sharedMemPerMultiprocessor; size_t const shmem_per_sm = properties.sharedMemPerMultiprocessor;
@ -203,40 +259,19 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
LaunchBounds{}); LaunchBounds{});
} }
// Assuming cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1) template <class LaunchBounds>
// NOTE these number can be obtained several ways: int cuda_get_opt_block_size_no_shmem(const cudaFuncAttributes& attr,
// * One option is to download the CUDA Occupancy Calculator spreadsheet, select LaunchBounds) {
// "Compute Capability" first and check what is the smallest "Shared Memory auto const& prop = Kokkos::Cuda().cuda_device_prop();
// Size Config" that is available. The "Shared Memory Per Multiprocessor" in
// bytes is then to be found below in the summary. // Thin version of cuda_get_opt_block_size for cases where there is no shared
// * Another option would be to look for the information in the "Tuning // memory
// Guide(s)" of the CUDA Toolkit Documentation for each GPU architecture, in auto const block_size_to_no_shmem = [&](int /*block_size*/) { return 0; };
// the "Shared Memory" section (more tedious)
inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) { return cuda_deduce_block_size(false, prop, attr, block_size_to_no_shmem,
int const compute_capability = properties.major * 10 + properties.minor; LaunchBounds{});
return [compute_capability]() {
switch (compute_capability) {
case 30:
case 32:
case 35: return 16;
case 37: return 80;
case 50:
case 53:
case 60:
case 62: return 64;
case 52:
case 61: return 96;
case 70:
case 80:
case 86: return 8;
case 75: return 32;
default:
Kokkos::Impl::throw_runtime_exception(
"Unknown device in cuda block size deduction");
}
return 0;
}() * 1024;
} }
} // namespace Impl } // namespace Impl
} // namespace Kokkos } // namespace Kokkos

View File

@ -418,7 +418,7 @@ KOKKOS_INLINE_FUNCTION
#endif // CUDA_VERSION >= 11000 && CUDA_VERSION < 11010 #endif // CUDA_VERSION >= 11000 && CUDA_VERSION < 11010
#if CUDA_VERSION >= 11010 && \ #if CUDA_VERSION >= 11010 && \
((defined(KOKKOS_ARCH_AMPERE80) || defined(KOKKOS_ARCH_AMPERE86))) ((defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER)))
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
bhalf_t cast_to_bhalf(bhalf_t val) { return val; } bhalf_t cast_to_bhalf(bhalf_t val) { return val; }
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION

View File

@ -569,12 +569,6 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default
} }
#endif #endif
#ifdef KOKKOS_ENABLE_PRE_CUDA_10_DEPRECATION_API
cudaThreadSetCacheConfig(cudaFuncCachePreferShared);
#else
cudaDeviceSetCacheConfig(cudaFuncCachePreferShared);
#endif
// Init the array for used for arbitrarily sized atomics // Init the array for used for arbitrarily sized atomics
if (stream == nullptr) Impl::initialize_host_cuda_lock_arrays(); if (stream == nullptr) Impl::initialize_host_cuda_lock_arrays();

View File

@ -93,10 +93,6 @@ namespace Impl {
// __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor) // __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
// function qualifier which could be used to improve performance. // function qualifier which could be used to improve performance.
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------
// Maximize L1 cache and minimize shared memory:
// cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1 );
// For 2.0 capability: 48 KB L1 and 16 KB shared
//----------------------------------------------------------------------------
template <class DriverType> template <class DriverType>
__global__ static void cuda_parallel_launch_constant_memory() { __global__ static void cuda_parallel_launch_constant_memory() {
@ -158,63 +154,119 @@ inline void check_shmem_request(CudaInternal const* cuda_instance, int shmem) {
} }
} }
// This function needs to be template on DriverType and LaunchBounds // These functions needs to be template on DriverType and LaunchBounds
// so that the static bool is unique for each type combo // so that the static bool is unique for each type combo
// KernelFuncPtr does not necessarily contain that type information. // KernelFuncPtr does not necessarily contain that type information.
template <class DriverType, class LaunchBounds, class KernelFuncPtr> template <class DriverType, class LaunchBounds, class KernelFuncPtr>
inline void configure_shmem_preference(KernelFuncPtr const& func, const cudaFuncAttributes& get_cuda_kernel_func_attributes(
bool prefer_shmem) { const KernelFuncPtr& func) {
#ifndef KOKKOS_ARCH_KEPLER // Only call cudaFuncGetAttributes once for each unique kernel
// On Kepler the L1 has no benefit since it doesn't cache reads // by leveraging static variable initialization rules
auto set_cache_config = [&] { auto wrap_get_attributes = [&]() -> cudaFuncAttributes {
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetCacheConfig( cudaFuncAttributes attr;
func, KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncGetAttributes(&attr, func));
(prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1))); return attr;
return prefer_shmem;
}; };
static bool cache_config_preference_cached = set_cache_config(); static cudaFuncAttributes func_attr = wrap_get_attributes();
if (cache_config_preference_cached != prefer_shmem) { return func_attr;
}
template <class DriverType, class LaunchBounds, class KernelFuncPtr>
inline void configure_shmem_preference(const KernelFuncPtr& func,
const cudaDeviceProp& device_props,
const size_t block_size, int& shmem,
const size_t occupancy) {
#ifndef KOKKOS_ARCH_KEPLER
const auto& func_attr =
get_cuda_kernel_func_attributes<DriverType, LaunchBounds>(func);
// Compute limits for number of blocks due to registers/SM
const size_t regs_per_sm = device_props.regsPerMultiprocessor;
const size_t regs_per_thread = func_attr.numRegs;
// The granularity of register allocation is chunks of 256 registers per warp
// -> 8 registers per thread
const size_t allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
size_t max_blocks_regs =
regs_per_sm / (allocated_regs_per_thread * block_size);
// Compute the maximum number of warps as a function of the number of
// registers
const size_t max_warps_per_sm_registers =
cuda_max_warps_per_sm_registers(device_props, func_attr);
// Constrain the number of blocks to respect the maximum number of warps per
// SM On face value this should be an equality, but due to the warp
// granularity constraints noted in `cuda_max_warps_per_sm_registers` the
// left-hand-side of this comparison can overshoot what the hardware allows
// based on register counts alone
while ((max_blocks_regs * block_size / device_props.warpSize) >
max_warps_per_sm_registers)
max_blocks_regs--;
// Compute how many threads per sm we actually want
const size_t max_threads_per_sm = device_props.maxThreadsPerMultiProcessor;
// only allocate multiples of warp size
const size_t num_threads_desired =
((max_threads_per_sm * occupancy / 100 + 31) / 32) * 32;
// Get close to the desired occupancy,
// don't undershoot by much but also don't allocate a whole new block just
// because one is a few threads over otherwise.
size_t num_blocks_desired =
(num_threads_desired + block_size * 0.8) / block_size;
num_blocks_desired = ::std::min(max_blocks_regs, num_blocks_desired);
if (num_blocks_desired == 0) num_blocks_desired = 1;
// Calculate how much shared memory we need per block
size_t shmem_per_block = shmem + func_attr.sharedSizeBytes;
// The minimum shared memory allocation we can have in total per SM is 8kB.
// If we want to lower occupancy we have to make sure we request at least that
// much in aggregate over all blocks, so that shared memory actually becomes a
// limiting factor for occupancy
constexpr size_t min_shmem_size_per_sm = 8192;
if ((occupancy < 100) &&
(shmem_per_block * num_blocks_desired < min_shmem_size_per_sm)) {
shmem_per_block = min_shmem_size_per_sm / num_blocks_desired;
// Need to set the caller's shmem variable so that the
// kernel launch uses the correct dynamic shared memory request
shmem = shmem_per_block - func_attr.sharedSizeBytes;
}
// Compute the carveout fraction we need based on occupancy
// Use multiples of 8kB
const size_t max_shmem_per_sm = device_props.sharedMemPerMultiprocessor;
size_t carveout = shmem_per_block == 0
? 0
: 100 *
(((num_blocks_desired * shmem_per_block +
min_shmem_size_per_sm - 1) /
min_shmem_size_per_sm) *
min_shmem_size_per_sm) /
max_shmem_per_sm;
if (carveout > 100) carveout = 100;
// Set the carveout, but only call it once per kernel or when it changes
auto set_cache_config = [&] {
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetAttribute(
func, cudaFuncAttributePreferredSharedMemoryCarveout, carveout));
return carveout;
};
// Store the value in a static variable so we only reset if needed
static size_t cache_config_preference_cached = set_cache_config();
if (cache_config_preference_cached != carveout) {
cache_config_preference_cached = set_cache_config(); cache_config_preference_cached = set_cache_config();
} }
#else #else
// Use the parameters so we don't get a warning // Use the parameters so we don't get a warning
(void)func; (void)func;
(void)prefer_shmem; (void)device_props;
(void)block_size;
(void)occupancy;
#endif #endif
} }
template <class Policy>
std::enable_if_t<Policy::experimental_contains_desired_occupancy>
modify_launch_configuration_if_desired_occupancy_is_specified(
Policy const& policy, cudaDeviceProp const& properties,
cudaFuncAttributes const& attributes, dim3 const& block, int& shmem,
bool& prefer_shmem) {
int const block_size = block.x * block.y * block.z;
int const desired_occupancy = policy.impl_get_desired_occupancy().value();
size_t const shmem_per_sm_prefer_l1 = get_shmem_per_sm_prefer_l1(properties);
size_t const static_shmem = attributes.sharedSizeBytes;
// round to nearest integer and avoid division by zero
int active_blocks = std::max(
1, static_cast<int>(std::round(
static_cast<double>(properties.maxThreadsPerMultiProcessor) /
block_size * desired_occupancy / 100)));
int const dynamic_shmem =
shmem_per_sm_prefer_l1 / active_blocks - static_shmem;
if (dynamic_shmem > shmem) {
shmem = dynamic_shmem;
prefer_shmem = false;
}
}
template <class Policy>
std::enable_if_t<!Policy::experimental_contains_desired_occupancy>
modify_launch_configuration_if_desired_occupancy_is_specified(
Policy const&, cudaDeviceProp const&, cudaFuncAttributes const&,
dim3 const& /*block*/, int& /*shmem*/, bool& /*prefer_shmem*/) {}
// </editor-fold> end Some helper functions for launch code readability }}}1 // </editor-fold> end Some helper functions for launch code readability }}}1
//============================================================================== //==============================================================================
@ -348,7 +400,7 @@ struct CudaParallelLaunchKernelInvoker<
#ifdef KOKKOS_CUDA_ENABLE_GRAPHS #ifdef KOKKOS_CUDA_ENABLE_GRAPHS
inline static void create_parallel_launch_graph_node( inline static void create_parallel_launch_graph_node(
DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem, DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
CudaInternal const* cuda_instance, bool prefer_shmem) { CudaInternal const* cuda_instance) {
//---------------------------------------- //----------------------------------------
auto const& graph = Impl::get_cuda_graph_from_kernel(driver); auto const& graph = Impl::get_cuda_graph_from_kernel(driver);
KOKKOS_EXPECTS(bool(graph)); KOKKOS_EXPECTS(bool(graph));
@ -358,8 +410,19 @@ struct CudaParallelLaunchKernelInvoker<
if (!Impl::is_empty_launch(grid, block)) { if (!Impl::is_empty_launch(grid, block)) {
Impl::check_shmem_request(cuda_instance, shmem); Impl::check_shmem_request(cuda_instance, shmem);
if (DriverType::Policy::
experimental_contains_desired_occupancy) {
/*
int desired_occupancy =
driver.get_policy().impl_get_desired_occupancy().value();
size_t block_size = block.x * block.y * block.z;
Impl::configure_shmem_preference<DriverType, LaunchBounds>( Impl::configure_shmem_preference<DriverType, LaunchBounds>(
base_t::get_kernel_func(), prefer_shmem); base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
shmem, desired_occupancy);*/
Kokkos::Impl::throw_runtime_exception(
std::string("Cuda graph node creation FAILED:"
" occupancy requests are currently broken."));
}
void const* args[] = {&driver}; void const* args[] = {&driver};
@ -442,7 +505,7 @@ struct CudaParallelLaunchKernelInvoker<
#ifdef KOKKOS_CUDA_ENABLE_GRAPHS #ifdef KOKKOS_CUDA_ENABLE_GRAPHS
inline static void create_parallel_launch_graph_node( inline static void create_parallel_launch_graph_node(
DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem, DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
CudaInternal const* cuda_instance, bool prefer_shmem) { CudaInternal const* cuda_instance) {
//---------------------------------------- //----------------------------------------
auto const& graph = Impl::get_cuda_graph_from_kernel(driver); auto const& graph = Impl::get_cuda_graph_from_kernel(driver);
KOKKOS_EXPECTS(bool(graph)); KOKKOS_EXPECTS(bool(graph));
@ -452,8 +515,18 @@ struct CudaParallelLaunchKernelInvoker<
if (!Impl::is_empty_launch(grid, block)) { if (!Impl::is_empty_launch(grid, block)) {
Impl::check_shmem_request(cuda_instance, shmem); Impl::check_shmem_request(cuda_instance, shmem);
if (DriverType::Policy::
experimental_contains_desired_occupancy) {
/*int desired_occupancy =
driver.get_policy().impl_get_desired_occupancy().value();
size_t block_size = block.x * block.y * block.z;
Impl::configure_shmem_preference<DriverType, LaunchBounds>( Impl::configure_shmem_preference<DriverType, LaunchBounds>(
base_t::get_kernel_func(), prefer_shmem); base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
shmem, desired_occupancy);*/
Kokkos::Impl::throw_runtime_exception(
std::string("Cuda graph node creation FAILED:"
" occupancy requests are currently broken."));
}
auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver); auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver);
@ -566,7 +639,7 @@ struct CudaParallelLaunchKernelInvoker<
#ifdef KOKKOS_CUDA_ENABLE_GRAPHS #ifdef KOKKOS_CUDA_ENABLE_GRAPHS
inline static void create_parallel_launch_graph_node( inline static void create_parallel_launch_graph_node(
DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem, DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
CudaInternal const* cuda_instance, bool prefer_shmem) { CudaInternal const* cuda_instance) {
// Just use global memory; coordinating through events to share constant // Just use global memory; coordinating through events to share constant
// memory with the non-graph interface is not really reasonable since // memory with the non-graph interface is not really reasonable since
// events don't work with Graphs directly, and this would anyway require // events don't work with Graphs directly, and this would anyway require
@ -580,7 +653,7 @@ struct CudaParallelLaunchKernelInvoker<
DriverType, LaunchBounds, DriverType, LaunchBounds,
Experimental::CudaLaunchMechanism::GlobalMemory>; Experimental::CudaLaunchMechanism::GlobalMemory>;
global_launch_impl_t::create_parallel_launch_graph_node( global_launch_impl_t::create_parallel_launch_graph_node(
driver, grid, block, shmem, cuda_instance, prefer_shmem); driver, grid, block, shmem, cuda_instance);
} }
#endif #endif
}; };
@ -613,8 +686,7 @@ struct CudaParallelLaunchImpl<
inline static void launch_kernel(const DriverType& driver, const dim3& grid, inline static void launch_kernel(const DriverType& driver, const dim3& grid,
const dim3& block, int shmem, const dim3& block, int shmem,
const CudaInternal* cuda_instance, const CudaInternal* cuda_instance) {
bool prefer_shmem) {
if (!Impl::is_empty_launch(grid, block)) { if (!Impl::is_empty_launch(grid, block)) {
// Prevent multiple threads to simultaneously set the cache configuration // Prevent multiple threads to simultaneously set the cache configuration
// preference and launch the same kernel // preference and launch the same kernel
@ -623,20 +695,22 @@ struct CudaParallelLaunchImpl<
Impl::check_shmem_request(cuda_instance, shmem); Impl::check_shmem_request(cuda_instance, shmem);
// If a desired occupancy is specified, we compute how much shared memory if (DriverType::Policy::
// to ask for to achieve that occupancy, assuming that the cache experimental_contains_desired_occupancy) {
// configuration is `cudaFuncCachePreferL1`. If the amount of dynamic /*int desired_occupancy =
// shared memory computed is actually smaller than `shmem` we overwrite driver.get_policy().impl_get_desired_occupancy().value();
// `shmem` and set `prefer_shmem` to `false`. size_t block_size = block.x * block.y * block.z;
modify_launch_configuration_if_desired_occupancy_is_specified(
driver.get_policy(), cuda_instance->m_deviceProp,
get_cuda_func_attributes(), block, shmem, prefer_shmem);
Impl::configure_shmem_preference< Impl::configure_shmem_preference<
DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>( DriverType,
base_t::get_kernel_func(), prefer_shmem); Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
shmem, desired_occupancy);*/
Kokkos::Impl::throw_runtime_exception(
std::string("Cuda graph node creation FAILED:"
" occupancy requests are currently broken."));
}
ensure_cuda_lock_arrays_on_device(); KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
// Invoke the driver function on the device // Invoke the driver function on the device
base_t::invoke_kernel(driver, grid, block, shmem, cuda_instance); base_t::invoke_kernel(driver, grid, block, shmem, cuda_instance);
@ -650,18 +724,9 @@ struct CudaParallelLaunchImpl<
} }
static cudaFuncAttributes get_cuda_func_attributes() { static cudaFuncAttributes get_cuda_func_attributes() {
// Race condition inside of cudaFuncGetAttributes if the same address is return get_cuda_kernel_func_attributes<
// given requires using a local variable as input instead of a static Rely DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
// on static variable initialization to make sure only one thread executes base_t::get_kernel_func());
// the code and the result is visible.
auto wrap_get_attributes = []() -> cudaFuncAttributes {
cudaFuncAttributes attr_tmp;
KOKKOS_IMPL_CUDA_SAFE_CALL(
cudaFuncGetAttributes(&attr_tmp, base_t::get_kernel_func()));
return attr_tmp;
};
static cudaFuncAttributes attr = wrap_get_attributes();
return attr;
} }
}; };

View File

@ -79,7 +79,8 @@ CudaLockArrays g_host_cuda_lock_arrays = {nullptr, 0};
void initialize_host_cuda_lock_arrays() { void initialize_host_cuda_lock_arrays() {
#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS #ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
desul::Impl::init_lock_arrays(); desul::Impl::init_lock_arrays();
desul::ensure_cuda_lock_arrays_on_device();
DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
#endif #endif
if (g_host_cuda_lock_arrays.atomic != nullptr) return; if (g_host_cuda_lock_arrays.atomic != nullptr) return;
KOKKOS_IMPL_CUDA_SAFE_CALL( KOKKOS_IMPL_CUDA_SAFE_CALL(
@ -88,7 +89,7 @@ void initialize_host_cuda_lock_arrays() {
Impl::cuda_device_synchronize( Impl::cuda_device_synchronize(
"Kokkos::Impl::initialize_host_cuda_lock_arrays: Pre Init Lock Arrays"); "Kokkos::Impl::initialize_host_cuda_lock_arrays: Pre Init Lock Arrays");
g_host_cuda_lock_arrays.n = Cuda::concurrency(); g_host_cuda_lock_arrays.n = Cuda::concurrency();
copy_cuda_lock_arrays_to_device(); KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK + 1 + 255) / 256, init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK + 1 + 255) / 256,
256>>>(); 256>>>();
Impl::cuda_device_synchronize( Impl::cuda_device_synchronize(
@ -105,7 +106,7 @@ void finalize_host_cuda_lock_arrays() {
g_host_cuda_lock_arrays.atomic = nullptr; g_host_cuda_lock_arrays.atomic = nullptr;
g_host_cuda_lock_arrays.n = 0; g_host_cuda_lock_arrays.n = 0;
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
copy_cuda_lock_arrays_to_device(); KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
#endif #endif
} }

View File

@ -67,7 +67,7 @@ struct CudaLockArrays {
/// \brief This global variable in Host space is the central definition /// \brief This global variable in Host space is the central definition
/// of these arrays. /// of these arrays.
extern CudaLockArrays g_host_cuda_lock_arrays; extern Kokkos::Impl::CudaLockArrays g_host_cuda_lock_arrays;
/// \brief After this call, the g_host_cuda_lock_arrays variable has /// \brief After this call, the g_host_cuda_lock_arrays variable has
/// valid, initialized arrays. /// valid, initialized arrays.
@ -105,12 +105,12 @@ namespace Impl {
/// instances in other translation units, we must update this CUDA global /// instances in other translation units, we must update this CUDA global
/// variable based on the Host global variable prior to running any kernels /// variable based on the Host global variable prior to running any kernels
/// that will use it. /// that will use it.
/// That is the purpose of the ensure_cuda_lock_arrays_on_device function. /// That is the purpose of the KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE macro.
__device__ __device__
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
__constant__ extern __constant__ extern
#endif #endif
CudaLockArrays g_device_cuda_lock_arrays; Kokkos::Impl::CudaLockArrays g_device_cuda_lock_arrays;
#define CUDA_SPACE_ATOMIC_MASK 0x1FFFF #define CUDA_SPACE_ATOMIC_MASK 0x1FFFF
@ -123,7 +123,9 @@ __device__ inline bool lock_address_cuda_space(void* ptr) {
size_t offset = size_t(ptr); size_t offset = size_t(ptr);
offset = offset >> 2; offset = offset >> 2;
offset = offset & CUDA_SPACE_ATOMIC_MASK; offset = offset & CUDA_SPACE_ATOMIC_MASK;
return (0 == atomicCAS(&g_device_cuda_lock_arrays.atomic[offset], 0, 1)); return (
0 ==
atomicCAS(&Kokkos::Impl::g_device_cuda_lock_arrays.atomic[offset], 0, 1));
} }
/// \brief Release lock for the address /// \brief Release lock for the address
@ -136,7 +138,7 @@ __device__ inline void unlock_address_cuda_space(void* ptr) {
size_t offset = size_t(ptr); size_t offset = size_t(ptr);
offset = offset >> 2; offset = offset >> 2;
offset = offset & CUDA_SPACE_ATOMIC_MASK; offset = offset & CUDA_SPACE_ATOMIC_MASK;
atomicExch(&g_device_cuda_lock_arrays.atomic[offset], 0); atomicExch(&Kokkos::Impl::g_device_cuda_lock_arrays.atomic[offset], 0);
} }
} // namespace Impl } // namespace Impl
@ -149,49 +151,45 @@ namespace {
static int lock_array_copied = 0; static int lock_array_copied = 0;
inline int eliminate_warning_for_lock_array() { return lock_array_copied; } inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
} // namespace } // namespace
} // namespace Impl
} // namespace Kokkos
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE /* Dan Ibanez: it is critical that this code be a macro, so that it will
inline capture the right address for Kokkos::Impl::g_device_cuda_lock_arrays!
#else putting this in an inline function will NOT do the right thing! */
static #define KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE() \
#endif { \
void if (::Kokkos::Impl::lock_array_copied == 0) { \
copy_cuda_lock_arrays_to_device() { KOKKOS_IMPL_CUDA_SAFE_CALL( \
if (lock_array_copied == 0) { cudaMemcpyToSymbol(Kokkos::Impl::g_device_cuda_lock_arrays, \
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemcpyToSymbol(g_device_cuda_lock_arrays, &Kokkos::Impl::g_host_cuda_lock_arrays, \
&g_host_cuda_lock_arrays, sizeof(Kokkos::Impl::CudaLockArrays))); \
sizeof(CudaLockArrays))); } \
lock_array_copied = 1; \
} }
lock_array_copied = 1;
}
#ifndef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS #ifndef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
inline void ensure_cuda_lock_arrays_on_device() {} #define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
#else #else
inline static void ensure_cuda_lock_arrays_on_device() { #define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() \
copy_cuda_lock_arrays_to_device(); KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()
}
#endif #endif
#else #else
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
inline void ensure_cuda_lock_arrays_on_device() {} #define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
#else #else
// Still Need COPY_CUDA_LOCK_ARRAYS for team scratch etc. // Still Need COPY_CUDA_LOCK_ARRAYS for team scratch etc.
inline static void ensure_cuda_lock_arrays_on_device() { #define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() \
copy_cuda_lock_arrays_to_device(); KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE() \
desul::ensure_cuda_lock_arrays_on_device(); DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
}
#endif #endif
#endif /* defined( KOKKOS_ENABLE_IMPL_DESUL_ATOMICS ) */ #endif /* defined( KOKKOS_ENABLE_IMPL_DESUL_ATOMICS ) */
} // namespace Impl
} // namespace Kokkos
#endif /* defined( KOKKOS_ENABLE_CUDA ) */ #endif /* defined( KOKKOS_ENABLE_CUDA ) */
#endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP */ #endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP */

View File

@ -67,6 +67,34 @@
namespace Kokkos { namespace Kokkos {
namespace Impl { namespace Impl {
template <typename ParallelType, typename Policy, typename LaunchBounds>
int max_tile_size_product_helper(const Policy& pol, const LaunchBounds&) {
cudaFuncAttributes attr =
CudaParallelLaunch<ParallelType,
LaunchBounds>::get_cuda_func_attributes();
auto const& prop = pol.space().cuda_device_prop();
// Limits due to registers/SM, MDRange doesn't have
// shared memory constraints
int const optimal_block_size =
Kokkos::Impl::cuda_get_opt_block_size_no_shmem(attr, LaunchBounds{});
// Compute how many blocks of this size we can launch, based on warp
// constraints
int const max_warps_per_sm_registers =
Kokkos::Impl::cuda_max_warps_per_sm_registers(prop, attr);
int const max_num_threads_from_warps =
max_warps_per_sm_registers * prop.warpSize;
int const max_num_blocks = max_num_threads_from_warps / optimal_block_size;
// Compute the total number of threads
int const max_threads_per_sm = optimal_block_size * max_num_blocks;
return std::min(
max_threads_per_sm,
static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
}
template <class FunctorType, class... Traits> template <class FunctorType, class... Traits>
class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> { class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
public: public:
@ -85,18 +113,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
public: public:
template <typename Policy, typename Functor> template <typename Policy, typename Functor>
static int max_tile_size_product(const Policy& pol, const Functor&) { static int max_tile_size_product(const Policy& pol, const Functor&) {
cudaFuncAttributes attr = return max_tile_size_product_helper<ParallelFor>(pol, LaunchBounds{});
CudaParallelLaunch<ParallelFor,
LaunchBounds>::get_cuda_func_attributes();
auto const& prop = pol.space().cuda_device_prop();
// Limits due to registers/SM, MDRange doesn't have
// shared memory constraints
int const regs_per_sm = prop.regsPerMultiprocessor;
int const regs_per_thread = attr.numRegs;
int const max_threads_per_sm = regs_per_sm / regs_per_thread;
return std::min(
max_threads_per_sm,
static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
} }
Policy const& get_policy() const { return m_rp; } Policy const& get_policy() const { return m_rp; }
inline __device__ void operator()() const { inline __device__ void operator()() const {
@ -121,8 +138,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
maxblocks[1]), maxblocks[1]),
1); 1);
CudaParallelLaunch<ParallelFor, LaunchBounds>( CudaParallelLaunch<ParallelFor, LaunchBounds>(
*this, grid, block, 0, m_rp.space().impl_internal_space_instance(), *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
false);
} else if (RP::rank == 3) { } else if (RP::rank == 3) {
const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], m_rp.m_tile[2]); const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], m_rp.m_tile[2]);
KOKKOS_ASSERT(block.x > 0); KOKKOS_ASSERT(block.x > 0);
@ -139,8 +155,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
(m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1) / block.z, (m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1) / block.z,
maxblocks[2])); maxblocks[2]));
CudaParallelLaunch<ParallelFor, LaunchBounds>( CudaParallelLaunch<ParallelFor, LaunchBounds>(
*this, grid, block, 0, m_rp.space().impl_internal_space_instance(), *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
false);
} else if (RP::rank == 4) { } else if (RP::rank == 4) {
// id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to // id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to
// threadIdx.z // threadIdx.z
@ -158,8 +173,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
(m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1) / block.z, (m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1) / block.z,
maxblocks[2])); maxblocks[2]));
CudaParallelLaunch<ParallelFor, LaunchBounds>( CudaParallelLaunch<ParallelFor, LaunchBounds>(
*this, grid, block, 0, m_rp.space().impl_internal_space_instance(), *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
false);
} else if (RP::rank == 5) { } else if (RP::rank == 5) {
// id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to
// threadIdx.z // threadIdx.z
@ -175,8 +189,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
(m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1) / block.z, (m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1) / block.z,
maxblocks[2])); maxblocks[2]));
CudaParallelLaunch<ParallelFor, LaunchBounds>( CudaParallelLaunch<ParallelFor, LaunchBounds>(
*this, grid, block, 0, m_rp.space().impl_internal_space_instance(), *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
false);
} else if (RP::rank == 6) { } else if (RP::rank == 6) {
// id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4,id5 to // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4,id5 to
// threadIdx.z // threadIdx.z
@ -191,8 +204,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
std::min<array_index_type>(m_rp.m_tile_end[4] * m_rp.m_tile_end[5], std::min<array_index_type>(m_rp.m_tile_end[4] * m_rp.m_tile_end[5],
maxblocks[2])); maxblocks[2]));
CudaParallelLaunch<ParallelFor, LaunchBounds>( CudaParallelLaunch<ParallelFor, LaunchBounds>(
*this, grid, block, 0, m_rp.space().impl_internal_space_instance(), *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
false);
} else { } else {
Kokkos::abort("Kokkos::MDRange Error: Exceeded rank bounds with Cuda\n"); Kokkos::abort("Kokkos::MDRange Error: Exceeded rank bounds with Cuda\n");
} }
@ -263,17 +275,7 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
public: public:
template <typename Policy, typename Functor> template <typename Policy, typename Functor>
static int max_tile_size_product(const Policy& pol, const Functor&) { static int max_tile_size_product(const Policy& pol, const Functor&) {
cudaFuncAttributes attr = return max_tile_size_product_helper<ParallelReduce>(pol, LaunchBounds{});
CudaParallelLaunch<ParallelReduce,
LaunchBounds>::get_cuda_func_attributes();
auto const& prop = pol.space().cuda_device_prop();
// Limits due do registers/SM
int const regs_per_sm = prop.regsPerMultiprocessor;
int const regs_per_thread = attr.numRegs;
int const max_threads_per_sm = regs_per_sm / regs_per_thread;
return std::min(
max_threads_per_sm,
static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
} }
Policy const& get_policy() const { return m_policy; } Policy const& get_policy() const { return m_policy; }
inline __device__ void exec_range(reference_type update) const { inline __device__ void exec_range(reference_type update) const {
@ -405,8 +407,8 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
CudaParallelLaunch<ParallelReduce, LaunchBounds>( CudaParallelLaunch<ParallelReduce, LaunchBounds>(
*this, grid, block, shmem, *this, grid, block, shmem,
m_policy.space().impl_internal_space_instance(), m_policy.space()
false); // copy to device and execute .impl_internal_space_instance()); // copy to device and execute
if (!m_result_ptr_device_accessible) { if (!m_result_ptr_device_accessible) {
if (m_result_ptr) { if (m_result_ptr) {

View File

@ -135,8 +135,7 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
#endif #endif
CudaParallelLaunch<ParallelFor, LaunchBounds>( CudaParallelLaunch<ParallelFor, LaunchBounds>(
*this, grid, block, 0, m_policy.space().impl_internal_space_instance(), *this, grid, block, 0, m_policy.space().impl_internal_space_instance());
false);
} }
ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
@ -375,8 +374,8 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
CudaParallelLaunch<ParallelReduce, LaunchBounds>( CudaParallelLaunch<ParallelReduce, LaunchBounds>(
*this, grid, block, shmem, *this, grid, block, shmem,
m_policy.space().impl_internal_space_instance(), m_policy.space()
false); // copy to device and execute .impl_internal_space_instance()); // copy to device and execute
if (!m_result_ptr_device_accessible) { if (!m_result_ptr_device_accessible) {
if (m_result_ptr) { if (m_result_ptr) {
@ -465,8 +464,24 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
public: public:
using pointer_type = typename Analysis::pointer_type; using pointer_type = typename Analysis::pointer_type;
using reference_type = typename Analysis::reference_type; using reference_type = typename Analysis::reference_type;
using value_type = typename Analysis::value_type;
using functor_type = FunctorType; using functor_type = FunctorType;
using size_type = Cuda::size_type; using size_type = Cuda::size_type;
// Conditionally set word_size_type to int16_t or int8_t if value_type is
// smaller than int32_t (Kokkos::Cuda::size_type)
// word_size_type is used to determine the word count, shared memory buffer
// size, and global memory buffer size before the scan is performed.
// Within the scan, the word count is recomputed based on word_size_type
// and when calculating indexes into the shared/global memory buffers for
// performing the scan, word_size_type is used again.
// For scalars > 4 bytes in size, indexing into shared/global memory relies
// on the block and grid dimensions to ensure that we index at the correct
// offset rather than at every 4 byte word; such that, when the join is
// performed, we have the correct data that was copied over in chunks of 4
// bytes.
using word_size_type = std::conditional_t<
sizeof(value_type) < sizeof(size_type),
std::conditional_t<sizeof(value_type) == 2, int16_t, int8_t>, size_type>;
private: private:
// Algorithmic constraints: // Algorithmic constraints:
@ -477,7 +492,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
const FunctorType m_functor; const FunctorType m_functor;
const Policy m_policy; const Policy m_policy;
size_type* m_scratch_space; word_size_type* m_scratch_space;
size_type* m_scratch_flags; size_type* m_scratch_flags;
size_type m_final; size_type m_final;
#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
@ -501,12 +516,12 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
__device__ inline void initial() const { __device__ inline void initial() const {
typename Analysis::Reducer final_reducer(&m_functor); typename Analysis::Reducer final_reducer(&m_functor);
const integral_nonzero_constant<size_type, Analysis::StaticValueSize / const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
sizeof(size_type)> sizeof(word_size_type)>
word_count(Analysis::value_size(m_functor) / sizeof(size_type)); word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));
size_type* const shared_value = word_size_type* const shared_value =
kokkos_impl_cuda_shared_memory<size_type>() + kokkos_impl_cuda_shared_memory<word_size_type>() +
word_count.value * threadIdx.y; word_count.value * threadIdx.y;
final_reducer.init(reinterpret_cast<pointer_type>(shared_value)); final_reducer.init(reinterpret_cast<pointer_type>(shared_value));
@ -532,7 +547,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
// gridDim.x // gridDim.x
cuda_single_inter_block_reduce_scan<true>( cuda_single_inter_block_reduce_scan<true>(
final_reducer, blockIdx.x, gridDim.x, final_reducer, blockIdx.x, gridDim.x,
kokkos_impl_cuda_shared_memory<size_type>(), m_scratch_space, kokkos_impl_cuda_shared_memory<word_size_type>(), m_scratch_space,
m_scratch_flags); m_scratch_flags);
} }
@ -541,21 +556,22 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
__device__ inline void final() const { __device__ inline void final() const {
typename Analysis::Reducer final_reducer(&m_functor); typename Analysis::Reducer final_reducer(&m_functor);
const integral_nonzero_constant<size_type, Analysis::StaticValueSize / const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
sizeof(size_type)> sizeof(word_size_type)>
word_count(Analysis::value_size(m_functor) / sizeof(size_type)); word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));
// Use shared memory as an exclusive scan: { 0 , value[0] , value[1] , // Use shared memory as an exclusive scan: { 0 , value[0] , value[1] ,
// value[2] , ... } // value[2] , ... }
size_type* const shared_data = kokkos_impl_cuda_shared_memory<size_type>(); word_size_type* const shared_data =
size_type* const shared_prefix = kokkos_impl_cuda_shared_memory<word_size_type>();
word_size_type* const shared_prefix =
shared_data + word_count.value * threadIdx.y; shared_data + word_count.value * threadIdx.y;
size_type* const shared_accum = word_size_type* const shared_accum =
shared_data + word_count.value * (blockDim.y + 1); shared_data + word_count.value * (blockDim.y + 1);
// Starting value for this thread block is the previous block's total. // Starting value for this thread block is the previous block's total.
if (blockIdx.x) { if (blockIdx.x) {
size_type* const block_total = word_size_type* const block_total =
m_scratch_space + word_count.value * (blockIdx.x - 1); m_scratch_space + word_count.value * (blockIdx.x - 1);
for (unsigned i = threadIdx.y; i < word_count.value; ++i) { for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
shared_accum[i] = block_total[i]; shared_accum[i] = block_total[i];
@ -602,7 +618,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
typename Analysis::pointer_type(shared_data + word_count.value)); typename Analysis::pointer_type(shared_data + word_count.value));
{ {
size_type* const block_total = word_size_type* const block_total =
shared_data + word_count.value * blockDim.y; shared_data + word_count.value * blockDim.y;
for (unsigned i = threadIdx.y; i < word_count.value; ++i) { for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
shared_accum[i] = block_total[i]; shared_accum[i] = block_total[i];
@ -690,8 +706,9 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
// How many block are really needed for this much work: // How many block are really needed for this much work:
const int grid_x = (nwork + work_per_block - 1) / work_per_block; const int grid_x = (nwork + work_per_block - 1) / work_per_block;
m_scratch_space = cuda_internal_scratch_space( m_scratch_space =
m_policy.space(), Analysis::value_size(m_functor) * grid_x); reinterpret_cast<word_size_type*>(cuda_internal_scratch_space(
m_policy.space(), Analysis::value_size(m_functor) * grid_x));
m_scratch_flags = m_scratch_flags =
cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type) * 1); cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type) * 1);
@ -708,16 +725,16 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
m_final = false; m_final = false;
CudaParallelLaunch<ParallelScan, LaunchBounds>( CudaParallelLaunch<ParallelScan, LaunchBounds>(
*this, grid, block, shmem, *this, grid, block, shmem,
m_policy.space().impl_internal_space_instance(), m_policy.space()
false); // copy to device and execute .impl_internal_space_instance()); // copy to device and execute
#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
} }
#endif #endif
m_final = true; m_final = true;
CudaParallelLaunch<ParallelScan, LaunchBounds>( CudaParallelLaunch<ParallelScan, LaunchBounds>(
*this, grid, block, shmem, *this, grid, block, shmem,
m_policy.space().impl_internal_space_instance(), m_policy.space()
false); // copy to device and execute .impl_internal_space_instance()); // copy to device and execute
} }
} }
@ -752,10 +769,26 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
Policy, FunctorType>; Policy, FunctorType>;
public: public:
using value_type = typename Analysis::value_type;
using pointer_type = typename Analysis::pointer_type; using pointer_type = typename Analysis::pointer_type;
using reference_type = typename Analysis::reference_type; using reference_type = typename Analysis::reference_type;
using functor_type = FunctorType; using functor_type = FunctorType;
using size_type = Cuda::size_type; using size_type = Cuda::size_type;
// Conditionally set word_size_type to int16_t or int8_t if value_type is
// smaller than int32_t (Kokkos::Cuda::size_type)
// word_size_type is used to determine the word count, shared memory buffer
// size, and global memory buffer size before the scan is performed.
// Within the scan, the word count is recomputed based on word_size_type
// and when calculating indexes into the shared/global memory buffers for
// performing the scan, word_size_type is used again.
// For scalars > 4 bytes in size, indexing into shared/global memory relies
// on the block and grid dimensions to ensure that we index at the correct
// offset rather than at every 4 byte word; such that, when the join is
// performed, we have the correct data that was copied over in chunks of 4
// bytes.
using word_size_type = std::conditional_t<
sizeof(value_type) < sizeof(size_type),
std::conditional_t<sizeof(value_type) == 2, int16_t, int8_t>, size_type>;
private: private:
// Algorithmic constraints: // Algorithmic constraints:
@ -766,7 +799,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
const FunctorType m_functor; const FunctorType m_functor;
const Policy m_policy; const Policy m_policy;
size_type* m_scratch_space; word_size_type* m_scratch_space;
size_type* m_scratch_flags; size_type* m_scratch_flags;
size_type m_final; size_type m_final;
ReturnType& m_returnvalue; ReturnType& m_returnvalue;
@ -791,12 +824,12 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
__device__ inline void initial() const { __device__ inline void initial() const {
typename Analysis::Reducer final_reducer(&m_functor); typename Analysis::Reducer final_reducer(&m_functor);
const integral_nonzero_constant<size_type, Analysis::StaticValueSize / const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
sizeof(size_type)> sizeof(word_size_type)>
word_count(Analysis::value_size(m_functor) / sizeof(size_type)); word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));
size_type* const shared_value = word_size_type* const shared_value =
kokkos_impl_cuda_shared_memory<size_type>() + kokkos_impl_cuda_shared_memory<word_size_type>() +
word_count.value * threadIdx.y; word_count.value * threadIdx.y;
final_reducer.init(reinterpret_cast<pointer_type>(shared_value)); final_reducer.init(reinterpret_cast<pointer_type>(shared_value));
@ -822,7 +855,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
// gridDim.x // gridDim.x
cuda_single_inter_block_reduce_scan<true>( cuda_single_inter_block_reduce_scan<true>(
final_reducer, blockIdx.x, gridDim.x, final_reducer, blockIdx.x, gridDim.x,
kokkos_impl_cuda_shared_memory<size_type>(), m_scratch_space, kokkos_impl_cuda_shared_memory<word_size_type>(), m_scratch_space,
m_scratch_flags); m_scratch_flags);
} }
@ -831,21 +864,22 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
__device__ inline void final() const { __device__ inline void final() const {
typename Analysis::Reducer final_reducer(&m_functor); typename Analysis::Reducer final_reducer(&m_functor);
const integral_nonzero_constant<size_type, Analysis::StaticValueSize / const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
sizeof(size_type)> sizeof(word_size_type)>
word_count(Analysis::value_size(m_functor) / sizeof(size_type)); word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));
// Use shared memory as an exclusive scan: { 0 , value[0] , value[1] , // Use shared memory as an exclusive scan: { 0 , value[0] , value[1] ,
// value[2] , ... } // value[2] , ... }
size_type* const shared_data = kokkos_impl_cuda_shared_memory<size_type>(); word_size_type* const shared_data =
size_type* const shared_prefix = kokkos_impl_cuda_shared_memory<word_size_type>();
word_size_type* const shared_prefix =
shared_data + word_count.value * threadIdx.y; shared_data + word_count.value * threadIdx.y;
size_type* const shared_accum = word_size_type* const shared_accum =
shared_data + word_count.value * (blockDim.y + 1); shared_data + word_count.value * (blockDim.y + 1);
// Starting value for this thread block is the previous block's total. // Starting value for this thread block is the previous block's total.
if (blockIdx.x) { if (blockIdx.x) {
size_type* const block_total = word_size_type* const block_total =
m_scratch_space + word_count.value * (blockIdx.x - 1); m_scratch_space + word_count.value * (blockIdx.x - 1);
for (unsigned i = threadIdx.y; i < word_count.value; ++i) { for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
shared_accum[i] = block_total[i]; shared_accum[i] = block_total[i];
@ -894,7 +928,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
typename Analysis::pointer_type(shared_data + word_count.value)); typename Analysis::pointer_type(shared_data + word_count.value));
{ {
size_type* const block_total = word_size_type* const block_total =
shared_data + word_count.value * blockDim.y; shared_data + word_count.value * blockDim.y;
for (unsigned i = threadIdx.y; i < word_count.value; ++i) { for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
shared_accum[i] = block_total[i]; shared_accum[i] = block_total[i];
@ -983,8 +1017,9 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
// How many block are really needed for this much work: // How many block are really needed for this much work:
const int grid_x = (nwork + work_per_block - 1) / work_per_block; const int grid_x = (nwork + work_per_block - 1) / work_per_block;
m_scratch_space = cuda_internal_scratch_space( m_scratch_space =
m_policy.space(), Analysis::value_size(m_functor) * grid_x); reinterpret_cast<word_size_type*>(cuda_internal_scratch_space(
m_policy.space(), Analysis::value_size(m_functor) * grid_x));
m_scratch_flags = m_scratch_flags =
cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type) * 1); cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type) * 1);
@ -1002,16 +1037,16 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
m_final = false; m_final = false;
CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>( CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>(
*this, grid, block, shmem, *this, grid, block, shmem,
m_policy.space().impl_internal_space_instance(), m_policy.space()
false); // copy to device and execute .impl_internal_space_instance()); // copy to device and execute
#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
} }
#endif #endif
m_final = true; m_final = true;
CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>( CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>(
*this, grid, block, shmem, *this, grid, block, shmem,
m_policy.space().impl_internal_space_instance(), m_policy.space()
false); // copy to device and execute .impl_internal_space_instance()); // copy to device and execute
const int size = Analysis::value_size(m_functor); const int size = Analysis::value_size(m_functor);
#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
@ -1022,7 +1057,8 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
#endif #endif
DeepCopy<HostSpace, CudaSpace, Cuda>( DeepCopy<HostSpace, CudaSpace, Cuda>(
m_policy.space(), &m_returnvalue, m_policy.space(), &m_returnvalue,
m_scratch_space + (grid_x - 1) * size / sizeof(int), size); m_scratch_space + (grid_x - 1) * size / sizeof(word_size_type),
size);
} }
} }

View File

@ -552,8 +552,8 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
CudaParallelLaunch<ParallelFor, LaunchBounds>( CudaParallelLaunch<ParallelFor, LaunchBounds>(
*this, grid, block, shmem_size_total, *this, grid, block, shmem_size_total,
m_policy.space().impl_internal_space_instance(), m_policy.space()
true); // copy to device and execute .impl_internal_space_instance()); // copy to device and execute
} }
ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
@ -878,8 +878,8 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
CudaParallelLaunch<ParallelReduce, LaunchBounds>( CudaParallelLaunch<ParallelReduce, LaunchBounds>(
*this, grid, block, shmem_size_total, *this, grid, block, shmem_size_total,
m_policy.space().impl_internal_space_instance(), m_policy.space()
true); // copy to device and execute .impl_internal_space_instance()); // copy to device and execute
if (!m_result_ptr_device_accessible) { if (!m_result_ptr_device_accessible) {
m_policy.space().fence( m_policy.space().fence(

View File

@ -116,6 +116,7 @@ __device__ inline void cuda_inter_warp_reduction(
value = result[0]; value = result[0];
for (int i = 1; (i * step < max_active_thread) && i < STEP_WIDTH; i++) for (int i = 1; (i * step < max_active_thread) && i < STEP_WIDTH; i++)
reducer.join(&value, &result[i]); reducer.join(&value, &result[i]);
__syncthreads();
} }
template <class ValueType, class ReducerType> template <class ValueType, class ReducerType>
@ -427,11 +428,6 @@ struct CudaReductionsFunctor<FunctorType, false, false> {
// __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor) // __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
// function qualifier which could be used to improve performance. // function qualifier which could be used to improve performance.
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------
// Maximize shared memory and minimize L1 cache:
// cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferShared );
// For 2.0 capability: 48 KB shared and 16 KB L1
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
/* /*
* Algorithmic constraints: * Algorithmic constraints:
* (a) blockDim.y <= 1024 * (a) blockDim.y <= 1024

View File

@ -100,8 +100,7 @@ class ParallelFor<FunctorType, Kokkos::WorkGraphPolicy<Traits...>,
const int shared = 0; const int shared = 0;
Kokkos::Impl::CudaParallelLaunch<Self>( Kokkos::Impl::CudaParallelLaunch<Self>(
*this, grid, block, shared, Cuda().impl_internal_space_instance(), *this, grid, block, shared, Cuda().impl_internal_space_instance());
false);
} }
inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)

View File

@ -448,11 +448,27 @@ class ParallelScanHIPBase {
Policy, FunctorType>; Policy, FunctorType>;
public: public:
using value_type = typename Analysis::value_type;
using pointer_type = typename Analysis::pointer_type; using pointer_type = typename Analysis::pointer_type;
using reference_type = typename Analysis::reference_type; using reference_type = typename Analysis::reference_type;
using functor_type = FunctorType; using functor_type = FunctorType;
using size_type = Kokkos::Experimental::HIP::size_type; using size_type = Kokkos::Experimental::HIP::size_type;
using index_type = typename Policy::index_type; using index_type = typename Policy::index_type;
// Conditionally set word_size_type to int16_t or int8_t if value_type is
// smaller than int32_t (Kokkos::HIP::size_type)
// word_size_type is used to determine the word count, shared memory buffer
// size, and global memory buffer size before the scan is performed.
// Within the scan, the word count is recomputed based on word_size_type
// and when calculating indexes into the shared/global memory buffers for
// performing the scan, word_size_type is used again.
// For scalars > 4 bytes in size, indexing into shared/global memory relies
// on the block and grid dimensions to ensure that we index at the correct
// offset rather than at every 4 byte word; such that, when the join is
// performed, we have the correct data that was copied over in chunks of 4
// bytes.
using word_size_type = std::conditional_t<
sizeof(value_type) < sizeof(size_type),
std::conditional_t<sizeof(value_type) == 2, int16_t, int8_t>, size_type>;
protected: protected:
// Algorithmic constraints: // Algorithmic constraints:
@ -463,7 +479,7 @@ class ParallelScanHIPBase {
const FunctorType m_functor; const FunctorType m_functor;
const Policy m_policy; const Policy m_policy;
size_type* m_scratch_space = nullptr; word_size_type* m_scratch_space = nullptr;
size_type* m_scratch_flags = nullptr; size_type* m_scratch_flags = nullptr;
size_type m_final = false; size_type m_final = false;
int m_grid_x = 0; int m_grid_x = 0;
@ -489,12 +505,12 @@ class ParallelScanHIPBase {
__device__ inline void initial() const { __device__ inline void initial() const {
typename Analysis::Reducer final_reducer(&m_functor); typename Analysis::Reducer final_reducer(&m_functor);
const integral_nonzero_constant<size_type, Analysis::StaticValueSize / const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
sizeof(size_type)> sizeof(word_size_type)>
word_count(Analysis::value_size(m_functor) / sizeof(size_type)); word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));
pointer_type const shared_value = reinterpret_cast<pointer_type>( pointer_type const shared_value = reinterpret_cast<pointer_type>(
Kokkos::Experimental::kokkos_impl_hip_shared_memory<size_type>() + Kokkos::Experimental::kokkos_impl_hip_shared_memory<word_size_type>() +
word_count.value * threadIdx.y); word_count.value * threadIdx.y);
final_reducer.init(shared_value); final_reducer.init(shared_value);
@ -518,7 +534,7 @@ class ParallelScanHIPBase {
// gridDim.x // gridDim.x
hip_single_inter_block_reduce_scan<true>( hip_single_inter_block_reduce_scan<true>(
final_reducer, blockIdx.x, gridDim.x, final_reducer, blockIdx.x, gridDim.x,
Kokkos::Experimental::kokkos_impl_hip_shared_memory<size_type>(), Kokkos::Experimental::kokkos_impl_hip_shared_memory<word_size_type>(),
m_scratch_space, m_scratch_flags); m_scratch_space, m_scratch_flags);
} }
@ -527,22 +543,22 @@ class ParallelScanHIPBase {
__device__ inline void final() const { __device__ inline void final() const {
typename Analysis::Reducer final_reducer(&m_functor); typename Analysis::Reducer final_reducer(&m_functor);
const integral_nonzero_constant<size_type, Analysis::StaticValueSize / const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
sizeof(size_type)> sizeof(word_size_type)>
word_count(Analysis::value_size(m_functor) / sizeof(size_type)); word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));
// Use shared memory as an exclusive scan: { 0 , value[0] , value[1] , // Use shared memory as an exclusive scan: { 0 , value[0] , value[1] ,
// value[2] , ... } // value[2] , ... }
size_type* const shared_data = word_size_type* const shared_data =
Kokkos::Experimental::kokkos_impl_hip_shared_memory<size_type>(); Kokkos::Experimental::kokkos_impl_hip_shared_memory<word_size_type>();
size_type* const shared_prefix = word_size_type* const shared_prefix =
shared_data + word_count.value * threadIdx.y; shared_data + word_count.value * threadIdx.y;
size_type* const shared_accum = word_size_type* const shared_accum =
shared_data + word_count.value * (blockDim.y + 1); shared_data + word_count.value * (blockDim.y + 1);
// Starting value for this thread block is the previous block's total. // Starting value for this thread block is the previous block's total.
if (blockIdx.x) { if (blockIdx.x) {
size_type* const block_total = word_size_type* const block_total =
m_scratch_space + word_count.value * (blockIdx.x - 1); m_scratch_space + word_count.value * (blockIdx.x - 1);
for (unsigned i = threadIdx.y; i < word_count.value; ++i) { for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
shared_accum[i] = block_total[i]; shared_accum[i] = block_total[i];
@ -588,7 +604,7 @@ class ParallelScanHIPBase {
typename Analysis::pointer_type(shared_data + word_count.value)); typename Analysis::pointer_type(shared_data + word_count.value));
{ {
size_type* const block_total = word_size_type* const block_total =
shared_data + word_count.value * blockDim.y; shared_data + word_count.value * blockDim.y;
for (unsigned i = threadIdx.y; i < word_count.value; ++i) { for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
shared_accum[i] = block_total[i]; shared_accum[i] = block_total[i];
@ -647,8 +663,9 @@ class ParallelScanHIPBase {
// How many block are really needed for this much work: // How many block are really needed for this much work:
m_grid_x = (nwork + work_per_block - 1) / work_per_block; m_grid_x = (nwork + work_per_block - 1) / work_per_block;
m_scratch_space = Kokkos::Experimental::Impl::hip_internal_scratch_space( m_scratch_space = reinterpret_cast<word_size_type*>(
m_policy.space(), Analysis::value_size(m_functor) * m_grid_x); Kokkos::Experimental::Impl::hip_internal_scratch_space(
m_policy.space(), Analysis::value_size(m_functor) * m_grid_x));
m_scratch_flags = Kokkos::Experimental::Impl::hip_internal_scratch_flags( m_scratch_flags = Kokkos::Experimental::Impl::hip_internal_scratch_flags(
m_policy.space(), sizeof(size_type) * 1); m_policy.space(), sizeof(size_type) * 1);
@ -734,7 +751,8 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace, DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace,
Kokkos::Experimental::HIP>( Kokkos::Experimental::HIP>(
Base::m_policy.space(), &m_returnvalue, Base::m_policy.space(), &m_returnvalue,
Base::m_scratch_space + (Base::m_grid_x - 1) * size / sizeof(int), Base::m_scratch_space + (Base::m_grid_x - 1) * size /
sizeof(typename Base::word_size_type),
size); size);
} }
} }

View File

@ -225,11 +225,11 @@ struct HIPReductionsFunctor<FunctorType, false> {
} }
} }
template <typename SizeType>
__device__ static inline bool scalar_inter_block_reduction( __device__ static inline bool scalar_inter_block_reduction(
FunctorType const& functor, FunctorType const& functor,
::Kokkos::Experimental::HIP::size_type const block_count, ::Kokkos::Experimental::HIP::size_type const block_count,
::Kokkos::Experimental::HIP::size_type* const shared_data, SizeType* const shared_data, SizeType* const global_data,
::Kokkos::Experimental::HIP::size_type* const global_data,
::Kokkos::Experimental::HIP::size_type* const global_flags) { ::Kokkos::Experimental::HIP::size_type* const global_flags) {
Scalar* const global_team_buffer_element = Scalar* const global_team_buffer_element =
reinterpret_cast<Scalar*>(global_data); reinterpret_cast<Scalar*>(global_data);
@ -411,16 +411,14 @@ __device__ void hip_intra_block_reduce_scan(
* Global reduce result is in the last threads' 'shared_data' location. * Global reduce result is in the last threads' 'shared_data' location.
*/ */
template <bool DoScan, class FunctorType> template <bool DoScan, typename FunctorType, typename SizeType>
__device__ bool hip_single_inter_block_reduce_scan_impl( __device__ bool hip_single_inter_block_reduce_scan_impl(
FunctorType const& functor, FunctorType const& functor,
::Kokkos::Experimental::HIP::size_type const block_id, ::Kokkos::Experimental::HIP::size_type const block_id,
::Kokkos::Experimental::HIP::size_type const block_count, ::Kokkos::Experimental::HIP::size_type const block_count,
::Kokkos::Experimental::HIP::size_type* const shared_data, SizeType* const shared_data, SizeType* const global_data,
::Kokkos::Experimental::HIP::size_type* const global_data,
::Kokkos::Experimental::HIP::size_type* const global_flags) { ::Kokkos::Experimental::HIP::size_type* const global_flags) {
using size_type = ::Kokkos::Experimental::HIP::size_type; using size_type = SizeType;
using value_type = typename FunctorType::value_type; using value_type = typename FunctorType::value_type;
using pointer_type = typename FunctorType::pointer_type; using pointer_type = typename FunctorType::pointer_type;
@ -518,13 +516,12 @@ __device__ bool hip_single_inter_block_reduce_scan_impl(
return is_last_block; return is_last_block;
} }
template <bool DoScan, typename FunctorType> template <bool DoScan, typename FunctorType, typename SizeType>
__device__ bool hip_single_inter_block_reduce_scan( __device__ bool hip_single_inter_block_reduce_scan(
FunctorType const& functor, FunctorType const& functor,
::Kokkos::Experimental::HIP::size_type const block_id, ::Kokkos::Experimental::HIP::size_type const block_id,
::Kokkos::Experimental::HIP::size_type const block_count, ::Kokkos::Experimental::HIP::size_type const block_count,
::Kokkos::Experimental::HIP::size_type* const shared_data, SizeType* const shared_data, SizeType* const global_data,
::Kokkos::Experimental::HIP::size_type* const global_data,
::Kokkos::Experimental::HIP::size_type* const global_flags) { ::Kokkos::Experimental::HIP::size_type* const global_flags) {
// If we are doing a reduction and we don't do an array reduction, we use the // If we are doing a reduction and we don't do an array reduction, we use the
// reduction-only path. Otherwise, we use the common path between reduction // reduction-only path. Otherwise, we use the common path between reduction

View File

@ -116,6 +116,7 @@ __device__ inline void hip_inter_warp_shuffle_reduction(
value = result[0]; value = result[0];
for (int i = 1; (i * step < max_active_thread) && (i < step_width); ++i) for (int i = 1; (i * step < max_active_thread) && (i < step_width); ++i)
reducer.join(&value, &result[i]); reducer.join(&value, &result[i]);
__syncthreads();
} }
template <typename ValueType, typename ReducerType> template <typename ValueType, typename ReducerType>

View File

@ -3711,6 +3711,7 @@ namespace Impl {
template <class T, class... P, class... ViewCtorArgs> template <class T, class... P, class... ViewCtorArgs>
inline std::enable_if_t< inline std::enable_if_t<
!Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
(std::is_same< (std::is_same<
typename Kokkos::View<T, P...>::memory_space, typename Kokkos::View<T, P...>::memory_space,
typename Kokkos::View<T, P...>::HostMirror::memory_space>::value && typename Kokkos::View<T, P...>::HostMirror::memory_space>::value &&
@ -3725,9 +3726,10 @@ create_mirror_view(const Kokkos::View<T, P...>& src,
template <class T, class... P, class... ViewCtorArgs> template <class T, class... P, class... ViewCtorArgs>
inline std::enable_if_t< inline std::enable_if_t<
!(std::is_same< !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
typename Kokkos::View<T, P...>::memory_space, !(std::is_same<typename Kokkos::View<T, P...>::memory_space,
typename Kokkos::View<T, P...>::HostMirror::memory_space>::value && typename Kokkos::View<
T, P...>::HostMirror::memory_space>::value &&
std::is_same< std::is_same<
typename Kokkos::View<T, P...>::data_type, typename Kokkos::View<T, P...>::data_type,
typename Kokkos::View<T, P...>::HostMirror::data_type>::value), typename Kokkos::View<T, P...>::HostMirror::data_type>::value),
@ -3738,25 +3740,33 @@ create_mirror_view(const Kokkos::View<T, P...>& src,
} }
// Create a mirror view in a new space (specialization for same space) // Create a mirror view in a new space (specialization for same space)
template <class Space, class T, class... P, class... ViewCtorArgs> template <class T, class... P, class... ViewCtorArgs,
std::enable_if_t<Impl::MirrorViewType<Space, T, P...>::is_same_memspace, class = std::enable_if_t<
typename Impl::MirrorViewType<Space, T, P...>::view_type> Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
create_mirror_view(const Space&, const Kokkos::View<T, P...>& src, std::enable_if_t<Impl::MirrorViewType<
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
T, P...>::is_same_memspace,
typename Impl::MirrorViewType<
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
T, P...>::view_type>
create_mirror_view(const Kokkos::View<T, P...>& src,
const Impl::ViewCtorProp<ViewCtorArgs...>&) { const Impl::ViewCtorProp<ViewCtorArgs...>&) {
return src; return src;
} }
// Create a mirror view in a new space (specialization for different space) // Create a mirror view in a new space (specialization for different space)
template <class Space, class T, class... P, class... ViewCtorArgs> template <class T, class... P, class... ViewCtorArgs,
std::enable_if_t<!Impl::MirrorViewType<Space, T, P...>::is_same_memspace, class = std::enable_if_t<
typename Impl::MirrorViewType<Space, T, P...>::view_type> Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
create_mirror_view(const Space&, const Kokkos::View<T, P...>& src, std::enable_if_t<!Impl::MirrorViewType<
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
T, P...>::is_same_memspace,
typename Impl::MirrorViewType<
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
T, P...>::view_type>
create_mirror_view(const Kokkos::View<T, P...>& src,
const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) { const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
using MemorySpace = typename Space::memory_space; return Kokkos::Impl::create_mirror(src, arg_prop);
using alloc_prop = Impl::ViewCtorProp<ViewCtorArgs..., MemorySpace>;
alloc_prop prop_copy(arg_prop);
return Kokkos::Impl::create_mirror(src, prop_copy);
} }
} // namespace Impl } // namespace Impl
@ -3815,9 +3825,10 @@ typename Impl::MirrorViewType<Space, T, P...>::view_type create_mirror_view(
template <class Space, class T, class... P, template <class Space, class T, class... P,
typename Enable = std::enable_if_t<Kokkos::is_space<Space>::value>> typename Enable = std::enable_if_t<Kokkos::is_space<Space>::value>>
typename Impl::MirrorViewType<Space, T, P...>::view_type create_mirror_view( typename Impl::MirrorViewType<Space, T, P...>::view_type create_mirror_view(
Kokkos::Impl::WithoutInitializing_t wi, Space const& space, Kokkos::Impl::WithoutInitializing_t wi, Space const&,
Kokkos::View<T, P...> const& v) { Kokkos::View<T, P...> const& v) {
return Impl::create_mirror_view(space, v, view_alloc(wi)); return Impl::create_mirror_view(
v, view_alloc(typename Space::memory_space{}, wi));
} }
template <class T, class... P, class... ViewCtorArgs> template <class T, class... P, class... ViewCtorArgs>

View File

@ -1754,7 +1754,10 @@ struct RankDataType<ValueType, 0> {
}; };
template <unsigned N, typename... Args> template <unsigned N, typename... Args>
KOKKOS_FUNCTION std::enable_if_t<N == View<Args...>::Rank, View<Args...>> KOKKOS_FUNCTION std::enable_if_t<
N == View<Args...>::Rank &&
std::is_same<typename ViewTraits<Args...>::specialize, void>::value,
View<Args...>>
as_view_of_rank_n(View<Args...> v) { as_view_of_rank_n(View<Args...> v) {
return v; return v;
} }
@ -1762,13 +1765,13 @@ as_view_of_rank_n(View<Args...> v) {
// Placeholder implementation to compile generic code for DynRankView; should // Placeholder implementation to compile generic code for DynRankView; should
// never be called // never be called
template <unsigned N, typename T, typename... Args> template <unsigned N, typename T, typename... Args>
std::enable_if_t< KOKKOS_FUNCTION std::enable_if_t<
N != View<T, Args...>::Rank, N != View<T, Args...>::Rank &&
std::is_same<typename ViewTraits<T, Args...>::specialize, void>::value,
View<typename RankDataType<typename View<T, Args...>::value_type, N>::type, View<typename RankDataType<typename View<T, Args...>::value_type, N>::type,
Args...>> Args...>>
as_view_of_rank_n(View<T, Args...>) { as_view_of_rank_n(View<T, Args...>) {
Kokkos::Impl::throw_runtime_exception( Kokkos::abort("Trying to get at a View of the wrong rank");
"Trying to get at a View of the wrong rank");
return {}; return {};
} }

View File

@ -101,8 +101,8 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
void push_work(const std::int32_t w) const noexcept { void push_work(const std::int32_t w) const noexcept {
const std::int32_t N = m_graph.numRows(); const std::int32_t N = m_graph.numRows();
std::int32_t volatile* const ready_queue = &m_queue[0]; std::int32_t* const ready_queue = &m_queue[0];
std::int32_t volatile* const end_hint = &m_queue[2 * N + 1]; std::int32_t* const end_hint = &m_queue[2 * N + 1];
// Push work to end of queue // Push work to end of queue
const std::int32_t j = atomic_fetch_add(end_hint, 1); const std::int32_t j = atomic_fetch_add(end_hint, 1);
@ -134,14 +134,14 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
std::int32_t pop_work() const noexcept { std::int32_t pop_work() const noexcept {
const std::int32_t N = m_graph.numRows(); const std::int32_t N = m_graph.numRows();
std::int32_t volatile* const ready_queue = &m_queue[0]; std::int32_t* const ready_queue = &m_queue[0];
std::int32_t volatile* const begin_hint = &m_queue[2 * N]; std::int32_t* const begin_hint = &m_queue[2 * N];
// begin hint is guaranteed to be less than or equal to // begin hint is guaranteed to be less than or equal to
// actual begin location in the queue. // actual begin location in the queue.
for (std::int32_t i = *begin_hint; i < N; ++i) { for (std::int32_t i = Kokkos::atomic_load(begin_hint); i < N; ++i) {
const std::int32_t w = ready_queue[i]; const std::int32_t w = Kokkos::atomic_load(&ready_queue[i]);
if (w == END_TOKEN) { if (w == END_TOKEN) {
return END_TOKEN; return END_TOKEN;
@ -169,7 +169,7 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
const std::int32_t N = m_graph.numRows(); const std::int32_t N = m_graph.numRows();
std::int32_t volatile* const count_queue = &m_queue[N]; std::int32_t* const count_queue = &m_queue[N];
const std::int32_t B = m_graph.row_map(w); const std::int32_t B = m_graph.row_map(w);
const std::int32_t E = m_graph.row_map(w + 1); const std::int32_t E = m_graph.row_map(w + 1);
@ -199,7 +199,7 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void operator()(const TagCount, int i) const noexcept { void operator()(const TagCount, int i) const noexcept {
std::int32_t volatile* const count_queue = &m_queue[m_graph.numRows()]; std::int32_t* const count_queue = &m_queue[m_graph.numRows()];
atomic_increment(count_queue + m_graph.entries[i]); atomic_increment(count_queue + m_graph.entries[i]);
} }

View File

@ -51,7 +51,7 @@ namespace Kokkos::Experimental::Impl {
struct OpenACC_Traits { struct OpenACC_Traits {
#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \ #if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
defined(KOKKOS_ARCH_AMPERE) defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER)
static constexpr acc_device_t dev_type = acc_device_nvidia; static constexpr acc_device_t dev_type = acc_device_nvidia;
static constexpr bool may_fallback_to_host = false; static constexpr bool may_fallback_to_host = false;
#else #else

View File

@ -47,6 +47,7 @@
#endif #endif
#include <Kokkos_Macros.hpp> #include <Kokkos_Macros.hpp>
#include <impl/Kokkos_DeviceManagement.hpp>
#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(_OPENMP) #if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(_OPENMP)
@ -115,7 +116,8 @@ void OpenMPTargetInternal::impl_initialize() {
// FIXME_OPENMPTARGET: Only fix the number of teams for NVIDIA architectures // FIXME_OPENMPTARGET: Only fix the number of teams for NVIDIA architectures
// from Pascal and upwards. // from Pascal and upwards.
#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \ #if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
defined(KOKKOS_ARCH_HOPPER)
#if defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG >= 1300) #if defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG >= 1300)
omp_set_num_teams(512); omp_set_num_teams(512);
#endif #endif
@ -164,7 +166,11 @@ void OpenMPTarget::impl_static_fence(const std::string& name) {
name, Kokkos::Experimental::Impl::openmp_fence_is_static::yes); name, Kokkos::Experimental::Impl::openmp_fence_is_static::yes);
} }
void OpenMPTarget::impl_initialize(InitializationSettings const&) { void OpenMPTarget::impl_initialize(InitializationSettings const& settings) {
using Kokkos::Impl::get_gpu;
const int device_num = get_gpu(settings);
omp_set_default_device(device_num);
Impl::OpenMPTargetInternal::impl_singleton()->impl_initialize(); Impl::OpenMPTargetInternal::impl_singleton()->impl_initialize();
} }
void OpenMPTarget::impl_finalize() { void OpenMPTarget::impl_finalize() {

View File

@ -155,7 +155,7 @@ void SYCL::impl_initialize(InitializationSettings const& settings) {
#if !defined(KOKKOS_ARCH_INTEL_GPU) && !defined(KOKKOS_ARCH_KEPLER) && \ #if !defined(KOKKOS_ARCH_INTEL_GPU) && !defined(KOKKOS_ARCH_KEPLER) && \
!defined(KOKKOS_ARCH_MAXWELL) && !defined(KOKKOS_ARCH_PASCAL) && \ !defined(KOKKOS_ARCH_MAXWELL) && !defined(KOKKOS_ARCH_PASCAL) && \
!defined(KOKKOS_ARCH_VOLTA) && !defined(KOKKOS_ARCH_TURING75) && \ !defined(KOKKOS_ARCH_VOLTA) && !defined(KOKKOS_ARCH_TURING75) && \
!defined(KOKKOS_ARCH_AMPERE) !defined(KOKKOS_ARCH_AMPERE) && !defined(KOKKOS_ARCH_HOPPER)
if (!settings.has_device_id() && gpu_devices.empty()) { if (!settings.has_device_id() && gpu_devices.empty()) {
Impl::SYCLInternal::singleton().initialize(sycl::device()); Impl::SYCLInternal::singleton().initialize(sycl::device());
return; return;

View File

@ -337,7 +337,8 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
// FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs. // FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs.
#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \ #if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \ defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
defined(KOKKOS_ARCH_HOPPER)
256, 256,
#endif #endif
max_threads_for_memory max_threads_for_memory
@ -369,7 +370,8 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
// FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs. // FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs.
#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \ #if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \ defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
defined(KOKKOS_ARCH_HOPPER)
256, 256,
#endif #endif
max_threads_for_memory max_threads_for_memory

View File

@ -110,10 +110,9 @@ KOKKOS_IMPL_HOST_FUNCTION inline uint64_t clock_tic_host() noexcept {
return ((uint64_t)a) | (((uint64_t)d) << 32); return ((uint64_t)a) | (((uint64_t)d) << 32);
#elif defined(__powerpc) || defined(__powerpc__) || defined(__powerpc64__) || \ #elif defined(__powerpc64__) || defined(__ppc64__)
defined(__POWERPC__) || defined(__ppc__) || defined(__ppc64__)
unsigned int cycles = 0; unsigned long cycles = 0;
asm volatile("mftb %0" : "=r"(cycles)); asm volatile("mftb %0" : "=r"(cycles));

View File

@ -166,6 +166,8 @@ int get_device_count() {
#elif defined(KOKKOS_ENABLE_OPENACC) #elif defined(KOKKOS_ENABLE_OPENACC)
return acc_get_num_devices( return acc_get_num_devices(
Kokkos::Experimental::Impl::OpenACC_Traits::dev_type); Kokkos::Experimental::Impl::OpenACC_Traits::dev_type);
#elif defined(KOKKOS_ENABLE_OPENMPTARGET)
return omp_get_num_devices();
#else #else
Kokkos::abort("implementation bug"); Kokkos::abort("implementation bug");
return -1; return -1;
@ -426,11 +428,17 @@ int Kokkos::Impl::get_gpu(const InitializationSettings& settings) {
Kokkos::abort("implementation bug"); Kokkos::abort("implementation bug");
} }
auto const* local_rank_str = char const* local_rank_str = nullptr;
std::getenv("OMPI_COMM_WORLD_LOCAL_RANK"); // OpenMPI for (char const* env_var : {
if (!local_rank_str) "OMPI_COMM_WORLD_LOCAL_RANK", // OpenMPI
local_rank_str = std::getenv("MV2_COMM_WORLD_LOCAL_RANK"); // MVAPICH2 "MV2_COMM_WORLD_LOCAL_RANK", // MVAPICH2
if (!local_rank_str) local_rank_str = std::getenv("SLURM_LOCALID"); // SLURM "MPI_LOCALRANKID", // MPICH
"SLURM_LOCALID", // SLURM
"PMI_LOCAL_RANK" // PMI
}) {
local_rank_str = std::getenv(env_var);
if (local_rank_str) break;
}
// use first GPU available for execution if unable to detect local MPI rank // use first GPU available for execution if unable to detect local MPI rank
if (!local_rank_str) { if (!local_rank_str) {

View File

@ -1128,9 +1128,8 @@ struct ViewOffset<
KOKKOS_INLINE_FUNCTION constexpr ViewOffset( KOKKOS_INLINE_FUNCTION constexpr ViewOffset(
const ViewOffset<DimRHS, Kokkos::LayoutRight, void>& rhs) const ViewOffset<DimRHS, Kokkos::LayoutRight, void>& rhs)
: m_dim(rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0) { : m_dim(rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0) {
static_assert((DimRHS::rank == 0 && dimension_type::rank == 0) || static_assert(((DimRHS::rank == 0 && dimension_type::rank == 0) ||
(DimRHS::rank == 1 && dimension_type::rank == 1 && (DimRHS::rank == 1 && dimension_type::rank == 1)),
dimension_type::rank_dynamic == 1),
"ViewOffset LayoutLeft and LayoutRight are only compatible " "ViewOffset LayoutLeft and LayoutRight are only compatible "
"when rank <= 1"); "when rank <= 1");
} }
@ -1778,8 +1777,7 @@ struct ViewOffset<
const ViewOffset<DimRHS, Kokkos::LayoutLeft, void>& rhs) const ViewOffset<DimRHS, Kokkos::LayoutLeft, void>& rhs)
: m_dim(rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0) { : m_dim(rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0) {
static_assert((DimRHS::rank == 0 && dimension_type::rank == 0) || static_assert((DimRHS::rank == 0 && dimension_type::rank == 0) ||
(DimRHS::rank == 1 && dimension_type::rank == 1 && (DimRHS::rank == 1 && dimension_type::rank == 1),
dimension_type::rank_dynamic == 1),
"ViewOffset LayoutRight and LayoutLeft are only compatible " "ViewOffset LayoutRight and LayoutLeft are only compatible "
"when rank <= 1"); "when rank <= 1");
} }
@ -3059,10 +3057,10 @@ struct ViewValueFunctor<DeviceType, ValueType, true /* is_scalar */> {
std::is_trivially_copy_assignable<Dummy>::value> std::is_trivially_copy_assignable<Dummy>::value>
construct_shared_allocation() { construct_shared_allocation() {
// Shortcut for zero initialization // Shortcut for zero initialization
ValueType value{};
// On A64FX memset seems to do the wrong thing with regards to first touch // On A64FX memset seems to do the wrong thing with regards to first touch
// leading to the significant performance issues // leading to the significant performance issues
#ifndef KOKKOS_ARCH_A64FX #ifndef KOKKOS_ARCH_A64FX
ValueType value{};
if (Impl::is_zero_byte(value)) { if (Impl::is_zero_byte(value)) {
uint64_t kpID = 0; uint64_t kpID = 0;
if (Kokkos::Profiling::profileLibraryLoaded()) { if (Kokkos::Profiling::profileLibraryLoaded()) {
@ -3539,9 +3537,7 @@ class ViewMapping<
typename SrcTraits::array_layout>::value || typename SrcTraits::array_layout>::value ||
std::is_same<typename DstTraits::array_layout, std::is_same<typename DstTraits::array_layout,
Kokkos::LayoutStride>::value || Kokkos::LayoutStride>::value ||
(DstTraits::dimension::rank == 0) || (DstTraits::dimension::rank == 0) || (DstTraits::dimension::rank == 1)
(DstTraits::dimension::rank == 1 &&
DstTraits::dimension::rank_dynamic == 1)
}; };
public: public:

View File

@ -73,6 +73,7 @@ KOKKOS_INCLUDE_DIRECTORIES(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files)
SET(COMPILE_ONLY_SOURCES SET(COMPILE_ONLY_SOURCES
TestArray.cpp TestArray.cpp
TestCreateMirror.cpp
TestDetectionIdiom.cpp TestDetectionIdiom.cpp
TestInterOp.cpp TestInterOp.cpp
TestLegionInteroperability.cpp TestLegionInteroperability.cpp
@ -86,6 +87,7 @@ ENDIF()
KOKKOS_ADD_EXECUTABLE( KOKKOS_ADD_EXECUTABLE(
TestCompileOnly TestCompileOnly
SOURCES SOURCES
TestCompileMain.cpp
${COMPILE_ONLY_SOURCES} ${COMPILE_ONLY_SOURCES}
) )

View File

@ -0,0 +1 @@
int main() {}

View File

@ -0,0 +1,126 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 3.0
// Copyright (2020) National Technology & Engineering
// Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Core.hpp>
template <typename TestView, typename MemorySpace>
void check_memory_space(TestView, MemorySpace) {
static_assert(
std::is_same<typename TestView::memory_space, MemorySpace>::value, "");
}
template <class View>
auto host_mirror_test_space(View) {
return std::conditional_t<
Kokkos::SpaceAccessibility<Kokkos::HostSpace,
typename View::memory_space>::accessible,
typename View::memory_space, Kokkos::HostSpace>{};
}
template <typename View>
void test_create_mirror_properties(const View& view) {
using namespace Kokkos;
using DeviceMemorySpace = typename DefaultExecutionSpace::memory_space;
// clang-format off
// create_mirror
check_memory_space(create_mirror(WithoutInitializing, view), host_mirror_test_space(view));
check_memory_space(create_mirror( view), host_mirror_test_space(view));
check_memory_space(create_mirror(WithoutInitializing, DefaultExecutionSpace{}, view), DeviceMemorySpace{});
check_memory_space(create_mirror( DefaultExecutionSpace{}, view), DeviceMemorySpace{});
// create_mirror_view
check_memory_space(create_mirror_view(WithoutInitializing, view), host_mirror_test_space(view));
check_memory_space(create_mirror_view( view), host_mirror_test_space(view));
check_memory_space(create_mirror_view(WithoutInitializing, DefaultExecutionSpace{}, view), DeviceMemorySpace{});
check_memory_space(create_mirror_view( DefaultExecutionSpace{}, view), DeviceMemorySpace{});
// create_mirror view_alloc
check_memory_space(create_mirror(view_alloc(WithoutInitializing), view), host_mirror_test_space(view));
check_memory_space(create_mirror(view_alloc(), view), host_mirror_test_space(view));
check_memory_space(create_mirror(view_alloc(WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
check_memory_space(create_mirror(view_alloc( DeviceMemorySpace{}), view), DeviceMemorySpace{});
// create_mirror_view view_alloc
check_memory_space(create_mirror_view(view_alloc(WithoutInitializing), view), host_mirror_test_space(view));
check_memory_space(create_mirror_view(view_alloc(), view), host_mirror_test_space(view));
check_memory_space(create_mirror_view(view_alloc(WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
check_memory_space(create_mirror_view(view_alloc( DeviceMemorySpace{}), view), DeviceMemorySpace{});
// create_mirror view_alloc + execution space
check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{}, WithoutInitializing), view), host_mirror_test_space(view));
check_memory_space(create_mirror(view_alloc(DefaultHostExecutionSpace{}), view), host_mirror_test_space(view));
check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{}, WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{}, DeviceMemorySpace{}), view), DeviceMemorySpace{});
// create_mirror_view view_alloc + execution space
check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{}, WithoutInitializing), view), host_mirror_test_space(view));
check_memory_space(create_mirror_view(view_alloc(DefaultHostExecutionSpace{}), view), host_mirror_test_space(view));
check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{}, WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{}, DeviceMemorySpace{}), view), DeviceMemorySpace{});
// create_mirror_view_and_copy
check_memory_space(create_mirror_view_and_copy(HostSpace{}, view), HostSpace{});
check_memory_space(create_mirror_view_and_copy(DeviceMemorySpace{}, view), DeviceMemorySpace{});
// create_mirror_view_and_copy view_alloc
check_memory_space(create_mirror_view_and_copy(view_alloc(HostSpace{}), view), HostSpace{});
check_memory_space(create_mirror_view_and_copy(view_alloc(DeviceMemorySpace{}), view), DeviceMemorySpace{});
// create_mirror_view_and_copy view_alloc + execution space
check_memory_space(create_mirror_view_and_copy(view_alloc(HostSpace{}, DefaultHostExecutionSpace{}), view), HostSpace{});
check_memory_space(create_mirror_view_and_copy(view_alloc(DeviceMemorySpace{}, DefaultExecutionSpace{}), view), DeviceMemorySpace{});
// clang-format on
}
void test() {
Kokkos::View<int*, Kokkos::DefaultExecutionSpace> device_view("device view",
10);
Kokkos::View<int*, Kokkos::HostSpace> host_view("host view", 10);
test_create_mirror_properties(device_view);
test_create_mirror_properties(host_view);
}

View File

@ -92,5 +92,3 @@ static_assert(std::is_same<difference_type<Woof>, int>::value,
static_assert(std::is_same<difference_type<Bark>, std::ptrdiff_t>::value, static_assert(std::is_same<difference_type<Bark>, std::ptrdiff_t>::value,
"Bark's difference_type should be ptrdiff_t!"); "Bark's difference_type should be ptrdiff_t!");
} // namespace Example } // namespace Example
int main() {}

View File

@ -45,12 +45,12 @@
#include <Kokkos_Core.hpp> #include <Kokkos_Core.hpp>
#include <cstdio> #include <cstdio>
namespace Test { namespace {
template <class Device> template <class Device, class T, T ImbalanceSz>
struct TestScan { struct TestScan {
using execution_space = Device; using execution_space = Device;
using value_type = int64_t; using value_type = T;
Kokkos::View<int, Device, Kokkos::MemoryTraits<Kokkos::Atomic> > errors; Kokkos::View<int, Device, Kokkos::MemoryTraits<Kokkos::Atomic> > errors;
@ -58,7 +58,10 @@ struct TestScan {
void operator()(const int iwork, value_type& update, void operator()(const int iwork, value_type& update,
const bool final_pass) const { const bool final_pass) const {
const value_type n = iwork + 1; const value_type n = iwork + 1;
const value_type imbalance = ((1000 <= n) && (0 == n % 1000)) ? 1000 : 0; const value_type imbalance =
((ImbalanceSz <= n) && (value_type(0) == n % ImbalanceSz))
? ImbalanceSz
: value_type(0);
// Insert an artificial load imbalance // Insert an artificial load imbalance
@ -133,12 +136,29 @@ struct TestScan {
} }
} }
}; };
} // namespace
TEST(TEST_CATEGORY, scan) { TEST(TEST_CATEGORY, scan) {
TestScan<TEST_EXECSPACE>::test_range(1, 1000); constexpr auto imbalance_size = 1000;
TestScan<TEST_EXECSPACE>(0); TestScan<TEST_EXECSPACE, int64_t, imbalance_size>::test_range(1, 1000);
TestScan<TEST_EXECSPACE>(100000); TestScan<TEST_EXECSPACE, int64_t, imbalance_size>(0);
TestScan<TEST_EXECSPACE>(10000000); TestScan<TEST_EXECSPACE, int64_t, imbalance_size>(100000);
TEST_EXECSPACE().fence(); TestScan<TEST_EXECSPACE, int64_t, imbalance_size>(10000000);
}
TEST(TEST_CATEGORY, small_size_scan) {
constexpr auto imbalance_size = 10; // Pick to not overflow...
TestScan<TEST_EXECSPACE, std::int8_t, imbalance_size>(0);
TestScan<TEST_EXECSPACE, std::int8_t, imbalance_size>(5);
TestScan<TEST_EXECSPACE, std::int8_t, imbalance_size>(10);
TestScan<TEST_EXECSPACE, std::int8_t, imbalance_size>(
static_cast<std::size_t>(
std::sqrt(std::numeric_limits<std::int8_t>::max())));
constexpr auto short_imbalance_size = 100; // Pick to not overflow...
TestScan<TEST_EXECSPACE, std::int16_t, short_imbalance_size>(0);
TestScan<TEST_EXECSPACE, std::int16_t, short_imbalance_size>(5);
TestScan<TEST_EXECSPACE, std::int16_t, short_imbalance_size>(100);
TestScan<TEST_EXECSPACE, std::int16_t, short_imbalance_size>(
static_cast<std::size_t>(
std::sqrt(std::numeric_limits<std::int16_t>::max())));
} }
} // namespace Test

View File

@ -1616,6 +1616,73 @@ struct TestTeamPolicyHandleByValue {
} // namespace } // namespace
namespace {
template <typename ExecutionSpace>
struct TestRepeatedTeamReduce {
static constexpr int ncol = 1500; // nothing special, just some work
KOKKOS_FUNCTION void operator()(
const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type &team)
const {
// non-divisible by power of two to make triggering problems easier
constexpr int nlev = 129;
constexpr auto pi = Kokkos::Experimental::pi_v<double>;
double b = 0.;
for (int ri = 0; ri < 10; ++ri) {
// The contributions here must be sufficiently complex, simply adding ones
// wasn't enough to trigger the bug.
const auto g1 = [&](const int k, double &acc) {
acc += Kokkos::cos(pi * double(k) / nlev);
};
const auto g2 = [&](const int k, double &acc) {
acc += Kokkos::sin(pi * double(k) / nlev);
};
double a1, a2;
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team, nlev), g1, a1);
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team, nlev), g2, a2);
b += a1;
b += a2;
}
const auto h = [&]() {
const auto col = team.league_rank();
v(col) = b + col;
};
Kokkos::single(Kokkos::PerTeam(team), h);
}
KOKKOS_FUNCTION void operator()(const int i, int &bad) const {
if (v(i) != v(0) + i) {
++bad;
KOKKOS_IMPL_DO_NOT_USE_PRINTF("Failing at %d!\n", i);
}
}
TestRepeatedTeamReduce() : v("v", ncol) { test(); }
void test() {
int team_size_recommended =
Kokkos::TeamPolicy<ExecutionSpace>(1, 1).team_size_recommended(
*this, Kokkos::ParallelForTag());
// Choose a non-recommened (non-power of two for GPUs) team size
int team_size = team_size_recommended > 1 ? team_size_recommended - 1 : 1;
// The failure was non-deterministic so run the test a bunch of times
for (int it = 0; it < 100; ++it) {
Kokkos::parallel_for(
Kokkos::TeamPolicy<ExecutionSpace>(ncol, team_size, 1), *this);
int bad = 0;
Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecutionSpace>(0, ncol),
*this, bad);
ASSERT_EQ(bad, 0) << " Failing in iteration " << it;
}
}
Kokkos::View<double *, ExecutionSpace> v;
};
} // namespace
} // namespace Test } // namespace Test
/*--------------------------------------------------------------------------*/ /*--------------------------------------------------------------------------*/

View File

@ -134,5 +134,15 @@ TEST(TEST_CATEGORY, team_parallel_dummy_with_reducer_and_scratch_space) {
} }
} }
TEST(TEST_CATEGORY, repeated_team_reduce) {
#ifdef KOKKOS_ENABLE_OPENMPTARGET
if (std::is_same<TEST_EXECSPACE, Kokkos::Experimental::OpenMPTarget>::value)
GTEST_SKIP() << "skipping since team_reduce for OpenMPTarget is not "
"properly implemented";
#endif
TestRepeatedTeamReduce<TEST_EXECSPACE>();
}
} // namespace Test } // namespace Test
#endif #endif

View File

@ -92,8 +92,18 @@ TEST(TEST_CATEGORY, view_is_assignable) {
View<double*, left, d_exec>>::test(false, false, 10); View<double*, left, d_exec>>::test(false, false, 10);
// Layout assignment // Layout assignment
Impl::TestAssignability<View<int, left, d_exec>,
View<int, right, d_exec>>::test(true, true);
Impl::TestAssignability<View<int*, left, d_exec>, Impl::TestAssignability<View<int*, left, d_exec>,
View<int*, right, d_exec>>::test(true, true, 10); View<int*, right, d_exec>>::test(true, true, 10);
Impl::TestAssignability<View<int[5], left, d_exec>,
View<int*, right, d_exec>>::test(false, false, 10);
Impl::TestAssignability<View<int[10], left, d_exec>,
View<int*, right, d_exec>>::test(false, true, 10);
Impl::TestAssignability<View<int*, left, d_exec>,
View<int[5], right, d_exec>>::test(true, true);
Impl::TestAssignability<View<int[5], left, d_exec>,
View<int[10], right, d_exec>>::test(false, false);
// This could be made possible (due to the degenerate nature of the views) but // This could be made possible (due to the degenerate nature of the views) but
// we do not allow this yet // we do not allow this yet

199
lib/kokkos/kokkos_5538.diff Normal file
View File

@ -0,0 +1,199 @@
diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos
index 22af411f32..530510a0d1 100644
--- a/lib/kokkos/Makefile.kokkos
+++ b/lib/kokkos/Makefile.kokkos
@@ -20,7 +20,7 @@ KOKKOS_DEVICES ?= "OpenMP"
#KOKKOS_DEVICES ?= "Threads"
# Options:
# Intel: KNC,KNL,SNB,HSW,BDW,SKL,SKX,ICL,ICX,SPR
-# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86
+# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86,Hopper90
# ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX
# IBM: BGQ,Power7,Power8,Power9
# AMD-GPUS: Vega900,Vega906,Vega908,Vega90A
@@ -401,6 +401,7 @@ KOKKOS_INTERNAL_USE_ARCH_VOLTA72 := $(call kokkos_has_string,$(KOKKOS_ARCH),Volt
KOKKOS_INTERNAL_USE_ARCH_TURING75 := $(call kokkos_has_string,$(KOKKOS_ARCH),Turing75)
KOKKOS_INTERNAL_USE_ARCH_AMPERE80 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere80)
KOKKOS_INTERNAL_USE_ARCH_AMPERE86 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere86)
+KOKKOS_INTERNAL_USE_ARCH_HOPPER90 := $(call kokkos_has_string,$(KOKKOS_ARCH),Hopper90)
KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30) \
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32) \
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \
@@ -414,7 +415,8 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLE
+ $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72) \
+ $(KOKKOS_INTERNAL_USE_ARCH_TURING75) \
+ $(KOKKOS_INTERNAL_USE_ARCH_AMPERE80) \
- + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86))
+ + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86) \
+ + $(KOKKOS_INTERNAL_USE_ARCH_HOPPER90))
#SEK: This seems like a bug to me
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
@@ -1194,6 +1196,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA_ARCH), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE86")
KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_86
endif
+ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_HOPPER90), 1)
+ tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER")
+ tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER90")
+ KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_90
+ endif
ifneq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)
diff --git a/lib/kokkos/cmake/KokkosCore_config.h.in b/lib/kokkos/cmake/KokkosCore_config.h.in
index 88ddc48378..b83ced9243 100644
--- a/lib/kokkos/cmake/KokkosCore_config.h.in
+++ b/lib/kokkos/cmake/KokkosCore_config.h.in
@@ -102,6 +102,7 @@
#cmakedefine KOKKOS_ARCH_AMPERE
#cmakedefine KOKKOS_ARCH_AMPERE80
#cmakedefine KOKKOS_ARCH_AMPERE86
+#cmakedefine KOKKOS_ARCH_HOPPER90
#cmakedefine KOKKOS_ARCH_AMD_ZEN
#cmakedefine KOKKOS_ARCH_AMD_ZEN2
#cmakedefine KOKKOS_ARCH_AMD_ZEN3
diff --git a/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc b/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc
index f56cef1651..2585a6a64c 100644
--- a/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc
+++ b/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc
@@ -74,6 +74,7 @@ int main() {
case 75: std::cout << "Set -DKokkos_ARCH_TURING75=ON ." << std::endl; break;
case 80: std::cout << "Set -DKokkos_ARCH_AMPERE80=ON ." << std::endl; break;
case 86: std::cout << "Set -DKokkos_ARCH_AMPERE86=ON ." << std::endl; break;
+ case 90: std::cout << "Set -DKokkos_ARCH_HOPPER90=ON ." << std::endl; break;
default:
std::cout << "Compute capability " << compute_capability
<< " is not supported" << std::endl;
diff --git a/lib/kokkos/cmake/kokkos_arch.cmake b/lib/kokkos/cmake/kokkos_arch.cmake
index ef16aad047..c1d76cceeb 100644
--- a/lib/kokkos/cmake/kokkos_arch.cmake
+++ b/lib/kokkos/cmake/kokkos_arch.cmake
@@ -86,6 +86,7 @@ KOKKOS_ARCH_OPTION(VOLTA72 GPU "NVIDIA Volta generation CC 7.2" "KOKK
KOKKOS_ARCH_OPTION(TURING75 GPU "NVIDIA Turing generation CC 7.5" "KOKKOS_SHOW_CUDA_ARCHS")
KOKKOS_ARCH_OPTION(AMPERE80 GPU "NVIDIA Ampere generation CC 8.0" "KOKKOS_SHOW_CUDA_ARCHS")
KOKKOS_ARCH_OPTION(AMPERE86 GPU "NVIDIA Ampere generation CC 8.6" "KOKKOS_SHOW_CUDA_ARCHS")
+KOKKOS_ARCH_OPTION(HOPPER90 GPU "NVIDIA Hopper generation CC 9.0" "KOKKOS_SHOW_CUDA_ARCHS")
IF(Kokkos_ENABLE_HIP OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_UNSUPPORTED_ARCHS)
SET(KOKKOS_SHOW_HIP_ARCHS ON)
@@ -544,6 +545,7 @@ CHECK_CUDA_ARCH(VOLTA72 sm_72)
CHECK_CUDA_ARCH(TURING75 sm_75)
CHECK_CUDA_ARCH(AMPERE80 sm_80)
CHECK_CUDA_ARCH(AMPERE86 sm_86)
+CHECK_CUDA_ARCH(HOPPER90 sm_90)
SET(AMDGPU_ARCH_ALREADY_SPECIFIED "")
FUNCTION(CHECK_AMDGPU_ARCH ARCH FLAG)
@@ -806,6 +808,10 @@ IF (KOKKOS_ARCH_AMPERE80 OR KOKKOS_ARCH_AMPERE86)
SET(KOKKOS_ARCH_AMPERE ON)
ENDIF()
+IF (KOKKOS_ARCH_HOPPER90)
+ SET(KOKKOS_ARCH_HOPPER ON)
+ENDIF()
+
#Regardless of version, make sure we define the general architecture name
IF (KOKKOS_ARCH_VEGA900 OR KOKKOS_ARCH_VEGA906 OR KOKKOS_ARCH_VEGA908 OR KOKKOS_ARCH_VEGA90A)
SET(KOKKOS_ARCH_VEGA ON)
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
index 56f9117844..fcd4773dbc 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
@@ -232,7 +232,8 @@ inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) {
case 61: return 96;
case 70:
case 80:
- case 86: return 8;
+ case 86:
+ case 90: return 8;
case 75: return 32;
default:
Kokkos::Impl::throw_runtime_exception(
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp
index 40a263561f..8c40ebd60d 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp
@@ -418,7 +418,7 @@ KOKKOS_INLINE_FUNCTION
#endif // CUDA_VERSION >= 11000 && CUDA_VERSION < 11010
#if CUDA_VERSION >= 11010 && \
- ((defined(KOKKOS_ARCH_AMPERE80) || defined(KOKKOS_ARCH_AMPERE86)))
+ ((defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER)))
KOKKOS_INLINE_FUNCTION
bhalf_t cast_to_bhalf(bhalf_t val) { return val; }
KOKKOS_INLINE_FUNCTION
diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp
index f9451ecfe6..2ce1efb98c 100644
--- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp
+++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp
@@ -51,7 +51,7 @@ namespace Kokkos::Experimental::Impl {
struct OpenACC_Traits {
#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
- defined(KOKKOS_ARCH_AMPERE)
+ defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER)
static constexpr acc_device_t dev_type = acc_device_nvidia;
static constexpr bool may_fallback_to_host = false;
#else
diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
index a9bc085912..27ee1d4232 100644
--- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
+++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
@@ -115,8 +115,9 @@ void OpenMPTargetInternal::impl_initialize() {
// FIXME_OPENMPTARGET: Only fix the number of teams for NVIDIA architectures
// from Pascal and upwards.
-#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
- defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
+#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
+ defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
+ defined(KOKKOS_ARCH_HOPPER)
#if defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG >= 1300)
omp_set_num_teams(512);
#endif
diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
index 840db4327c..7e5addbc5b 100644
--- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
@@ -155,7 +155,7 @@ void SYCL::impl_initialize(InitializationSettings const& settings) {
#if !defined(KOKKOS_ARCH_INTEL_GPU) && !defined(KOKKOS_ARCH_KEPLER) && \
!defined(KOKKOS_ARCH_MAXWELL) && !defined(KOKKOS_ARCH_PASCAL) && \
!defined(KOKKOS_ARCH_VOLTA) && !defined(KOKKOS_ARCH_TURING75) && \
- !defined(KOKKOS_ARCH_AMPERE)
+ !defined(KOKKOS_ARCH_AMPERE) && !defined(KOKKOS_ARCH_HOPPER)
if (!settings.has_device_id() && gpu_devices.empty()) {
Impl::SYCLInternal::singleton().initialize(sycl::device());
return;
diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
index 5ac7d8af30..ba101f699e 100644
--- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
@@ -335,9 +335,10 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
return std::min({
int(m_space.impl_internal_space_instance()->m_maxWorkgroupSize),
// FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs.
-#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
- defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
- defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
+#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
+ defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
+ defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
+ defined(KOKKOS_ARCH_HOPPER)
256,
#endif
max_threads_for_memory
@@ -367,9 +368,10 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
return std::min<int>({
int(m_space.impl_internal_space_instance()->m_maxWorkgroupSize),
// FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs.
-#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
- defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
- defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
+#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
+ defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
+ defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
+ defined(KOKKOS_ARCH_HOPPER)
256,
#endif
max_threads_for_memory

523
lib/kokkos/kokkos_5706.diff Normal file
View File

@ -0,0 +1,523 @@
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
index fcd4773dbc..30b6958a67 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
@@ -207,7 +207,6 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
LaunchBounds{});
}
-// Assuming cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1)
// NOTE these number can be obtained several ways:
// * One option is to download the CUDA Occupancy Calculator spreadsheet, select
// "Compute Capability" first and check what is the smallest "Shared Memory
@@ -242,6 +241,7 @@ inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) {
return 0;
}() * 1024;
}
+
} // namespace Impl
} // namespace Kokkos
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
index 5811498e01..e22eb3b842 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
@@ -569,12 +569,6 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default
}
#endif
-#ifdef KOKKOS_ENABLE_PRE_CUDA_10_DEPRECATION_API
- cudaThreadSetCacheConfig(cudaFuncCachePreferShared);
-#else
- cudaDeviceSetCacheConfig(cudaFuncCachePreferShared);
-#endif
-
// Init the array for used for arbitrarily sized atomics
if (stream == nullptr) Impl::initialize_host_cuda_lock_arrays();
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
index b7a80ad84f..5c4c3a7d39 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
@@ -93,10 +93,6 @@ namespace Impl {
// __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
// function qualifier which could be used to improve performance.
//----------------------------------------------------------------------------
-// Maximize L1 cache and minimize shared memory:
-// cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1 );
-// For 2.0 capability: 48 KB L1 and 16 KB shared
-//----------------------------------------------------------------------------
template <class DriverType>
__global__ static void cuda_parallel_launch_constant_memory() {
@@ -158,63 +154,105 @@ inline void check_shmem_request(CudaInternal const* cuda_instance, int shmem) {
}
}
-// This function needs to be template on DriverType and LaunchBounds
+// These functions needs to be template on DriverType and LaunchBounds
// so that the static bool is unique for each type combo
// KernelFuncPtr does not necessarily contain that type information.
+
template <class DriverType, class LaunchBounds, class KernelFuncPtr>
-inline void configure_shmem_preference(KernelFuncPtr const& func,
- bool prefer_shmem) {
+const cudaFuncAttributes& get_cuda_kernel_func_attributes(
+ const KernelFuncPtr& func) {
+ // Only call cudaFuncGetAttributes once for each unique kernel
+ // by leveraging static variable initialization rules
+ auto wrap_get_attributes = [&]() -> cudaFuncAttributes {
+ cudaFuncAttributes attr;
+ KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncGetAttributes(&attr, func));
+ return attr;
+ };
+ static cudaFuncAttributes func_attr = wrap_get_attributes();
+ return func_attr;
+}
+
+template <class DriverType, class LaunchBounds, class KernelFuncPtr>
+inline void configure_shmem_preference(const KernelFuncPtr& func,
+ const cudaDeviceProp& device_props,
+ const size_t block_size, int& shmem,
+ const size_t occupancy) {
#ifndef KOKKOS_ARCH_KEPLER
- // On Kepler the L1 has no benefit since it doesn't cache reads
+
+ const auto& func_attr =
+ get_cuda_kernel_func_attributes<DriverType, LaunchBounds>(func);
+
+ // Compute limits for number of blocks due to registers/SM
+ const size_t regs_per_sm = device_props.regsPerMultiprocessor;
+ const size_t regs_per_thread = func_attr.numRegs;
+ // The granularity of register allocation is chunks of 256 registers per warp
+ // -> 8 registers per thread
+ const size_t allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
+ const size_t max_blocks_regs =
+ regs_per_sm / (allocated_regs_per_thread * block_size);
+
+ // Compute how many threads per sm we actually want
+ const size_t max_threads_per_sm = device_props.maxThreadsPerMultiProcessor;
+ // only allocate multiples of warp size
+ const size_t num_threads_desired =
+ ((max_threads_per_sm * occupancy / 100 + 31) / 32) * 32;
+ // Get close to the desired occupancy,
+ // don't undershoot by much but also don't allocate a whole new block just
+ // because one is a few threads over otherwise.
+ size_t num_blocks_desired =
+ (num_threads_desired + block_size * 0.8) / block_size;
+ num_blocks_desired = ::std::min(max_blocks_regs, num_blocks_desired);
+ if (num_blocks_desired == 0) num_blocks_desired = 1;
+
+ // Calculate how much shared memory we need per block
+ size_t shmem_per_block = shmem + func_attr.sharedSizeBytes;
+
+ // The minimum shared memory allocation we can have in total per SM is 8kB.
+ // If we want to lower occupancy we have to make sure we request at least that
+ // much in aggregate over all blocks, so that shared memory actually becomes a
+ // limiting factor for occupancy
+ constexpr size_t min_shmem_size_per_sm = 8192;
+ if ((occupancy < 100) &&
+ (shmem_per_block * num_blocks_desired < min_shmem_size_per_sm)) {
+ shmem_per_block = min_shmem_size_per_sm / num_blocks_desired;
+ // Need to set the caller's shmem variable so that the
+ // kernel launch uses the correct dynamic shared memory request
+ shmem = shmem_per_block - func_attr.sharedSizeBytes;
+ }
+
+ // Compute the carveout fraction we need based on occupancy
+ // Use multiples of 8kB
+ const size_t max_shmem_per_sm = device_props.sharedMemPerMultiprocessor;
+ size_t carveout = shmem_per_block == 0
+ ? 0
+ : 100 *
+ (((num_blocks_desired * shmem_per_block +
+ min_shmem_size_per_sm - 1) /
+ min_shmem_size_per_sm) *
+ min_shmem_size_per_sm) /
+ max_shmem_per_sm;
+ if (carveout > 100) carveout = 100;
+
+ // Set the carveout, but only call it once per kernel or when it changes
auto set_cache_config = [&] {
- KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
- func,
- (prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1)));
- return prefer_shmem;
+ KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetAttribute(
+ func, cudaFuncAttributePreferredSharedMemoryCarveout, carveout));
+ return carveout;
};
- static bool cache_config_preference_cached = set_cache_config();
- if (cache_config_preference_cached != prefer_shmem) {
+ // Store the value in a static variable so we only reset if needed
+ static size_t cache_config_preference_cached = set_cache_config();
+ if (cache_config_preference_cached != carveout) {
cache_config_preference_cached = set_cache_config();
}
#else
// Use the parameters so we don't get a warning
(void)func;
- (void)prefer_shmem;
+ (void)device_props;
+ (void)block_size;
+ (void)occupancy;
#endif
}
-template <class Policy>
-std::enable_if_t<Policy::experimental_contains_desired_occupancy>
-modify_launch_configuration_if_desired_occupancy_is_specified(
- Policy const& policy, cudaDeviceProp const& properties,
- cudaFuncAttributes const& attributes, dim3 const& block, int& shmem,
- bool& prefer_shmem) {
- int const block_size = block.x * block.y * block.z;
- int const desired_occupancy = policy.impl_get_desired_occupancy().value();
-
- size_t const shmem_per_sm_prefer_l1 = get_shmem_per_sm_prefer_l1(properties);
- size_t const static_shmem = attributes.sharedSizeBytes;
-
- // round to nearest integer and avoid division by zero
- int active_blocks = std::max(
- 1, static_cast<int>(std::round(
- static_cast<double>(properties.maxThreadsPerMultiProcessor) /
- block_size * desired_occupancy / 100)));
- int const dynamic_shmem =
- shmem_per_sm_prefer_l1 / active_blocks - static_shmem;
-
- if (dynamic_shmem > shmem) {
- shmem = dynamic_shmem;
- prefer_shmem = false;
- }
-}
-
-template <class Policy>
-std::enable_if_t<!Policy::experimental_contains_desired_occupancy>
-modify_launch_configuration_if_desired_occupancy_is_specified(
- Policy const&, cudaDeviceProp const&, cudaFuncAttributes const&,
- dim3 const& /*block*/, int& /*shmem*/, bool& /*prefer_shmem*/) {}
-
// </editor-fold> end Some helper functions for launch code readability }}}1
//==============================================================================
@@ -348,7 +386,7 @@ struct CudaParallelLaunchKernelInvoker<
#ifdef KOKKOS_CUDA_ENABLE_GRAPHS
inline static void create_parallel_launch_graph_node(
DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
- CudaInternal const* cuda_instance, bool prefer_shmem) {
+ CudaInternal const* cuda_instance) {
//----------------------------------------
auto const& graph = Impl::get_cuda_graph_from_kernel(driver);
KOKKOS_EXPECTS(bool(graph));
@@ -358,8 +396,15 @@ struct CudaParallelLaunchKernelInvoker<
if (!Impl::is_empty_launch(grid, block)) {
Impl::check_shmem_request(cuda_instance, shmem);
- Impl::configure_shmem_preference<DriverType, LaunchBounds>(
- base_t::get_kernel_func(), prefer_shmem);
+ if (DriverType::Policy::
+ experimental_contains_desired_occupancy) {
+ int desired_occupancy =
+ driver.get_policy().impl_get_desired_occupancy().value();
+ size_t block_size = block.x * block.y * block.z;
+ Impl::configure_shmem_preference<DriverType, LaunchBounds>(
+ base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
+ shmem, desired_occupancy);
+ }
void const* args[] = {&driver};
@@ -442,7 +487,7 @@ struct CudaParallelLaunchKernelInvoker<
#ifdef KOKKOS_CUDA_ENABLE_GRAPHS
inline static void create_parallel_launch_graph_node(
DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
- CudaInternal const* cuda_instance, bool prefer_shmem) {
+ CudaInternal const* cuda_instance) {
//----------------------------------------
auto const& graph = Impl::get_cuda_graph_from_kernel(driver);
KOKKOS_EXPECTS(bool(graph));
@@ -452,8 +497,15 @@ struct CudaParallelLaunchKernelInvoker<
if (!Impl::is_empty_launch(grid, block)) {
Impl::check_shmem_request(cuda_instance, shmem);
- Impl::configure_shmem_preference<DriverType, LaunchBounds>(
- base_t::get_kernel_func(), prefer_shmem);
+ if constexpr (DriverType::Policy::
+ experimental_contains_desired_occupancy) {
+ int desired_occupancy =
+ driver.get_policy().impl_get_desired_occupancy().value();
+ size_t block_size = block.x * block.y * block.z;
+ Impl::configure_shmem_preference<DriverType, LaunchBounds>(
+ base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
+ shmem, desired_occupancy);
+ }
auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver);
@@ -566,7 +618,7 @@ struct CudaParallelLaunchKernelInvoker<
#ifdef KOKKOS_CUDA_ENABLE_GRAPHS
inline static void create_parallel_launch_graph_node(
DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
- CudaInternal const* cuda_instance, bool prefer_shmem) {
+ CudaInternal const* cuda_instance) {
// Just use global memory; coordinating through events to share constant
// memory with the non-graph interface is not really reasonable since
// events don't work with Graphs directly, and this would anyway require
@@ -580,7 +632,7 @@ struct CudaParallelLaunchKernelInvoker<
DriverType, LaunchBounds,
Experimental::CudaLaunchMechanism::GlobalMemory>;
global_launch_impl_t::create_parallel_launch_graph_node(
- driver, grid, block, shmem, cuda_instance, prefer_shmem);
+ driver, grid, block, shmem, cuda_instance);
}
#endif
};
@@ -613,8 +665,7 @@ struct CudaParallelLaunchImpl<
inline static void launch_kernel(const DriverType& driver, const dim3& grid,
const dim3& block, int shmem,
- const CudaInternal* cuda_instance,
- bool prefer_shmem) {
+ const CudaInternal* cuda_instance) {
if (!Impl::is_empty_launch(grid, block)) {
// Prevent multiple threads to simultaneously set the cache configuration
// preference and launch the same kernel
@@ -623,18 +674,17 @@ struct CudaParallelLaunchImpl<
Impl::check_shmem_request(cuda_instance, shmem);
- // If a desired occupancy is specified, we compute how much shared memory
- // to ask for to achieve that occupancy, assuming that the cache
- // configuration is `cudaFuncCachePreferL1`. If the amount of dynamic
- // shared memory computed is actually smaller than `shmem` we overwrite
- // `shmem` and set `prefer_shmem` to `false`.
- modify_launch_configuration_if_desired_occupancy_is_specified(
- driver.get_policy(), cuda_instance->m_deviceProp,
- get_cuda_func_attributes(), block, shmem, prefer_shmem);
-
- Impl::configure_shmem_preference<
- DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
- base_t::get_kernel_func(), prefer_shmem);
+ if (DriverType::Policy::
+ experimental_contains_desired_occupancy) {
+ int desired_occupancy =
+ driver.get_policy().impl_get_desired_occupancy().value();
+ size_t block_size = block.x * block.y * block.z;
+ Impl::configure_shmem_preference<
+ DriverType,
+ Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
+ base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
+ shmem, desired_occupancy);
+ }
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
@@ -650,18 +700,9 @@ struct CudaParallelLaunchImpl<
}
static cudaFuncAttributes get_cuda_func_attributes() {
- // Race condition inside of cudaFuncGetAttributes if the same address is
- // given requires using a local variable as input instead of a static Rely
- // on static variable initialization to make sure only one thread executes
- // the code and the result is visible.
- auto wrap_get_attributes = []() -> cudaFuncAttributes {
- cudaFuncAttributes attr_tmp;
- KOKKOS_IMPL_CUDA_SAFE_CALL(
- cudaFuncGetAttributes(&attr_tmp, base_t::get_kernel_func()));
- return attr_tmp;
- };
- static cudaFuncAttributes attr = wrap_get_attributes();
- return attr;
+ return get_cuda_kernel_func_attributes<
+ DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
+ base_t::get_kernel_func());
}
};
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
index e586bb4cc6..0e348c092a 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
@@ -121,8 +121,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
maxblocks[1]),
1);
CudaParallelLaunch<ParallelFor, LaunchBounds>(
- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
- false);
+ *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
} else if (RP::rank == 3) {
const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], m_rp.m_tile[2]);
KOKKOS_ASSERT(block.x > 0);
@@ -139,8 +138,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
(m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1) / block.z,
maxblocks[2]));
CudaParallelLaunch<ParallelFor, LaunchBounds>(
- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
- false);
+ *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
} else if (RP::rank == 4) {
// id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to
// threadIdx.z
@@ -158,8 +156,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
(m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1) / block.z,
maxblocks[2]));
CudaParallelLaunch<ParallelFor, LaunchBounds>(
- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
- false);
+ *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
} else if (RP::rank == 5) {
// id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to
// threadIdx.z
@@ -175,8 +172,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
(m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1) / block.z,
maxblocks[2]));
CudaParallelLaunch<ParallelFor, LaunchBounds>(
- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
- false);
+ *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
} else if (RP::rank == 6) {
// id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4,id5 to
// threadIdx.z
@@ -191,8 +187,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
std::min<array_index_type>(m_rp.m_tile_end[4] * m_rp.m_tile_end[5],
maxblocks[2]));
CudaParallelLaunch<ParallelFor, LaunchBounds>(
- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
- false);
+ *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
} else {
Kokkos::abort("Kokkos::MDRange Error: Exceeded rank bounds with Cuda\n");
}
@@ -405,8 +400,8 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
CudaParallelLaunch<ParallelReduce, LaunchBounds>(
*this, grid, block, shmem,
- m_policy.space().impl_internal_space_instance(),
- false); // copy to device and execute
+ m_policy.space()
+ .impl_internal_space_instance()); // copy to device and execute
if (!m_result_ptr_device_accessible) {
if (m_result_ptr) {
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
index ac160f8fe2..d1031751c2 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
@@ -135,8 +135,7 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
#endif
CudaParallelLaunch<ParallelFor, LaunchBounds>(
- *this, grid, block, 0, m_policy.space().impl_internal_space_instance(),
- false);
+ *this, grid, block, 0, m_policy.space().impl_internal_space_instance());
}
ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
@@ -375,8 +374,8 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
CudaParallelLaunch<ParallelReduce, LaunchBounds>(
*this, grid, block, shmem,
- m_policy.space().impl_internal_space_instance(),
- false); // copy to device and execute
+ m_policy.space()
+ .impl_internal_space_instance()); // copy to device and execute
if (!m_result_ptr_device_accessible) {
if (m_result_ptr) {
@@ -726,16 +725,16 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
m_final = false;
CudaParallelLaunch<ParallelScan, LaunchBounds>(
*this, grid, block, shmem,
- m_policy.space().impl_internal_space_instance(),
- false); // copy to device and execute
+ m_policy.space()
+ .impl_internal_space_instance()); // copy to device and execute
#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
}
#endif
m_final = true;
CudaParallelLaunch<ParallelScan, LaunchBounds>(
*this, grid, block, shmem,
- m_policy.space().impl_internal_space_instance(),
- false); // copy to device and execute
+ m_policy.space()
+ .impl_internal_space_instance()); // copy to device and execute
}
}
@@ -1038,16 +1037,16 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
m_final = false;
CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>(
*this, grid, block, shmem,
- m_policy.space().impl_internal_space_instance(),
- false); // copy to device and execute
+ m_policy.space()
+ .impl_internal_space_instance()); // copy to device and execute
#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
}
#endif
m_final = true;
CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>(
*this, grid, block, shmem,
- m_policy.space().impl_internal_space_instance(),
- false); // copy to device and execute
+ m_policy.space()
+ .impl_internal_space_instance()); // copy to device and execute
const int size = Analysis::value_size(m_functor);
#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
index cdd16085b3..ea9430b812 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
@@ -552,8 +552,8 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
CudaParallelLaunch<ParallelFor, LaunchBounds>(
*this, grid, block, shmem_size_total,
- m_policy.space().impl_internal_space_instance(),
- true); // copy to device and execute
+ m_policy.space()
+ .impl_internal_space_instance()); // copy to device and execute
}
ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
@@ -878,8 +878,8 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
CudaParallelLaunch<ParallelReduce, LaunchBounds>(
*this, grid, block, shmem_size_total,
- m_policy.space().impl_internal_space_instance(),
- true); // copy to device and execute
+ m_policy.space()
+ .impl_internal_space_instance()); // copy to device and execute
if (!m_result_ptr_device_accessible) {
m_policy.space().fence(
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
index 34d4bef9fd..178012431c 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
@@ -428,11 +428,6 @@ struct CudaReductionsFunctor<FunctorType, false, false> {
// __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
// function qualifier which could be used to improve performance.
//----------------------------------------------------------------------------
-// Maximize shared memory and minimize L1 cache:
-// cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferShared );
-// For 2.0 capability: 48 KB shared and 16 KB L1
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
/*
* Algorithmic constraints:
* (a) blockDim.y <= 1024
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
index fb3a6b138f..a12378a891 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
@@ -100,8 +100,7 @@ class ParallelFor<FunctorType, Kokkos::WorkGraphPolicy<Traits...>,
const int shared = 0;
Kokkos::Impl::CudaParallelLaunch<Self>(
- *this, grid, block, shared, Cuda().impl_internal_space_instance(),
- false);
+ *this, grid, block, shared, Cuda().impl_internal_space_instance());
}
inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)

View File

@ -0,0 +1,46 @@
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
index 30b6958a67..b94f053272 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
@@ -207,41 +207,6 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
LaunchBounds{});
}
-// NOTE these number can be obtained several ways:
-// * One option is to download the CUDA Occupancy Calculator spreadsheet, select
-// "Compute Capability" first and check what is the smallest "Shared Memory
-// Size Config" that is available. The "Shared Memory Per Multiprocessor" in
-// bytes is then to be found below in the summary.
-// * Another option would be to look for the information in the "Tuning
-// Guide(s)" of the CUDA Toolkit Documentation for each GPU architecture, in
-// the "Shared Memory" section (more tedious)
-inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) {
- int const compute_capability = properties.major * 10 + properties.minor;
- return [compute_capability]() {
- switch (compute_capability) {
- case 30:
- case 32:
- case 35: return 16;
- case 37: return 80;
- case 50:
- case 53:
- case 60:
- case 62: return 64;
- case 52:
- case 61: return 96;
- case 70:
- case 80:
- case 86:
- case 90: return 8;
- case 75: return 32;
- default:
- Kokkos::Impl::throw_runtime_exception(
- "Unknown device in cuda block size deduction");
- }
- return 0;
- }() * 1024;
-}
-
} // namespace Impl
} // namespace Kokkos

204
lib/kokkos/kokkos_5739.diff Normal file
View File

@ -0,0 +1,204 @@
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
index b94f053272..252c13c524 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
@@ -53,17 +53,69 @@
namespace Kokkos {
namespace Impl {
+inline int cuda_warp_per_sm_allocation_granularity(
+ cudaDeviceProp const& properties) {
+ // Allocation granularity of warps in each sm
+ switch (properties.major) {
+ case 3:
+ case 5:
+ case 7:
+ case 8:
+ case 9: return 4;
+ case 6: return (properties.minor == 0 ? 2 : 4);
+ default:
+ throw_runtime_exception(
+ "Unknown device in cuda warp per sm allocation granularity");
+ return 0;
+ }
+}
+
+inline int cuda_max_warps_per_sm_registers(
+ cudaDeviceProp const& properties, cudaFuncAttributes const& attributes) {
+ // Maximum number of warps per sm as a function of register counts,
+ // subject to the constraint that warps are allocated with a fixed granularity
+ int const max_regs_per_block = properties.regsPerBlock;
+ int const regs_per_warp = attributes.numRegs * properties.warpSize;
+ int const warp_granularity =
+ cuda_warp_per_sm_allocation_granularity(properties);
+ // The granularity of register allocation is chunks of 256 registers per warp,
+ // which implies a need to over-allocate, so we round up
+ int const allocated_regs_per_warp = (regs_per_warp + 256 - 1) / 256;
+
+ // The maximum number of warps per SM is constrained from above by register
+ // allocation. To satisfy the constraint that warps per SM is allocated at a
+ // finite granularity, we need to round down.
+ int const max_warps_per_sm =
+ warp_granularity *
+ (max_regs_per_block / (allocated_regs_per_warp * warp_granularity));
+
+ return max_warps_per_sm;
+}
+
inline int cuda_max_active_blocks_per_sm(cudaDeviceProp const& properties,
cudaFuncAttributes const& attributes,
int block_size, size_t dynamic_shmem) {
- // Limits due do registers/SM
+ // Limits due to registers/SM
int const regs_per_sm = properties.regsPerMultiprocessor;
int const regs_per_thread = attributes.numRegs;
// The granularity of register allocation is chunks of 256 registers per warp
// -> 8 registers per thread
int const allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
- int const max_blocks_regs =
- regs_per_sm / (allocated_regs_per_thread * block_size);
+ int max_blocks_regs = regs_per_sm / (allocated_regs_per_thread * block_size);
+
+ // Compute the maximum number of warps as a function of the number of
+ // registers
+ int const max_warps_per_sm_registers =
+ cuda_max_warps_per_sm_registers(properties, attributes);
+
+ // Constrain the number of blocks to respect the maximum number of warps per
+ // SM On face value this should be an equality, but due to the warp
+ // granularity constraints noted in `cuda_max_warps_per_sm_registers` the
+ // left-hand-side of this comparison can overshoot what the hardware allows
+ // based on register counts alone
+ while ((max_blocks_regs * block_size / properties.warpSize) >
+ max_warps_per_sm_registers)
+ max_blocks_regs--;
// Limits due to shared memory/SM
size_t const shmem_per_sm = properties.sharedMemPerMultiprocessor;
@@ -207,6 +259,19 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
LaunchBounds{});
}
+template <class LaunchBounds>
+int cuda_get_opt_block_size_no_shmem(const cudaFuncAttributes& attr,
+ LaunchBounds) {
+ auto const& prop = Kokkos::Cuda().cuda_device_prop();
+
+ // Thin version of cuda_get_opt_block_size for cases where there is no shared
+ // memory
+ auto const block_size_to_no_shmem = [&](int /*block_size*/) { return 0; };
+
+ return cuda_deduce_block_size(false, prop, attr, block_size_to_no_shmem,
+ LaunchBounds{});
+}
+
} // namespace Impl
} // namespace Kokkos
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
index 5c4c3a7d39..170183ca0a 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
@@ -188,9 +188,23 @@ inline void configure_shmem_preference(const KernelFuncPtr& func,
// The granularity of register allocation is chunks of 256 registers per warp
// -> 8 registers per thread
const size_t allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
- const size_t max_blocks_regs =
+ size_t max_blocks_regs =
regs_per_sm / (allocated_regs_per_thread * block_size);
+ // Compute the maximum number of warps as a function of the number of
+ // registers
+ const size_t max_warps_per_sm_registers =
+ cuda_max_warps_per_sm_registers(device_props, func_attr);
+
+ // Constrain the number of blocks to respect the maximum number of warps per
+ // SM On face value this should be an equality, but due to the warp
+ // granularity constraints noted in `cuda_max_warps_per_sm_registers` the
+ // left-hand-side of this comparison can overshoot what the hardware allows
+ // based on register counts alone
+ while ((max_blocks_regs * block_size / device_props.warpSize) >
+ max_warps_per_sm_registers)
+ max_blocks_regs--;
+
// Compute how many threads per sm we actually want
const size_t max_threads_per_sm = device_props.maxThreadsPerMultiProcessor;
// only allocate multiples of warp size
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
index 0e348c092a..7e4f62f12e 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
@@ -67,6 +67,34 @@
namespace Kokkos {
namespace Impl {
+template <typename ParallelType, typename Policy, typename LaunchBounds>
+int max_tile_size_product_helper(const Policy& pol, const LaunchBounds&) {
+ cudaFuncAttributes attr =
+ CudaParallelLaunch<ParallelType,
+ LaunchBounds>::get_cuda_func_attributes();
+ auto const& prop = pol.space().cuda_device_prop();
+
+ // Limits due to registers/SM, MDRange doesn't have
+ // shared memory constraints
+ int const optimal_block_size =
+ Kokkos::Impl::cuda_get_opt_block_size_no_shmem(attr, LaunchBounds{});
+
+ // Compute how many blocks of this size we can launch, based on warp
+ // constraints
+ int const max_warps_per_sm_registers =
+ Kokkos::Impl::cuda_max_warps_per_sm_registers(prop, attr);
+ int const max_num_threads_from_warps =
+ max_warps_per_sm_registers * prop.warpSize;
+ int const max_num_blocks = max_num_threads_from_warps / optimal_block_size;
+
+ // Compute the total number of threads
+ int const max_threads_per_sm = optimal_block_size * max_num_blocks;
+
+ return std::min(
+ max_threads_per_sm,
+ static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
+}
+
template <class FunctorType, class... Traits>
class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
public:
@@ -85,18 +113,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
public:
template <typename Policy, typename Functor>
static int max_tile_size_product(const Policy& pol, const Functor&) {
- cudaFuncAttributes attr =
- CudaParallelLaunch<ParallelFor,
- LaunchBounds>::get_cuda_func_attributes();
- auto const& prop = pol.space().cuda_device_prop();
- // Limits due to registers/SM, MDRange doesn't have
- // shared memory constraints
- int const regs_per_sm = prop.regsPerMultiprocessor;
- int const regs_per_thread = attr.numRegs;
- int const max_threads_per_sm = regs_per_sm / regs_per_thread;
- return std::min(
- max_threads_per_sm,
- static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
+ return max_tile_size_product_helper<ParallelFor>(pol, LaunchBounds{});
}
Policy const& get_policy() const { return m_rp; }
inline __device__ void operator()() const {
@@ -258,17 +275,7 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
public:
template <typename Policy, typename Functor>
static int max_tile_size_product(const Policy& pol, const Functor&) {
- cudaFuncAttributes attr =
- CudaParallelLaunch<ParallelReduce,
- LaunchBounds>::get_cuda_func_attributes();
- auto const& prop = pol.space().cuda_device_prop();
- // Limits due do registers/SM
- int const regs_per_sm = prop.regsPerMultiprocessor;
- int const regs_per_thread = attr.numRegs;
- int const max_threads_per_sm = regs_per_sm / regs_per_thread;
- return std::min(
- max_threads_per_sm,
- static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
+ return max_tile_size_product_helper<ParallelReduce>(pol, LaunchBounds{});
}
Policy const& get_policy() const { return m_policy; }
inline __device__ void exec_range(reference_type update) const {

View File

@ -0,0 +1,63 @@
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
index 170183ca0a..ba43e362bb 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
@@ -412,12 +412,16 @@ struct CudaParallelLaunchKernelInvoker<
Impl::check_shmem_request(cuda_instance, shmem);
if (DriverType::Policy::
experimental_contains_desired_occupancy) {
+ /*
int desired_occupancy =
driver.get_policy().impl_get_desired_occupancy().value();
size_t block_size = block.x * block.y * block.z;
Impl::configure_shmem_preference<DriverType, LaunchBounds>(
base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
- shmem, desired_occupancy);
+ shmem, desired_occupancy);*/
+ Kokkos::Impl::throw_runtime_exception(
+ std::string("Cuda graph node creation FAILED:"
+ " occupancy requests are currently broken."));
}
void const* args[] = {&driver};
@@ -511,14 +515,17 @@ struct CudaParallelLaunchKernelInvoker<
if (!Impl::is_empty_launch(grid, block)) {
Impl::check_shmem_request(cuda_instance, shmem);
- if constexpr (DriverType::Policy::
+ if (DriverType::Policy::
experimental_contains_desired_occupancy) {
- int desired_occupancy =
+ /*int desired_occupancy =
driver.get_policy().impl_get_desired_occupancy().value();
size_t block_size = block.x * block.y * block.z;
Impl::configure_shmem_preference<DriverType, LaunchBounds>(
base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
- shmem, desired_occupancy);
+ shmem, desired_occupancy);*/
+ Kokkos::Impl::throw_runtime_exception(
+ std::string("Cuda graph node creation FAILED:"
+ " occupancy requests are currently broken."));
}
auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver);
@@ -690,14 +697,17 @@ struct CudaParallelLaunchImpl<
if (DriverType::Policy::
experimental_contains_desired_occupancy) {
- int desired_occupancy =
+ /*int desired_occupancy =
driver.get_policy().impl_get_desired_occupancy().value();
size_t block_size = block.x * block.y * block.z;
Impl::configure_shmem_preference<
DriverType,
Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
- shmem, desired_occupancy);
+ shmem, desired_occupancy);*/
+ Kokkos::Impl::throw_runtime_exception(
+ std::string("Cuda graph node creation FAILED:"
+ " occupancy requests are currently broken."));
}
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();

View File

@ -29,3 +29,4 @@ tag: 3.5.00 date: 11:19:2021 master: c28a8b03 release: 21b879e4
tag: 3.6.00 date: 04:14:2022 master: 2834f94a release: 6ea708ff tag: 3.6.00 date: 04:14:2022 master: 2834f94a release: 6ea708ff
tag: 3.6.01 date: 06:16:2022 master: b52f8c83 release: afe9b404 tag: 3.6.01 date: 06:16:2022 master: b52f8c83 release: afe9b404
tag: 3.7.00 date: 08:25:2022 master: d19aab99 release: 0018e5fb tag: 3.7.00 date: 08:25:2022 master: d19aab99 release: 0018e5fb
tag: 3.7.01 date: 12:01:2022 master: 61d7db55 release: d3bb8cfe

View File

@ -1,5 +1,5 @@
TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
LIB_REQUIRED_PACKAGES KokkosCore LIB_REQUIRED_PACKAGES KokkosCore
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC HPX LIB_OPTIONAL_TPLS Pthread CUDA HWLOC
TEST_OPTIONAL_TPLS CUSPARSE TEST_OPTIONAL_TPLS CUSPARSE
) )

View File

@ -0,0 +1,3 @@
#Official Tool: clang-format version 8.0.0
DisableFormat: true
SortIncludes: false

View File

@ -76,7 +76,7 @@ namespace Impl {
/// instances in other translation units, we must update this CUDA global /// instances in other translation units, we must update this CUDA global
/// variable based on the Host global variable prior to running any kernels /// variable based on the Host global variable prior to running any kernels
/// that will use it. /// that will use it.
/// That is the purpose of the ensure_cuda_lock_arrays_on_device function. /// That is the purpose of the KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE macro.
__device__ __device__
#ifdef __CUDACC_RDC__ #ifdef __CUDACC_RDC__
__constant__ extern __constant__ extern
@ -138,42 +138,33 @@ namespace {
static int lock_array_copied = 0; static int lock_array_copied = 0;
inline int eliminate_warning_for_lock_array() { return lock_array_copied; } inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
} // namespace } // namespace
#ifdef __CUDACC_RDC__
inline
#else
static
#endif
void
copy_cuda_lock_arrays_to_device() {
if (lock_array_copied == 0) {
cudaMemcpyToSymbol(CUDA_SPACE_ATOMIC_LOCKS_DEVICE,
&CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h,
sizeof(int32_t*));
cudaMemcpyToSymbol(CUDA_SPACE_ATOMIC_LOCKS_NODE,
&CUDA_SPACE_ATOMIC_LOCKS_NODE_h,
sizeof(int32_t*));
}
lock_array_copied = 1;
}
} // namespace Impl } // namespace Impl
} // namespace desul } // namespace desul
/* It is critical that this code be a macro, so that it will
capture the right address for desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE
putting this in an inline function will NOT do the right thing! */
#define DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE() \
{ \
if (::desul::Impl::lock_array_copied == 0) { \
cudaMemcpyToSymbol(::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE, \
&::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h, \
sizeof(int32_t*)); \
cudaMemcpyToSymbol(::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_NODE, \
&::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_NODE_h, \
sizeof(int32_t*)); \
} \
::desul::Impl::lock_array_copied = 1; \
}
#endif /* defined( __CUDACC__ ) */ #endif /* defined( __CUDACC__ ) */
#endif /* defined( DESUL_HAVE_CUDA_ATOMICS ) */ #endif /* defined( DESUL_HAVE_CUDA_ATOMICS ) */
namespace desul {
#if defined(__CUDACC_RDC__) || (!defined(__CUDACC__)) #if defined(__CUDACC_RDC__) || (!defined(__CUDACC__))
inline void ensure_cuda_lock_arrays_on_device() {} #define DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
#else #else
static inline void ensure_cuda_lock_arrays_on_device() { #define DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() \
Impl::copy_cuda_lock_arrays_to_device(); DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()
}
#endif #endif
} // namespace desul #endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP_ */
#endif /* #ifndef DESUL_ATOMICS_LOCK_ARRAY_CUDA_HPP_ */

View File

@ -70,7 +70,7 @@ void init_lock_arrays_cuda() {
"init_lock_arrays_cuda: cudaMalloc host locks"); "init_lock_arrays_cuda: cudaMalloc host locks");
auto error_sync1 = cudaDeviceSynchronize(); auto error_sync1 = cudaDeviceSynchronize();
copy_cuda_lock_arrays_to_device(); DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
check_error_and_throw_cuda(error_sync1, "init_lock_arrays_cuda: post mallocs"); check_error_and_throw_cuda(error_sync1, "init_lock_arrays_cuda: post mallocs");
init_lock_arrays_cuda_kernel<<<(CUDA_SPACE_ATOMIC_MASK + 1 + 255) / 256, 256>>>(); init_lock_arrays_cuda_kernel<<<(CUDA_SPACE_ATOMIC_MASK + 1 + 255) / 256, 256>>>();
auto error_sync2 = cudaDeviceSynchronize(); auto error_sync2 = cudaDeviceSynchronize();
@ -85,7 +85,7 @@ void finalize_lock_arrays_cuda() {
CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr; CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr;
CUDA_SPACE_ATOMIC_LOCKS_NODE_h = nullptr; CUDA_SPACE_ATOMIC_LOCKS_NODE_h = nullptr;
#ifdef __CUDACC_RDC__ #ifdef __CUDACC_RDC__
copy_cuda_lock_arrays_to_device(); DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
#endif #endif
} }

View File

@ -190,6 +190,20 @@ namespace LAMMPS_NS {
Tp_BIAS,Tp_RMASS,Tp_ZERO>(i); Tp_BIAS,Tp_RMASS,Tp_ZERO>(i);
} }
KOKKOS_INLINE_FUNCTION
static void init(value_type &update) {
update.fx = 0.0;
update.fy = 0.0;
update.fz = 0.0;
}
KOKKOS_INLINE_FUNCTION
static void join(value_type &update,
const value_type &source) {
update.fx += source.fx;
update.fy += source.fy;
update.fz += source.fz;
}
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
static void init(volatile value_type &update) { static void init(volatile value_type &update) {
update.fx = 0.0; update.fx = 0.0;
@ -233,6 +247,15 @@ namespace LAMMPS_NS {
energy += c.compute_energy_item(i); energy += c.compute_energy_item(i);
} }
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
static void init(value_type &update) {
update = 0.0;
}
KOKKOS_INLINE_FUNCTION
static void join(value_type &update,
const value_type &source) {
update += source;
}
KOKKOS_INLINE_FUNCTION
static void init(volatile value_type &update) { static void init(volatile value_type &update) {
update = 0.0; update = 0.0;
} }

View File

@ -77,7 +77,6 @@ GPU_AWARE_UNKNOWN
using namespace LAMMPS_NS; using namespace LAMMPS_NS;
Kokkos::InitArguments KokkosLMP::args{-1, -1, -1, false};
int KokkosLMP::is_finalized = 0; int KokkosLMP::is_finalized = 0;
int KokkosLMP::init_ngpus = 0; int KokkosLMP::init_ngpus = 0;
@ -110,7 +109,6 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
ngpus = 0; ngpus = 0;
int device = 0; int device = 0;
nthreads = 1; nthreads = 1;
numa = 1;
int iarg = 0; int iarg = 0;
while (iarg < narg) { while (iarg < narg) {
@ -189,30 +187,24 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
iarg += 2; iarg += 2;
} else if (strcmp(arg[iarg],"n") == 0 ||
strcmp(arg[iarg],"numa") == 0) {
numa = utils::inumeric(FLERR, arg[iarg+1], false, lmp);
iarg += 2;
} else error->all(FLERR,"Invalid Kokkos command-line arg: {}", arg[iarg]); } else error->all(FLERR,"Invalid Kokkos command-line arg: {}", arg[iarg]);
} }
// Initialize Kokkos. However, we cannot change any // Initialize Kokkos. However, we cannot change any
// Kokkos library parameters after the first initalization // Kokkos library parameters after the first initalization
if (args.num_threads != -1) { Kokkos::InitializationSettings args;
if ((args.num_threads != nthreads) || (args.num_numa != numa) || (args.device_id != device))
if (args.has_num_threads()) {
if ((args.get_num_threads() != nthreads) || (args.get_device_id() != device))
if (me == 0) if (me == 0)
error->warning(FLERR,"Kokkos package already initalized, " error->warning(FLERR,"Kokkos package already initalized. Cannot change parameters");
"cannot reinitialize with different parameters"); nthreads = args.get_num_threads();
nthreads = args.num_threads; device = args.get_device_id();
numa = args.num_numa;
device = args.device_id;
ngpus = init_ngpus; ngpus = init_ngpus;
} else { } else {
args.num_threads = nthreads; args.set_num_threads(nthreads);
args.num_numa = numa; args.set_device_id(device);
args.device_id = device;
init_ngpus = ngpus; init_ngpus = ngpus;
} }
@ -350,7 +342,7 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
/* ---------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- */
void KokkosLMP::initialize(Kokkos::InitArguments args, Error *error) void KokkosLMP::initialize(Kokkos::InitializationSettings args, Error *error)
{ {
if (!Kokkos::is_initialized()) { if (!Kokkos::is_initialized()) {
if (is_finalized) if (is_finalized)

View File

@ -43,7 +43,6 @@ class KokkosLMP : protected Pointers {
int forward_fix_comm_changed; int forward_fix_comm_changed;
int reverse_comm_changed; int reverse_comm_changed;
int nthreads,ngpus; int nthreads,ngpus;
int numa;
int auto_sync; int auto_sync;
int gpu_aware_flag; int gpu_aware_flag;
int neigh_thread; int neigh_thread;
@ -53,12 +52,11 @@ class KokkosLMP : protected Pointers {
double binsize; double binsize;
static int is_finalized; static int is_finalized;
static Kokkos::InitArguments args;
static int init_ngpus; static int init_ngpus;
KokkosLMP(class LAMMPS *, int, char **); KokkosLMP(class LAMMPS *, int, char **);
static void initialize(Kokkos::InitArguments, Error *); static void initialize(Kokkos::InitializationSettings, Error *);
static void finalize(); static void finalize();
void accelerator(int, char **); void accelerator(int, char **);
int neigh_count(int); int neigh_count(int);

View File

@ -524,6 +524,12 @@ struct PairReaxKokkosFindBondFunctor {
PairReaxFFKokkos<DeviceType> c; PairReaxFFKokkos<DeviceType> c;
PairReaxKokkosFindBondFunctor(PairReaxFFKokkos<DeviceType>* c_ptr):c(*c_ptr) {}; PairReaxKokkosFindBondFunctor(PairReaxFFKokkos<DeviceType>* c_ptr):c(*c_ptr) {};
KOKKOS_INLINE_FUNCTION
void join(int &dst,
const int &src) const {
dst = MAX(dst,src);
}
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void join(volatile int &dst, void join(volatile int &dst,
const volatile int &src) const { const volatile int &src) const {

View File

@ -52,7 +52,6 @@ class KokkosLMP {
int kokkos_exists; int kokkos_exists;
int nthreads; int nthreads;
int ngpus; int ngpus;
int numa;
KokkosLMP(class LAMMPS *, int, char **) { kokkos_exists = 0; } KokkosLMP(class LAMMPS *, int, char **) { kokkos_exists = 0; }
~KokkosLMP() {} ~KokkosLMP() {}

View File

@ -91,7 +91,7 @@ Comm::Comm(LAMMPS *lmp) : Pointers(lmp)
nthreads = 1; nthreads = 1;
#ifdef _OPENMP #ifdef _OPENMP
if (lmp->kokkos) { if (lmp->kokkos) {
nthreads = lmp->kokkos->nthreads * lmp->kokkos->numa; nthreads = lmp->kokkos->nthreads;
} else if (getenv("OMP_NUM_THREADS") == nullptr) { } else if (getenv("OMP_NUM_THREADS") == nullptr) {
nthreads = 1; nthreads = 1;
if (me == 0) if (me == 0)