Merge pull request #3532 from stanmoore1/kk_occupancy
Update Kokkos version in LAMMPS to 3.7.1
This commit is contained in:
@ -49,8 +49,8 @@ if(DOWNLOAD_KOKKOS)
|
|||||||
list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_CXX_EXTENSIONS=${CMAKE_CXX_EXTENSIONS}")
|
list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_CXX_EXTENSIONS=${CMAKE_CXX_EXTENSIONS}")
|
||||||
list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}")
|
list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}")
|
||||||
include(ExternalProject)
|
include(ExternalProject)
|
||||||
set(KOKKOS_URL "https://github.com/kokkos/kokkos/archive/3.7.00.tar.gz" CACHE STRING "URL for KOKKOS tarball")
|
set(KOKKOS_URL "https://github.com/kokkos/kokkos/archive/3.7.01.tar.gz" CACHE STRING "URL for KOKKOS tarball")
|
||||||
set(KOKKOS_MD5 "84991eca9f066383abe119a5bc7a11c4" CACHE STRING "MD5 checksum of KOKKOS tarball")
|
set(KOKKOS_MD5 "f140e02b826223b1045207d9bc10d404" CACHE STRING "MD5 checksum of KOKKOS tarball")
|
||||||
mark_as_advanced(KOKKOS_URL)
|
mark_as_advanced(KOKKOS_URL)
|
||||||
mark_as_advanced(KOKKOS_MD5)
|
mark_as_advanced(KOKKOS_MD5)
|
||||||
ExternalProject_Add(kokkos_build
|
ExternalProject_Add(kokkos_build
|
||||||
@ -74,7 +74,7 @@ if(DOWNLOAD_KOKKOS)
|
|||||||
add_dependencies(LAMMPS::KOKKOSCORE kokkos_build)
|
add_dependencies(LAMMPS::KOKKOSCORE kokkos_build)
|
||||||
add_dependencies(LAMMPS::KOKKOSCONTAINERS kokkos_build)
|
add_dependencies(LAMMPS::KOKKOSCONTAINERS kokkos_build)
|
||||||
elseif(EXTERNAL_KOKKOS)
|
elseif(EXTERNAL_KOKKOS)
|
||||||
find_package(Kokkos 3.7.00 REQUIRED CONFIG)
|
find_package(Kokkos 3.7.01 REQUIRED CONFIG)
|
||||||
target_link_libraries(lammps PRIVATE Kokkos::kokkos)
|
target_link_libraries(lammps PRIVATE Kokkos::kokkos)
|
||||||
target_link_libraries(lmp PRIVATE Kokkos::kokkos)
|
target_link_libraries(lmp PRIVATE Kokkos::kokkos)
|
||||||
else()
|
else()
|
||||||
|
|||||||
@ -105,13 +105,12 @@ Either the full word or an abbreviation can be used for the keywords.
|
|||||||
Note that the keywords do not use a leading minus sign. I.e. the
|
Note that the keywords do not use a leading minus sign. I.e. the
|
||||||
keyword is "t", not "-t". Also note that each of the keywords has a
|
keyword is "t", not "-t". Also note that each of the keywords has a
|
||||||
default setting. Examples of when to use these options and what
|
default setting. Examples of when to use these options and what
|
||||||
settings to use on different platforms is given on the :doc:`KOKKOS package <Speed_kokkos>`
|
settings to use on different platforms is given on the :doc:`KOKKOS
|
||||||
doc page.
|
package <Speed_kokkos>` doc page.
|
||||||
|
|
||||||
* d or device
|
* d or device
|
||||||
* g or gpus
|
* g or gpus
|
||||||
* t or threads
|
* t or threads
|
||||||
* n or numa
|
|
||||||
|
|
||||||
.. parsed-literal::
|
.. parsed-literal::
|
||||||
|
|
||||||
@ -164,19 +163,10 @@ the number of physical cores per node, to use your available hardware
|
|||||||
optimally. This also sets the number of threads used by the host when
|
optimally. This also sets the number of threads used by the host when
|
||||||
LAMMPS is compiled with CUDA=yes.
|
LAMMPS is compiled with CUDA=yes.
|
||||||
|
|
||||||
.. parsed-literal::
|
.. deprecated:: 22Dec2022
|
||||||
|
|
||||||
numa Nm
|
Support for the "numa" or "n" option was removed as its functionality
|
||||||
|
was ignored in Kokkos for some time already.
|
||||||
This option is only relevant when using pthreads with hwloc support.
|
|
||||||
In this case Nm defines the number of NUMA regions (typically sockets)
|
|
||||||
on a node which will be utilized by a single MPI rank. By default Nm
|
|
||||||
= 1. If this option is used the total number of worker-threads per
|
|
||||||
MPI rank is threads\*numa. Currently it is always almost better to
|
|
||||||
assign at least one MPI rank per NUMA region, and leave numa set to
|
|
||||||
its default value of 1. This is because letting a single process span
|
|
||||||
multiple NUMA regions induces a significant amount of cross NUMA data
|
|
||||||
traffic which is slow.
|
|
||||||
|
|
||||||
----------
|
----------
|
||||||
|
|
||||||
|
|||||||
@ -1,5 +1,27 @@
|
|||||||
# Change Log
|
# Change Log
|
||||||
|
|
||||||
|
## [3.7.01](https://github.com/kokkos/kokkos/tree/3.7.01) (2022-12-01)
|
||||||
|
[Full Changelog](https://github.com/kokkos/kokkos/compare/3.7.00...3.7.01)
|
||||||
|
|
||||||
|
### Bug Fixes:
|
||||||
|
- Add fences to all sorting routines not taking an execution space instance argument [\#5547](https://github.com/kokkos/kokkos/pull/5547)
|
||||||
|
- Fix repeated `team_reduce` without barrier [\#5552](https://github.com/kokkos/kokkos/pull/5552)
|
||||||
|
- Fix memory spaces in `create_mirror_view` overloads using `view_alloc` [\#5521](https://github.com/kokkos/kokkos/pull/5521)
|
||||||
|
- Allow `as_view_of_rank_n()` to be overloaded for "special" scalar types [\#5553](https://github.com/kokkos/kokkos/pull/5553)
|
||||||
|
- Fix warning calling a `__host__` function from a `__host__ __device__` from `View:: as_view_of_rank_n` [\#5591](https://github.com/kokkos/kokkos/pull/5591)
|
||||||
|
- OpenMPTarget: adding implementation to set device id. [\#5557](https://github.com/kokkos/kokkos/pull/5557)
|
||||||
|
- Use `Kokkos::atomic_load` to Correct Race Condition Giving Rise to Seg Faulting Error in OpenMP tests [\#5559](https://github.com/kokkos/kokkos/pull/5559)
|
||||||
|
- cmake: define `KOKKOS_ARCH_A64FX` [\#5561](https://github.com/kokkos/kokkos/pull/5561)
|
||||||
|
- Only link against libatomic in gnu-make OpenMPTarget build [\#5565](https://github.com/kokkos/kokkos/pull/5565)
|
||||||
|
- Fix static extents assignment for LayoutLeft/LayoutRight assignment [\#5566](https://github.com/kokkos/kokkos/pull/5566)
|
||||||
|
- Do not add -cuda to the link line with NVHPC compiler when the CUDA backend is not actually enabled [\#5569](https://github.com/kokkos/kokkos/pull/5569)
|
||||||
|
- Export the flags in `KOKKOS_AMDGPU_OPTIONS` when using Trilinos [\#5571](https://github.com/kokkos/kokkos/pull/5571)
|
||||||
|
- Add support for detecting MPI local rank with MPICH and PMI [\#5570](https://github.com/kokkos/kokkos/pull/5570) [\#5582](https://github.com/kokkos/kokkos/pull/5582)
|
||||||
|
- Remove listing of undefined TPL dependencies [\#5573](https://github.com/kokkos/kokkos/pull/5573)
|
||||||
|
- ClockTic changed to 64 bit to fix overflow on Power [\#5592](https://github.com/kokkos/kokkos/pull/5592)
|
||||||
|
- Fix incorrect offset in CUDA and HIP parallel scan for < 4 byte types [\#5607](https://github.com/kokkos/kokkos/pull/5607)
|
||||||
|
- Fix initialization of Cuda lock arrays [\#5622](https://github.com/kokkos/kokkos/pull/5622)
|
||||||
|
|
||||||
## [3.7.00](https://github.com/kokkos/kokkos/tree/3.7.00) (2022-08-22)
|
## [3.7.00](https://github.com/kokkos/kokkos/tree/3.7.00) (2022-08-22)
|
||||||
[Full Changelog](https://github.com/kokkos/kokkos/compare/3.6.01...3.7.00)
|
[Full Changelog](https://github.com/kokkos/kokkos/compare/3.6.01...3.7.00)
|
||||||
|
|
||||||
@ -102,7 +124,6 @@
|
|||||||
- Deprecate command line arguments (other than `--help`) that are not prefixed with `kokkos-*` [\#5120](https://github.com/kokkos/kokkos/pull/5120)
|
- Deprecate command line arguments (other than `--help`) that are not prefixed with `kokkos-*` [\#5120](https://github.com/kokkos/kokkos/pull/5120)
|
||||||
- Deprecate `--[kokkos-]numa` cmdline arg and `KOKKOS_NUMA` env var [\#5117](https://github.com/kokkos/kokkos/pull/5117)
|
- Deprecate `--[kokkos-]numa` cmdline arg and `KOKKOS_NUMA` env var [\#5117](https://github.com/kokkos/kokkos/pull/5117)
|
||||||
- Deprecate `--[kokkos-]threads` command line argument in favor of `--[kokkos-]num-threads` [\#5111](https://github.com/kokkos/kokkos/pull/5111)
|
- Deprecate `--[kokkos-]threads` command line argument in favor of `--[kokkos-]num-threads` [\#5111](https://github.com/kokkos/kokkos/pull/5111)
|
||||||
- Deprecate `Kokkos::common_view_alloc_prop` [\#5059](https://github.com/kokkos/kokkos/pull/5059)
|
|
||||||
- Deprecate `Kokkos::is_reducer_type` [\#4957](https://github.com/kokkos/kokkos/pull/4957)
|
- Deprecate `Kokkos::is_reducer_type` [\#4957](https://github.com/kokkos/kokkos/pull/4957)
|
||||||
- Deprecate `OffsetView` constructors taking `index_list_type` [\#4810](https://github.com/kokkos/kokkos/pull/4810)
|
- Deprecate `OffsetView` constructors taking `index_list_type` [\#4810](https://github.com/kokkos/kokkos/pull/4810)
|
||||||
- Deprecate overloads of `Kokkos::sort` taking a parameter `bool always_use_kokkos_sort` [\#5382](https://github.com/kokkos/kokkos/issues/5382)
|
- Deprecate overloads of `Kokkos::sort` taking a parameter `bool always_use_kokkos_sort` [\#5382](https://github.com/kokkos/kokkos/issues/5382)
|
||||||
|
|||||||
@ -129,7 +129,7 @@ ENDIF()
|
|||||||
|
|
||||||
set(Kokkos_VERSION_MAJOR 3)
|
set(Kokkos_VERSION_MAJOR 3)
|
||||||
set(Kokkos_VERSION_MINOR 7)
|
set(Kokkos_VERSION_MINOR 7)
|
||||||
set(Kokkos_VERSION_PATCH 00)
|
set(Kokkos_VERSION_PATCH 01)
|
||||||
set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}")
|
set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}")
|
||||||
math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}")
|
math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}")
|
||||||
|
|
||||||
@ -152,6 +152,7 @@ ENDIF()
|
|||||||
# but scoping issues can make it difficult
|
# but scoping issues can make it difficult
|
||||||
GLOBAL_SET(KOKKOS_COMPILE_OPTIONS)
|
GLOBAL_SET(KOKKOS_COMPILE_OPTIONS)
|
||||||
GLOBAL_SET(KOKKOS_LINK_OPTIONS)
|
GLOBAL_SET(KOKKOS_LINK_OPTIONS)
|
||||||
|
GLOBAL_SET(KOKKOS_AMDGPU_OPTIONS)
|
||||||
GLOBAL_SET(KOKKOS_CUDA_OPTIONS)
|
GLOBAL_SET(KOKKOS_CUDA_OPTIONS)
|
||||||
GLOBAL_SET(KOKKOS_CUDAFE_OPTIONS)
|
GLOBAL_SET(KOKKOS_CUDAFE_OPTIONS)
|
||||||
GLOBAL_SET(KOKKOS_XCOMPILER_OPTIONS)
|
GLOBAL_SET(KOKKOS_XCOMPILER_OPTIONS)
|
||||||
@ -228,6 +229,9 @@ IF (KOKKOS_HAS_TRILINOS)
|
|||||||
# we have to match the annoying behavior, also we have to preserve quotes
|
# we have to match the annoying behavior, also we have to preserve quotes
|
||||||
# which needs another workaround.
|
# which needs another workaround.
|
||||||
SET(KOKKOS_COMPILE_OPTIONS_TMP)
|
SET(KOKKOS_COMPILE_OPTIONS_TMP)
|
||||||
|
IF (KOKKOS_ENABLE_HIP)
|
||||||
|
LIST(APPEND KOKKOS_COMPILE_OPTIONS ${KOKKOS_AMDGPU_OPTIONS})
|
||||||
|
ENDIF()
|
||||||
FOREACH(OPTION ${KOKKOS_COMPILE_OPTIONS})
|
FOREACH(OPTION ${KOKKOS_COMPILE_OPTIONS})
|
||||||
STRING(FIND "${OPTION}" " " OPTION_HAS_WHITESPACE)
|
STRING(FIND "${OPTION}" " " OPTION_HAS_WHITESPACE)
|
||||||
IF(OPTION_HAS_WHITESPACE EQUAL -1)
|
IF(OPTION_HAS_WHITESPACE EQUAL -1)
|
||||||
|
|||||||
@ -12,7 +12,7 @@ endif
|
|||||||
|
|
||||||
KOKKOS_VERSION_MAJOR = 3
|
KOKKOS_VERSION_MAJOR = 3
|
||||||
KOKKOS_VERSION_MINOR = 7
|
KOKKOS_VERSION_MINOR = 7
|
||||||
KOKKOS_VERSION_PATCH = 00
|
KOKKOS_VERSION_PATCH = 01
|
||||||
KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc)
|
KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc)
|
||||||
|
|
||||||
# Options: Cuda,HIP,SYCL,OpenMPTarget,OpenMP,Threads,Serial
|
# Options: Cuda,HIP,SYCL,OpenMPTarget,OpenMP,Threads,Serial
|
||||||
@ -20,7 +20,7 @@ KOKKOS_DEVICES ?= "OpenMP"
|
|||||||
#KOKKOS_DEVICES ?= "Threads"
|
#KOKKOS_DEVICES ?= "Threads"
|
||||||
# Options:
|
# Options:
|
||||||
# Intel: KNC,KNL,SNB,HSW,BDW,SKL,SKX,ICL,ICX,SPR
|
# Intel: KNC,KNL,SNB,HSW,BDW,SKL,SKX,ICL,ICX,SPR
|
||||||
# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86
|
# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86,Hopper90
|
||||||
# ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX
|
# ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX
|
||||||
# IBM: BGQ,Power7,Power8,Power9
|
# IBM: BGQ,Power7,Power8,Power9
|
||||||
# AMD-GPUS: Vega900,Vega906,Vega908,Vega90A
|
# AMD-GPUS: Vega900,Vega906,Vega908,Vega90A
|
||||||
@ -401,6 +401,7 @@ KOKKOS_INTERNAL_USE_ARCH_VOLTA72 := $(call kokkos_has_string,$(KOKKOS_ARCH),Volt
|
|||||||
KOKKOS_INTERNAL_USE_ARCH_TURING75 := $(call kokkos_has_string,$(KOKKOS_ARCH),Turing75)
|
KOKKOS_INTERNAL_USE_ARCH_TURING75 := $(call kokkos_has_string,$(KOKKOS_ARCH),Turing75)
|
||||||
KOKKOS_INTERNAL_USE_ARCH_AMPERE80 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere80)
|
KOKKOS_INTERNAL_USE_ARCH_AMPERE80 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere80)
|
||||||
KOKKOS_INTERNAL_USE_ARCH_AMPERE86 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere86)
|
KOKKOS_INTERNAL_USE_ARCH_AMPERE86 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere86)
|
||||||
|
KOKKOS_INTERNAL_USE_ARCH_HOPPER90 := $(call kokkos_has_string,$(KOKKOS_ARCH),Hopper90)
|
||||||
KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30) \
|
KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30) \
|
||||||
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32) \
|
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32) \
|
||||||
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \
|
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \
|
||||||
@ -414,7 +415,8 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLE
|
|||||||
+ $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72) \
|
+ $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72) \
|
||||||
+ $(KOKKOS_INTERNAL_USE_ARCH_TURING75) \
|
+ $(KOKKOS_INTERNAL_USE_ARCH_TURING75) \
|
||||||
+ $(KOKKOS_INTERNAL_USE_ARCH_AMPERE80) \
|
+ $(KOKKOS_INTERNAL_USE_ARCH_AMPERE80) \
|
||||||
+ $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86))
|
+ $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86) \
|
||||||
|
+ $(KOKKOS_INTERNAL_USE_ARCH_HOPPER90))
|
||||||
|
|
||||||
#SEK: This seems like a bug to me
|
#SEK: This seems like a bug to me
|
||||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
|
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
|
||||||
@ -505,10 +507,6 @@ KOKKOS_LINK_FLAGS =
|
|||||||
KOKKOS_SRC =
|
KOKKOS_SRC =
|
||||||
KOKKOS_HEADERS =
|
KOKKOS_HEADERS =
|
||||||
|
|
||||||
#ifeq ($(KOKKOS_INTERNAL_COMPILER_GCC), 1)
|
|
||||||
KOKKOS_LIBS += -latomic
|
|
||||||
#endif
|
|
||||||
|
|
||||||
# Generating the KokkosCore_config.h file.
|
# Generating the KokkosCore_config.h file.
|
||||||
|
|
||||||
KOKKOS_INTERNAL_CONFIG_TMP=KokkosCore_config.tmp
|
KOKKOS_INTERNAL_CONFIG_TMP=KokkosCore_config.tmp
|
||||||
@ -550,6 +548,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1)
|
|||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
|
ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
|
||||||
|
KOKKOS_LIBS += -latomic
|
||||||
tmp := $(call kokkos_append_header,'$H''define KOKKOS_ENABLE_OPENMPTARGET')
|
tmp := $(call kokkos_append_header,'$H''define KOKKOS_ENABLE_OPENMPTARGET')
|
||||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_GCC), 1)
|
ifeq ($(KOKKOS_INTERNAL_COMPILER_GCC), 1)
|
||||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_WORKAROUND_OPENMPTARGET_GCC")
|
tmp := $(call kokkos_append_header,"$H""define KOKKOS_WORKAROUND_OPENMPTARGET_GCC")
|
||||||
@ -1197,6 +1196,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA_ARCH), 1)
|
|||||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE86")
|
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE86")
|
||||||
KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_86
|
KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_86
|
||||||
endif
|
endif
|
||||||
|
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_HOPPER90), 1)
|
||||||
|
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER")
|
||||||
|
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER90")
|
||||||
|
KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_90
|
||||||
|
endif
|
||||||
|
|
||||||
ifneq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
|
ifneq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
|
||||||
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)
|
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
|
TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
|
||||||
LIB_REQUIRED_PACKAGES KokkosCore KokkosContainers
|
LIB_REQUIRED_PACKAGES KokkosCore KokkosContainers
|
||||||
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC HPX
|
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC
|
||||||
TEST_OPTIONAL_TPLS CUSPARSE
|
TEST_OPTIONAL_TPLS CUSPARSE
|
||||||
)
|
)
|
||||||
|
|||||||
@ -265,8 +265,8 @@ class BinSort {
|
|||||||
//----------------------------------------
|
//----------------------------------------
|
||||||
// Create the permutation vector, the bin_offset array and the bin_count
|
// Create the permutation vector, the bin_offset array and the bin_count
|
||||||
// array. Can be called again if keys changed
|
// array. Can be called again if keys changed
|
||||||
template <class ExecutionSpace = exec_space>
|
template <class ExecutionSpace>
|
||||||
void create_permute_vector(const ExecutionSpace& exec = exec_space{}) {
|
void create_permute_vector(const ExecutionSpace& exec) {
|
||||||
static_assert(
|
static_assert(
|
||||||
Kokkos::SpaceAccessibility<ExecutionSpace,
|
Kokkos::SpaceAccessibility<ExecutionSpace,
|
||||||
typename Space::memory_space>::accessible,
|
typename Space::memory_space>::accessible,
|
||||||
@ -297,6 +297,15 @@ class BinSort {
|
|||||||
*this);
|
*this);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Create the permutation vector, the bin_offset array and the bin_count
|
||||||
|
// array. Can be called again if keys changed
|
||||||
|
void create_permute_vector() {
|
||||||
|
Kokkos::fence("Kokkos::Binsort::create_permute_vector: before");
|
||||||
|
exec_space e{};
|
||||||
|
create_permute_vector(e);
|
||||||
|
e.fence("Kokkos::Binsort::create_permute_vector: after");
|
||||||
|
}
|
||||||
|
|
||||||
// Sort a subset of a view with respect to the first dimension using the
|
// Sort a subset of a view with respect to the first dimension using the
|
||||||
// permutation array
|
// permutation array
|
||||||
template <class ExecutionSpace, class ValuesViewType>
|
template <class ExecutionSpace, class ValuesViewType>
|
||||||
@ -372,9 +381,10 @@ class BinSort {
|
|||||||
template <class ValuesViewType>
|
template <class ValuesViewType>
|
||||||
void sort(ValuesViewType const& values, int values_range_begin,
|
void sort(ValuesViewType const& values, int values_range_begin,
|
||||||
int values_range_end) const {
|
int values_range_end) const {
|
||||||
|
Kokkos::fence("Kokkos::Binsort::sort: before");
|
||||||
exec_space exec;
|
exec_space exec;
|
||||||
sort(exec, values, values_range_begin, values_range_end);
|
sort(exec, values, values_range_begin, values_range_end);
|
||||||
exec.fence("Kokkos::Sort: fence after sorting");
|
exec.fence("Kokkos::BinSort:sort: after");
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class ExecutionSpace, class ValuesViewType>
|
template <class ExecutionSpace, class ValuesViewType>
|
||||||
@ -641,9 +651,10 @@ std::enable_if_t<Kokkos::is_execution_space<ExecutionSpace>::value> sort(
|
|||||||
|
|
||||||
template <class ViewType>
|
template <class ViewType>
|
||||||
void sort(ViewType const& view) {
|
void sort(ViewType const& view) {
|
||||||
|
Kokkos::fence("Kokkos::sort: before");
|
||||||
typename ViewType::execution_space exec;
|
typename ViewType::execution_space exec;
|
||||||
sort(exec, view);
|
sort(exec, view);
|
||||||
exec.fence("Kokkos::Sort: fence after sorting");
|
exec.fence("Kokkos::sort: fence after sorting");
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
|
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
|
||||||
@ -682,6 +693,7 @@ std::enable_if_t<Kokkos::is_execution_space<ExecutionSpace>::value> sort(
|
|||||||
|
|
||||||
template <class ViewType>
|
template <class ViewType>
|
||||||
void sort(ViewType view, size_t const begin, size_t const end) {
|
void sort(ViewType view, size_t const begin, size_t const end) {
|
||||||
|
Kokkos::fence("Kokkos::sort: before");
|
||||||
typename ViewType::execution_space exec;
|
typename ViewType::execution_space exec;
|
||||||
sort(exec, view, begin, end);
|
sort(exec, view, begin, end);
|
||||||
exec.fence("Kokkos::Sort: fence after sorting");
|
exec.fence("Kokkos::Sort: fence after sorting");
|
||||||
|
|||||||
@ -10,10 +10,12 @@
|
|||||||
# Default settings: change those according to your machine. For
|
# Default settings: change those according to your machine. For
|
||||||
# example, you may have have two different wrappers with either icpc
|
# example, you may have have two different wrappers with either icpc
|
||||||
# or g++ as their back-end compiler. The defaults can be overwritten
|
# or g++ as their back-end compiler. The defaults can be overwritten
|
||||||
# by using the usual arguments (e.g., -arch=sm_30 -ccbin icpc).
|
# by using the usual arguments (e.g., -arch=sm_80 -ccbin icpc).
|
||||||
|
# sm_70 is supported by every CUDA version from 9-12 and is thus
|
||||||
|
# chosen as default
|
||||||
|
|
||||||
default_arch="sm_35"
|
default_arch="sm_70"
|
||||||
#default_arch="sm_50"
|
#default_arch="sm_80"
|
||||||
|
|
||||||
#
|
#
|
||||||
# The default C++ compiler.
|
# The default C++ compiler.
|
||||||
|
|||||||
@ -66,6 +66,7 @@
|
|||||||
#cmakedefine KOKKOS_ARCH_ARMV8_THUNDERX
|
#cmakedefine KOKKOS_ARCH_ARMV8_THUNDERX
|
||||||
#cmakedefine KOKKOS_ARCH_ARMV81
|
#cmakedefine KOKKOS_ARCH_ARMV81
|
||||||
#cmakedefine KOKKOS_ARCH_ARMV8_THUNDERX2
|
#cmakedefine KOKKOS_ARCH_ARMV8_THUNDERX2
|
||||||
|
#cmakedefine KOKKOS_ARCH_A64FX
|
||||||
#cmakedefine KOKKOS_ARCH_AMD_AVX2
|
#cmakedefine KOKKOS_ARCH_AMD_AVX2
|
||||||
#cmakedefine KOKKOS_ARCH_AVX
|
#cmakedefine KOKKOS_ARCH_AVX
|
||||||
#cmakedefine KOKKOS_ARCH_AVX2
|
#cmakedefine KOKKOS_ARCH_AVX2
|
||||||
@ -101,6 +102,7 @@
|
|||||||
#cmakedefine KOKKOS_ARCH_AMPERE
|
#cmakedefine KOKKOS_ARCH_AMPERE
|
||||||
#cmakedefine KOKKOS_ARCH_AMPERE80
|
#cmakedefine KOKKOS_ARCH_AMPERE80
|
||||||
#cmakedefine KOKKOS_ARCH_AMPERE86
|
#cmakedefine KOKKOS_ARCH_AMPERE86
|
||||||
|
#cmakedefine KOKKOS_ARCH_HOPPER90
|
||||||
#cmakedefine KOKKOS_ARCH_AMD_ZEN
|
#cmakedefine KOKKOS_ARCH_AMD_ZEN
|
||||||
#cmakedefine KOKKOS_ARCH_AMD_ZEN2
|
#cmakedefine KOKKOS_ARCH_AMD_ZEN2
|
||||||
#cmakedefine KOKKOS_ARCH_AMD_ZEN3
|
#cmakedefine KOKKOS_ARCH_AMD_ZEN3
|
||||||
|
|||||||
@ -74,6 +74,7 @@ int main() {
|
|||||||
case 75: std::cout << "Set -DKokkos_ARCH_TURING75=ON ." << std::endl; break;
|
case 75: std::cout << "Set -DKokkos_ARCH_TURING75=ON ." << std::endl; break;
|
||||||
case 80: std::cout << "Set -DKokkos_ARCH_AMPERE80=ON ." << std::endl; break;
|
case 80: std::cout << "Set -DKokkos_ARCH_AMPERE80=ON ." << std::endl; break;
|
||||||
case 86: std::cout << "Set -DKokkos_ARCH_AMPERE86=ON ." << std::endl; break;
|
case 86: std::cout << "Set -DKokkos_ARCH_AMPERE86=ON ." << std::endl; break;
|
||||||
|
case 90: std::cout << "Set -DKokkos_ARCH_HOPPER90=ON ." << std::endl; break;
|
||||||
default:
|
default:
|
||||||
std::cout << "Compute capability " << compute_capability
|
std::cout << "Compute capability " << compute_capability
|
||||||
<< " is not supported" << std::endl;
|
<< " is not supported" << std::endl;
|
||||||
|
|||||||
@ -86,6 +86,7 @@ KOKKOS_ARCH_OPTION(VOLTA72 GPU "NVIDIA Volta generation CC 7.2" "KOKK
|
|||||||
KOKKOS_ARCH_OPTION(TURING75 GPU "NVIDIA Turing generation CC 7.5" "KOKKOS_SHOW_CUDA_ARCHS")
|
KOKKOS_ARCH_OPTION(TURING75 GPU "NVIDIA Turing generation CC 7.5" "KOKKOS_SHOW_CUDA_ARCHS")
|
||||||
KOKKOS_ARCH_OPTION(AMPERE80 GPU "NVIDIA Ampere generation CC 8.0" "KOKKOS_SHOW_CUDA_ARCHS")
|
KOKKOS_ARCH_OPTION(AMPERE80 GPU "NVIDIA Ampere generation CC 8.0" "KOKKOS_SHOW_CUDA_ARCHS")
|
||||||
KOKKOS_ARCH_OPTION(AMPERE86 GPU "NVIDIA Ampere generation CC 8.6" "KOKKOS_SHOW_CUDA_ARCHS")
|
KOKKOS_ARCH_OPTION(AMPERE86 GPU "NVIDIA Ampere generation CC 8.6" "KOKKOS_SHOW_CUDA_ARCHS")
|
||||||
|
KOKKOS_ARCH_OPTION(HOPPER90 GPU "NVIDIA Hopper generation CC 9.0" "KOKKOS_SHOW_CUDA_ARCHS")
|
||||||
|
|
||||||
IF(Kokkos_ENABLE_HIP OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_UNSUPPORTED_ARCHS)
|
IF(Kokkos_ENABLE_HIP OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_UNSUPPORTED_ARCHS)
|
||||||
SET(KOKKOS_SHOW_HIP_ARCHS ON)
|
SET(KOKKOS_SHOW_HIP_ARCHS ON)
|
||||||
@ -187,7 +188,9 @@ IF (KOKKOS_CXX_COMPILER_ID STREQUAL Clang)
|
|||||||
ELSEIF (KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
|
ELSEIF (KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
|
||||||
SET(CUDA_ARCH_FLAG "-gpu")
|
SET(CUDA_ARCH_FLAG "-gpu")
|
||||||
GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS -cuda)
|
GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS -cuda)
|
||||||
|
IF (KOKKOS_ENABLE_CUDA) # FIXME ideally unreachable when CUDA not enabled
|
||||||
GLOBAL_APPEND(KOKKOS_LINK_OPTIONS -cuda)
|
GLOBAL_APPEND(KOKKOS_LINK_OPTIONS -cuda)
|
||||||
|
ENDIF()
|
||||||
ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA)
|
ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA)
|
||||||
SET(CUDA_ARCH_FLAG "-arch")
|
SET(CUDA_ARCH_FLAG "-arch")
|
||||||
ENDIF()
|
ENDIF()
|
||||||
@ -542,6 +545,7 @@ CHECK_CUDA_ARCH(VOLTA72 sm_72)
|
|||||||
CHECK_CUDA_ARCH(TURING75 sm_75)
|
CHECK_CUDA_ARCH(TURING75 sm_75)
|
||||||
CHECK_CUDA_ARCH(AMPERE80 sm_80)
|
CHECK_CUDA_ARCH(AMPERE80 sm_80)
|
||||||
CHECK_CUDA_ARCH(AMPERE86 sm_86)
|
CHECK_CUDA_ARCH(AMPERE86 sm_86)
|
||||||
|
CHECK_CUDA_ARCH(HOPPER90 sm_90)
|
||||||
|
|
||||||
SET(AMDGPU_ARCH_ALREADY_SPECIFIED "")
|
SET(AMDGPU_ARCH_ALREADY_SPECIFIED "")
|
||||||
FUNCTION(CHECK_AMDGPU_ARCH ARCH FLAG)
|
FUNCTION(CHECK_AMDGPU_ARCH ARCH FLAG)
|
||||||
@ -804,6 +808,10 @@ IF (KOKKOS_ARCH_AMPERE80 OR KOKKOS_ARCH_AMPERE86)
|
|||||||
SET(KOKKOS_ARCH_AMPERE ON)
|
SET(KOKKOS_ARCH_AMPERE ON)
|
||||||
ENDIF()
|
ENDIF()
|
||||||
|
|
||||||
|
IF (KOKKOS_ARCH_HOPPER90)
|
||||||
|
SET(KOKKOS_ARCH_HOPPER ON)
|
||||||
|
ENDIF()
|
||||||
|
|
||||||
#Regardless of version, make sure we define the general architecture name
|
#Regardless of version, make sure we define the general architecture name
|
||||||
IF (KOKKOS_ARCH_VEGA900 OR KOKKOS_ARCH_VEGA906 OR KOKKOS_ARCH_VEGA908 OR KOKKOS_ARCH_VEGA90A)
|
IF (KOKKOS_ARCH_VEGA900 OR KOKKOS_ARCH_VEGA906 OR KOKKOS_ARCH_VEGA908 OR KOKKOS_ARCH_VEGA90A)
|
||||||
SET(KOKKOS_ARCH_VEGA ON)
|
SET(KOKKOS_ARCH_VEGA ON)
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
|
TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
|
||||||
LIB_REQUIRED_PACKAGES KokkosCore
|
LIB_REQUIRED_PACKAGES KokkosCore
|
||||||
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC HPX
|
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC
|
||||||
TEST_OPTIONAL_TPLS CUSPARSE
|
TEST_OPTIONAL_TPLS CUSPARSE
|
||||||
)
|
)
|
||||||
|
|||||||
@ -1701,7 +1701,11 @@ namespace Impl {
|
|||||||
underlying memory, to facilitate implementation of deep_copy() and
|
underlying memory, to facilitate implementation of deep_copy() and
|
||||||
other routines that are defined on View */
|
other routines that are defined on View */
|
||||||
template <unsigned N, typename T, typename... Args>
|
template <unsigned N, typename T, typename... Args>
|
||||||
KOKKOS_FUNCTION auto as_view_of_rank_n(DynRankView<T, Args...> v) {
|
KOKKOS_FUNCTION auto as_view_of_rank_n(
|
||||||
|
DynRankView<T, Args...> v,
|
||||||
|
typename std::enable_if<std::is_same<
|
||||||
|
typename ViewTraits<T, Args...>::specialize, void>::value>::type* =
|
||||||
|
nullptr) {
|
||||||
if (v.rank() != N) {
|
if (v.rank() != N) {
|
||||||
KOKKOS_IF_ON_HOST(
|
KOKKOS_IF_ON_HOST(
|
||||||
const std::string message =
|
const std::string message =
|
||||||
@ -2114,6 +2118,7 @@ inline auto create_mirror(
|
|||||||
namespace Impl {
|
namespace Impl {
|
||||||
template <class T, class... P, class... ViewCtorArgs>
|
template <class T, class... P, class... ViewCtorArgs>
|
||||||
inline std::enable_if_t<
|
inline std::enable_if_t<
|
||||||
|
!Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
|
||||||
std::is_same<
|
std::is_same<
|
||||||
typename DynRankView<T, P...>::memory_space,
|
typename DynRankView<T, P...>::memory_space,
|
||||||
typename DynRankView<T, P...>::HostMirror::memory_space>::value &&
|
typename DynRankView<T, P...>::HostMirror::memory_space>::value &&
|
||||||
@ -2128,6 +2133,7 @@ create_mirror_view(const DynRankView<T, P...>& src,
|
|||||||
|
|
||||||
template <class T, class... P, class... ViewCtorArgs>
|
template <class T, class... P, class... ViewCtorArgs>
|
||||||
inline std::enable_if_t<
|
inline std::enable_if_t<
|
||||||
|
!Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
|
||||||
!(std::is_same<
|
!(std::is_same<
|
||||||
typename DynRankView<T, P...>::memory_space,
|
typename DynRankView<T, P...>::memory_space,
|
||||||
typename DynRankView<T, P...>::HostMirror::memory_space>::value &&
|
typename DynRankView<T, P...>::HostMirror::memory_space>::value &&
|
||||||
@ -2141,29 +2147,39 @@ create_mirror_view(
|
|||||||
return Kokkos::Impl::create_mirror(src, arg_prop);
|
return Kokkos::Impl::create_mirror(src, arg_prop);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Space, class T, class... P, class... ViewCtorArgs>
|
template <class T, class... P, class... ViewCtorArgs,
|
||||||
|
class = std::enable_if_t<
|
||||||
|
Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
|
||||||
inline std::enable_if_t<
|
inline std::enable_if_t<
|
||||||
Kokkos::is_space<Space>::value &&
|
Kokkos::is_space<
|
||||||
Impl::MirrorDRViewType<Space, T, P...>::is_same_memspace,
|
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space>::value &&
|
||||||
typename Impl::MirrorDRViewType<Space, T, P...>::view_type>
|
Impl::MirrorDRViewType<
|
||||||
create_mirror_view(const Space&, const Kokkos::DynRankView<T, P...>& src,
|
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T,
|
||||||
|
P...>::is_same_memspace,
|
||||||
|
typename Impl::MirrorDRViewType<
|
||||||
|
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T,
|
||||||
|
P...>::view_type>
|
||||||
|
create_mirror_view(const Kokkos::DynRankView<T, P...>& src,
|
||||||
const typename Impl::ViewCtorProp<ViewCtorArgs...>&) {
|
const typename Impl::ViewCtorProp<ViewCtorArgs...>&) {
|
||||||
return src;
|
return src;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Space, class T, class... P, class... ViewCtorArgs>
|
template <class T, class... P, class... ViewCtorArgs,
|
||||||
|
class = std::enable_if_t<
|
||||||
|
Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
|
||||||
inline std::enable_if_t<
|
inline std::enable_if_t<
|
||||||
Kokkos::is_space<Space>::value &&
|
Kokkos::is_space<
|
||||||
!Impl::MirrorDRViewType<Space, T, P...>::is_same_memspace,
|
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space>::value &&
|
||||||
typename Impl::MirrorDRViewType<Space, T, P...>::view_type>
|
!Impl::MirrorDRViewType<
|
||||||
|
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T,
|
||||||
|
P...>::is_same_memspace,
|
||||||
|
typename Impl::MirrorDRViewType<
|
||||||
|
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T,
|
||||||
|
P...>::view_type>
|
||||||
create_mirror_view(
|
create_mirror_view(
|
||||||
const Space&, const Kokkos::DynRankView<T, P...>& src,
|
const Kokkos::DynRankView<T, P...>& src,
|
||||||
const typename Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
|
const typename Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
|
||||||
using MemorySpace = typename Space::memory_space;
|
return Kokkos::Impl::create_mirror(src, arg_prop);
|
||||||
using alloc_prop = Impl::ViewCtorProp<ViewCtorArgs..., MemorySpace>;
|
|
||||||
alloc_prop prop_copy(arg_prop);
|
|
||||||
|
|
||||||
return Kokkos::Impl::create_mirror(src, prop_copy);
|
|
||||||
}
|
}
|
||||||
} // namespace Impl
|
} // namespace Impl
|
||||||
|
|
||||||
@ -2224,9 +2240,10 @@ create_mirror_view(
|
|||||||
|
|
||||||
template <class Space, class T, class... P>
|
template <class Space, class T, class... P>
|
||||||
inline auto create_mirror_view(Kokkos::Impl::WithoutInitializing_t wi,
|
inline auto create_mirror_view(Kokkos::Impl::WithoutInitializing_t wi,
|
||||||
const Space& space,
|
const Space&,
|
||||||
const Kokkos::DynRankView<T, P...>& src) {
|
const Kokkos::DynRankView<T, P...>& src) {
|
||||||
return Impl::create_mirror_view(space, src, Kokkos::view_alloc(wi));
|
return Impl::create_mirror_view(
|
||||||
|
src, Kokkos::view_alloc(typename Space::memory_space{}, wi));
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class T, class... P, class... ViewCtorArgs>
|
template <class T, class... P, class... ViewCtorArgs>
|
||||||
|
|||||||
@ -710,7 +710,7 @@ template <class Space, class T, class... P>
|
|||||||
inline auto create_mirror(
|
inline auto create_mirror(
|
||||||
const Space&, const Kokkos::Experimental::DynamicView<T, P...>& src) {
|
const Space&, const Kokkos::Experimental::DynamicView<T, P...>& src) {
|
||||||
return Impl::create_mirror(
|
return Impl::create_mirror(
|
||||||
src, Impl::ViewCtorProp<>{typename Space::memory_space{}});
|
src, Kokkos::view_alloc(typename Space::memory_space{}));
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Space, class T, class... P>
|
template <class Space, class T, class... P>
|
||||||
@ -729,8 +729,10 @@ inline auto create_mirror(
|
|||||||
}
|
}
|
||||||
|
|
||||||
namespace Impl {
|
namespace Impl {
|
||||||
|
|
||||||
template <class T, class... P, class... ViewCtorArgs>
|
template <class T, class... P, class... ViewCtorArgs>
|
||||||
inline std::enable_if_t<
|
inline std::enable_if_t<
|
||||||
|
!Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
|
||||||
(std::is_same<
|
(std::is_same<
|
||||||
typename Kokkos::Experimental::DynamicView<T, P...>::memory_space,
|
typename Kokkos::Experimental::DynamicView<T, P...>::memory_space,
|
||||||
typename Kokkos::Experimental::DynamicView<
|
typename Kokkos::Experimental::DynamicView<
|
||||||
@ -740,14 +742,14 @@ inline std::enable_if_t<
|
|||||||
typename Kokkos::Experimental::DynamicView<
|
typename Kokkos::Experimental::DynamicView<
|
||||||
T, P...>::HostMirror::data_type>::value),
|
T, P...>::HostMirror::data_type>::value),
|
||||||
typename Kokkos::Experimental::DynamicView<T, P...>::HostMirror>
|
typename Kokkos::Experimental::DynamicView<T, P...>::HostMirror>
|
||||||
create_mirror_view(
|
create_mirror_view(const Kokkos::Experimental::DynamicView<T, P...>& src,
|
||||||
const typename Kokkos::Experimental::DynamicView<T, P...>& src,
|
|
||||||
const Impl::ViewCtorProp<ViewCtorArgs...>&) {
|
const Impl::ViewCtorProp<ViewCtorArgs...>&) {
|
||||||
return src;
|
return src;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class T, class... P, class... ViewCtorArgs>
|
template <class T, class... P, class... ViewCtorArgs>
|
||||||
inline std::enable_if_t<
|
inline std::enable_if_t<
|
||||||
|
!Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
|
||||||
!(std::is_same<
|
!(std::is_same<
|
||||||
typename Kokkos::Experimental::DynamicView<T, P...>::memory_space,
|
typename Kokkos::Experimental::DynamicView<T, P...>::memory_space,
|
||||||
typename Kokkos::Experimental::DynamicView<
|
typename Kokkos::Experimental::DynamicView<
|
||||||
@ -762,15 +764,33 @@ create_mirror_view(const Kokkos::Experimental::DynamicView<T, P...>& src,
|
|||||||
return Kokkos::create_mirror(arg_prop, src);
|
return Kokkos::create_mirror(arg_prop, src);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Space, class T, class... P, class... ViewCtorArgs>
|
template <class T, class... P, class... ViewCtorArgs,
|
||||||
inline std::enable_if_t<
|
class = std::enable_if_t<
|
||||||
Impl::MirrorDynamicViewType<Space, T, P...>::is_same_memspace,
|
Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
|
||||||
typename Kokkos::Impl::MirrorDynamicViewType<Space, T, P...>::view_type>
|
std::enable_if_t<Impl::MirrorDynamicViewType<
|
||||||
create_mirror_view(const Space&,
|
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
|
||||||
const Kokkos::Experimental::DynamicView<T, P...>& src,
|
T, P...>::is_same_memspace,
|
||||||
|
typename Impl::MirrorDynamicViewType<
|
||||||
|
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
|
||||||
|
T, P...>::view_type>
|
||||||
|
create_mirror_view(const Kokkos::Experimental::DynamicView<T, P...>& src,
|
||||||
const Impl::ViewCtorProp<ViewCtorArgs...>&) {
|
const Impl::ViewCtorProp<ViewCtorArgs...>&) {
|
||||||
return src;
|
return src;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <class T, class... P, class... ViewCtorArgs,
|
||||||
|
class = std::enable_if_t<
|
||||||
|
Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
|
||||||
|
std::enable_if_t<!Impl::MirrorDynamicViewType<
|
||||||
|
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
|
||||||
|
T, P...>::is_same_memspace,
|
||||||
|
typename Impl::MirrorDynamicViewType<
|
||||||
|
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
|
||||||
|
T, P...>::view_type>
|
||||||
|
create_mirror_view(const Kokkos::Experimental::DynamicView<T, P...>& src,
|
||||||
|
const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
|
||||||
|
return Kokkos::Impl::create_mirror(src, arg_prop);
|
||||||
|
}
|
||||||
} // namespace Impl
|
} // namespace Impl
|
||||||
|
|
||||||
// Create a mirror view in host space
|
// Create a mirror view in host space
|
||||||
@ -790,8 +810,9 @@ inline auto create_mirror_view(
|
|||||||
// Create a mirror in a new space
|
// Create a mirror in a new space
|
||||||
template <class Space, class T, class... P>
|
template <class Space, class T, class... P>
|
||||||
inline auto create_mirror_view(
|
inline auto create_mirror_view(
|
||||||
const Space& space, const Kokkos::Experimental::DynamicView<T, P...>& src) {
|
const Space&, const Kokkos::Experimental::DynamicView<T, P...>& src) {
|
||||||
return Impl::create_mirror_view(space, src, Impl::ViewCtorProp<>{});
|
return Impl::create_mirror_view(src,
|
||||||
|
view_alloc(typename Space::memory_space{}));
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Space, class T, class... P>
|
template <class Space, class T, class... P>
|
||||||
|
|||||||
@ -1901,19 +1901,22 @@ struct MirrorOffsetType {
|
|||||||
|
|
||||||
namespace Impl {
|
namespace Impl {
|
||||||
template <class T, class... P, class... ViewCtorArgs>
|
template <class T, class... P, class... ViewCtorArgs>
|
||||||
inline typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror
|
inline std::enable_if_t<
|
||||||
|
!Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space,
|
||||||
|
typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror>
|
||||||
create_mirror(const Kokkos::Experimental::OffsetView<T, P...>& src,
|
create_mirror(const Kokkos::Experimental::OffsetView<T, P...>& src,
|
||||||
const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
|
const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
|
||||||
return typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror(
|
return typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror(
|
||||||
Kokkos::create_mirror(arg_prop, src.view()), src.begins());
|
Kokkos::create_mirror(arg_prop, src.view()), src.begins());
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Space, class T, class... P, class... ViewCtorArgs>
|
template <class T, class... P, class... ViewCtorArgs,
|
||||||
inline typename Kokkos::Impl::MirrorOffsetType<Space, T, P...>::view_type
|
class = std::enable_if_t<
|
||||||
create_mirror(const Space&,
|
Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
|
||||||
const Kokkos::Experimental::OffsetView<T, P...>& src,
|
inline auto create_mirror(const Kokkos::Experimental::OffsetView<T, P...>& src,
|
||||||
const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
|
const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
|
||||||
using alloc_prop_input = Impl::ViewCtorProp<ViewCtorArgs...>;
|
using alloc_prop_input = Impl::ViewCtorProp<ViewCtorArgs...>;
|
||||||
|
using Space = typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space;
|
||||||
|
|
||||||
static_assert(
|
static_assert(
|
||||||
!alloc_prop_input::has_label,
|
!alloc_prop_input::has_label,
|
||||||
@ -1923,10 +1926,6 @@ create_mirror(const Space&,
|
|||||||
!alloc_prop_input::has_pointer,
|
!alloc_prop_input::has_pointer,
|
||||||
"The view constructor arguments passed to Kokkos::create_mirror must "
|
"The view constructor arguments passed to Kokkos::create_mirror must "
|
||||||
"not include a pointer!");
|
"not include a pointer!");
|
||||||
static_assert(
|
|
||||||
!alloc_prop_input::has_memory_space,
|
|
||||||
"The view constructor arguments passed to Kokkos::create_mirror must "
|
|
||||||
"not include a memory space instance!");
|
|
||||||
static_assert(
|
static_assert(
|
||||||
!alloc_prop_input::allow_padding,
|
!alloc_prop_input::allow_padding,
|
||||||
"The view constructor arguments passed to Kokkos::create_mirror must "
|
"The view constructor arguments passed to Kokkos::create_mirror must "
|
||||||
@ -1962,15 +1961,17 @@ inline auto create_mirror(
|
|||||||
template <class Space, class T, class... P,
|
template <class Space, class T, class... P,
|
||||||
typename Enable = std::enable_if_t<Kokkos::is_space<Space>::value>>
|
typename Enable = std::enable_if_t<Kokkos::is_space<Space>::value>>
|
||||||
inline auto create_mirror(
|
inline auto create_mirror(
|
||||||
const Space& space, const Kokkos::Experimental::OffsetView<T, P...>& src) {
|
const Space&, const Kokkos::Experimental::OffsetView<T, P...>& src) {
|
||||||
return Impl::create_mirror(space, src, Impl::ViewCtorProp<>{});
|
return Impl::create_mirror(
|
||||||
|
src, Kokkos::view_alloc(typename Space::memory_space{}));
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Space, class T, class... P>
|
template <class Space, class T, class... P>
|
||||||
typename Kokkos::Impl::MirrorOffsetType<Space, T, P...>::view_type
|
typename Kokkos::Impl::MirrorOffsetType<Space, T, P...>::view_type
|
||||||
create_mirror(Kokkos::Impl::WithoutInitializing_t wi, const Space& space,
|
create_mirror(Kokkos::Impl::WithoutInitializing_t wi, const Space&,
|
||||||
const Kokkos::Experimental::OffsetView<T, P...>& src) {
|
const Kokkos::Experimental::OffsetView<T, P...>& src) {
|
||||||
return Impl::create_mirror(space, src, Kokkos::view_alloc(wi));
|
return Impl::create_mirror(
|
||||||
|
src, Kokkos::view_alloc(typename Space::memory_space{}, wi));
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class T, class... P, class... ViewCtorArgs>
|
template <class T, class... P, class... ViewCtorArgs>
|
||||||
@ -1983,22 +1984,24 @@ inline auto create_mirror(
|
|||||||
namespace Impl {
|
namespace Impl {
|
||||||
template <class T, class... P, class... ViewCtorArgs>
|
template <class T, class... P, class... ViewCtorArgs>
|
||||||
inline std::enable_if_t<
|
inline std::enable_if_t<
|
||||||
|
!Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
|
||||||
(std::is_same<
|
(std::is_same<
|
||||||
typename Kokkos::Experimental::OffsetView<T, P...>::memory_space,
|
typename Kokkos::Experimental::OffsetView<T, P...>::memory_space,
|
||||||
typename Kokkos::Experimental::OffsetView<
|
typename Kokkos::Experimental::OffsetView<
|
||||||
T, P...>::HostMirror::memory_space>::value &&
|
T, P...>::HostMirror::memory_space>::value &&
|
||||||
std::is_same<typename Kokkos::Experimental::OffsetView<T, P...>::data_type,
|
std::is_same<
|
||||||
|
typename Kokkos::Experimental::OffsetView<T, P...>::data_type,
|
||||||
typename Kokkos::Experimental::OffsetView<
|
typename Kokkos::Experimental::OffsetView<
|
||||||
T, P...>::HostMirror::data_type>::value),
|
T, P...>::HostMirror::data_type>::value),
|
||||||
typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror>
|
typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror>
|
||||||
create_mirror_view(
|
create_mirror_view(const Kokkos::Experimental::OffsetView<T, P...>& src,
|
||||||
const typename Kokkos::Experimental::OffsetView<T, P...>& src,
|
|
||||||
const Impl::ViewCtorProp<ViewCtorArgs...>&) {
|
const Impl::ViewCtorProp<ViewCtorArgs...>&) {
|
||||||
return src;
|
return src;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class T, class... P, class... ViewCtorArgs>
|
template <class T, class... P, class... ViewCtorArgs>
|
||||||
inline std::enable_if_t<
|
inline std::enable_if_t<
|
||||||
|
!Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
|
||||||
!(std::is_same<
|
!(std::is_same<
|
||||||
typename Kokkos::Experimental::OffsetView<T, P...>::memory_space,
|
typename Kokkos::Experimental::OffsetView<T, P...>::memory_space,
|
||||||
typename Kokkos::Experimental::OffsetView<
|
typename Kokkos::Experimental::OffsetView<
|
||||||
@ -2013,24 +2016,32 @@ create_mirror_view(const Kokkos::Experimental::OffsetView<T, P...>& src,
|
|||||||
return Kokkos::create_mirror(arg_prop, src);
|
return Kokkos::create_mirror(arg_prop, src);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Space, class T, class... P, class... ViewCtorArgs>
|
template <class T, class... P, class... ViewCtorArgs,
|
||||||
inline std::enable_if_t<
|
class = std::enable_if_t<
|
||||||
Impl::MirrorOffsetViewType<Space, T, P...>::is_same_memspace,
|
Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
|
||||||
Kokkos::Experimental::OffsetView<T, P...>>
|
std::enable_if_t<Impl::MirrorOffsetViewType<
|
||||||
create_mirror_view(const Space&,
|
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
|
||||||
const Kokkos::Experimental::OffsetView<T, P...>& src,
|
T, P...>::is_same_memspace,
|
||||||
|
typename Impl::MirrorOffsetViewType<
|
||||||
|
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
|
||||||
|
T, P...>::view_type>
|
||||||
|
create_mirror_view(const Kokkos::Experimental::OffsetView<T, P...>& src,
|
||||||
const Impl::ViewCtorProp<ViewCtorArgs...>&) {
|
const Impl::ViewCtorProp<ViewCtorArgs...>&) {
|
||||||
return src;
|
return src;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Space, class T, class... P, class... ViewCtorArgs>
|
template <class T, class... P, class... ViewCtorArgs,
|
||||||
std::enable_if_t<
|
class = std::enable_if_t<
|
||||||
!Impl::MirrorOffsetViewType<Space, T, P...>::is_same_memspace,
|
Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
|
||||||
typename Kokkos::Impl::MirrorOffsetViewType<Space, T, P...>::view_type>
|
std::enable_if_t<!Impl::MirrorOffsetViewType<
|
||||||
create_mirror_view(const Space& space,
|
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
|
||||||
const Kokkos::Experimental::OffsetView<T, P...>& src,
|
T, P...>::is_same_memspace,
|
||||||
|
typename Impl::MirrorOffsetViewType<
|
||||||
|
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
|
||||||
|
T, P...>::view_type>
|
||||||
|
create_mirror_view(const Kokkos::Experimental::OffsetView<T, P...>& src,
|
||||||
const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
|
const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
|
||||||
return create_mirror(space, src, arg_prop);
|
return Kokkos::Impl::create_mirror(src, arg_prop);
|
||||||
}
|
}
|
||||||
} // namespace Impl
|
} // namespace Impl
|
||||||
|
|
||||||
@ -2052,15 +2063,17 @@ inline auto create_mirror_view(
|
|||||||
template <class Space, class T, class... P,
|
template <class Space, class T, class... P,
|
||||||
typename Enable = std::enable_if_t<Kokkos::is_space<Space>::value>>
|
typename Enable = std::enable_if_t<Kokkos::is_space<Space>::value>>
|
||||||
inline auto create_mirror_view(
|
inline auto create_mirror_view(
|
||||||
const Space& space, const Kokkos::Experimental::OffsetView<T, P...>& src) {
|
const Space&, const Kokkos::Experimental::OffsetView<T, P...>& src) {
|
||||||
return Impl::create_mirror_view(space, src, Impl::ViewCtorProp<>{});
|
return Impl::create_mirror_view(
|
||||||
|
src, Kokkos::view_alloc(typename Space::memory_space{}));
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Space, class T, class... P>
|
template <class Space, class T, class... P>
|
||||||
inline auto create_mirror_view(
|
inline auto create_mirror_view(
|
||||||
Kokkos::Impl::WithoutInitializing_t wi, const Space& space,
|
Kokkos::Impl::WithoutInitializing_t wi, const Space&,
|
||||||
const Kokkos::Experimental::OffsetView<T, P...>& src) {
|
const Kokkos::Experimental::OffsetView<T, P...>& src) {
|
||||||
return Impl::create_mirror_view(space, src, Kokkos::view_alloc(wi));
|
return Impl::create_mirror_view(
|
||||||
|
src, Kokkos::view_alloc(typename Space::memory_space{}, wi));
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class T, class... P, class... ViewCtorArgs>
|
template <class T, class... P, class... ViewCtorArgs>
|
||||||
|
|||||||
@ -46,3 +46,13 @@ foreach(Tag Threads;Serial;OpenMP;HPX;Cuda;HIP;SYCL)
|
|||||||
KOKKOS_ADD_EXECUTABLE_AND_TEST(UnitTest_${Tag} SOURCES ${UnitTestSources})
|
KOKKOS_ADD_EXECUTABLE_AND_TEST(UnitTest_${Tag} SOURCES ${UnitTestSources})
|
||||||
endif()
|
endif()
|
||||||
endforeach()
|
endforeach()
|
||||||
|
|
||||||
|
SET(COMPILE_ONLY_SOURCES
|
||||||
|
TestCreateMirror.cpp
|
||||||
|
)
|
||||||
|
KOKKOS_ADD_EXECUTABLE(
|
||||||
|
TestCompileOnly
|
||||||
|
SOURCES
|
||||||
|
TestCompileMain.cpp
|
||||||
|
${COMPILE_ONLY_SOURCES}
|
||||||
|
)
|
||||||
|
|||||||
1
lib/kokkos/containers/unit_tests/TestCompileMain.cpp
Normal file
1
lib/kokkos/containers/unit_tests/TestCompileMain.cpp
Normal file
@ -0,0 +1 @@
|
|||||||
|
int main() {}
|
||||||
179
lib/kokkos/containers/unit_tests/TestCreateMirror.cpp
Normal file
179
lib/kokkos/containers/unit_tests/TestCreateMirror.cpp
Normal file
@ -0,0 +1,179 @@
|
|||||||
|
/*
|
||||||
|
//@HEADER
|
||||||
|
// ************************************************************************
|
||||||
|
//
|
||||||
|
// Kokkos v. 3.0
|
||||||
|
// Copyright (2020) National Technology & Engineering
|
||||||
|
// Solutions of Sandia, LLC (NTESS).
|
||||||
|
//
|
||||||
|
// Under the terms of Contract DE-NA0003525 with NTESS,
|
||||||
|
// the U.S. Government retains certain rights in this software.
|
||||||
|
//
|
||||||
|
// Redistribution and use in source and binary forms, with or without
|
||||||
|
// modification, are permitted provided that the following conditions are
|
||||||
|
// met:
|
||||||
|
//
|
||||||
|
// 1. Redistributions of source code must retain the above copyright
|
||||||
|
// notice, this list of conditions and the following disclaimer.
|
||||||
|
//
|
||||||
|
// 2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
// notice, this list of conditions and the following disclaimer in the
|
||||||
|
// documentation and/or other materials provided with the distribution.
|
||||||
|
//
|
||||||
|
// 3. Neither the name of the Corporation nor the names of the
|
||||||
|
// contributors may be used to endorse or promote products derived from
|
||||||
|
// this software without specific prior written permission.
|
||||||
|
//
|
||||||
|
// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
|
||||||
|
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||||
|
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
|
||||||
|
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||||
|
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||||
|
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||||
|
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||||
|
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||||
|
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
//
|
||||||
|
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
|
||||||
|
//
|
||||||
|
// ************************************************************************
|
||||||
|
//@HEADER
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <Kokkos_Core.hpp>
|
||||||
|
#include <Kokkos_DynamicView.hpp>
|
||||||
|
#include <Kokkos_DynRankView.hpp>
|
||||||
|
#include <Kokkos_OffsetView.hpp>
|
||||||
|
|
||||||
|
template <typename TestView, typename MemorySpace>
|
||||||
|
void check_memory_space(TestView, MemorySpace) {
|
||||||
|
static_assert(
|
||||||
|
std::is_same<typename TestView::memory_space, MemorySpace>::value, "");
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class View>
|
||||||
|
auto host_mirror_test_space(View) {
|
||||||
|
return std::conditional_t<
|
||||||
|
Kokkos::SpaceAccessibility<Kokkos::HostSpace,
|
||||||
|
typename View::memory_space>::accessible,
|
||||||
|
typename View::memory_space, Kokkos::HostSpace>{};
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename View>
|
||||||
|
void test_create_mirror_properties(const View& view) {
|
||||||
|
using namespace Kokkos;
|
||||||
|
using DeviceMemorySpace = typename DefaultExecutionSpace::memory_space;
|
||||||
|
|
||||||
|
// clang-format off
|
||||||
|
|
||||||
|
// create_mirror
|
||||||
|
#ifndef KOKKOS_ENABLE_CXX14
|
||||||
|
// FIXME DynamicView: HostMirror is the same type
|
||||||
|
if constexpr (!is_dynamic_view<View>::value) {
|
||||||
|
check_memory_space(create_mirror(WithoutInitializing, view), host_mirror_test_space(view));
|
||||||
|
check_memory_space(create_mirror( view), host_mirror_test_space(view));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
check_memory_space(create_mirror(WithoutInitializing, DefaultExecutionSpace{}, view), DeviceMemorySpace{});
|
||||||
|
check_memory_space(create_mirror( DefaultExecutionSpace{}, view), DeviceMemorySpace{});
|
||||||
|
|
||||||
|
// create_mirror_view
|
||||||
|
#ifndef KOKKOS_ENABLE_CXX14
|
||||||
|
// FIXME DynamicView: HostMirror is the same type
|
||||||
|
if constexpr (!is_dynamic_view<View>::value) {
|
||||||
|
check_memory_space(create_mirror_view(WithoutInitializing, view), host_mirror_test_space(view));
|
||||||
|
check_memory_space(create_mirror_view( view), host_mirror_test_space(view));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
check_memory_space(create_mirror_view(WithoutInitializing, DefaultExecutionSpace{}, view), DeviceMemorySpace{});
|
||||||
|
check_memory_space(create_mirror_view( DefaultExecutionSpace{}, view), DeviceMemorySpace{});
|
||||||
|
|
||||||
|
// create_mirror view_alloc
|
||||||
|
#ifndef KOKKOS_ENABLE_CXX14
|
||||||
|
// FIXME DynamicView: HostMirror is the same type
|
||||||
|
if constexpr (!is_dynamic_view<View>::value) {
|
||||||
|
check_memory_space(create_mirror(view_alloc(WithoutInitializing), view), host_mirror_test_space(view));
|
||||||
|
check_memory_space(create_mirror(view_alloc(), view), host_mirror_test_space(view));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
check_memory_space(create_mirror(view_alloc(WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
|
||||||
|
check_memory_space(create_mirror(view_alloc( DeviceMemorySpace{}), view), DeviceMemorySpace{});
|
||||||
|
|
||||||
|
// create_mirror_view view_alloc
|
||||||
|
#ifndef KOKKOS_ENABLE_CXX14
|
||||||
|
// FIXME DynamicView: HostMirror is the same type
|
||||||
|
if constexpr (!is_dynamic_view<View>::value) {
|
||||||
|
check_memory_space(create_mirror_view(view_alloc(WithoutInitializing), view), host_mirror_test_space(view));
|
||||||
|
check_memory_space(create_mirror_view(view_alloc(), view), host_mirror_test_space(view));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
check_memory_space(create_mirror_view(view_alloc(WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
|
||||||
|
check_memory_space(create_mirror_view(view_alloc( DeviceMemorySpace{}), view), DeviceMemorySpace{});
|
||||||
|
|
||||||
|
// create_mirror view_alloc + execution space
|
||||||
|
#ifndef KOKKOS_ENABLE_CXX14
|
||||||
|
// FIXME DynamicView: HostMirror is the same type
|
||||||
|
if constexpr (!is_dynamic_view<View>::value) {
|
||||||
|
check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{}, WithoutInitializing), view), host_mirror_test_space(view));
|
||||||
|
check_memory_space(create_mirror(view_alloc(DefaultHostExecutionSpace{}), view), host_mirror_test_space(view));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{}, WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
|
||||||
|
check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{}, DeviceMemorySpace{}), view), DeviceMemorySpace{});
|
||||||
|
|
||||||
|
// create_mirror_view view_alloc + execution space
|
||||||
|
#ifndef KOKKOS_ENABLE_CXX14
|
||||||
|
// FIXME DynamicView: HostMirror is the same type
|
||||||
|
if constexpr (!is_dynamic_view<View>::value) {
|
||||||
|
check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{}, WithoutInitializing), view), host_mirror_test_space(view));
|
||||||
|
check_memory_space(create_mirror_view(view_alloc(DefaultHostExecutionSpace{}), view), host_mirror_test_space(view));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{}, WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
|
||||||
|
check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{}, DeviceMemorySpace{}), view), DeviceMemorySpace{});
|
||||||
|
|
||||||
|
// create_mirror_view_and_copy
|
||||||
|
check_memory_space(create_mirror_view_and_copy(HostSpace{}, view), HostSpace{});
|
||||||
|
check_memory_space(create_mirror_view_and_copy(DeviceMemorySpace{}, view), DeviceMemorySpace{});
|
||||||
|
|
||||||
|
// create_mirror_view_and_copy view_alloc
|
||||||
|
check_memory_space(create_mirror_view_and_copy(view_alloc(HostSpace{}), view), HostSpace{});
|
||||||
|
check_memory_space(create_mirror_view_and_copy(view_alloc(DeviceMemorySpace{}), view), DeviceMemorySpace{});
|
||||||
|
|
||||||
|
// create_mirror_view_and_copy view_alloc + execution space
|
||||||
|
check_memory_space(create_mirror_view_and_copy(view_alloc(HostSpace{}, DefaultHostExecutionSpace{}), view), HostSpace{});
|
||||||
|
check_memory_space(create_mirror_view_and_copy(view_alloc(DeviceMemorySpace{}, DefaultExecutionSpace{}), view), DeviceMemorySpace{});
|
||||||
|
|
||||||
|
// clang-format on
|
||||||
|
}
|
||||||
|
|
||||||
|
void test_create_mirror_dynrankview() {
|
||||||
|
Kokkos::DynRankView<int, Kokkos::DefaultExecutionSpace> device_view(
|
||||||
|
"device view", 10);
|
||||||
|
Kokkos::DynRankView<int, Kokkos::HostSpace> host_view("host view", 10);
|
||||||
|
|
||||||
|
test_create_mirror_properties(device_view);
|
||||||
|
test_create_mirror_properties(host_view);
|
||||||
|
}
|
||||||
|
|
||||||
|
void test_reate_mirror_offsetview() {
|
||||||
|
Kokkos::Experimental::OffsetView<int*, Kokkos::DefaultExecutionSpace>
|
||||||
|
device_view("device view", {0, 10});
|
||||||
|
Kokkos::Experimental::OffsetView<int*, Kokkos::HostSpace> host_view(
|
||||||
|
"host view", {0, 10});
|
||||||
|
|
||||||
|
test_create_mirror_properties(device_view);
|
||||||
|
test_create_mirror_properties(host_view);
|
||||||
|
}
|
||||||
|
|
||||||
|
void test_create_mirror_dynamicview() {
|
||||||
|
Kokkos::Experimental::DynamicView<int*, Kokkos::DefaultExecutionSpace>
|
||||||
|
device_view("device view", 2, 10);
|
||||||
|
Kokkos::Experimental::DynamicView<int*, Kokkos::HostSpace> host_view(
|
||||||
|
"host view", 2, 10);
|
||||||
|
|
||||||
|
test_create_mirror_properties(device_view);
|
||||||
|
test_create_mirror_properties(host_view);
|
||||||
|
}
|
||||||
@ -1,5 +1,5 @@
|
|||||||
TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
|
TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
|
||||||
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC DLlib HPX
|
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC DLlib
|
||||||
TEST_OPTIONAL_TPLS CUSPARSE
|
TEST_OPTIONAL_TPLS CUSPARSE
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@ -53,13 +53,69 @@
|
|||||||
namespace Kokkos {
|
namespace Kokkos {
|
||||||
namespace Impl {
|
namespace Impl {
|
||||||
|
|
||||||
|
inline int cuda_warp_per_sm_allocation_granularity(
|
||||||
|
cudaDeviceProp const& properties) {
|
||||||
|
// Allocation granularity of warps in each sm
|
||||||
|
switch (properties.major) {
|
||||||
|
case 3:
|
||||||
|
case 5:
|
||||||
|
case 7:
|
||||||
|
case 8:
|
||||||
|
case 9: return 4;
|
||||||
|
case 6: return (properties.minor == 0 ? 2 : 4);
|
||||||
|
default:
|
||||||
|
throw_runtime_exception(
|
||||||
|
"Unknown device in cuda warp per sm allocation granularity");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline int cuda_max_warps_per_sm_registers(
|
||||||
|
cudaDeviceProp const& properties, cudaFuncAttributes const& attributes) {
|
||||||
|
// Maximum number of warps per sm as a function of register counts,
|
||||||
|
// subject to the constraint that warps are allocated with a fixed granularity
|
||||||
|
int const max_regs_per_block = properties.regsPerBlock;
|
||||||
|
int const regs_per_warp = attributes.numRegs * properties.warpSize;
|
||||||
|
int const warp_granularity =
|
||||||
|
cuda_warp_per_sm_allocation_granularity(properties);
|
||||||
|
// The granularity of register allocation is chunks of 256 registers per warp,
|
||||||
|
// which implies a need to over-allocate, so we round up
|
||||||
|
int const allocated_regs_per_warp = 256 * ((regs_per_warp + 256 - 1) / 256);
|
||||||
|
|
||||||
|
// The maximum number of warps per SM is constrained from above by register
|
||||||
|
// allocation. To satisfy the constraint that warps per SM is allocated at a
|
||||||
|
// finite granularity, we need to round down.
|
||||||
|
int const max_warps_per_sm =
|
||||||
|
warp_granularity *
|
||||||
|
(max_regs_per_block / (allocated_regs_per_warp * warp_granularity));
|
||||||
|
|
||||||
|
return max_warps_per_sm;
|
||||||
|
}
|
||||||
|
|
||||||
inline int cuda_max_active_blocks_per_sm(cudaDeviceProp const& properties,
|
inline int cuda_max_active_blocks_per_sm(cudaDeviceProp const& properties,
|
||||||
cudaFuncAttributes const& attributes,
|
cudaFuncAttributes const& attributes,
|
||||||
int block_size, size_t dynamic_shmem) {
|
int block_size, size_t dynamic_shmem) {
|
||||||
// Limits due do registers/SM
|
// Limits due to registers/SM
|
||||||
int const regs_per_sm = properties.regsPerMultiprocessor;
|
int const regs_per_sm = properties.regsPerMultiprocessor;
|
||||||
int const regs_per_thread = attributes.numRegs;
|
int const regs_per_thread = attributes.numRegs;
|
||||||
int const max_blocks_regs = regs_per_sm / (regs_per_thread * block_size);
|
// The granularity of register allocation is chunks of 256 registers per warp
|
||||||
|
// -> 8 registers per thread
|
||||||
|
int const allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
|
||||||
|
int max_blocks_regs = regs_per_sm / (allocated_regs_per_thread * block_size);
|
||||||
|
|
||||||
|
// Compute the maximum number of warps as a function of the number of
|
||||||
|
// registers
|
||||||
|
int const max_warps_per_sm_registers =
|
||||||
|
cuda_max_warps_per_sm_registers(properties, attributes);
|
||||||
|
|
||||||
|
// Constrain the number of blocks to respect the maximum number of warps per
|
||||||
|
// SM On face value this should be an equality, but due to the warp
|
||||||
|
// granularity constraints noted in `cuda_max_warps_per_sm_registers` the
|
||||||
|
// left-hand-side of this comparison can overshoot what the hardware allows
|
||||||
|
// based on register counts alone
|
||||||
|
while ((max_blocks_regs * block_size / properties.warpSize) >
|
||||||
|
max_warps_per_sm_registers)
|
||||||
|
max_blocks_regs--;
|
||||||
|
|
||||||
// Limits due to shared memory/SM
|
// Limits due to shared memory/SM
|
||||||
size_t const shmem_per_sm = properties.sharedMemPerMultiprocessor;
|
size_t const shmem_per_sm = properties.sharedMemPerMultiprocessor;
|
||||||
@ -203,40 +259,19 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
|
|||||||
LaunchBounds{});
|
LaunchBounds{});
|
||||||
}
|
}
|
||||||
|
|
||||||
// Assuming cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1)
|
template <class LaunchBounds>
|
||||||
// NOTE these number can be obtained several ways:
|
int cuda_get_opt_block_size_no_shmem(const cudaFuncAttributes& attr,
|
||||||
// * One option is to download the CUDA Occupancy Calculator spreadsheet, select
|
LaunchBounds) {
|
||||||
// "Compute Capability" first and check what is the smallest "Shared Memory
|
auto const& prop = Kokkos::Cuda().cuda_device_prop();
|
||||||
// Size Config" that is available. The "Shared Memory Per Multiprocessor" in
|
|
||||||
// bytes is then to be found below in the summary.
|
// Thin version of cuda_get_opt_block_size for cases where there is no shared
|
||||||
// * Another option would be to look for the information in the "Tuning
|
// memory
|
||||||
// Guide(s)" of the CUDA Toolkit Documentation for each GPU architecture, in
|
auto const block_size_to_no_shmem = [&](int /*block_size*/) { return 0; };
|
||||||
// the "Shared Memory" section (more tedious)
|
|
||||||
inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) {
|
return cuda_deduce_block_size(false, prop, attr, block_size_to_no_shmem,
|
||||||
int const compute_capability = properties.major * 10 + properties.minor;
|
LaunchBounds{});
|
||||||
return [compute_capability]() {
|
|
||||||
switch (compute_capability) {
|
|
||||||
case 30:
|
|
||||||
case 32:
|
|
||||||
case 35: return 16;
|
|
||||||
case 37: return 80;
|
|
||||||
case 50:
|
|
||||||
case 53:
|
|
||||||
case 60:
|
|
||||||
case 62: return 64;
|
|
||||||
case 52:
|
|
||||||
case 61: return 96;
|
|
||||||
case 70:
|
|
||||||
case 80:
|
|
||||||
case 86: return 8;
|
|
||||||
case 75: return 32;
|
|
||||||
default:
|
|
||||||
Kokkos::Impl::throw_runtime_exception(
|
|
||||||
"Unknown device in cuda block size deduction");
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}() * 1024;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace Impl
|
} // namespace Impl
|
||||||
} // namespace Kokkos
|
} // namespace Kokkos
|
||||||
|
|
||||||
|
|||||||
@ -418,7 +418,7 @@ KOKKOS_INLINE_FUNCTION
|
|||||||
#endif // CUDA_VERSION >= 11000 && CUDA_VERSION < 11010
|
#endif // CUDA_VERSION >= 11000 && CUDA_VERSION < 11010
|
||||||
|
|
||||||
#if CUDA_VERSION >= 11010 && \
|
#if CUDA_VERSION >= 11010 && \
|
||||||
((defined(KOKKOS_ARCH_AMPERE80) || defined(KOKKOS_ARCH_AMPERE86)))
|
((defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER)))
|
||||||
KOKKOS_INLINE_FUNCTION
|
KOKKOS_INLINE_FUNCTION
|
||||||
bhalf_t cast_to_bhalf(bhalf_t val) { return val; }
|
bhalf_t cast_to_bhalf(bhalf_t val) { return val; }
|
||||||
KOKKOS_INLINE_FUNCTION
|
KOKKOS_INLINE_FUNCTION
|
||||||
|
|||||||
@ -569,12 +569,6 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef KOKKOS_ENABLE_PRE_CUDA_10_DEPRECATION_API
|
|
||||||
cudaThreadSetCacheConfig(cudaFuncCachePreferShared);
|
|
||||||
#else
|
|
||||||
cudaDeviceSetCacheConfig(cudaFuncCachePreferShared);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Init the array for used for arbitrarily sized atomics
|
// Init the array for used for arbitrarily sized atomics
|
||||||
if (stream == nullptr) Impl::initialize_host_cuda_lock_arrays();
|
if (stream == nullptr) Impl::initialize_host_cuda_lock_arrays();
|
||||||
|
|
||||||
|
|||||||
@ -93,10 +93,6 @@ namespace Impl {
|
|||||||
// __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
|
// __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
|
||||||
// function qualifier which could be used to improve performance.
|
// function qualifier which could be used to improve performance.
|
||||||
//----------------------------------------------------------------------------
|
//----------------------------------------------------------------------------
|
||||||
// Maximize L1 cache and minimize shared memory:
|
|
||||||
// cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1 );
|
|
||||||
// For 2.0 capability: 48 KB L1 and 16 KB shared
|
|
||||||
//----------------------------------------------------------------------------
|
|
||||||
|
|
||||||
template <class DriverType>
|
template <class DriverType>
|
||||||
__global__ static void cuda_parallel_launch_constant_memory() {
|
__global__ static void cuda_parallel_launch_constant_memory() {
|
||||||
@ -158,63 +154,119 @@ inline void check_shmem_request(CudaInternal const* cuda_instance, int shmem) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// This function needs to be template on DriverType and LaunchBounds
|
// These functions needs to be template on DriverType and LaunchBounds
|
||||||
// so that the static bool is unique for each type combo
|
// so that the static bool is unique for each type combo
|
||||||
// KernelFuncPtr does not necessarily contain that type information.
|
// KernelFuncPtr does not necessarily contain that type information.
|
||||||
|
|
||||||
template <class DriverType, class LaunchBounds, class KernelFuncPtr>
|
template <class DriverType, class LaunchBounds, class KernelFuncPtr>
|
||||||
inline void configure_shmem_preference(KernelFuncPtr const& func,
|
const cudaFuncAttributes& get_cuda_kernel_func_attributes(
|
||||||
bool prefer_shmem) {
|
const KernelFuncPtr& func) {
|
||||||
#ifndef KOKKOS_ARCH_KEPLER
|
// Only call cudaFuncGetAttributes once for each unique kernel
|
||||||
// On Kepler the L1 has no benefit since it doesn't cache reads
|
// by leveraging static variable initialization rules
|
||||||
auto set_cache_config = [&] {
|
auto wrap_get_attributes = [&]() -> cudaFuncAttributes {
|
||||||
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
|
cudaFuncAttributes attr;
|
||||||
func,
|
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncGetAttributes(&attr, func));
|
||||||
(prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1)));
|
return attr;
|
||||||
return prefer_shmem;
|
|
||||||
};
|
};
|
||||||
static bool cache_config_preference_cached = set_cache_config();
|
static cudaFuncAttributes func_attr = wrap_get_attributes();
|
||||||
if (cache_config_preference_cached != prefer_shmem) {
|
return func_attr;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class DriverType, class LaunchBounds, class KernelFuncPtr>
|
||||||
|
inline void configure_shmem_preference(const KernelFuncPtr& func,
|
||||||
|
const cudaDeviceProp& device_props,
|
||||||
|
const size_t block_size, int& shmem,
|
||||||
|
const size_t occupancy) {
|
||||||
|
#ifndef KOKKOS_ARCH_KEPLER
|
||||||
|
|
||||||
|
const auto& func_attr =
|
||||||
|
get_cuda_kernel_func_attributes<DriverType, LaunchBounds>(func);
|
||||||
|
|
||||||
|
// Compute limits for number of blocks due to registers/SM
|
||||||
|
const size_t regs_per_sm = device_props.regsPerMultiprocessor;
|
||||||
|
const size_t regs_per_thread = func_attr.numRegs;
|
||||||
|
// The granularity of register allocation is chunks of 256 registers per warp
|
||||||
|
// -> 8 registers per thread
|
||||||
|
const size_t allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
|
||||||
|
size_t max_blocks_regs =
|
||||||
|
regs_per_sm / (allocated_regs_per_thread * block_size);
|
||||||
|
|
||||||
|
// Compute the maximum number of warps as a function of the number of
|
||||||
|
// registers
|
||||||
|
const size_t max_warps_per_sm_registers =
|
||||||
|
cuda_max_warps_per_sm_registers(device_props, func_attr);
|
||||||
|
|
||||||
|
// Constrain the number of blocks to respect the maximum number of warps per
|
||||||
|
// SM On face value this should be an equality, but due to the warp
|
||||||
|
// granularity constraints noted in `cuda_max_warps_per_sm_registers` the
|
||||||
|
// left-hand-side of this comparison can overshoot what the hardware allows
|
||||||
|
// based on register counts alone
|
||||||
|
while ((max_blocks_regs * block_size / device_props.warpSize) >
|
||||||
|
max_warps_per_sm_registers)
|
||||||
|
max_blocks_regs--;
|
||||||
|
|
||||||
|
// Compute how many threads per sm we actually want
|
||||||
|
const size_t max_threads_per_sm = device_props.maxThreadsPerMultiProcessor;
|
||||||
|
// only allocate multiples of warp size
|
||||||
|
const size_t num_threads_desired =
|
||||||
|
((max_threads_per_sm * occupancy / 100 + 31) / 32) * 32;
|
||||||
|
// Get close to the desired occupancy,
|
||||||
|
// don't undershoot by much but also don't allocate a whole new block just
|
||||||
|
// because one is a few threads over otherwise.
|
||||||
|
size_t num_blocks_desired =
|
||||||
|
(num_threads_desired + block_size * 0.8) / block_size;
|
||||||
|
num_blocks_desired = ::std::min(max_blocks_regs, num_blocks_desired);
|
||||||
|
if (num_blocks_desired == 0) num_blocks_desired = 1;
|
||||||
|
|
||||||
|
// Calculate how much shared memory we need per block
|
||||||
|
size_t shmem_per_block = shmem + func_attr.sharedSizeBytes;
|
||||||
|
|
||||||
|
// The minimum shared memory allocation we can have in total per SM is 8kB.
|
||||||
|
// If we want to lower occupancy we have to make sure we request at least that
|
||||||
|
// much in aggregate over all blocks, so that shared memory actually becomes a
|
||||||
|
// limiting factor for occupancy
|
||||||
|
constexpr size_t min_shmem_size_per_sm = 8192;
|
||||||
|
if ((occupancy < 100) &&
|
||||||
|
(shmem_per_block * num_blocks_desired < min_shmem_size_per_sm)) {
|
||||||
|
shmem_per_block = min_shmem_size_per_sm / num_blocks_desired;
|
||||||
|
// Need to set the caller's shmem variable so that the
|
||||||
|
// kernel launch uses the correct dynamic shared memory request
|
||||||
|
shmem = shmem_per_block - func_attr.sharedSizeBytes;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compute the carveout fraction we need based on occupancy
|
||||||
|
// Use multiples of 8kB
|
||||||
|
const size_t max_shmem_per_sm = device_props.sharedMemPerMultiprocessor;
|
||||||
|
size_t carveout = shmem_per_block == 0
|
||||||
|
? 0
|
||||||
|
: 100 *
|
||||||
|
(((num_blocks_desired * shmem_per_block +
|
||||||
|
min_shmem_size_per_sm - 1) /
|
||||||
|
min_shmem_size_per_sm) *
|
||||||
|
min_shmem_size_per_sm) /
|
||||||
|
max_shmem_per_sm;
|
||||||
|
if (carveout > 100) carveout = 100;
|
||||||
|
|
||||||
|
// Set the carveout, but only call it once per kernel or when it changes
|
||||||
|
auto set_cache_config = [&] {
|
||||||
|
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetAttribute(
|
||||||
|
func, cudaFuncAttributePreferredSharedMemoryCarveout, carveout));
|
||||||
|
return carveout;
|
||||||
|
};
|
||||||
|
// Store the value in a static variable so we only reset if needed
|
||||||
|
static size_t cache_config_preference_cached = set_cache_config();
|
||||||
|
if (cache_config_preference_cached != carveout) {
|
||||||
cache_config_preference_cached = set_cache_config();
|
cache_config_preference_cached = set_cache_config();
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
// Use the parameters so we don't get a warning
|
// Use the parameters so we don't get a warning
|
||||||
(void)func;
|
(void)func;
|
||||||
(void)prefer_shmem;
|
(void)device_props;
|
||||||
|
(void)block_size;
|
||||||
|
(void)occupancy;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Policy>
|
|
||||||
std::enable_if_t<Policy::experimental_contains_desired_occupancy>
|
|
||||||
modify_launch_configuration_if_desired_occupancy_is_specified(
|
|
||||||
Policy const& policy, cudaDeviceProp const& properties,
|
|
||||||
cudaFuncAttributes const& attributes, dim3 const& block, int& shmem,
|
|
||||||
bool& prefer_shmem) {
|
|
||||||
int const block_size = block.x * block.y * block.z;
|
|
||||||
int const desired_occupancy = policy.impl_get_desired_occupancy().value();
|
|
||||||
|
|
||||||
size_t const shmem_per_sm_prefer_l1 = get_shmem_per_sm_prefer_l1(properties);
|
|
||||||
size_t const static_shmem = attributes.sharedSizeBytes;
|
|
||||||
|
|
||||||
// round to nearest integer and avoid division by zero
|
|
||||||
int active_blocks = std::max(
|
|
||||||
1, static_cast<int>(std::round(
|
|
||||||
static_cast<double>(properties.maxThreadsPerMultiProcessor) /
|
|
||||||
block_size * desired_occupancy / 100)));
|
|
||||||
int const dynamic_shmem =
|
|
||||||
shmem_per_sm_prefer_l1 / active_blocks - static_shmem;
|
|
||||||
|
|
||||||
if (dynamic_shmem > shmem) {
|
|
||||||
shmem = dynamic_shmem;
|
|
||||||
prefer_shmem = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class Policy>
|
|
||||||
std::enable_if_t<!Policy::experimental_contains_desired_occupancy>
|
|
||||||
modify_launch_configuration_if_desired_occupancy_is_specified(
|
|
||||||
Policy const&, cudaDeviceProp const&, cudaFuncAttributes const&,
|
|
||||||
dim3 const& /*block*/, int& /*shmem*/, bool& /*prefer_shmem*/) {}
|
|
||||||
|
|
||||||
// </editor-fold> end Some helper functions for launch code readability }}}1
|
// </editor-fold> end Some helper functions for launch code readability }}}1
|
||||||
//==============================================================================
|
//==============================================================================
|
||||||
|
|
||||||
@ -348,7 +400,7 @@ struct CudaParallelLaunchKernelInvoker<
|
|||||||
#ifdef KOKKOS_CUDA_ENABLE_GRAPHS
|
#ifdef KOKKOS_CUDA_ENABLE_GRAPHS
|
||||||
inline static void create_parallel_launch_graph_node(
|
inline static void create_parallel_launch_graph_node(
|
||||||
DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
|
DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
|
||||||
CudaInternal const* cuda_instance, bool prefer_shmem) {
|
CudaInternal const* cuda_instance) {
|
||||||
//----------------------------------------
|
//----------------------------------------
|
||||||
auto const& graph = Impl::get_cuda_graph_from_kernel(driver);
|
auto const& graph = Impl::get_cuda_graph_from_kernel(driver);
|
||||||
KOKKOS_EXPECTS(bool(graph));
|
KOKKOS_EXPECTS(bool(graph));
|
||||||
@ -358,8 +410,19 @@ struct CudaParallelLaunchKernelInvoker<
|
|||||||
|
|
||||||
if (!Impl::is_empty_launch(grid, block)) {
|
if (!Impl::is_empty_launch(grid, block)) {
|
||||||
Impl::check_shmem_request(cuda_instance, shmem);
|
Impl::check_shmem_request(cuda_instance, shmem);
|
||||||
|
if (DriverType::Policy::
|
||||||
|
experimental_contains_desired_occupancy) {
|
||||||
|
/*
|
||||||
|
int desired_occupancy =
|
||||||
|
driver.get_policy().impl_get_desired_occupancy().value();
|
||||||
|
size_t block_size = block.x * block.y * block.z;
|
||||||
Impl::configure_shmem_preference<DriverType, LaunchBounds>(
|
Impl::configure_shmem_preference<DriverType, LaunchBounds>(
|
||||||
base_t::get_kernel_func(), prefer_shmem);
|
base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
|
||||||
|
shmem, desired_occupancy);*/
|
||||||
|
Kokkos::Impl::throw_runtime_exception(
|
||||||
|
std::string("Cuda graph node creation FAILED:"
|
||||||
|
" occupancy requests are currently broken."));
|
||||||
|
}
|
||||||
|
|
||||||
void const* args[] = {&driver};
|
void const* args[] = {&driver};
|
||||||
|
|
||||||
@ -442,7 +505,7 @@ struct CudaParallelLaunchKernelInvoker<
|
|||||||
#ifdef KOKKOS_CUDA_ENABLE_GRAPHS
|
#ifdef KOKKOS_CUDA_ENABLE_GRAPHS
|
||||||
inline static void create_parallel_launch_graph_node(
|
inline static void create_parallel_launch_graph_node(
|
||||||
DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
|
DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
|
||||||
CudaInternal const* cuda_instance, bool prefer_shmem) {
|
CudaInternal const* cuda_instance) {
|
||||||
//----------------------------------------
|
//----------------------------------------
|
||||||
auto const& graph = Impl::get_cuda_graph_from_kernel(driver);
|
auto const& graph = Impl::get_cuda_graph_from_kernel(driver);
|
||||||
KOKKOS_EXPECTS(bool(graph));
|
KOKKOS_EXPECTS(bool(graph));
|
||||||
@ -452,8 +515,18 @@ struct CudaParallelLaunchKernelInvoker<
|
|||||||
|
|
||||||
if (!Impl::is_empty_launch(grid, block)) {
|
if (!Impl::is_empty_launch(grid, block)) {
|
||||||
Impl::check_shmem_request(cuda_instance, shmem);
|
Impl::check_shmem_request(cuda_instance, shmem);
|
||||||
|
if (DriverType::Policy::
|
||||||
|
experimental_contains_desired_occupancy) {
|
||||||
|
/*int desired_occupancy =
|
||||||
|
driver.get_policy().impl_get_desired_occupancy().value();
|
||||||
|
size_t block_size = block.x * block.y * block.z;
|
||||||
Impl::configure_shmem_preference<DriverType, LaunchBounds>(
|
Impl::configure_shmem_preference<DriverType, LaunchBounds>(
|
||||||
base_t::get_kernel_func(), prefer_shmem);
|
base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
|
||||||
|
shmem, desired_occupancy);*/
|
||||||
|
Kokkos::Impl::throw_runtime_exception(
|
||||||
|
std::string("Cuda graph node creation FAILED:"
|
||||||
|
" occupancy requests are currently broken."));
|
||||||
|
}
|
||||||
|
|
||||||
auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver);
|
auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver);
|
||||||
|
|
||||||
@ -566,7 +639,7 @@ struct CudaParallelLaunchKernelInvoker<
|
|||||||
#ifdef KOKKOS_CUDA_ENABLE_GRAPHS
|
#ifdef KOKKOS_CUDA_ENABLE_GRAPHS
|
||||||
inline static void create_parallel_launch_graph_node(
|
inline static void create_parallel_launch_graph_node(
|
||||||
DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
|
DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
|
||||||
CudaInternal const* cuda_instance, bool prefer_shmem) {
|
CudaInternal const* cuda_instance) {
|
||||||
// Just use global memory; coordinating through events to share constant
|
// Just use global memory; coordinating through events to share constant
|
||||||
// memory with the non-graph interface is not really reasonable since
|
// memory with the non-graph interface is not really reasonable since
|
||||||
// events don't work with Graphs directly, and this would anyway require
|
// events don't work with Graphs directly, and this would anyway require
|
||||||
@ -580,7 +653,7 @@ struct CudaParallelLaunchKernelInvoker<
|
|||||||
DriverType, LaunchBounds,
|
DriverType, LaunchBounds,
|
||||||
Experimental::CudaLaunchMechanism::GlobalMemory>;
|
Experimental::CudaLaunchMechanism::GlobalMemory>;
|
||||||
global_launch_impl_t::create_parallel_launch_graph_node(
|
global_launch_impl_t::create_parallel_launch_graph_node(
|
||||||
driver, grid, block, shmem, cuda_instance, prefer_shmem);
|
driver, grid, block, shmem, cuda_instance);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
@ -613,8 +686,7 @@ struct CudaParallelLaunchImpl<
|
|||||||
|
|
||||||
inline static void launch_kernel(const DriverType& driver, const dim3& grid,
|
inline static void launch_kernel(const DriverType& driver, const dim3& grid,
|
||||||
const dim3& block, int shmem,
|
const dim3& block, int shmem,
|
||||||
const CudaInternal* cuda_instance,
|
const CudaInternal* cuda_instance) {
|
||||||
bool prefer_shmem) {
|
|
||||||
if (!Impl::is_empty_launch(grid, block)) {
|
if (!Impl::is_empty_launch(grid, block)) {
|
||||||
// Prevent multiple threads to simultaneously set the cache configuration
|
// Prevent multiple threads to simultaneously set the cache configuration
|
||||||
// preference and launch the same kernel
|
// preference and launch the same kernel
|
||||||
@ -623,20 +695,22 @@ struct CudaParallelLaunchImpl<
|
|||||||
|
|
||||||
Impl::check_shmem_request(cuda_instance, shmem);
|
Impl::check_shmem_request(cuda_instance, shmem);
|
||||||
|
|
||||||
// If a desired occupancy is specified, we compute how much shared memory
|
if (DriverType::Policy::
|
||||||
// to ask for to achieve that occupancy, assuming that the cache
|
experimental_contains_desired_occupancy) {
|
||||||
// configuration is `cudaFuncCachePreferL1`. If the amount of dynamic
|
/*int desired_occupancy =
|
||||||
// shared memory computed is actually smaller than `shmem` we overwrite
|
driver.get_policy().impl_get_desired_occupancy().value();
|
||||||
// `shmem` and set `prefer_shmem` to `false`.
|
size_t block_size = block.x * block.y * block.z;
|
||||||
modify_launch_configuration_if_desired_occupancy_is_specified(
|
|
||||||
driver.get_policy(), cuda_instance->m_deviceProp,
|
|
||||||
get_cuda_func_attributes(), block, shmem, prefer_shmem);
|
|
||||||
|
|
||||||
Impl::configure_shmem_preference<
|
Impl::configure_shmem_preference<
|
||||||
DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
|
DriverType,
|
||||||
base_t::get_kernel_func(), prefer_shmem);
|
Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
|
||||||
|
base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
|
||||||
|
shmem, desired_occupancy);*/
|
||||||
|
Kokkos::Impl::throw_runtime_exception(
|
||||||
|
std::string("Cuda graph node creation FAILED:"
|
||||||
|
" occupancy requests are currently broken."));
|
||||||
|
}
|
||||||
|
|
||||||
ensure_cuda_lock_arrays_on_device();
|
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
|
||||||
|
|
||||||
// Invoke the driver function on the device
|
// Invoke the driver function on the device
|
||||||
base_t::invoke_kernel(driver, grid, block, shmem, cuda_instance);
|
base_t::invoke_kernel(driver, grid, block, shmem, cuda_instance);
|
||||||
@ -650,18 +724,9 @@ struct CudaParallelLaunchImpl<
|
|||||||
}
|
}
|
||||||
|
|
||||||
static cudaFuncAttributes get_cuda_func_attributes() {
|
static cudaFuncAttributes get_cuda_func_attributes() {
|
||||||
// Race condition inside of cudaFuncGetAttributes if the same address is
|
return get_cuda_kernel_func_attributes<
|
||||||
// given requires using a local variable as input instead of a static Rely
|
DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
|
||||||
// on static variable initialization to make sure only one thread executes
|
base_t::get_kernel_func());
|
||||||
// the code and the result is visible.
|
|
||||||
auto wrap_get_attributes = []() -> cudaFuncAttributes {
|
|
||||||
cudaFuncAttributes attr_tmp;
|
|
||||||
KOKKOS_IMPL_CUDA_SAFE_CALL(
|
|
||||||
cudaFuncGetAttributes(&attr_tmp, base_t::get_kernel_func()));
|
|
||||||
return attr_tmp;
|
|
||||||
};
|
|
||||||
static cudaFuncAttributes attr = wrap_get_attributes();
|
|
||||||
return attr;
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@ -79,7 +79,8 @@ CudaLockArrays g_host_cuda_lock_arrays = {nullptr, 0};
|
|||||||
void initialize_host_cuda_lock_arrays() {
|
void initialize_host_cuda_lock_arrays() {
|
||||||
#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
|
#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
|
||||||
desul::Impl::init_lock_arrays();
|
desul::Impl::init_lock_arrays();
|
||||||
desul::ensure_cuda_lock_arrays_on_device();
|
|
||||||
|
DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
|
||||||
#endif
|
#endif
|
||||||
if (g_host_cuda_lock_arrays.atomic != nullptr) return;
|
if (g_host_cuda_lock_arrays.atomic != nullptr) return;
|
||||||
KOKKOS_IMPL_CUDA_SAFE_CALL(
|
KOKKOS_IMPL_CUDA_SAFE_CALL(
|
||||||
@ -88,7 +89,7 @@ void initialize_host_cuda_lock_arrays() {
|
|||||||
Impl::cuda_device_synchronize(
|
Impl::cuda_device_synchronize(
|
||||||
"Kokkos::Impl::initialize_host_cuda_lock_arrays: Pre Init Lock Arrays");
|
"Kokkos::Impl::initialize_host_cuda_lock_arrays: Pre Init Lock Arrays");
|
||||||
g_host_cuda_lock_arrays.n = Cuda::concurrency();
|
g_host_cuda_lock_arrays.n = Cuda::concurrency();
|
||||||
copy_cuda_lock_arrays_to_device();
|
KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
|
||||||
init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK + 1 + 255) / 256,
|
init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK + 1 + 255) / 256,
|
||||||
256>>>();
|
256>>>();
|
||||||
Impl::cuda_device_synchronize(
|
Impl::cuda_device_synchronize(
|
||||||
@ -105,7 +106,7 @@ void finalize_host_cuda_lock_arrays() {
|
|||||||
g_host_cuda_lock_arrays.atomic = nullptr;
|
g_host_cuda_lock_arrays.atomic = nullptr;
|
||||||
g_host_cuda_lock_arrays.n = 0;
|
g_host_cuda_lock_arrays.n = 0;
|
||||||
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
|
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
|
||||||
copy_cuda_lock_arrays_to_device();
|
KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -67,7 +67,7 @@ struct CudaLockArrays {
|
|||||||
|
|
||||||
/// \brief This global variable in Host space is the central definition
|
/// \brief This global variable in Host space is the central definition
|
||||||
/// of these arrays.
|
/// of these arrays.
|
||||||
extern CudaLockArrays g_host_cuda_lock_arrays;
|
extern Kokkos::Impl::CudaLockArrays g_host_cuda_lock_arrays;
|
||||||
|
|
||||||
/// \brief After this call, the g_host_cuda_lock_arrays variable has
|
/// \brief After this call, the g_host_cuda_lock_arrays variable has
|
||||||
/// valid, initialized arrays.
|
/// valid, initialized arrays.
|
||||||
@ -105,12 +105,12 @@ namespace Impl {
|
|||||||
/// instances in other translation units, we must update this CUDA global
|
/// instances in other translation units, we must update this CUDA global
|
||||||
/// variable based on the Host global variable prior to running any kernels
|
/// variable based on the Host global variable prior to running any kernels
|
||||||
/// that will use it.
|
/// that will use it.
|
||||||
/// That is the purpose of the ensure_cuda_lock_arrays_on_device function.
|
/// That is the purpose of the KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE macro.
|
||||||
__device__
|
__device__
|
||||||
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
|
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
|
||||||
__constant__ extern
|
__constant__ extern
|
||||||
#endif
|
#endif
|
||||||
CudaLockArrays g_device_cuda_lock_arrays;
|
Kokkos::Impl::CudaLockArrays g_device_cuda_lock_arrays;
|
||||||
|
|
||||||
#define CUDA_SPACE_ATOMIC_MASK 0x1FFFF
|
#define CUDA_SPACE_ATOMIC_MASK 0x1FFFF
|
||||||
|
|
||||||
@ -123,7 +123,9 @@ __device__ inline bool lock_address_cuda_space(void* ptr) {
|
|||||||
size_t offset = size_t(ptr);
|
size_t offset = size_t(ptr);
|
||||||
offset = offset >> 2;
|
offset = offset >> 2;
|
||||||
offset = offset & CUDA_SPACE_ATOMIC_MASK;
|
offset = offset & CUDA_SPACE_ATOMIC_MASK;
|
||||||
return (0 == atomicCAS(&g_device_cuda_lock_arrays.atomic[offset], 0, 1));
|
return (
|
||||||
|
0 ==
|
||||||
|
atomicCAS(&Kokkos::Impl::g_device_cuda_lock_arrays.atomic[offset], 0, 1));
|
||||||
}
|
}
|
||||||
|
|
||||||
/// \brief Release lock for the address
|
/// \brief Release lock for the address
|
||||||
@ -136,7 +138,7 @@ __device__ inline void unlock_address_cuda_space(void* ptr) {
|
|||||||
size_t offset = size_t(ptr);
|
size_t offset = size_t(ptr);
|
||||||
offset = offset >> 2;
|
offset = offset >> 2;
|
||||||
offset = offset & CUDA_SPACE_ATOMIC_MASK;
|
offset = offset & CUDA_SPACE_ATOMIC_MASK;
|
||||||
atomicExch(&g_device_cuda_lock_arrays.atomic[offset], 0);
|
atomicExch(&Kokkos::Impl::g_device_cuda_lock_arrays.atomic[offset], 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace Impl
|
} // namespace Impl
|
||||||
@ -149,49 +151,45 @@ namespace {
|
|||||||
static int lock_array_copied = 0;
|
static int lock_array_copied = 0;
|
||||||
inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
|
inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
|
||||||
} // namespace
|
} // namespace
|
||||||
|
} // namespace Impl
|
||||||
|
} // namespace Kokkos
|
||||||
|
|
||||||
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
|
/* Dan Ibanez: it is critical that this code be a macro, so that it will
|
||||||
inline
|
capture the right address for Kokkos::Impl::g_device_cuda_lock_arrays!
|
||||||
#else
|
putting this in an inline function will NOT do the right thing! */
|
||||||
static
|
#define KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE() \
|
||||||
#endif
|
{ \
|
||||||
void
|
if (::Kokkos::Impl::lock_array_copied == 0) { \
|
||||||
copy_cuda_lock_arrays_to_device() {
|
KOKKOS_IMPL_CUDA_SAFE_CALL( \
|
||||||
if (lock_array_copied == 0) {
|
cudaMemcpyToSymbol(Kokkos::Impl::g_device_cuda_lock_arrays, \
|
||||||
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemcpyToSymbol(g_device_cuda_lock_arrays,
|
&Kokkos::Impl::g_host_cuda_lock_arrays, \
|
||||||
&g_host_cuda_lock_arrays,
|
sizeof(Kokkos::Impl::CudaLockArrays))); \
|
||||||
sizeof(CudaLockArrays)));
|
} \
|
||||||
}
|
lock_array_copied = 1; \
|
||||||
lock_array_copied = 1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
|
#ifndef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
|
||||||
|
|
||||||
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
|
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
|
||||||
inline void ensure_cuda_lock_arrays_on_device() {}
|
#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
|
||||||
#else
|
#else
|
||||||
inline static void ensure_cuda_lock_arrays_on_device() {
|
#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() \
|
||||||
copy_cuda_lock_arrays_to_device();
|
KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
|
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
|
||||||
inline void ensure_cuda_lock_arrays_on_device() {}
|
#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
|
||||||
#else
|
#else
|
||||||
// Still Need COPY_CUDA_LOCK_ARRAYS for team scratch etc.
|
// Still Need COPY_CUDA_LOCK_ARRAYS for team scratch etc.
|
||||||
inline static void ensure_cuda_lock_arrays_on_device() {
|
#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() \
|
||||||
copy_cuda_lock_arrays_to_device();
|
KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE() \
|
||||||
desul::ensure_cuda_lock_arrays_on_device();
|
DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif /* defined( KOKKOS_ENABLE_IMPL_DESUL_ATOMICS ) */
|
#endif /* defined( KOKKOS_ENABLE_IMPL_DESUL_ATOMICS ) */
|
||||||
|
|
||||||
} // namespace Impl
|
|
||||||
} // namespace Kokkos
|
|
||||||
|
|
||||||
#endif /* defined( KOKKOS_ENABLE_CUDA ) */
|
#endif /* defined( KOKKOS_ENABLE_CUDA ) */
|
||||||
|
|
||||||
#endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP */
|
#endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP */
|
||||||
|
|||||||
@ -67,6 +67,34 @@
|
|||||||
namespace Kokkos {
|
namespace Kokkos {
|
||||||
namespace Impl {
|
namespace Impl {
|
||||||
|
|
||||||
|
template <typename ParallelType, typename Policy, typename LaunchBounds>
|
||||||
|
int max_tile_size_product_helper(const Policy& pol, const LaunchBounds&) {
|
||||||
|
cudaFuncAttributes attr =
|
||||||
|
CudaParallelLaunch<ParallelType,
|
||||||
|
LaunchBounds>::get_cuda_func_attributes();
|
||||||
|
auto const& prop = pol.space().cuda_device_prop();
|
||||||
|
|
||||||
|
// Limits due to registers/SM, MDRange doesn't have
|
||||||
|
// shared memory constraints
|
||||||
|
int const optimal_block_size =
|
||||||
|
Kokkos::Impl::cuda_get_opt_block_size_no_shmem(attr, LaunchBounds{});
|
||||||
|
|
||||||
|
// Compute how many blocks of this size we can launch, based on warp
|
||||||
|
// constraints
|
||||||
|
int const max_warps_per_sm_registers =
|
||||||
|
Kokkos::Impl::cuda_max_warps_per_sm_registers(prop, attr);
|
||||||
|
int const max_num_threads_from_warps =
|
||||||
|
max_warps_per_sm_registers * prop.warpSize;
|
||||||
|
int const max_num_blocks = max_num_threads_from_warps / optimal_block_size;
|
||||||
|
|
||||||
|
// Compute the total number of threads
|
||||||
|
int const max_threads_per_sm = optimal_block_size * max_num_blocks;
|
||||||
|
|
||||||
|
return std::min(
|
||||||
|
max_threads_per_sm,
|
||||||
|
static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
|
||||||
|
}
|
||||||
|
|
||||||
template <class FunctorType, class... Traits>
|
template <class FunctorType, class... Traits>
|
||||||
class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
|
class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
|
||||||
public:
|
public:
|
||||||
@ -85,18 +113,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
|
|||||||
public:
|
public:
|
||||||
template <typename Policy, typename Functor>
|
template <typename Policy, typename Functor>
|
||||||
static int max_tile_size_product(const Policy& pol, const Functor&) {
|
static int max_tile_size_product(const Policy& pol, const Functor&) {
|
||||||
cudaFuncAttributes attr =
|
return max_tile_size_product_helper<ParallelFor>(pol, LaunchBounds{});
|
||||||
CudaParallelLaunch<ParallelFor,
|
|
||||||
LaunchBounds>::get_cuda_func_attributes();
|
|
||||||
auto const& prop = pol.space().cuda_device_prop();
|
|
||||||
// Limits due to registers/SM, MDRange doesn't have
|
|
||||||
// shared memory constraints
|
|
||||||
int const regs_per_sm = prop.regsPerMultiprocessor;
|
|
||||||
int const regs_per_thread = attr.numRegs;
|
|
||||||
int const max_threads_per_sm = regs_per_sm / regs_per_thread;
|
|
||||||
return std::min(
|
|
||||||
max_threads_per_sm,
|
|
||||||
static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
|
|
||||||
}
|
}
|
||||||
Policy const& get_policy() const { return m_rp; }
|
Policy const& get_policy() const { return m_rp; }
|
||||||
inline __device__ void operator()() const {
|
inline __device__ void operator()() const {
|
||||||
@ -121,8 +138,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
|
|||||||
maxblocks[1]),
|
maxblocks[1]),
|
||||||
1);
|
1);
|
||||||
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
||||||
*this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
|
*this, grid, block, 0, m_rp.space().impl_internal_space_instance());
|
||||||
false);
|
|
||||||
} else if (RP::rank == 3) {
|
} else if (RP::rank == 3) {
|
||||||
const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], m_rp.m_tile[2]);
|
const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], m_rp.m_tile[2]);
|
||||||
KOKKOS_ASSERT(block.x > 0);
|
KOKKOS_ASSERT(block.x > 0);
|
||||||
@ -139,8 +155,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
|
|||||||
(m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1) / block.z,
|
(m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1) / block.z,
|
||||||
maxblocks[2]));
|
maxblocks[2]));
|
||||||
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
||||||
*this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
|
*this, grid, block, 0, m_rp.space().impl_internal_space_instance());
|
||||||
false);
|
|
||||||
} else if (RP::rank == 4) {
|
} else if (RP::rank == 4) {
|
||||||
// id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to
|
// id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to
|
||||||
// threadIdx.z
|
// threadIdx.z
|
||||||
@ -158,8 +173,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
|
|||||||
(m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1) / block.z,
|
(m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1) / block.z,
|
||||||
maxblocks[2]));
|
maxblocks[2]));
|
||||||
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
||||||
*this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
|
*this, grid, block, 0, m_rp.space().impl_internal_space_instance());
|
||||||
false);
|
|
||||||
} else if (RP::rank == 5) {
|
} else if (RP::rank == 5) {
|
||||||
// id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to
|
// id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to
|
||||||
// threadIdx.z
|
// threadIdx.z
|
||||||
@ -175,8 +189,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
|
|||||||
(m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1) / block.z,
|
(m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1) / block.z,
|
||||||
maxblocks[2]));
|
maxblocks[2]));
|
||||||
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
||||||
*this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
|
*this, grid, block, 0, m_rp.space().impl_internal_space_instance());
|
||||||
false);
|
|
||||||
} else if (RP::rank == 6) {
|
} else if (RP::rank == 6) {
|
||||||
// id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4,id5 to
|
// id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4,id5 to
|
||||||
// threadIdx.z
|
// threadIdx.z
|
||||||
@ -191,8 +204,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
|
|||||||
std::min<array_index_type>(m_rp.m_tile_end[4] * m_rp.m_tile_end[5],
|
std::min<array_index_type>(m_rp.m_tile_end[4] * m_rp.m_tile_end[5],
|
||||||
maxblocks[2]));
|
maxblocks[2]));
|
||||||
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
||||||
*this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
|
*this, grid, block, 0, m_rp.space().impl_internal_space_instance());
|
||||||
false);
|
|
||||||
} else {
|
} else {
|
||||||
Kokkos::abort("Kokkos::MDRange Error: Exceeded rank bounds with Cuda\n");
|
Kokkos::abort("Kokkos::MDRange Error: Exceeded rank bounds with Cuda\n");
|
||||||
}
|
}
|
||||||
@ -263,17 +275,7 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
|
|||||||
public:
|
public:
|
||||||
template <typename Policy, typename Functor>
|
template <typename Policy, typename Functor>
|
||||||
static int max_tile_size_product(const Policy& pol, const Functor&) {
|
static int max_tile_size_product(const Policy& pol, const Functor&) {
|
||||||
cudaFuncAttributes attr =
|
return max_tile_size_product_helper<ParallelReduce>(pol, LaunchBounds{});
|
||||||
CudaParallelLaunch<ParallelReduce,
|
|
||||||
LaunchBounds>::get_cuda_func_attributes();
|
|
||||||
auto const& prop = pol.space().cuda_device_prop();
|
|
||||||
// Limits due do registers/SM
|
|
||||||
int const regs_per_sm = prop.regsPerMultiprocessor;
|
|
||||||
int const regs_per_thread = attr.numRegs;
|
|
||||||
int const max_threads_per_sm = regs_per_sm / regs_per_thread;
|
|
||||||
return std::min(
|
|
||||||
max_threads_per_sm,
|
|
||||||
static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
|
|
||||||
}
|
}
|
||||||
Policy const& get_policy() const { return m_policy; }
|
Policy const& get_policy() const { return m_policy; }
|
||||||
inline __device__ void exec_range(reference_type update) const {
|
inline __device__ void exec_range(reference_type update) const {
|
||||||
@ -405,8 +407,8 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
|
|||||||
|
|
||||||
CudaParallelLaunch<ParallelReduce, LaunchBounds>(
|
CudaParallelLaunch<ParallelReduce, LaunchBounds>(
|
||||||
*this, grid, block, shmem,
|
*this, grid, block, shmem,
|
||||||
m_policy.space().impl_internal_space_instance(),
|
m_policy.space()
|
||||||
false); // copy to device and execute
|
.impl_internal_space_instance()); // copy to device and execute
|
||||||
|
|
||||||
if (!m_result_ptr_device_accessible) {
|
if (!m_result_ptr_device_accessible) {
|
||||||
if (m_result_ptr) {
|
if (m_result_ptr) {
|
||||||
|
|||||||
@ -135,8 +135,7 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
||||||
*this, grid, block, 0, m_policy.space().impl_internal_space_instance(),
|
*this, grid, block, 0, m_policy.space().impl_internal_space_instance());
|
||||||
false);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
|
ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
|
||||||
@ -375,8 +374,8 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
|
|||||||
|
|
||||||
CudaParallelLaunch<ParallelReduce, LaunchBounds>(
|
CudaParallelLaunch<ParallelReduce, LaunchBounds>(
|
||||||
*this, grid, block, shmem,
|
*this, grid, block, shmem,
|
||||||
m_policy.space().impl_internal_space_instance(),
|
m_policy.space()
|
||||||
false); // copy to device and execute
|
.impl_internal_space_instance()); // copy to device and execute
|
||||||
|
|
||||||
if (!m_result_ptr_device_accessible) {
|
if (!m_result_ptr_device_accessible) {
|
||||||
if (m_result_ptr) {
|
if (m_result_ptr) {
|
||||||
@ -465,8 +464,24 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
|
|||||||
public:
|
public:
|
||||||
using pointer_type = typename Analysis::pointer_type;
|
using pointer_type = typename Analysis::pointer_type;
|
||||||
using reference_type = typename Analysis::reference_type;
|
using reference_type = typename Analysis::reference_type;
|
||||||
|
using value_type = typename Analysis::value_type;
|
||||||
using functor_type = FunctorType;
|
using functor_type = FunctorType;
|
||||||
using size_type = Cuda::size_type;
|
using size_type = Cuda::size_type;
|
||||||
|
// Conditionally set word_size_type to int16_t or int8_t if value_type is
|
||||||
|
// smaller than int32_t (Kokkos::Cuda::size_type)
|
||||||
|
// word_size_type is used to determine the word count, shared memory buffer
|
||||||
|
// size, and global memory buffer size before the scan is performed.
|
||||||
|
// Within the scan, the word count is recomputed based on word_size_type
|
||||||
|
// and when calculating indexes into the shared/global memory buffers for
|
||||||
|
// performing the scan, word_size_type is used again.
|
||||||
|
// For scalars > 4 bytes in size, indexing into shared/global memory relies
|
||||||
|
// on the block and grid dimensions to ensure that we index at the correct
|
||||||
|
// offset rather than at every 4 byte word; such that, when the join is
|
||||||
|
// performed, we have the correct data that was copied over in chunks of 4
|
||||||
|
// bytes.
|
||||||
|
using word_size_type = std::conditional_t<
|
||||||
|
sizeof(value_type) < sizeof(size_type),
|
||||||
|
std::conditional_t<sizeof(value_type) == 2, int16_t, int8_t>, size_type>;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// Algorithmic constraints:
|
// Algorithmic constraints:
|
||||||
@ -477,7 +492,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
|
|||||||
|
|
||||||
const FunctorType m_functor;
|
const FunctorType m_functor;
|
||||||
const Policy m_policy;
|
const Policy m_policy;
|
||||||
size_type* m_scratch_space;
|
word_size_type* m_scratch_space;
|
||||||
size_type* m_scratch_flags;
|
size_type* m_scratch_flags;
|
||||||
size_type m_final;
|
size_type m_final;
|
||||||
#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
|
#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
|
||||||
@ -501,12 +516,12 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
|
|||||||
__device__ inline void initial() const {
|
__device__ inline void initial() const {
|
||||||
typename Analysis::Reducer final_reducer(&m_functor);
|
typename Analysis::Reducer final_reducer(&m_functor);
|
||||||
|
|
||||||
const integral_nonzero_constant<size_type, Analysis::StaticValueSize /
|
const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
|
||||||
sizeof(size_type)>
|
sizeof(word_size_type)>
|
||||||
word_count(Analysis::value_size(m_functor) / sizeof(size_type));
|
word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));
|
||||||
|
|
||||||
size_type* const shared_value =
|
word_size_type* const shared_value =
|
||||||
kokkos_impl_cuda_shared_memory<size_type>() +
|
kokkos_impl_cuda_shared_memory<word_size_type>() +
|
||||||
word_count.value * threadIdx.y;
|
word_count.value * threadIdx.y;
|
||||||
|
|
||||||
final_reducer.init(reinterpret_cast<pointer_type>(shared_value));
|
final_reducer.init(reinterpret_cast<pointer_type>(shared_value));
|
||||||
@ -532,7 +547,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
|
|||||||
// gridDim.x
|
// gridDim.x
|
||||||
cuda_single_inter_block_reduce_scan<true>(
|
cuda_single_inter_block_reduce_scan<true>(
|
||||||
final_reducer, blockIdx.x, gridDim.x,
|
final_reducer, blockIdx.x, gridDim.x,
|
||||||
kokkos_impl_cuda_shared_memory<size_type>(), m_scratch_space,
|
kokkos_impl_cuda_shared_memory<word_size_type>(), m_scratch_space,
|
||||||
m_scratch_flags);
|
m_scratch_flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -541,21 +556,22 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
|
|||||||
__device__ inline void final() const {
|
__device__ inline void final() const {
|
||||||
typename Analysis::Reducer final_reducer(&m_functor);
|
typename Analysis::Reducer final_reducer(&m_functor);
|
||||||
|
|
||||||
const integral_nonzero_constant<size_type, Analysis::StaticValueSize /
|
const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
|
||||||
sizeof(size_type)>
|
sizeof(word_size_type)>
|
||||||
word_count(Analysis::value_size(m_functor) / sizeof(size_type));
|
word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));
|
||||||
|
|
||||||
// Use shared memory as an exclusive scan: { 0 , value[0] , value[1] ,
|
// Use shared memory as an exclusive scan: { 0 , value[0] , value[1] ,
|
||||||
// value[2] , ... }
|
// value[2] , ... }
|
||||||
size_type* const shared_data = kokkos_impl_cuda_shared_memory<size_type>();
|
word_size_type* const shared_data =
|
||||||
size_type* const shared_prefix =
|
kokkos_impl_cuda_shared_memory<word_size_type>();
|
||||||
|
word_size_type* const shared_prefix =
|
||||||
shared_data + word_count.value * threadIdx.y;
|
shared_data + word_count.value * threadIdx.y;
|
||||||
size_type* const shared_accum =
|
word_size_type* const shared_accum =
|
||||||
shared_data + word_count.value * (blockDim.y + 1);
|
shared_data + word_count.value * (blockDim.y + 1);
|
||||||
|
|
||||||
// Starting value for this thread block is the previous block's total.
|
// Starting value for this thread block is the previous block's total.
|
||||||
if (blockIdx.x) {
|
if (blockIdx.x) {
|
||||||
size_type* const block_total =
|
word_size_type* const block_total =
|
||||||
m_scratch_space + word_count.value * (blockIdx.x - 1);
|
m_scratch_space + word_count.value * (blockIdx.x - 1);
|
||||||
for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
|
for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
|
||||||
shared_accum[i] = block_total[i];
|
shared_accum[i] = block_total[i];
|
||||||
@ -602,7 +618,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
|
|||||||
typename Analysis::pointer_type(shared_data + word_count.value));
|
typename Analysis::pointer_type(shared_data + word_count.value));
|
||||||
|
|
||||||
{
|
{
|
||||||
size_type* const block_total =
|
word_size_type* const block_total =
|
||||||
shared_data + word_count.value * blockDim.y;
|
shared_data + word_count.value * blockDim.y;
|
||||||
for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
|
for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
|
||||||
shared_accum[i] = block_total[i];
|
shared_accum[i] = block_total[i];
|
||||||
@ -690,8 +706,9 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
|
|||||||
// How many block are really needed for this much work:
|
// How many block are really needed for this much work:
|
||||||
const int grid_x = (nwork + work_per_block - 1) / work_per_block;
|
const int grid_x = (nwork + work_per_block - 1) / work_per_block;
|
||||||
|
|
||||||
m_scratch_space = cuda_internal_scratch_space(
|
m_scratch_space =
|
||||||
m_policy.space(), Analysis::value_size(m_functor) * grid_x);
|
reinterpret_cast<word_size_type*>(cuda_internal_scratch_space(
|
||||||
|
m_policy.space(), Analysis::value_size(m_functor) * grid_x));
|
||||||
m_scratch_flags =
|
m_scratch_flags =
|
||||||
cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type) * 1);
|
cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type) * 1);
|
||||||
|
|
||||||
@ -708,16 +725,16 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
|
|||||||
m_final = false;
|
m_final = false;
|
||||||
CudaParallelLaunch<ParallelScan, LaunchBounds>(
|
CudaParallelLaunch<ParallelScan, LaunchBounds>(
|
||||||
*this, grid, block, shmem,
|
*this, grid, block, shmem,
|
||||||
m_policy.space().impl_internal_space_instance(),
|
m_policy.space()
|
||||||
false); // copy to device and execute
|
.impl_internal_space_instance()); // copy to device and execute
|
||||||
#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
|
#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
m_final = true;
|
m_final = true;
|
||||||
CudaParallelLaunch<ParallelScan, LaunchBounds>(
|
CudaParallelLaunch<ParallelScan, LaunchBounds>(
|
||||||
*this, grid, block, shmem,
|
*this, grid, block, shmem,
|
||||||
m_policy.space().impl_internal_space_instance(),
|
m_policy.space()
|
||||||
false); // copy to device and execute
|
.impl_internal_space_instance()); // copy to device and execute
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -752,10 +769,26 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
|
|||||||
Policy, FunctorType>;
|
Policy, FunctorType>;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
using value_type = typename Analysis::value_type;
|
||||||
using pointer_type = typename Analysis::pointer_type;
|
using pointer_type = typename Analysis::pointer_type;
|
||||||
using reference_type = typename Analysis::reference_type;
|
using reference_type = typename Analysis::reference_type;
|
||||||
using functor_type = FunctorType;
|
using functor_type = FunctorType;
|
||||||
using size_type = Cuda::size_type;
|
using size_type = Cuda::size_type;
|
||||||
|
// Conditionally set word_size_type to int16_t or int8_t if value_type is
|
||||||
|
// smaller than int32_t (Kokkos::Cuda::size_type)
|
||||||
|
// word_size_type is used to determine the word count, shared memory buffer
|
||||||
|
// size, and global memory buffer size before the scan is performed.
|
||||||
|
// Within the scan, the word count is recomputed based on word_size_type
|
||||||
|
// and when calculating indexes into the shared/global memory buffers for
|
||||||
|
// performing the scan, word_size_type is used again.
|
||||||
|
// For scalars > 4 bytes in size, indexing into shared/global memory relies
|
||||||
|
// on the block and grid dimensions to ensure that we index at the correct
|
||||||
|
// offset rather than at every 4 byte word; such that, when the join is
|
||||||
|
// performed, we have the correct data that was copied over in chunks of 4
|
||||||
|
// bytes.
|
||||||
|
using word_size_type = std::conditional_t<
|
||||||
|
sizeof(value_type) < sizeof(size_type),
|
||||||
|
std::conditional_t<sizeof(value_type) == 2, int16_t, int8_t>, size_type>;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// Algorithmic constraints:
|
// Algorithmic constraints:
|
||||||
@ -766,7 +799,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
|
|||||||
|
|
||||||
const FunctorType m_functor;
|
const FunctorType m_functor;
|
||||||
const Policy m_policy;
|
const Policy m_policy;
|
||||||
size_type* m_scratch_space;
|
word_size_type* m_scratch_space;
|
||||||
size_type* m_scratch_flags;
|
size_type* m_scratch_flags;
|
||||||
size_type m_final;
|
size_type m_final;
|
||||||
ReturnType& m_returnvalue;
|
ReturnType& m_returnvalue;
|
||||||
@ -791,12 +824,12 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
|
|||||||
__device__ inline void initial() const {
|
__device__ inline void initial() const {
|
||||||
typename Analysis::Reducer final_reducer(&m_functor);
|
typename Analysis::Reducer final_reducer(&m_functor);
|
||||||
|
|
||||||
const integral_nonzero_constant<size_type, Analysis::StaticValueSize /
|
const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
|
||||||
sizeof(size_type)>
|
sizeof(word_size_type)>
|
||||||
word_count(Analysis::value_size(m_functor) / sizeof(size_type));
|
word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));
|
||||||
|
|
||||||
size_type* const shared_value =
|
word_size_type* const shared_value =
|
||||||
kokkos_impl_cuda_shared_memory<size_type>() +
|
kokkos_impl_cuda_shared_memory<word_size_type>() +
|
||||||
word_count.value * threadIdx.y;
|
word_count.value * threadIdx.y;
|
||||||
|
|
||||||
final_reducer.init(reinterpret_cast<pointer_type>(shared_value));
|
final_reducer.init(reinterpret_cast<pointer_type>(shared_value));
|
||||||
@ -822,7 +855,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
|
|||||||
// gridDim.x
|
// gridDim.x
|
||||||
cuda_single_inter_block_reduce_scan<true>(
|
cuda_single_inter_block_reduce_scan<true>(
|
||||||
final_reducer, blockIdx.x, gridDim.x,
|
final_reducer, blockIdx.x, gridDim.x,
|
||||||
kokkos_impl_cuda_shared_memory<size_type>(), m_scratch_space,
|
kokkos_impl_cuda_shared_memory<word_size_type>(), m_scratch_space,
|
||||||
m_scratch_flags);
|
m_scratch_flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -831,21 +864,22 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
|
|||||||
__device__ inline void final() const {
|
__device__ inline void final() const {
|
||||||
typename Analysis::Reducer final_reducer(&m_functor);
|
typename Analysis::Reducer final_reducer(&m_functor);
|
||||||
|
|
||||||
const integral_nonzero_constant<size_type, Analysis::StaticValueSize /
|
const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
|
||||||
sizeof(size_type)>
|
sizeof(word_size_type)>
|
||||||
word_count(Analysis::value_size(m_functor) / sizeof(size_type));
|
word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));
|
||||||
|
|
||||||
// Use shared memory as an exclusive scan: { 0 , value[0] , value[1] ,
|
// Use shared memory as an exclusive scan: { 0 , value[0] , value[1] ,
|
||||||
// value[2] , ... }
|
// value[2] , ... }
|
||||||
size_type* const shared_data = kokkos_impl_cuda_shared_memory<size_type>();
|
word_size_type* const shared_data =
|
||||||
size_type* const shared_prefix =
|
kokkos_impl_cuda_shared_memory<word_size_type>();
|
||||||
|
word_size_type* const shared_prefix =
|
||||||
shared_data + word_count.value * threadIdx.y;
|
shared_data + word_count.value * threadIdx.y;
|
||||||
size_type* const shared_accum =
|
word_size_type* const shared_accum =
|
||||||
shared_data + word_count.value * (blockDim.y + 1);
|
shared_data + word_count.value * (blockDim.y + 1);
|
||||||
|
|
||||||
// Starting value for this thread block is the previous block's total.
|
// Starting value for this thread block is the previous block's total.
|
||||||
if (blockIdx.x) {
|
if (blockIdx.x) {
|
||||||
size_type* const block_total =
|
word_size_type* const block_total =
|
||||||
m_scratch_space + word_count.value * (blockIdx.x - 1);
|
m_scratch_space + word_count.value * (blockIdx.x - 1);
|
||||||
for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
|
for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
|
||||||
shared_accum[i] = block_total[i];
|
shared_accum[i] = block_total[i];
|
||||||
@ -894,7 +928,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
|
|||||||
typename Analysis::pointer_type(shared_data + word_count.value));
|
typename Analysis::pointer_type(shared_data + word_count.value));
|
||||||
|
|
||||||
{
|
{
|
||||||
size_type* const block_total =
|
word_size_type* const block_total =
|
||||||
shared_data + word_count.value * blockDim.y;
|
shared_data + word_count.value * blockDim.y;
|
||||||
for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
|
for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
|
||||||
shared_accum[i] = block_total[i];
|
shared_accum[i] = block_total[i];
|
||||||
@ -983,8 +1017,9 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
|
|||||||
// How many block are really needed for this much work:
|
// How many block are really needed for this much work:
|
||||||
const int grid_x = (nwork + work_per_block - 1) / work_per_block;
|
const int grid_x = (nwork + work_per_block - 1) / work_per_block;
|
||||||
|
|
||||||
m_scratch_space = cuda_internal_scratch_space(
|
m_scratch_space =
|
||||||
m_policy.space(), Analysis::value_size(m_functor) * grid_x);
|
reinterpret_cast<word_size_type*>(cuda_internal_scratch_space(
|
||||||
|
m_policy.space(), Analysis::value_size(m_functor) * grid_x));
|
||||||
m_scratch_flags =
|
m_scratch_flags =
|
||||||
cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type) * 1);
|
cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type) * 1);
|
||||||
|
|
||||||
@ -1002,16 +1037,16 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
|
|||||||
m_final = false;
|
m_final = false;
|
||||||
CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>(
|
CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>(
|
||||||
*this, grid, block, shmem,
|
*this, grid, block, shmem,
|
||||||
m_policy.space().impl_internal_space_instance(),
|
m_policy.space()
|
||||||
false); // copy to device and execute
|
.impl_internal_space_instance()); // copy to device and execute
|
||||||
#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
|
#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
m_final = true;
|
m_final = true;
|
||||||
CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>(
|
CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>(
|
||||||
*this, grid, block, shmem,
|
*this, grid, block, shmem,
|
||||||
m_policy.space().impl_internal_space_instance(),
|
m_policy.space()
|
||||||
false); // copy to device and execute
|
.impl_internal_space_instance()); // copy to device and execute
|
||||||
|
|
||||||
const int size = Analysis::value_size(m_functor);
|
const int size = Analysis::value_size(m_functor);
|
||||||
#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
|
#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
|
||||||
@ -1022,7 +1057,8 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
|
|||||||
#endif
|
#endif
|
||||||
DeepCopy<HostSpace, CudaSpace, Cuda>(
|
DeepCopy<HostSpace, CudaSpace, Cuda>(
|
||||||
m_policy.space(), &m_returnvalue,
|
m_policy.space(), &m_returnvalue,
|
||||||
m_scratch_space + (grid_x - 1) * size / sizeof(int), size);
|
m_scratch_space + (grid_x - 1) * size / sizeof(word_size_type),
|
||||||
|
size);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -552,8 +552,8 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
|
|||||||
|
|
||||||
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
||||||
*this, grid, block, shmem_size_total,
|
*this, grid, block, shmem_size_total,
|
||||||
m_policy.space().impl_internal_space_instance(),
|
m_policy.space()
|
||||||
true); // copy to device and execute
|
.impl_internal_space_instance()); // copy to device and execute
|
||||||
}
|
}
|
||||||
|
|
||||||
ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
|
ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
|
||||||
@ -878,8 +878,8 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
|
|||||||
|
|
||||||
CudaParallelLaunch<ParallelReduce, LaunchBounds>(
|
CudaParallelLaunch<ParallelReduce, LaunchBounds>(
|
||||||
*this, grid, block, shmem_size_total,
|
*this, grid, block, shmem_size_total,
|
||||||
m_policy.space().impl_internal_space_instance(),
|
m_policy.space()
|
||||||
true); // copy to device and execute
|
.impl_internal_space_instance()); // copy to device and execute
|
||||||
|
|
||||||
if (!m_result_ptr_device_accessible) {
|
if (!m_result_ptr_device_accessible) {
|
||||||
m_policy.space().fence(
|
m_policy.space().fence(
|
||||||
|
|||||||
@ -116,6 +116,7 @@ __device__ inline void cuda_inter_warp_reduction(
|
|||||||
value = result[0];
|
value = result[0];
|
||||||
for (int i = 1; (i * step < max_active_thread) && i < STEP_WIDTH; i++)
|
for (int i = 1; (i * step < max_active_thread) && i < STEP_WIDTH; i++)
|
||||||
reducer.join(&value, &result[i]);
|
reducer.join(&value, &result[i]);
|
||||||
|
__syncthreads();
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class ValueType, class ReducerType>
|
template <class ValueType, class ReducerType>
|
||||||
@ -427,11 +428,6 @@ struct CudaReductionsFunctor<FunctorType, false, false> {
|
|||||||
// __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
|
// __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
|
||||||
// function qualifier which could be used to improve performance.
|
// function qualifier which could be used to improve performance.
|
||||||
//----------------------------------------------------------------------------
|
//----------------------------------------------------------------------------
|
||||||
// Maximize shared memory and minimize L1 cache:
|
|
||||||
// cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferShared );
|
|
||||||
// For 2.0 capability: 48 KB shared and 16 KB L1
|
|
||||||
//----------------------------------------------------------------------------
|
|
||||||
//----------------------------------------------------------------------------
|
|
||||||
/*
|
/*
|
||||||
* Algorithmic constraints:
|
* Algorithmic constraints:
|
||||||
* (a) blockDim.y <= 1024
|
* (a) blockDim.y <= 1024
|
||||||
|
|||||||
@ -100,8 +100,7 @@ class ParallelFor<FunctorType, Kokkos::WorkGraphPolicy<Traits...>,
|
|||||||
const int shared = 0;
|
const int shared = 0;
|
||||||
|
|
||||||
Kokkos::Impl::CudaParallelLaunch<Self>(
|
Kokkos::Impl::CudaParallelLaunch<Self>(
|
||||||
*this, grid, block, shared, Cuda().impl_internal_space_instance(),
|
*this, grid, block, shared, Cuda().impl_internal_space_instance());
|
||||||
false);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
|
inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
|
||||||
|
|||||||
@ -448,11 +448,27 @@ class ParallelScanHIPBase {
|
|||||||
Policy, FunctorType>;
|
Policy, FunctorType>;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
using value_type = typename Analysis::value_type;
|
||||||
using pointer_type = typename Analysis::pointer_type;
|
using pointer_type = typename Analysis::pointer_type;
|
||||||
using reference_type = typename Analysis::reference_type;
|
using reference_type = typename Analysis::reference_type;
|
||||||
using functor_type = FunctorType;
|
using functor_type = FunctorType;
|
||||||
using size_type = Kokkos::Experimental::HIP::size_type;
|
using size_type = Kokkos::Experimental::HIP::size_type;
|
||||||
using index_type = typename Policy::index_type;
|
using index_type = typename Policy::index_type;
|
||||||
|
// Conditionally set word_size_type to int16_t or int8_t if value_type is
|
||||||
|
// smaller than int32_t (Kokkos::HIP::size_type)
|
||||||
|
// word_size_type is used to determine the word count, shared memory buffer
|
||||||
|
// size, and global memory buffer size before the scan is performed.
|
||||||
|
// Within the scan, the word count is recomputed based on word_size_type
|
||||||
|
// and when calculating indexes into the shared/global memory buffers for
|
||||||
|
// performing the scan, word_size_type is used again.
|
||||||
|
// For scalars > 4 bytes in size, indexing into shared/global memory relies
|
||||||
|
// on the block and grid dimensions to ensure that we index at the correct
|
||||||
|
// offset rather than at every 4 byte word; such that, when the join is
|
||||||
|
// performed, we have the correct data that was copied over in chunks of 4
|
||||||
|
// bytes.
|
||||||
|
using word_size_type = std::conditional_t<
|
||||||
|
sizeof(value_type) < sizeof(size_type),
|
||||||
|
std::conditional_t<sizeof(value_type) == 2, int16_t, int8_t>, size_type>;
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
// Algorithmic constraints:
|
// Algorithmic constraints:
|
||||||
@ -463,7 +479,7 @@ class ParallelScanHIPBase {
|
|||||||
|
|
||||||
const FunctorType m_functor;
|
const FunctorType m_functor;
|
||||||
const Policy m_policy;
|
const Policy m_policy;
|
||||||
size_type* m_scratch_space = nullptr;
|
word_size_type* m_scratch_space = nullptr;
|
||||||
size_type* m_scratch_flags = nullptr;
|
size_type* m_scratch_flags = nullptr;
|
||||||
size_type m_final = false;
|
size_type m_final = false;
|
||||||
int m_grid_x = 0;
|
int m_grid_x = 0;
|
||||||
@ -489,12 +505,12 @@ class ParallelScanHIPBase {
|
|||||||
__device__ inline void initial() const {
|
__device__ inline void initial() const {
|
||||||
typename Analysis::Reducer final_reducer(&m_functor);
|
typename Analysis::Reducer final_reducer(&m_functor);
|
||||||
|
|
||||||
const integral_nonzero_constant<size_type, Analysis::StaticValueSize /
|
const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
|
||||||
sizeof(size_type)>
|
sizeof(word_size_type)>
|
||||||
word_count(Analysis::value_size(m_functor) / sizeof(size_type));
|
word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));
|
||||||
|
|
||||||
pointer_type const shared_value = reinterpret_cast<pointer_type>(
|
pointer_type const shared_value = reinterpret_cast<pointer_type>(
|
||||||
Kokkos::Experimental::kokkos_impl_hip_shared_memory<size_type>() +
|
Kokkos::Experimental::kokkos_impl_hip_shared_memory<word_size_type>() +
|
||||||
word_count.value * threadIdx.y);
|
word_count.value * threadIdx.y);
|
||||||
|
|
||||||
final_reducer.init(shared_value);
|
final_reducer.init(shared_value);
|
||||||
@ -518,7 +534,7 @@ class ParallelScanHIPBase {
|
|||||||
// gridDim.x
|
// gridDim.x
|
||||||
hip_single_inter_block_reduce_scan<true>(
|
hip_single_inter_block_reduce_scan<true>(
|
||||||
final_reducer, blockIdx.x, gridDim.x,
|
final_reducer, blockIdx.x, gridDim.x,
|
||||||
Kokkos::Experimental::kokkos_impl_hip_shared_memory<size_type>(),
|
Kokkos::Experimental::kokkos_impl_hip_shared_memory<word_size_type>(),
|
||||||
m_scratch_space, m_scratch_flags);
|
m_scratch_space, m_scratch_flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -527,22 +543,22 @@ class ParallelScanHIPBase {
|
|||||||
__device__ inline void final() const {
|
__device__ inline void final() const {
|
||||||
typename Analysis::Reducer final_reducer(&m_functor);
|
typename Analysis::Reducer final_reducer(&m_functor);
|
||||||
|
|
||||||
const integral_nonzero_constant<size_type, Analysis::StaticValueSize /
|
const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
|
||||||
sizeof(size_type)>
|
sizeof(word_size_type)>
|
||||||
word_count(Analysis::value_size(m_functor) / sizeof(size_type));
|
word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));
|
||||||
|
|
||||||
// Use shared memory as an exclusive scan: { 0 , value[0] , value[1] ,
|
// Use shared memory as an exclusive scan: { 0 , value[0] , value[1] ,
|
||||||
// value[2] , ... }
|
// value[2] , ... }
|
||||||
size_type* const shared_data =
|
word_size_type* const shared_data =
|
||||||
Kokkos::Experimental::kokkos_impl_hip_shared_memory<size_type>();
|
Kokkos::Experimental::kokkos_impl_hip_shared_memory<word_size_type>();
|
||||||
size_type* const shared_prefix =
|
word_size_type* const shared_prefix =
|
||||||
shared_data + word_count.value * threadIdx.y;
|
shared_data + word_count.value * threadIdx.y;
|
||||||
size_type* const shared_accum =
|
word_size_type* const shared_accum =
|
||||||
shared_data + word_count.value * (blockDim.y + 1);
|
shared_data + word_count.value * (blockDim.y + 1);
|
||||||
|
|
||||||
// Starting value for this thread block is the previous block's total.
|
// Starting value for this thread block is the previous block's total.
|
||||||
if (blockIdx.x) {
|
if (blockIdx.x) {
|
||||||
size_type* const block_total =
|
word_size_type* const block_total =
|
||||||
m_scratch_space + word_count.value * (blockIdx.x - 1);
|
m_scratch_space + word_count.value * (blockIdx.x - 1);
|
||||||
for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
|
for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
|
||||||
shared_accum[i] = block_total[i];
|
shared_accum[i] = block_total[i];
|
||||||
@ -588,7 +604,7 @@ class ParallelScanHIPBase {
|
|||||||
typename Analysis::pointer_type(shared_data + word_count.value));
|
typename Analysis::pointer_type(shared_data + word_count.value));
|
||||||
|
|
||||||
{
|
{
|
||||||
size_type* const block_total =
|
word_size_type* const block_total =
|
||||||
shared_data + word_count.value * blockDim.y;
|
shared_data + word_count.value * blockDim.y;
|
||||||
for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
|
for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
|
||||||
shared_accum[i] = block_total[i];
|
shared_accum[i] = block_total[i];
|
||||||
@ -647,8 +663,9 @@ class ParallelScanHIPBase {
|
|||||||
// How many block are really needed for this much work:
|
// How many block are really needed for this much work:
|
||||||
m_grid_x = (nwork + work_per_block - 1) / work_per_block;
|
m_grid_x = (nwork + work_per_block - 1) / work_per_block;
|
||||||
|
|
||||||
m_scratch_space = Kokkos::Experimental::Impl::hip_internal_scratch_space(
|
m_scratch_space = reinterpret_cast<word_size_type*>(
|
||||||
m_policy.space(), Analysis::value_size(m_functor) * m_grid_x);
|
Kokkos::Experimental::Impl::hip_internal_scratch_space(
|
||||||
|
m_policy.space(), Analysis::value_size(m_functor) * m_grid_x));
|
||||||
m_scratch_flags = Kokkos::Experimental::Impl::hip_internal_scratch_flags(
|
m_scratch_flags = Kokkos::Experimental::Impl::hip_internal_scratch_flags(
|
||||||
m_policy.space(), sizeof(size_type) * 1);
|
m_policy.space(), sizeof(size_type) * 1);
|
||||||
|
|
||||||
@ -734,7 +751,8 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
|
|||||||
DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace,
|
DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace,
|
||||||
Kokkos::Experimental::HIP>(
|
Kokkos::Experimental::HIP>(
|
||||||
Base::m_policy.space(), &m_returnvalue,
|
Base::m_policy.space(), &m_returnvalue,
|
||||||
Base::m_scratch_space + (Base::m_grid_x - 1) * size / sizeof(int),
|
Base::m_scratch_space + (Base::m_grid_x - 1) * size /
|
||||||
|
sizeof(typename Base::word_size_type),
|
||||||
size);
|
size);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -225,11 +225,11 @@ struct HIPReductionsFunctor<FunctorType, false> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename SizeType>
|
||||||
__device__ static inline bool scalar_inter_block_reduction(
|
__device__ static inline bool scalar_inter_block_reduction(
|
||||||
FunctorType const& functor,
|
FunctorType const& functor,
|
||||||
::Kokkos::Experimental::HIP::size_type const block_count,
|
::Kokkos::Experimental::HIP::size_type const block_count,
|
||||||
::Kokkos::Experimental::HIP::size_type* const shared_data,
|
SizeType* const shared_data, SizeType* const global_data,
|
||||||
::Kokkos::Experimental::HIP::size_type* const global_data,
|
|
||||||
::Kokkos::Experimental::HIP::size_type* const global_flags) {
|
::Kokkos::Experimental::HIP::size_type* const global_flags) {
|
||||||
Scalar* const global_team_buffer_element =
|
Scalar* const global_team_buffer_element =
|
||||||
reinterpret_cast<Scalar*>(global_data);
|
reinterpret_cast<Scalar*>(global_data);
|
||||||
@ -411,16 +411,14 @@ __device__ void hip_intra_block_reduce_scan(
|
|||||||
* Global reduce result is in the last threads' 'shared_data' location.
|
* Global reduce result is in the last threads' 'shared_data' location.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
template <bool DoScan, class FunctorType>
|
template <bool DoScan, typename FunctorType, typename SizeType>
|
||||||
__device__ bool hip_single_inter_block_reduce_scan_impl(
|
__device__ bool hip_single_inter_block_reduce_scan_impl(
|
||||||
FunctorType const& functor,
|
FunctorType const& functor,
|
||||||
::Kokkos::Experimental::HIP::size_type const block_id,
|
::Kokkos::Experimental::HIP::size_type const block_id,
|
||||||
::Kokkos::Experimental::HIP::size_type const block_count,
|
::Kokkos::Experimental::HIP::size_type const block_count,
|
||||||
::Kokkos::Experimental::HIP::size_type* const shared_data,
|
SizeType* const shared_data, SizeType* const global_data,
|
||||||
::Kokkos::Experimental::HIP::size_type* const global_data,
|
|
||||||
::Kokkos::Experimental::HIP::size_type* const global_flags) {
|
::Kokkos::Experimental::HIP::size_type* const global_flags) {
|
||||||
using size_type = ::Kokkos::Experimental::HIP::size_type;
|
using size_type = SizeType;
|
||||||
|
|
||||||
using value_type = typename FunctorType::value_type;
|
using value_type = typename FunctorType::value_type;
|
||||||
using pointer_type = typename FunctorType::pointer_type;
|
using pointer_type = typename FunctorType::pointer_type;
|
||||||
|
|
||||||
@ -518,13 +516,12 @@ __device__ bool hip_single_inter_block_reduce_scan_impl(
|
|||||||
return is_last_block;
|
return is_last_block;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <bool DoScan, typename FunctorType>
|
template <bool DoScan, typename FunctorType, typename SizeType>
|
||||||
__device__ bool hip_single_inter_block_reduce_scan(
|
__device__ bool hip_single_inter_block_reduce_scan(
|
||||||
FunctorType const& functor,
|
FunctorType const& functor,
|
||||||
::Kokkos::Experimental::HIP::size_type const block_id,
|
::Kokkos::Experimental::HIP::size_type const block_id,
|
||||||
::Kokkos::Experimental::HIP::size_type const block_count,
|
::Kokkos::Experimental::HIP::size_type const block_count,
|
||||||
::Kokkos::Experimental::HIP::size_type* const shared_data,
|
SizeType* const shared_data, SizeType* const global_data,
|
||||||
::Kokkos::Experimental::HIP::size_type* const global_data,
|
|
||||||
::Kokkos::Experimental::HIP::size_type* const global_flags) {
|
::Kokkos::Experimental::HIP::size_type* const global_flags) {
|
||||||
// If we are doing a reduction and we don't do an array reduction, we use the
|
// If we are doing a reduction and we don't do an array reduction, we use the
|
||||||
// reduction-only path. Otherwise, we use the common path between reduction
|
// reduction-only path. Otherwise, we use the common path between reduction
|
||||||
|
|||||||
@ -116,6 +116,7 @@ __device__ inline void hip_inter_warp_shuffle_reduction(
|
|||||||
value = result[0];
|
value = result[0];
|
||||||
for (int i = 1; (i * step < max_active_thread) && (i < step_width); ++i)
|
for (int i = 1; (i * step < max_active_thread) && (i < step_width); ++i)
|
||||||
reducer.join(&value, &result[i]);
|
reducer.join(&value, &result[i]);
|
||||||
|
__syncthreads();
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename ValueType, typename ReducerType>
|
template <typename ValueType, typename ReducerType>
|
||||||
|
|||||||
@ -3711,6 +3711,7 @@ namespace Impl {
|
|||||||
|
|
||||||
template <class T, class... P, class... ViewCtorArgs>
|
template <class T, class... P, class... ViewCtorArgs>
|
||||||
inline std::enable_if_t<
|
inline std::enable_if_t<
|
||||||
|
!Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
|
||||||
(std::is_same<
|
(std::is_same<
|
||||||
typename Kokkos::View<T, P...>::memory_space,
|
typename Kokkos::View<T, P...>::memory_space,
|
||||||
typename Kokkos::View<T, P...>::HostMirror::memory_space>::value &&
|
typename Kokkos::View<T, P...>::HostMirror::memory_space>::value &&
|
||||||
@ -3725,9 +3726,10 @@ create_mirror_view(const Kokkos::View<T, P...>& src,
|
|||||||
|
|
||||||
template <class T, class... P, class... ViewCtorArgs>
|
template <class T, class... P, class... ViewCtorArgs>
|
||||||
inline std::enable_if_t<
|
inline std::enable_if_t<
|
||||||
!(std::is_same<
|
!Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
|
||||||
typename Kokkos::View<T, P...>::memory_space,
|
!(std::is_same<typename Kokkos::View<T, P...>::memory_space,
|
||||||
typename Kokkos::View<T, P...>::HostMirror::memory_space>::value &&
|
typename Kokkos::View<
|
||||||
|
T, P...>::HostMirror::memory_space>::value &&
|
||||||
std::is_same<
|
std::is_same<
|
||||||
typename Kokkos::View<T, P...>::data_type,
|
typename Kokkos::View<T, P...>::data_type,
|
||||||
typename Kokkos::View<T, P...>::HostMirror::data_type>::value),
|
typename Kokkos::View<T, P...>::HostMirror::data_type>::value),
|
||||||
@ -3738,25 +3740,33 @@ create_mirror_view(const Kokkos::View<T, P...>& src,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Create a mirror view in a new space (specialization for same space)
|
// Create a mirror view in a new space (specialization for same space)
|
||||||
template <class Space, class T, class... P, class... ViewCtorArgs>
|
template <class T, class... P, class... ViewCtorArgs,
|
||||||
std::enable_if_t<Impl::MirrorViewType<Space, T, P...>::is_same_memspace,
|
class = std::enable_if_t<
|
||||||
typename Impl::MirrorViewType<Space, T, P...>::view_type>
|
Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
|
||||||
create_mirror_view(const Space&, const Kokkos::View<T, P...>& src,
|
std::enable_if_t<Impl::MirrorViewType<
|
||||||
|
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
|
||||||
|
T, P...>::is_same_memspace,
|
||||||
|
typename Impl::MirrorViewType<
|
||||||
|
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
|
||||||
|
T, P...>::view_type>
|
||||||
|
create_mirror_view(const Kokkos::View<T, P...>& src,
|
||||||
const Impl::ViewCtorProp<ViewCtorArgs...>&) {
|
const Impl::ViewCtorProp<ViewCtorArgs...>&) {
|
||||||
return src;
|
return src;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create a mirror view in a new space (specialization for different space)
|
// Create a mirror view in a new space (specialization for different space)
|
||||||
template <class Space, class T, class... P, class... ViewCtorArgs>
|
template <class T, class... P, class... ViewCtorArgs,
|
||||||
std::enable_if_t<!Impl::MirrorViewType<Space, T, P...>::is_same_memspace,
|
class = std::enable_if_t<
|
||||||
typename Impl::MirrorViewType<Space, T, P...>::view_type>
|
Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
|
||||||
create_mirror_view(const Space&, const Kokkos::View<T, P...>& src,
|
std::enable_if_t<!Impl::MirrorViewType<
|
||||||
|
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
|
||||||
|
T, P...>::is_same_memspace,
|
||||||
|
typename Impl::MirrorViewType<
|
||||||
|
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
|
||||||
|
T, P...>::view_type>
|
||||||
|
create_mirror_view(const Kokkos::View<T, P...>& src,
|
||||||
const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
|
const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
|
||||||
using MemorySpace = typename Space::memory_space;
|
return Kokkos::Impl::create_mirror(src, arg_prop);
|
||||||
using alloc_prop = Impl::ViewCtorProp<ViewCtorArgs..., MemorySpace>;
|
|
||||||
alloc_prop prop_copy(arg_prop);
|
|
||||||
|
|
||||||
return Kokkos::Impl::create_mirror(src, prop_copy);
|
|
||||||
}
|
}
|
||||||
} // namespace Impl
|
} // namespace Impl
|
||||||
|
|
||||||
@ -3815,9 +3825,10 @@ typename Impl::MirrorViewType<Space, T, P...>::view_type create_mirror_view(
|
|||||||
template <class Space, class T, class... P,
|
template <class Space, class T, class... P,
|
||||||
typename Enable = std::enable_if_t<Kokkos::is_space<Space>::value>>
|
typename Enable = std::enable_if_t<Kokkos::is_space<Space>::value>>
|
||||||
typename Impl::MirrorViewType<Space, T, P...>::view_type create_mirror_view(
|
typename Impl::MirrorViewType<Space, T, P...>::view_type create_mirror_view(
|
||||||
Kokkos::Impl::WithoutInitializing_t wi, Space const& space,
|
Kokkos::Impl::WithoutInitializing_t wi, Space const&,
|
||||||
Kokkos::View<T, P...> const& v) {
|
Kokkos::View<T, P...> const& v) {
|
||||||
return Impl::create_mirror_view(space, v, view_alloc(wi));
|
return Impl::create_mirror_view(
|
||||||
|
v, view_alloc(typename Space::memory_space{}, wi));
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class T, class... P, class... ViewCtorArgs>
|
template <class T, class... P, class... ViewCtorArgs>
|
||||||
|
|||||||
@ -1754,7 +1754,10 @@ struct RankDataType<ValueType, 0> {
|
|||||||
};
|
};
|
||||||
|
|
||||||
template <unsigned N, typename... Args>
|
template <unsigned N, typename... Args>
|
||||||
KOKKOS_FUNCTION std::enable_if_t<N == View<Args...>::Rank, View<Args...>>
|
KOKKOS_FUNCTION std::enable_if_t<
|
||||||
|
N == View<Args...>::Rank &&
|
||||||
|
std::is_same<typename ViewTraits<Args...>::specialize, void>::value,
|
||||||
|
View<Args...>>
|
||||||
as_view_of_rank_n(View<Args...> v) {
|
as_view_of_rank_n(View<Args...> v) {
|
||||||
return v;
|
return v;
|
||||||
}
|
}
|
||||||
@ -1762,13 +1765,13 @@ as_view_of_rank_n(View<Args...> v) {
|
|||||||
// Placeholder implementation to compile generic code for DynRankView; should
|
// Placeholder implementation to compile generic code for DynRankView; should
|
||||||
// never be called
|
// never be called
|
||||||
template <unsigned N, typename T, typename... Args>
|
template <unsigned N, typename T, typename... Args>
|
||||||
std::enable_if_t<
|
KOKKOS_FUNCTION std::enable_if_t<
|
||||||
N != View<T, Args...>::Rank,
|
N != View<T, Args...>::Rank &&
|
||||||
|
std::is_same<typename ViewTraits<T, Args...>::specialize, void>::value,
|
||||||
View<typename RankDataType<typename View<T, Args...>::value_type, N>::type,
|
View<typename RankDataType<typename View<T, Args...>::value_type, N>::type,
|
||||||
Args...>>
|
Args...>>
|
||||||
as_view_of_rank_n(View<T, Args...>) {
|
as_view_of_rank_n(View<T, Args...>) {
|
||||||
Kokkos::Impl::throw_runtime_exception(
|
Kokkos::abort("Trying to get at a View of the wrong rank");
|
||||||
"Trying to get at a View of the wrong rank");
|
|
||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -101,8 +101,8 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
|
|||||||
void push_work(const std::int32_t w) const noexcept {
|
void push_work(const std::int32_t w) const noexcept {
|
||||||
const std::int32_t N = m_graph.numRows();
|
const std::int32_t N = m_graph.numRows();
|
||||||
|
|
||||||
std::int32_t volatile* const ready_queue = &m_queue[0];
|
std::int32_t* const ready_queue = &m_queue[0];
|
||||||
std::int32_t volatile* const end_hint = &m_queue[2 * N + 1];
|
std::int32_t* const end_hint = &m_queue[2 * N + 1];
|
||||||
|
|
||||||
// Push work to end of queue
|
// Push work to end of queue
|
||||||
const std::int32_t j = atomic_fetch_add(end_hint, 1);
|
const std::int32_t j = atomic_fetch_add(end_hint, 1);
|
||||||
@ -134,14 +134,14 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
|
|||||||
std::int32_t pop_work() const noexcept {
|
std::int32_t pop_work() const noexcept {
|
||||||
const std::int32_t N = m_graph.numRows();
|
const std::int32_t N = m_graph.numRows();
|
||||||
|
|
||||||
std::int32_t volatile* const ready_queue = &m_queue[0];
|
std::int32_t* const ready_queue = &m_queue[0];
|
||||||
std::int32_t volatile* const begin_hint = &m_queue[2 * N];
|
std::int32_t* const begin_hint = &m_queue[2 * N];
|
||||||
|
|
||||||
// begin hint is guaranteed to be less than or equal to
|
// begin hint is guaranteed to be less than or equal to
|
||||||
// actual begin location in the queue.
|
// actual begin location in the queue.
|
||||||
|
|
||||||
for (std::int32_t i = *begin_hint; i < N; ++i) {
|
for (std::int32_t i = Kokkos::atomic_load(begin_hint); i < N; ++i) {
|
||||||
const std::int32_t w = ready_queue[i];
|
const std::int32_t w = Kokkos::atomic_load(&ready_queue[i]);
|
||||||
|
|
||||||
if (w == END_TOKEN) {
|
if (w == END_TOKEN) {
|
||||||
return END_TOKEN;
|
return END_TOKEN;
|
||||||
@ -169,7 +169,7 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
|
|||||||
|
|
||||||
const std::int32_t N = m_graph.numRows();
|
const std::int32_t N = m_graph.numRows();
|
||||||
|
|
||||||
std::int32_t volatile* const count_queue = &m_queue[N];
|
std::int32_t* const count_queue = &m_queue[N];
|
||||||
|
|
||||||
const std::int32_t B = m_graph.row_map(w);
|
const std::int32_t B = m_graph.row_map(w);
|
||||||
const std::int32_t E = m_graph.row_map(w + 1);
|
const std::int32_t E = m_graph.row_map(w + 1);
|
||||||
@ -199,7 +199,7 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
|
|||||||
|
|
||||||
KOKKOS_INLINE_FUNCTION
|
KOKKOS_INLINE_FUNCTION
|
||||||
void operator()(const TagCount, int i) const noexcept {
|
void operator()(const TagCount, int i) const noexcept {
|
||||||
std::int32_t volatile* const count_queue = &m_queue[m_graph.numRows()];
|
std::int32_t* const count_queue = &m_queue[m_graph.numRows()];
|
||||||
|
|
||||||
atomic_increment(count_queue + m_graph.entries[i]);
|
atomic_increment(count_queue + m_graph.entries[i]);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -51,7 +51,7 @@ namespace Kokkos::Experimental::Impl {
|
|||||||
|
|
||||||
struct OpenACC_Traits {
|
struct OpenACC_Traits {
|
||||||
#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
|
#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
|
||||||
defined(KOKKOS_ARCH_AMPERE)
|
defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER)
|
||||||
static constexpr acc_device_t dev_type = acc_device_nvidia;
|
static constexpr acc_device_t dev_type = acc_device_nvidia;
|
||||||
static constexpr bool may_fallback_to_host = false;
|
static constexpr bool may_fallback_to_host = false;
|
||||||
#else
|
#else
|
||||||
|
|||||||
@ -47,6 +47,7 @@
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include <Kokkos_Macros.hpp>
|
#include <Kokkos_Macros.hpp>
|
||||||
|
#include <impl/Kokkos_DeviceManagement.hpp>
|
||||||
|
|
||||||
#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(_OPENMP)
|
#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(_OPENMP)
|
||||||
|
|
||||||
@ -115,7 +116,8 @@ void OpenMPTargetInternal::impl_initialize() {
|
|||||||
// FIXME_OPENMPTARGET: Only fix the number of teams for NVIDIA architectures
|
// FIXME_OPENMPTARGET: Only fix the number of teams for NVIDIA architectures
|
||||||
// from Pascal and upwards.
|
// from Pascal and upwards.
|
||||||
#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
|
#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
|
||||||
defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
|
defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
|
||||||
|
defined(KOKKOS_ARCH_HOPPER)
|
||||||
#if defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG >= 1300)
|
#if defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG >= 1300)
|
||||||
omp_set_num_teams(512);
|
omp_set_num_teams(512);
|
||||||
#endif
|
#endif
|
||||||
@ -164,7 +166,11 @@ void OpenMPTarget::impl_static_fence(const std::string& name) {
|
|||||||
name, Kokkos::Experimental::Impl::openmp_fence_is_static::yes);
|
name, Kokkos::Experimental::Impl::openmp_fence_is_static::yes);
|
||||||
}
|
}
|
||||||
|
|
||||||
void OpenMPTarget::impl_initialize(InitializationSettings const&) {
|
void OpenMPTarget::impl_initialize(InitializationSettings const& settings) {
|
||||||
|
using Kokkos::Impl::get_gpu;
|
||||||
|
const int device_num = get_gpu(settings);
|
||||||
|
omp_set_default_device(device_num);
|
||||||
|
|
||||||
Impl::OpenMPTargetInternal::impl_singleton()->impl_initialize();
|
Impl::OpenMPTargetInternal::impl_singleton()->impl_initialize();
|
||||||
}
|
}
|
||||||
void OpenMPTarget::impl_finalize() {
|
void OpenMPTarget::impl_finalize() {
|
||||||
|
|||||||
@ -155,7 +155,7 @@ void SYCL::impl_initialize(InitializationSettings const& settings) {
|
|||||||
#if !defined(KOKKOS_ARCH_INTEL_GPU) && !defined(KOKKOS_ARCH_KEPLER) && \
|
#if !defined(KOKKOS_ARCH_INTEL_GPU) && !defined(KOKKOS_ARCH_KEPLER) && \
|
||||||
!defined(KOKKOS_ARCH_MAXWELL) && !defined(KOKKOS_ARCH_PASCAL) && \
|
!defined(KOKKOS_ARCH_MAXWELL) && !defined(KOKKOS_ARCH_PASCAL) && \
|
||||||
!defined(KOKKOS_ARCH_VOLTA) && !defined(KOKKOS_ARCH_TURING75) && \
|
!defined(KOKKOS_ARCH_VOLTA) && !defined(KOKKOS_ARCH_TURING75) && \
|
||||||
!defined(KOKKOS_ARCH_AMPERE)
|
!defined(KOKKOS_ARCH_AMPERE) && !defined(KOKKOS_ARCH_HOPPER)
|
||||||
if (!settings.has_device_id() && gpu_devices.empty()) {
|
if (!settings.has_device_id() && gpu_devices.empty()) {
|
||||||
Impl::SYCLInternal::singleton().initialize(sycl::device());
|
Impl::SYCLInternal::singleton().initialize(sycl::device());
|
||||||
return;
|
return;
|
||||||
|
|||||||
@ -337,7 +337,8 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
|
|||||||
// FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs.
|
// FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs.
|
||||||
#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
|
#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
|
||||||
defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
|
defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
|
||||||
defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
|
defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
|
||||||
|
defined(KOKKOS_ARCH_HOPPER)
|
||||||
256,
|
256,
|
||||||
#endif
|
#endif
|
||||||
max_threads_for_memory
|
max_threads_for_memory
|
||||||
@ -369,7 +370,8 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
|
|||||||
// FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs.
|
// FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs.
|
||||||
#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
|
#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
|
||||||
defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
|
defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
|
||||||
defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
|
defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
|
||||||
|
defined(KOKKOS_ARCH_HOPPER)
|
||||||
256,
|
256,
|
||||||
#endif
|
#endif
|
||||||
max_threads_for_memory
|
max_threads_for_memory
|
||||||
|
|||||||
@ -110,10 +110,9 @@ KOKKOS_IMPL_HOST_FUNCTION inline uint64_t clock_tic_host() noexcept {
|
|||||||
|
|
||||||
return ((uint64_t)a) | (((uint64_t)d) << 32);
|
return ((uint64_t)a) | (((uint64_t)d) << 32);
|
||||||
|
|
||||||
#elif defined(__powerpc) || defined(__powerpc__) || defined(__powerpc64__) || \
|
#elif defined(__powerpc64__) || defined(__ppc64__)
|
||||||
defined(__POWERPC__) || defined(__ppc__) || defined(__ppc64__)
|
|
||||||
|
|
||||||
unsigned int cycles = 0;
|
unsigned long cycles = 0;
|
||||||
|
|
||||||
asm volatile("mftb %0" : "=r"(cycles));
|
asm volatile("mftb %0" : "=r"(cycles));
|
||||||
|
|
||||||
|
|||||||
@ -166,6 +166,8 @@ int get_device_count() {
|
|||||||
#elif defined(KOKKOS_ENABLE_OPENACC)
|
#elif defined(KOKKOS_ENABLE_OPENACC)
|
||||||
return acc_get_num_devices(
|
return acc_get_num_devices(
|
||||||
Kokkos::Experimental::Impl::OpenACC_Traits::dev_type);
|
Kokkos::Experimental::Impl::OpenACC_Traits::dev_type);
|
||||||
|
#elif defined(KOKKOS_ENABLE_OPENMPTARGET)
|
||||||
|
return omp_get_num_devices();
|
||||||
#else
|
#else
|
||||||
Kokkos::abort("implementation bug");
|
Kokkos::abort("implementation bug");
|
||||||
return -1;
|
return -1;
|
||||||
@ -426,11 +428,17 @@ int Kokkos::Impl::get_gpu(const InitializationSettings& settings) {
|
|||||||
Kokkos::abort("implementation bug");
|
Kokkos::abort("implementation bug");
|
||||||
}
|
}
|
||||||
|
|
||||||
auto const* local_rank_str =
|
char const* local_rank_str = nullptr;
|
||||||
std::getenv("OMPI_COMM_WORLD_LOCAL_RANK"); // OpenMPI
|
for (char const* env_var : {
|
||||||
if (!local_rank_str)
|
"OMPI_COMM_WORLD_LOCAL_RANK", // OpenMPI
|
||||||
local_rank_str = std::getenv("MV2_COMM_WORLD_LOCAL_RANK"); // MVAPICH2
|
"MV2_COMM_WORLD_LOCAL_RANK", // MVAPICH2
|
||||||
if (!local_rank_str) local_rank_str = std::getenv("SLURM_LOCALID"); // SLURM
|
"MPI_LOCALRANKID", // MPICH
|
||||||
|
"SLURM_LOCALID", // SLURM
|
||||||
|
"PMI_LOCAL_RANK" // PMI
|
||||||
|
}) {
|
||||||
|
local_rank_str = std::getenv(env_var);
|
||||||
|
if (local_rank_str) break;
|
||||||
|
}
|
||||||
|
|
||||||
// use first GPU available for execution if unable to detect local MPI rank
|
// use first GPU available for execution if unable to detect local MPI rank
|
||||||
if (!local_rank_str) {
|
if (!local_rank_str) {
|
||||||
|
|||||||
@ -1128,9 +1128,8 @@ struct ViewOffset<
|
|||||||
KOKKOS_INLINE_FUNCTION constexpr ViewOffset(
|
KOKKOS_INLINE_FUNCTION constexpr ViewOffset(
|
||||||
const ViewOffset<DimRHS, Kokkos::LayoutRight, void>& rhs)
|
const ViewOffset<DimRHS, Kokkos::LayoutRight, void>& rhs)
|
||||||
: m_dim(rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0) {
|
: m_dim(rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0) {
|
||||||
static_assert((DimRHS::rank == 0 && dimension_type::rank == 0) ||
|
static_assert(((DimRHS::rank == 0 && dimension_type::rank == 0) ||
|
||||||
(DimRHS::rank == 1 && dimension_type::rank == 1 &&
|
(DimRHS::rank == 1 && dimension_type::rank == 1)),
|
||||||
dimension_type::rank_dynamic == 1),
|
|
||||||
"ViewOffset LayoutLeft and LayoutRight are only compatible "
|
"ViewOffset LayoutLeft and LayoutRight are only compatible "
|
||||||
"when rank <= 1");
|
"when rank <= 1");
|
||||||
}
|
}
|
||||||
@ -1778,8 +1777,7 @@ struct ViewOffset<
|
|||||||
const ViewOffset<DimRHS, Kokkos::LayoutLeft, void>& rhs)
|
const ViewOffset<DimRHS, Kokkos::LayoutLeft, void>& rhs)
|
||||||
: m_dim(rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0) {
|
: m_dim(rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0) {
|
||||||
static_assert((DimRHS::rank == 0 && dimension_type::rank == 0) ||
|
static_assert((DimRHS::rank == 0 && dimension_type::rank == 0) ||
|
||||||
(DimRHS::rank == 1 && dimension_type::rank == 1 &&
|
(DimRHS::rank == 1 && dimension_type::rank == 1),
|
||||||
dimension_type::rank_dynamic == 1),
|
|
||||||
"ViewOffset LayoutRight and LayoutLeft are only compatible "
|
"ViewOffset LayoutRight and LayoutLeft are only compatible "
|
||||||
"when rank <= 1");
|
"when rank <= 1");
|
||||||
}
|
}
|
||||||
@ -3059,10 +3057,10 @@ struct ViewValueFunctor<DeviceType, ValueType, true /* is_scalar */> {
|
|||||||
std::is_trivially_copy_assignable<Dummy>::value>
|
std::is_trivially_copy_assignable<Dummy>::value>
|
||||||
construct_shared_allocation() {
|
construct_shared_allocation() {
|
||||||
// Shortcut for zero initialization
|
// Shortcut for zero initialization
|
||||||
ValueType value{};
|
|
||||||
// On A64FX memset seems to do the wrong thing with regards to first touch
|
// On A64FX memset seems to do the wrong thing with regards to first touch
|
||||||
// leading to the significant performance issues
|
// leading to the significant performance issues
|
||||||
#ifndef KOKKOS_ARCH_A64FX
|
#ifndef KOKKOS_ARCH_A64FX
|
||||||
|
ValueType value{};
|
||||||
if (Impl::is_zero_byte(value)) {
|
if (Impl::is_zero_byte(value)) {
|
||||||
uint64_t kpID = 0;
|
uint64_t kpID = 0;
|
||||||
if (Kokkos::Profiling::profileLibraryLoaded()) {
|
if (Kokkos::Profiling::profileLibraryLoaded()) {
|
||||||
@ -3539,9 +3537,7 @@ class ViewMapping<
|
|||||||
typename SrcTraits::array_layout>::value ||
|
typename SrcTraits::array_layout>::value ||
|
||||||
std::is_same<typename DstTraits::array_layout,
|
std::is_same<typename DstTraits::array_layout,
|
||||||
Kokkos::LayoutStride>::value ||
|
Kokkos::LayoutStride>::value ||
|
||||||
(DstTraits::dimension::rank == 0) ||
|
(DstTraits::dimension::rank == 0) || (DstTraits::dimension::rank == 1)
|
||||||
(DstTraits::dimension::rank == 1 &&
|
|
||||||
DstTraits::dimension::rank_dynamic == 1)
|
|
||||||
};
|
};
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|||||||
@ -73,6 +73,7 @@ KOKKOS_INCLUDE_DIRECTORIES(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files)
|
|||||||
|
|
||||||
SET(COMPILE_ONLY_SOURCES
|
SET(COMPILE_ONLY_SOURCES
|
||||||
TestArray.cpp
|
TestArray.cpp
|
||||||
|
TestCreateMirror.cpp
|
||||||
TestDetectionIdiom.cpp
|
TestDetectionIdiom.cpp
|
||||||
TestInterOp.cpp
|
TestInterOp.cpp
|
||||||
TestLegionInteroperability.cpp
|
TestLegionInteroperability.cpp
|
||||||
@ -86,6 +87,7 @@ ENDIF()
|
|||||||
KOKKOS_ADD_EXECUTABLE(
|
KOKKOS_ADD_EXECUTABLE(
|
||||||
TestCompileOnly
|
TestCompileOnly
|
||||||
SOURCES
|
SOURCES
|
||||||
|
TestCompileMain.cpp
|
||||||
${COMPILE_ONLY_SOURCES}
|
${COMPILE_ONLY_SOURCES}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
1
lib/kokkos/core/unit_test/TestCompileMain.cpp
Normal file
1
lib/kokkos/core/unit_test/TestCompileMain.cpp
Normal file
@ -0,0 +1 @@
|
|||||||
|
int main() {}
|
||||||
126
lib/kokkos/core/unit_test/TestCreateMirror.cpp
Normal file
126
lib/kokkos/core/unit_test/TestCreateMirror.cpp
Normal file
@ -0,0 +1,126 @@
|
|||||||
|
/*
|
||||||
|
//@HEADER
|
||||||
|
// ************************************************************************
|
||||||
|
//
|
||||||
|
// Kokkos v. 3.0
|
||||||
|
// Copyright (2020) National Technology & Engineering
|
||||||
|
// Solutions of Sandia, LLC (NTESS).
|
||||||
|
//
|
||||||
|
// Under the terms of Contract DE-NA0003525 with NTESS,
|
||||||
|
// the U.S. Government retains certain rights in this software.
|
||||||
|
//
|
||||||
|
// Redistribution and use in source and binary forms, with or without
|
||||||
|
// modification, are permitted provided that the following conditions are
|
||||||
|
// met:
|
||||||
|
//
|
||||||
|
// 1. Redistributions of source code must retain the above copyright
|
||||||
|
// notice, this list of conditions and the following disclaimer.
|
||||||
|
//
|
||||||
|
// 2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
// notice, this list of conditions and the following disclaimer in the
|
||||||
|
// documentation and/or other materials provided with the distribution.
|
||||||
|
//
|
||||||
|
// 3. Neither the name of the Corporation nor the names of the
|
||||||
|
// contributors may be used to endorse or promote products derived from
|
||||||
|
// this software without specific prior written permission.
|
||||||
|
//
|
||||||
|
// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
|
||||||
|
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||||
|
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
|
||||||
|
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||||
|
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||||
|
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||||
|
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||||
|
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||||
|
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
//
|
||||||
|
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
|
||||||
|
//
|
||||||
|
// ************************************************************************
|
||||||
|
//@HEADER
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <Kokkos_Core.hpp>
|
||||||
|
|
||||||
|
template <typename TestView, typename MemorySpace>
|
||||||
|
void check_memory_space(TestView, MemorySpace) {
|
||||||
|
static_assert(
|
||||||
|
std::is_same<typename TestView::memory_space, MemorySpace>::value, "");
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class View>
|
||||||
|
auto host_mirror_test_space(View) {
|
||||||
|
return std::conditional_t<
|
||||||
|
Kokkos::SpaceAccessibility<Kokkos::HostSpace,
|
||||||
|
typename View::memory_space>::accessible,
|
||||||
|
typename View::memory_space, Kokkos::HostSpace>{};
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename View>
|
||||||
|
void test_create_mirror_properties(const View& view) {
|
||||||
|
using namespace Kokkos;
|
||||||
|
using DeviceMemorySpace = typename DefaultExecutionSpace::memory_space;
|
||||||
|
|
||||||
|
// clang-format off
|
||||||
|
|
||||||
|
// create_mirror
|
||||||
|
check_memory_space(create_mirror(WithoutInitializing, view), host_mirror_test_space(view));
|
||||||
|
check_memory_space(create_mirror( view), host_mirror_test_space(view));
|
||||||
|
check_memory_space(create_mirror(WithoutInitializing, DefaultExecutionSpace{}, view), DeviceMemorySpace{});
|
||||||
|
check_memory_space(create_mirror( DefaultExecutionSpace{}, view), DeviceMemorySpace{});
|
||||||
|
|
||||||
|
// create_mirror_view
|
||||||
|
check_memory_space(create_mirror_view(WithoutInitializing, view), host_mirror_test_space(view));
|
||||||
|
check_memory_space(create_mirror_view( view), host_mirror_test_space(view));
|
||||||
|
check_memory_space(create_mirror_view(WithoutInitializing, DefaultExecutionSpace{}, view), DeviceMemorySpace{});
|
||||||
|
check_memory_space(create_mirror_view( DefaultExecutionSpace{}, view), DeviceMemorySpace{});
|
||||||
|
|
||||||
|
// create_mirror view_alloc
|
||||||
|
check_memory_space(create_mirror(view_alloc(WithoutInitializing), view), host_mirror_test_space(view));
|
||||||
|
check_memory_space(create_mirror(view_alloc(), view), host_mirror_test_space(view));
|
||||||
|
check_memory_space(create_mirror(view_alloc(WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
|
||||||
|
check_memory_space(create_mirror(view_alloc( DeviceMemorySpace{}), view), DeviceMemorySpace{});
|
||||||
|
|
||||||
|
// create_mirror_view view_alloc
|
||||||
|
check_memory_space(create_mirror_view(view_alloc(WithoutInitializing), view), host_mirror_test_space(view));
|
||||||
|
check_memory_space(create_mirror_view(view_alloc(), view), host_mirror_test_space(view));
|
||||||
|
check_memory_space(create_mirror_view(view_alloc(WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
|
||||||
|
check_memory_space(create_mirror_view(view_alloc( DeviceMemorySpace{}), view), DeviceMemorySpace{});
|
||||||
|
|
||||||
|
// create_mirror view_alloc + execution space
|
||||||
|
check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{}, WithoutInitializing), view), host_mirror_test_space(view));
|
||||||
|
check_memory_space(create_mirror(view_alloc(DefaultHostExecutionSpace{}), view), host_mirror_test_space(view));
|
||||||
|
check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{}, WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
|
||||||
|
check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{}, DeviceMemorySpace{}), view), DeviceMemorySpace{});
|
||||||
|
|
||||||
|
// create_mirror_view view_alloc + execution space
|
||||||
|
check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{}, WithoutInitializing), view), host_mirror_test_space(view));
|
||||||
|
check_memory_space(create_mirror_view(view_alloc(DefaultHostExecutionSpace{}), view), host_mirror_test_space(view));
|
||||||
|
check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{}, WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
|
||||||
|
check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{}, DeviceMemorySpace{}), view), DeviceMemorySpace{});
|
||||||
|
|
||||||
|
// create_mirror_view_and_copy
|
||||||
|
check_memory_space(create_mirror_view_and_copy(HostSpace{}, view), HostSpace{});
|
||||||
|
check_memory_space(create_mirror_view_and_copy(DeviceMemorySpace{}, view), DeviceMemorySpace{});
|
||||||
|
|
||||||
|
// create_mirror_view_and_copy view_alloc
|
||||||
|
check_memory_space(create_mirror_view_and_copy(view_alloc(HostSpace{}), view), HostSpace{});
|
||||||
|
check_memory_space(create_mirror_view_and_copy(view_alloc(DeviceMemorySpace{}), view), DeviceMemorySpace{});
|
||||||
|
|
||||||
|
// create_mirror_view_and_copy view_alloc + execution space
|
||||||
|
check_memory_space(create_mirror_view_and_copy(view_alloc(HostSpace{}, DefaultHostExecutionSpace{}), view), HostSpace{});
|
||||||
|
check_memory_space(create_mirror_view_and_copy(view_alloc(DeviceMemorySpace{}, DefaultExecutionSpace{}), view), DeviceMemorySpace{});
|
||||||
|
|
||||||
|
// clang-format on
|
||||||
|
}
|
||||||
|
|
||||||
|
void test() {
|
||||||
|
Kokkos::View<int*, Kokkos::DefaultExecutionSpace> device_view("device view",
|
||||||
|
10);
|
||||||
|
Kokkos::View<int*, Kokkos::HostSpace> host_view("host view", 10);
|
||||||
|
|
||||||
|
test_create_mirror_properties(device_view);
|
||||||
|
test_create_mirror_properties(host_view);
|
||||||
|
}
|
||||||
@ -92,5 +92,3 @@ static_assert(std::is_same<difference_type<Woof>, int>::value,
|
|||||||
static_assert(std::is_same<difference_type<Bark>, std::ptrdiff_t>::value,
|
static_assert(std::is_same<difference_type<Bark>, std::ptrdiff_t>::value,
|
||||||
"Bark's difference_type should be ptrdiff_t!");
|
"Bark's difference_type should be ptrdiff_t!");
|
||||||
} // namespace Example
|
} // namespace Example
|
||||||
|
|
||||||
int main() {}
|
|
||||||
|
|||||||
@ -45,12 +45,12 @@
|
|||||||
#include <Kokkos_Core.hpp>
|
#include <Kokkos_Core.hpp>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
|
|
||||||
namespace Test {
|
namespace {
|
||||||
|
|
||||||
template <class Device>
|
template <class Device, class T, T ImbalanceSz>
|
||||||
struct TestScan {
|
struct TestScan {
|
||||||
using execution_space = Device;
|
using execution_space = Device;
|
||||||
using value_type = int64_t;
|
using value_type = T;
|
||||||
|
|
||||||
Kokkos::View<int, Device, Kokkos::MemoryTraits<Kokkos::Atomic> > errors;
|
Kokkos::View<int, Device, Kokkos::MemoryTraits<Kokkos::Atomic> > errors;
|
||||||
|
|
||||||
@ -58,7 +58,10 @@ struct TestScan {
|
|||||||
void operator()(const int iwork, value_type& update,
|
void operator()(const int iwork, value_type& update,
|
||||||
const bool final_pass) const {
|
const bool final_pass) const {
|
||||||
const value_type n = iwork + 1;
|
const value_type n = iwork + 1;
|
||||||
const value_type imbalance = ((1000 <= n) && (0 == n % 1000)) ? 1000 : 0;
|
const value_type imbalance =
|
||||||
|
((ImbalanceSz <= n) && (value_type(0) == n % ImbalanceSz))
|
||||||
|
? ImbalanceSz
|
||||||
|
: value_type(0);
|
||||||
|
|
||||||
// Insert an artificial load imbalance
|
// Insert an artificial load imbalance
|
||||||
|
|
||||||
@ -133,12 +136,29 @@ struct TestScan {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
} // namespace
|
||||||
|
|
||||||
TEST(TEST_CATEGORY, scan) {
|
TEST(TEST_CATEGORY, scan) {
|
||||||
TestScan<TEST_EXECSPACE>::test_range(1, 1000);
|
constexpr auto imbalance_size = 1000;
|
||||||
TestScan<TEST_EXECSPACE>(0);
|
TestScan<TEST_EXECSPACE, int64_t, imbalance_size>::test_range(1, 1000);
|
||||||
TestScan<TEST_EXECSPACE>(100000);
|
TestScan<TEST_EXECSPACE, int64_t, imbalance_size>(0);
|
||||||
TestScan<TEST_EXECSPACE>(10000000);
|
TestScan<TEST_EXECSPACE, int64_t, imbalance_size>(100000);
|
||||||
TEST_EXECSPACE().fence();
|
TestScan<TEST_EXECSPACE, int64_t, imbalance_size>(10000000);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(TEST_CATEGORY, small_size_scan) {
|
||||||
|
constexpr auto imbalance_size = 10; // Pick to not overflow...
|
||||||
|
TestScan<TEST_EXECSPACE, std::int8_t, imbalance_size>(0);
|
||||||
|
TestScan<TEST_EXECSPACE, std::int8_t, imbalance_size>(5);
|
||||||
|
TestScan<TEST_EXECSPACE, std::int8_t, imbalance_size>(10);
|
||||||
|
TestScan<TEST_EXECSPACE, std::int8_t, imbalance_size>(
|
||||||
|
static_cast<std::size_t>(
|
||||||
|
std::sqrt(std::numeric_limits<std::int8_t>::max())));
|
||||||
|
constexpr auto short_imbalance_size = 100; // Pick to not overflow...
|
||||||
|
TestScan<TEST_EXECSPACE, std::int16_t, short_imbalance_size>(0);
|
||||||
|
TestScan<TEST_EXECSPACE, std::int16_t, short_imbalance_size>(5);
|
||||||
|
TestScan<TEST_EXECSPACE, std::int16_t, short_imbalance_size>(100);
|
||||||
|
TestScan<TEST_EXECSPACE, std::int16_t, short_imbalance_size>(
|
||||||
|
static_cast<std::size_t>(
|
||||||
|
std::sqrt(std::numeric_limits<std::int16_t>::max())));
|
||||||
}
|
}
|
||||||
} // namespace Test
|
|
||||||
|
|||||||
@ -1616,6 +1616,73 @@ struct TestTeamPolicyHandleByValue {
|
|||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
template <typename ExecutionSpace>
|
||||||
|
struct TestRepeatedTeamReduce {
|
||||||
|
static constexpr int ncol = 1500; // nothing special, just some work
|
||||||
|
|
||||||
|
KOKKOS_FUNCTION void operator()(
|
||||||
|
const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type &team)
|
||||||
|
const {
|
||||||
|
// non-divisible by power of two to make triggering problems easier
|
||||||
|
constexpr int nlev = 129;
|
||||||
|
constexpr auto pi = Kokkos::Experimental::pi_v<double>;
|
||||||
|
double b = 0.;
|
||||||
|
for (int ri = 0; ri < 10; ++ri) {
|
||||||
|
// The contributions here must be sufficiently complex, simply adding ones
|
||||||
|
// wasn't enough to trigger the bug.
|
||||||
|
const auto g1 = [&](const int k, double &acc) {
|
||||||
|
acc += Kokkos::cos(pi * double(k) / nlev);
|
||||||
|
};
|
||||||
|
const auto g2 = [&](const int k, double &acc) {
|
||||||
|
acc += Kokkos::sin(pi * double(k) / nlev);
|
||||||
|
};
|
||||||
|
double a1, a2;
|
||||||
|
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team, nlev), g1, a1);
|
||||||
|
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team, nlev), g2, a2);
|
||||||
|
b += a1;
|
||||||
|
b += a2;
|
||||||
|
}
|
||||||
|
const auto h = [&]() {
|
||||||
|
const auto col = team.league_rank();
|
||||||
|
v(col) = b + col;
|
||||||
|
};
|
||||||
|
Kokkos::single(Kokkos::PerTeam(team), h);
|
||||||
|
}
|
||||||
|
|
||||||
|
KOKKOS_FUNCTION void operator()(const int i, int &bad) const {
|
||||||
|
if (v(i) != v(0) + i) {
|
||||||
|
++bad;
|
||||||
|
KOKKOS_IMPL_DO_NOT_USE_PRINTF("Failing at %d!\n", i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
TestRepeatedTeamReduce() : v("v", ncol) { test(); }
|
||||||
|
|
||||||
|
void test() {
|
||||||
|
int team_size_recommended =
|
||||||
|
Kokkos::TeamPolicy<ExecutionSpace>(1, 1).team_size_recommended(
|
||||||
|
*this, Kokkos::ParallelForTag());
|
||||||
|
// Choose a non-recommened (non-power of two for GPUs) team size
|
||||||
|
int team_size = team_size_recommended > 1 ? team_size_recommended - 1 : 1;
|
||||||
|
|
||||||
|
// The failure was non-deterministic so run the test a bunch of times
|
||||||
|
for (int it = 0; it < 100; ++it) {
|
||||||
|
Kokkos::parallel_for(
|
||||||
|
Kokkos::TeamPolicy<ExecutionSpace>(ncol, team_size, 1), *this);
|
||||||
|
|
||||||
|
int bad = 0;
|
||||||
|
Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecutionSpace>(0, ncol),
|
||||||
|
*this, bad);
|
||||||
|
ASSERT_EQ(bad, 0) << " Failing in iteration " << it;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Kokkos::View<double *, ExecutionSpace> v;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace
|
||||||
|
|
||||||
} // namespace Test
|
} // namespace Test
|
||||||
|
|
||||||
/*--------------------------------------------------------------------------*/
|
/*--------------------------------------------------------------------------*/
|
||||||
|
|||||||
@ -134,5 +134,15 @@ TEST(TEST_CATEGORY, team_parallel_dummy_with_reducer_and_scratch_space) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST(TEST_CATEGORY, repeated_team_reduce) {
|
||||||
|
#ifdef KOKKOS_ENABLE_OPENMPTARGET
|
||||||
|
if (std::is_same<TEST_EXECSPACE, Kokkos::Experimental::OpenMPTarget>::value)
|
||||||
|
GTEST_SKIP() << "skipping since team_reduce for OpenMPTarget is not "
|
||||||
|
"properly implemented";
|
||||||
|
#endif
|
||||||
|
|
||||||
|
TestRepeatedTeamReduce<TEST_EXECSPACE>();
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace Test
|
} // namespace Test
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -92,8 +92,18 @@ TEST(TEST_CATEGORY, view_is_assignable) {
|
|||||||
View<double*, left, d_exec>>::test(false, false, 10);
|
View<double*, left, d_exec>>::test(false, false, 10);
|
||||||
|
|
||||||
// Layout assignment
|
// Layout assignment
|
||||||
|
Impl::TestAssignability<View<int, left, d_exec>,
|
||||||
|
View<int, right, d_exec>>::test(true, true);
|
||||||
Impl::TestAssignability<View<int*, left, d_exec>,
|
Impl::TestAssignability<View<int*, left, d_exec>,
|
||||||
View<int*, right, d_exec>>::test(true, true, 10);
|
View<int*, right, d_exec>>::test(true, true, 10);
|
||||||
|
Impl::TestAssignability<View<int[5], left, d_exec>,
|
||||||
|
View<int*, right, d_exec>>::test(false, false, 10);
|
||||||
|
Impl::TestAssignability<View<int[10], left, d_exec>,
|
||||||
|
View<int*, right, d_exec>>::test(false, true, 10);
|
||||||
|
Impl::TestAssignability<View<int*, left, d_exec>,
|
||||||
|
View<int[5], right, d_exec>>::test(true, true);
|
||||||
|
Impl::TestAssignability<View<int[5], left, d_exec>,
|
||||||
|
View<int[10], right, d_exec>>::test(false, false);
|
||||||
|
|
||||||
// This could be made possible (due to the degenerate nature of the views) but
|
// This could be made possible (due to the degenerate nature of the views) but
|
||||||
// we do not allow this yet
|
// we do not allow this yet
|
||||||
|
|||||||
199
lib/kokkos/kokkos_5538.diff
Normal file
199
lib/kokkos/kokkos_5538.diff
Normal file
@ -0,0 +1,199 @@
|
|||||||
|
diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos
|
||||||
|
index 22af411f32..530510a0d1 100644
|
||||||
|
--- a/lib/kokkos/Makefile.kokkos
|
||||||
|
+++ b/lib/kokkos/Makefile.kokkos
|
||||||
|
@@ -20,7 +20,7 @@ KOKKOS_DEVICES ?= "OpenMP"
|
||||||
|
#KOKKOS_DEVICES ?= "Threads"
|
||||||
|
# Options:
|
||||||
|
# Intel: KNC,KNL,SNB,HSW,BDW,SKL,SKX,ICL,ICX,SPR
|
||||||
|
-# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86
|
||||||
|
+# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86,Hopper90
|
||||||
|
# ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX
|
||||||
|
# IBM: BGQ,Power7,Power8,Power9
|
||||||
|
# AMD-GPUS: Vega900,Vega906,Vega908,Vega90A
|
||||||
|
@@ -401,6 +401,7 @@ KOKKOS_INTERNAL_USE_ARCH_VOLTA72 := $(call kokkos_has_string,$(KOKKOS_ARCH),Volt
|
||||||
|
KOKKOS_INTERNAL_USE_ARCH_TURING75 := $(call kokkos_has_string,$(KOKKOS_ARCH),Turing75)
|
||||||
|
KOKKOS_INTERNAL_USE_ARCH_AMPERE80 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere80)
|
||||||
|
KOKKOS_INTERNAL_USE_ARCH_AMPERE86 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere86)
|
||||||
|
+KOKKOS_INTERNAL_USE_ARCH_HOPPER90 := $(call kokkos_has_string,$(KOKKOS_ARCH),Hopper90)
|
||||||
|
KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30) \
|
||||||
|
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32) \
|
||||||
|
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \
|
||||||
|
@@ -414,7 +415,8 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLE
|
||||||
|
+ $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72) \
|
||||||
|
+ $(KOKKOS_INTERNAL_USE_ARCH_TURING75) \
|
||||||
|
+ $(KOKKOS_INTERNAL_USE_ARCH_AMPERE80) \
|
||||||
|
- + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86))
|
||||||
|
+ + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86) \
|
||||||
|
+ + $(KOKKOS_INTERNAL_USE_ARCH_HOPPER90))
|
||||||
|
|
||||||
|
#SEK: This seems like a bug to me
|
||||||
|
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
|
||||||
|
@@ -1194,6 +1196,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA_ARCH), 1)
|
||||||
|
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE86")
|
||||||
|
KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_86
|
||||||
|
endif
|
||||||
|
+ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_HOPPER90), 1)
|
||||||
|
+ tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER")
|
||||||
|
+ tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER90")
|
||||||
|
+ KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_90
|
||||||
|
+ endif
|
||||||
|
|
||||||
|
ifneq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
|
||||||
|
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)
|
||||||
|
diff --git a/lib/kokkos/cmake/KokkosCore_config.h.in b/lib/kokkos/cmake/KokkosCore_config.h.in
|
||||||
|
index 88ddc48378..b83ced9243 100644
|
||||||
|
--- a/lib/kokkos/cmake/KokkosCore_config.h.in
|
||||||
|
+++ b/lib/kokkos/cmake/KokkosCore_config.h.in
|
||||||
|
@@ -102,6 +102,7 @@
|
||||||
|
#cmakedefine KOKKOS_ARCH_AMPERE
|
||||||
|
#cmakedefine KOKKOS_ARCH_AMPERE80
|
||||||
|
#cmakedefine KOKKOS_ARCH_AMPERE86
|
||||||
|
+#cmakedefine KOKKOS_ARCH_HOPPER90
|
||||||
|
#cmakedefine KOKKOS_ARCH_AMD_ZEN
|
||||||
|
#cmakedefine KOKKOS_ARCH_AMD_ZEN2
|
||||||
|
#cmakedefine KOKKOS_ARCH_AMD_ZEN3
|
||||||
|
diff --git a/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc b/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc
|
||||||
|
index f56cef1651..2585a6a64c 100644
|
||||||
|
--- a/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc
|
||||||
|
+++ b/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc
|
||||||
|
@@ -74,6 +74,7 @@ int main() {
|
||||||
|
case 75: std::cout << "Set -DKokkos_ARCH_TURING75=ON ." << std::endl; break;
|
||||||
|
case 80: std::cout << "Set -DKokkos_ARCH_AMPERE80=ON ." << std::endl; break;
|
||||||
|
case 86: std::cout << "Set -DKokkos_ARCH_AMPERE86=ON ." << std::endl; break;
|
||||||
|
+ case 90: std::cout << "Set -DKokkos_ARCH_HOPPER90=ON ." << std::endl; break;
|
||||||
|
default:
|
||||||
|
std::cout << "Compute capability " << compute_capability
|
||||||
|
<< " is not supported" << std::endl;
|
||||||
|
diff --git a/lib/kokkos/cmake/kokkos_arch.cmake b/lib/kokkos/cmake/kokkos_arch.cmake
|
||||||
|
index ef16aad047..c1d76cceeb 100644
|
||||||
|
--- a/lib/kokkos/cmake/kokkos_arch.cmake
|
||||||
|
+++ b/lib/kokkos/cmake/kokkos_arch.cmake
|
||||||
|
@@ -86,6 +86,7 @@ KOKKOS_ARCH_OPTION(VOLTA72 GPU "NVIDIA Volta generation CC 7.2" "KOKK
|
||||||
|
KOKKOS_ARCH_OPTION(TURING75 GPU "NVIDIA Turing generation CC 7.5" "KOKKOS_SHOW_CUDA_ARCHS")
|
||||||
|
KOKKOS_ARCH_OPTION(AMPERE80 GPU "NVIDIA Ampere generation CC 8.0" "KOKKOS_SHOW_CUDA_ARCHS")
|
||||||
|
KOKKOS_ARCH_OPTION(AMPERE86 GPU "NVIDIA Ampere generation CC 8.6" "KOKKOS_SHOW_CUDA_ARCHS")
|
||||||
|
+KOKKOS_ARCH_OPTION(HOPPER90 GPU "NVIDIA Hopper generation CC 9.0" "KOKKOS_SHOW_CUDA_ARCHS")
|
||||||
|
|
||||||
|
IF(Kokkos_ENABLE_HIP OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_UNSUPPORTED_ARCHS)
|
||||||
|
SET(KOKKOS_SHOW_HIP_ARCHS ON)
|
||||||
|
@@ -544,6 +545,7 @@ CHECK_CUDA_ARCH(VOLTA72 sm_72)
|
||||||
|
CHECK_CUDA_ARCH(TURING75 sm_75)
|
||||||
|
CHECK_CUDA_ARCH(AMPERE80 sm_80)
|
||||||
|
CHECK_CUDA_ARCH(AMPERE86 sm_86)
|
||||||
|
+CHECK_CUDA_ARCH(HOPPER90 sm_90)
|
||||||
|
|
||||||
|
SET(AMDGPU_ARCH_ALREADY_SPECIFIED "")
|
||||||
|
FUNCTION(CHECK_AMDGPU_ARCH ARCH FLAG)
|
||||||
|
@@ -806,6 +808,10 @@ IF (KOKKOS_ARCH_AMPERE80 OR KOKKOS_ARCH_AMPERE86)
|
||||||
|
SET(KOKKOS_ARCH_AMPERE ON)
|
||||||
|
ENDIF()
|
||||||
|
|
||||||
|
+IF (KOKKOS_ARCH_HOPPER90)
|
||||||
|
+ SET(KOKKOS_ARCH_HOPPER ON)
|
||||||
|
+ENDIF()
|
||||||
|
+
|
||||||
|
#Regardless of version, make sure we define the general architecture name
|
||||||
|
IF (KOKKOS_ARCH_VEGA900 OR KOKKOS_ARCH_VEGA906 OR KOKKOS_ARCH_VEGA908 OR KOKKOS_ARCH_VEGA90A)
|
||||||
|
SET(KOKKOS_ARCH_VEGA ON)
|
||||||
|
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
|
||||||
|
index 56f9117844..fcd4773dbc 100644
|
||||||
|
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
|
||||||
|
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
|
||||||
|
@@ -232,7 +232,8 @@ inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) {
|
||||||
|
case 61: return 96;
|
||||||
|
case 70:
|
||||||
|
case 80:
|
||||||
|
- case 86: return 8;
|
||||||
|
+ case 86:
|
||||||
|
+ case 90: return 8;
|
||||||
|
case 75: return 32;
|
||||||
|
default:
|
||||||
|
Kokkos::Impl::throw_runtime_exception(
|
||||||
|
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp
|
||||||
|
index 40a263561f..8c40ebd60d 100644
|
||||||
|
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp
|
||||||
|
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp
|
||||||
|
@@ -418,7 +418,7 @@ KOKKOS_INLINE_FUNCTION
|
||||||
|
#endif // CUDA_VERSION >= 11000 && CUDA_VERSION < 11010
|
||||||
|
|
||||||
|
#if CUDA_VERSION >= 11010 && \
|
||||||
|
- ((defined(KOKKOS_ARCH_AMPERE80) || defined(KOKKOS_ARCH_AMPERE86)))
|
||||||
|
+ ((defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER)))
|
||||||
|
KOKKOS_INLINE_FUNCTION
|
||||||
|
bhalf_t cast_to_bhalf(bhalf_t val) { return val; }
|
||||||
|
KOKKOS_INLINE_FUNCTION
|
||||||
|
diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp
|
||||||
|
index f9451ecfe6..2ce1efb98c 100644
|
||||||
|
--- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp
|
||||||
|
+++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp
|
||||||
|
@@ -51,7 +51,7 @@ namespace Kokkos::Experimental::Impl {
|
||||||
|
|
||||||
|
struct OpenACC_Traits {
|
||||||
|
#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
|
||||||
|
- defined(KOKKOS_ARCH_AMPERE)
|
||||||
|
+ defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER)
|
||||||
|
static constexpr acc_device_t dev_type = acc_device_nvidia;
|
||||||
|
static constexpr bool may_fallback_to_host = false;
|
||||||
|
#else
|
||||||
|
diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
|
||||||
|
index a9bc085912..27ee1d4232 100644
|
||||||
|
--- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
|
||||||
|
+++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
|
||||||
|
@@ -115,8 +115,9 @@ void OpenMPTargetInternal::impl_initialize() {
|
||||||
|
|
||||||
|
// FIXME_OPENMPTARGET: Only fix the number of teams for NVIDIA architectures
|
||||||
|
// from Pascal and upwards.
|
||||||
|
-#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
|
||||||
|
- defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
|
||||||
|
+#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
|
||||||
|
+ defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
|
||||||
|
+ defined(KOKKOS_ARCH_HOPPER)
|
||||||
|
#if defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG >= 1300)
|
||||||
|
omp_set_num_teams(512);
|
||||||
|
#endif
|
||||||
|
diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
|
||||||
|
index 840db4327c..7e5addbc5b 100644
|
||||||
|
--- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
|
||||||
|
+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
|
||||||
|
@@ -155,7 +155,7 @@ void SYCL::impl_initialize(InitializationSettings const& settings) {
|
||||||
|
#if !defined(KOKKOS_ARCH_INTEL_GPU) && !defined(KOKKOS_ARCH_KEPLER) && \
|
||||||
|
!defined(KOKKOS_ARCH_MAXWELL) && !defined(KOKKOS_ARCH_PASCAL) && \
|
||||||
|
!defined(KOKKOS_ARCH_VOLTA) && !defined(KOKKOS_ARCH_TURING75) && \
|
||||||
|
- !defined(KOKKOS_ARCH_AMPERE)
|
||||||
|
+ !defined(KOKKOS_ARCH_AMPERE) && !defined(KOKKOS_ARCH_HOPPER)
|
||||||
|
if (!settings.has_device_id() && gpu_devices.empty()) {
|
||||||
|
Impl::SYCLInternal::singleton().initialize(sycl::device());
|
||||||
|
return;
|
||||||
|
diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
|
||||||
|
index 5ac7d8af30..ba101f699e 100644
|
||||||
|
--- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
|
||||||
|
+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
|
||||||
|
@@ -335,9 +335,10 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
|
||||||
|
return std::min({
|
||||||
|
int(m_space.impl_internal_space_instance()->m_maxWorkgroupSize),
|
||||||
|
// FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs.
|
||||||
|
-#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
|
||||||
|
- defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
|
||||||
|
- defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
|
||||||
|
+#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
|
||||||
|
+ defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
|
||||||
|
+ defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
|
||||||
|
+ defined(KOKKOS_ARCH_HOPPER)
|
||||||
|
256,
|
||||||
|
#endif
|
||||||
|
max_threads_for_memory
|
||||||
|
@@ -367,9 +368,10 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
|
||||||
|
return std::min<int>({
|
||||||
|
int(m_space.impl_internal_space_instance()->m_maxWorkgroupSize),
|
||||||
|
// FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs.
|
||||||
|
-#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
|
||||||
|
- defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
|
||||||
|
- defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
|
||||||
|
+#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
|
||||||
|
+ defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
|
||||||
|
+ defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
|
||||||
|
+ defined(KOKKOS_ARCH_HOPPER)
|
||||||
|
256,
|
||||||
|
#endif
|
||||||
|
max_threads_for_memory
|
||||||
523
lib/kokkos/kokkos_5706.diff
Normal file
523
lib/kokkos/kokkos_5706.diff
Normal file
@ -0,0 +1,523 @@
|
|||||||
|
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
|
||||||
|
index fcd4773dbc..30b6958a67 100644
|
||||||
|
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
|
||||||
|
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
|
||||||
|
@@ -207,7 +207,6 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
|
||||||
|
LaunchBounds{});
|
||||||
|
}
|
||||||
|
|
||||||
|
-// Assuming cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1)
|
||||||
|
// NOTE these number can be obtained several ways:
|
||||||
|
// * One option is to download the CUDA Occupancy Calculator spreadsheet, select
|
||||||
|
// "Compute Capability" first and check what is the smallest "Shared Memory
|
||||||
|
@@ -242,6 +241,7 @@ inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) {
|
||||||
|
return 0;
|
||||||
|
}() * 1024;
|
||||||
|
}
|
||||||
|
+
|
||||||
|
} // namespace Impl
|
||||||
|
} // namespace Kokkos
|
||||||
|
|
||||||
|
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
|
||||||
|
index 5811498e01..e22eb3b842 100644
|
||||||
|
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
|
||||||
|
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
|
||||||
|
@@ -569,12 +569,6 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
-#ifdef KOKKOS_ENABLE_PRE_CUDA_10_DEPRECATION_API
|
||||||
|
- cudaThreadSetCacheConfig(cudaFuncCachePreferShared);
|
||||||
|
-#else
|
||||||
|
- cudaDeviceSetCacheConfig(cudaFuncCachePreferShared);
|
||||||
|
-#endif
|
||||||
|
-
|
||||||
|
// Init the array for used for arbitrarily sized atomics
|
||||||
|
if (stream == nullptr) Impl::initialize_host_cuda_lock_arrays();
|
||||||
|
|
||||||
|
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
|
||||||
|
index b7a80ad84f..5c4c3a7d39 100644
|
||||||
|
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
|
||||||
|
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
|
||||||
|
@@ -93,10 +93,6 @@ namespace Impl {
|
||||||
|
// __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
|
||||||
|
// function qualifier which could be used to improve performance.
|
||||||
|
//----------------------------------------------------------------------------
|
||||||
|
-// Maximize L1 cache and minimize shared memory:
|
||||||
|
-// cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1 );
|
||||||
|
-// For 2.0 capability: 48 KB L1 and 16 KB shared
|
||||||
|
-//----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
template <class DriverType>
|
||||||
|
__global__ static void cuda_parallel_launch_constant_memory() {
|
||||||
|
@@ -158,63 +154,105 @@ inline void check_shmem_request(CudaInternal const* cuda_instance, int shmem) {
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
-// This function needs to be template on DriverType and LaunchBounds
|
||||||
|
+// These functions needs to be template on DriverType and LaunchBounds
|
||||||
|
// so that the static bool is unique for each type combo
|
||||||
|
// KernelFuncPtr does not necessarily contain that type information.
|
||||||
|
+
|
||||||
|
template <class DriverType, class LaunchBounds, class KernelFuncPtr>
|
||||||
|
-inline void configure_shmem_preference(KernelFuncPtr const& func,
|
||||||
|
- bool prefer_shmem) {
|
||||||
|
+const cudaFuncAttributes& get_cuda_kernel_func_attributes(
|
||||||
|
+ const KernelFuncPtr& func) {
|
||||||
|
+ // Only call cudaFuncGetAttributes once for each unique kernel
|
||||||
|
+ // by leveraging static variable initialization rules
|
||||||
|
+ auto wrap_get_attributes = [&]() -> cudaFuncAttributes {
|
||||||
|
+ cudaFuncAttributes attr;
|
||||||
|
+ KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncGetAttributes(&attr, func));
|
||||||
|
+ return attr;
|
||||||
|
+ };
|
||||||
|
+ static cudaFuncAttributes func_attr = wrap_get_attributes();
|
||||||
|
+ return func_attr;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+template <class DriverType, class LaunchBounds, class KernelFuncPtr>
|
||||||
|
+inline void configure_shmem_preference(const KernelFuncPtr& func,
|
||||||
|
+ const cudaDeviceProp& device_props,
|
||||||
|
+ const size_t block_size, int& shmem,
|
||||||
|
+ const size_t occupancy) {
|
||||||
|
#ifndef KOKKOS_ARCH_KEPLER
|
||||||
|
- // On Kepler the L1 has no benefit since it doesn't cache reads
|
||||||
|
+
|
||||||
|
+ const auto& func_attr =
|
||||||
|
+ get_cuda_kernel_func_attributes<DriverType, LaunchBounds>(func);
|
||||||
|
+
|
||||||
|
+ // Compute limits for number of blocks due to registers/SM
|
||||||
|
+ const size_t regs_per_sm = device_props.regsPerMultiprocessor;
|
||||||
|
+ const size_t regs_per_thread = func_attr.numRegs;
|
||||||
|
+ // The granularity of register allocation is chunks of 256 registers per warp
|
||||||
|
+ // -> 8 registers per thread
|
||||||
|
+ const size_t allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
|
||||||
|
+ const size_t max_blocks_regs =
|
||||||
|
+ regs_per_sm / (allocated_regs_per_thread * block_size);
|
||||||
|
+
|
||||||
|
+ // Compute how many threads per sm we actually want
|
||||||
|
+ const size_t max_threads_per_sm = device_props.maxThreadsPerMultiProcessor;
|
||||||
|
+ // only allocate multiples of warp size
|
||||||
|
+ const size_t num_threads_desired =
|
||||||
|
+ ((max_threads_per_sm * occupancy / 100 + 31) / 32) * 32;
|
||||||
|
+ // Get close to the desired occupancy,
|
||||||
|
+ // don't undershoot by much but also don't allocate a whole new block just
|
||||||
|
+ // because one is a few threads over otherwise.
|
||||||
|
+ size_t num_blocks_desired =
|
||||||
|
+ (num_threads_desired + block_size * 0.8) / block_size;
|
||||||
|
+ num_blocks_desired = ::std::min(max_blocks_regs, num_blocks_desired);
|
||||||
|
+ if (num_blocks_desired == 0) num_blocks_desired = 1;
|
||||||
|
+
|
||||||
|
+ // Calculate how much shared memory we need per block
|
||||||
|
+ size_t shmem_per_block = shmem + func_attr.sharedSizeBytes;
|
||||||
|
+
|
||||||
|
+ // The minimum shared memory allocation we can have in total per SM is 8kB.
|
||||||
|
+ // If we want to lower occupancy we have to make sure we request at least that
|
||||||
|
+ // much in aggregate over all blocks, so that shared memory actually becomes a
|
||||||
|
+ // limiting factor for occupancy
|
||||||
|
+ constexpr size_t min_shmem_size_per_sm = 8192;
|
||||||
|
+ if ((occupancy < 100) &&
|
||||||
|
+ (shmem_per_block * num_blocks_desired < min_shmem_size_per_sm)) {
|
||||||
|
+ shmem_per_block = min_shmem_size_per_sm / num_blocks_desired;
|
||||||
|
+ // Need to set the caller's shmem variable so that the
|
||||||
|
+ // kernel launch uses the correct dynamic shared memory request
|
||||||
|
+ shmem = shmem_per_block - func_attr.sharedSizeBytes;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ // Compute the carveout fraction we need based on occupancy
|
||||||
|
+ // Use multiples of 8kB
|
||||||
|
+ const size_t max_shmem_per_sm = device_props.sharedMemPerMultiprocessor;
|
||||||
|
+ size_t carveout = shmem_per_block == 0
|
||||||
|
+ ? 0
|
||||||
|
+ : 100 *
|
||||||
|
+ (((num_blocks_desired * shmem_per_block +
|
||||||
|
+ min_shmem_size_per_sm - 1) /
|
||||||
|
+ min_shmem_size_per_sm) *
|
||||||
|
+ min_shmem_size_per_sm) /
|
||||||
|
+ max_shmem_per_sm;
|
||||||
|
+ if (carveout > 100) carveout = 100;
|
||||||
|
+
|
||||||
|
+ // Set the carveout, but only call it once per kernel or when it changes
|
||||||
|
auto set_cache_config = [&] {
|
||||||
|
- KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
|
||||||
|
- func,
|
||||||
|
- (prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1)));
|
||||||
|
- return prefer_shmem;
|
||||||
|
+ KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetAttribute(
|
||||||
|
+ func, cudaFuncAttributePreferredSharedMemoryCarveout, carveout));
|
||||||
|
+ return carveout;
|
||||||
|
};
|
||||||
|
- static bool cache_config_preference_cached = set_cache_config();
|
||||||
|
- if (cache_config_preference_cached != prefer_shmem) {
|
||||||
|
+ // Store the value in a static variable so we only reset if needed
|
||||||
|
+ static size_t cache_config_preference_cached = set_cache_config();
|
||||||
|
+ if (cache_config_preference_cached != carveout) {
|
||||||
|
cache_config_preference_cached = set_cache_config();
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
// Use the parameters so we don't get a warning
|
||||||
|
(void)func;
|
||||||
|
- (void)prefer_shmem;
|
||||||
|
+ (void)device_props;
|
||||||
|
+ (void)block_size;
|
||||||
|
+ (void)occupancy;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
-template <class Policy>
|
||||||
|
-std::enable_if_t<Policy::experimental_contains_desired_occupancy>
|
||||||
|
-modify_launch_configuration_if_desired_occupancy_is_specified(
|
||||||
|
- Policy const& policy, cudaDeviceProp const& properties,
|
||||||
|
- cudaFuncAttributes const& attributes, dim3 const& block, int& shmem,
|
||||||
|
- bool& prefer_shmem) {
|
||||||
|
- int const block_size = block.x * block.y * block.z;
|
||||||
|
- int const desired_occupancy = policy.impl_get_desired_occupancy().value();
|
||||||
|
-
|
||||||
|
- size_t const shmem_per_sm_prefer_l1 = get_shmem_per_sm_prefer_l1(properties);
|
||||||
|
- size_t const static_shmem = attributes.sharedSizeBytes;
|
||||||
|
-
|
||||||
|
- // round to nearest integer and avoid division by zero
|
||||||
|
- int active_blocks = std::max(
|
||||||
|
- 1, static_cast<int>(std::round(
|
||||||
|
- static_cast<double>(properties.maxThreadsPerMultiProcessor) /
|
||||||
|
- block_size * desired_occupancy / 100)));
|
||||||
|
- int const dynamic_shmem =
|
||||||
|
- shmem_per_sm_prefer_l1 / active_blocks - static_shmem;
|
||||||
|
-
|
||||||
|
- if (dynamic_shmem > shmem) {
|
||||||
|
- shmem = dynamic_shmem;
|
||||||
|
- prefer_shmem = false;
|
||||||
|
- }
|
||||||
|
-}
|
||||||
|
-
|
||||||
|
-template <class Policy>
|
||||||
|
-std::enable_if_t<!Policy::experimental_contains_desired_occupancy>
|
||||||
|
-modify_launch_configuration_if_desired_occupancy_is_specified(
|
||||||
|
- Policy const&, cudaDeviceProp const&, cudaFuncAttributes const&,
|
||||||
|
- dim3 const& /*block*/, int& /*shmem*/, bool& /*prefer_shmem*/) {}
|
||||||
|
-
|
||||||
|
// </editor-fold> end Some helper functions for launch code readability }}}1
|
||||||
|
//==============================================================================
|
||||||
|
|
||||||
|
@@ -348,7 +386,7 @@ struct CudaParallelLaunchKernelInvoker<
|
||||||
|
#ifdef KOKKOS_CUDA_ENABLE_GRAPHS
|
||||||
|
inline static void create_parallel_launch_graph_node(
|
||||||
|
DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
|
||||||
|
- CudaInternal const* cuda_instance, bool prefer_shmem) {
|
||||||
|
+ CudaInternal const* cuda_instance) {
|
||||||
|
//----------------------------------------
|
||||||
|
auto const& graph = Impl::get_cuda_graph_from_kernel(driver);
|
||||||
|
KOKKOS_EXPECTS(bool(graph));
|
||||||
|
@@ -358,8 +396,15 @@ struct CudaParallelLaunchKernelInvoker<
|
||||||
|
|
||||||
|
if (!Impl::is_empty_launch(grid, block)) {
|
||||||
|
Impl::check_shmem_request(cuda_instance, shmem);
|
||||||
|
- Impl::configure_shmem_preference<DriverType, LaunchBounds>(
|
||||||
|
- base_t::get_kernel_func(), prefer_shmem);
|
||||||
|
+ if (DriverType::Policy::
|
||||||
|
+ experimental_contains_desired_occupancy) {
|
||||||
|
+ int desired_occupancy =
|
||||||
|
+ driver.get_policy().impl_get_desired_occupancy().value();
|
||||||
|
+ size_t block_size = block.x * block.y * block.z;
|
||||||
|
+ Impl::configure_shmem_preference<DriverType, LaunchBounds>(
|
||||||
|
+ base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
|
||||||
|
+ shmem, desired_occupancy);
|
||||||
|
+ }
|
||||||
|
|
||||||
|
void const* args[] = {&driver};
|
||||||
|
|
||||||
|
@@ -442,7 +487,7 @@ struct CudaParallelLaunchKernelInvoker<
|
||||||
|
#ifdef KOKKOS_CUDA_ENABLE_GRAPHS
|
||||||
|
inline static void create_parallel_launch_graph_node(
|
||||||
|
DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
|
||||||
|
- CudaInternal const* cuda_instance, bool prefer_shmem) {
|
||||||
|
+ CudaInternal const* cuda_instance) {
|
||||||
|
//----------------------------------------
|
||||||
|
auto const& graph = Impl::get_cuda_graph_from_kernel(driver);
|
||||||
|
KOKKOS_EXPECTS(bool(graph));
|
||||||
|
@@ -452,8 +497,15 @@ struct CudaParallelLaunchKernelInvoker<
|
||||||
|
|
||||||
|
if (!Impl::is_empty_launch(grid, block)) {
|
||||||
|
Impl::check_shmem_request(cuda_instance, shmem);
|
||||||
|
- Impl::configure_shmem_preference<DriverType, LaunchBounds>(
|
||||||
|
- base_t::get_kernel_func(), prefer_shmem);
|
||||||
|
+ if constexpr (DriverType::Policy::
|
||||||
|
+ experimental_contains_desired_occupancy) {
|
||||||
|
+ int desired_occupancy =
|
||||||
|
+ driver.get_policy().impl_get_desired_occupancy().value();
|
||||||
|
+ size_t block_size = block.x * block.y * block.z;
|
||||||
|
+ Impl::configure_shmem_preference<DriverType, LaunchBounds>(
|
||||||
|
+ base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
|
||||||
|
+ shmem, desired_occupancy);
|
||||||
|
+ }
|
||||||
|
|
||||||
|
auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver);
|
||||||
|
|
||||||
|
@@ -566,7 +618,7 @@ struct CudaParallelLaunchKernelInvoker<
|
||||||
|
#ifdef KOKKOS_CUDA_ENABLE_GRAPHS
|
||||||
|
inline static void create_parallel_launch_graph_node(
|
||||||
|
DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
|
||||||
|
- CudaInternal const* cuda_instance, bool prefer_shmem) {
|
||||||
|
+ CudaInternal const* cuda_instance) {
|
||||||
|
// Just use global memory; coordinating through events to share constant
|
||||||
|
// memory with the non-graph interface is not really reasonable since
|
||||||
|
// events don't work with Graphs directly, and this would anyway require
|
||||||
|
@@ -580,7 +632,7 @@ struct CudaParallelLaunchKernelInvoker<
|
||||||
|
DriverType, LaunchBounds,
|
||||||
|
Experimental::CudaLaunchMechanism::GlobalMemory>;
|
||||||
|
global_launch_impl_t::create_parallel_launch_graph_node(
|
||||||
|
- driver, grid, block, shmem, cuda_instance, prefer_shmem);
|
||||||
|
+ driver, grid, block, shmem, cuda_instance);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
@@ -613,8 +665,7 @@ struct CudaParallelLaunchImpl<
|
||||||
|
|
||||||
|
inline static void launch_kernel(const DriverType& driver, const dim3& grid,
|
||||||
|
const dim3& block, int shmem,
|
||||||
|
- const CudaInternal* cuda_instance,
|
||||||
|
- bool prefer_shmem) {
|
||||||
|
+ const CudaInternal* cuda_instance) {
|
||||||
|
if (!Impl::is_empty_launch(grid, block)) {
|
||||||
|
// Prevent multiple threads to simultaneously set the cache configuration
|
||||||
|
// preference and launch the same kernel
|
||||||
|
@@ -623,18 +674,17 @@ struct CudaParallelLaunchImpl<
|
||||||
|
|
||||||
|
Impl::check_shmem_request(cuda_instance, shmem);
|
||||||
|
|
||||||
|
- // If a desired occupancy is specified, we compute how much shared memory
|
||||||
|
- // to ask for to achieve that occupancy, assuming that the cache
|
||||||
|
- // configuration is `cudaFuncCachePreferL1`. If the amount of dynamic
|
||||||
|
- // shared memory computed is actually smaller than `shmem` we overwrite
|
||||||
|
- // `shmem` and set `prefer_shmem` to `false`.
|
||||||
|
- modify_launch_configuration_if_desired_occupancy_is_specified(
|
||||||
|
- driver.get_policy(), cuda_instance->m_deviceProp,
|
||||||
|
- get_cuda_func_attributes(), block, shmem, prefer_shmem);
|
||||||
|
-
|
||||||
|
- Impl::configure_shmem_preference<
|
||||||
|
- DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
|
||||||
|
- base_t::get_kernel_func(), prefer_shmem);
|
||||||
|
+ if (DriverType::Policy::
|
||||||
|
+ experimental_contains_desired_occupancy) {
|
||||||
|
+ int desired_occupancy =
|
||||||
|
+ driver.get_policy().impl_get_desired_occupancy().value();
|
||||||
|
+ size_t block_size = block.x * block.y * block.z;
|
||||||
|
+ Impl::configure_shmem_preference<
|
||||||
|
+ DriverType,
|
||||||
|
+ Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
|
||||||
|
+ base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
|
||||||
|
+ shmem, desired_occupancy);
|
||||||
|
+ }
|
||||||
|
|
||||||
|
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
|
||||||
|
|
||||||
|
@@ -650,18 +700,9 @@ struct CudaParallelLaunchImpl<
|
||||||
|
}
|
||||||
|
|
||||||
|
static cudaFuncAttributes get_cuda_func_attributes() {
|
||||||
|
- // Race condition inside of cudaFuncGetAttributes if the same address is
|
||||||
|
- // given requires using a local variable as input instead of a static Rely
|
||||||
|
- // on static variable initialization to make sure only one thread executes
|
||||||
|
- // the code and the result is visible.
|
||||||
|
- auto wrap_get_attributes = []() -> cudaFuncAttributes {
|
||||||
|
- cudaFuncAttributes attr_tmp;
|
||||||
|
- KOKKOS_IMPL_CUDA_SAFE_CALL(
|
||||||
|
- cudaFuncGetAttributes(&attr_tmp, base_t::get_kernel_func()));
|
||||||
|
- return attr_tmp;
|
||||||
|
- };
|
||||||
|
- static cudaFuncAttributes attr = wrap_get_attributes();
|
||||||
|
- return attr;
|
||||||
|
+ return get_cuda_kernel_func_attributes<
|
||||||
|
+ DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
|
||||||
|
+ base_t::get_kernel_func());
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
|
||||||
|
index e586bb4cc6..0e348c092a 100644
|
||||||
|
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
|
||||||
|
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
|
||||||
|
@@ -121,8 +121,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
|
||||||
|
maxblocks[1]),
|
||||||
|
1);
|
||||||
|
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
||||||
|
- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
|
||||||
|
- false);
|
||||||
|
+ *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
|
||||||
|
} else if (RP::rank == 3) {
|
||||||
|
const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], m_rp.m_tile[2]);
|
||||||
|
KOKKOS_ASSERT(block.x > 0);
|
||||||
|
@@ -139,8 +138,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
|
||||||
|
(m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1) / block.z,
|
||||||
|
maxblocks[2]));
|
||||||
|
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
||||||
|
- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
|
||||||
|
- false);
|
||||||
|
+ *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
|
||||||
|
} else if (RP::rank == 4) {
|
||||||
|
// id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to
|
||||||
|
// threadIdx.z
|
||||||
|
@@ -158,8 +156,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
|
||||||
|
(m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1) / block.z,
|
||||||
|
maxblocks[2]));
|
||||||
|
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
||||||
|
- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
|
||||||
|
- false);
|
||||||
|
+ *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
|
||||||
|
} else if (RP::rank == 5) {
|
||||||
|
// id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to
|
||||||
|
// threadIdx.z
|
||||||
|
@@ -175,8 +172,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
|
||||||
|
(m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1) / block.z,
|
||||||
|
maxblocks[2]));
|
||||||
|
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
||||||
|
- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
|
||||||
|
- false);
|
||||||
|
+ *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
|
||||||
|
} else if (RP::rank == 6) {
|
||||||
|
// id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4,id5 to
|
||||||
|
// threadIdx.z
|
||||||
|
@@ -191,8 +187,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
|
||||||
|
std::min<array_index_type>(m_rp.m_tile_end[4] * m_rp.m_tile_end[5],
|
||||||
|
maxblocks[2]));
|
||||||
|
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
||||||
|
- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
|
||||||
|
- false);
|
||||||
|
+ *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
|
||||||
|
} else {
|
||||||
|
Kokkos::abort("Kokkos::MDRange Error: Exceeded rank bounds with Cuda\n");
|
||||||
|
}
|
||||||
|
@@ -405,8 +400,8 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
|
||||||
|
|
||||||
|
CudaParallelLaunch<ParallelReduce, LaunchBounds>(
|
||||||
|
*this, grid, block, shmem,
|
||||||
|
- m_policy.space().impl_internal_space_instance(),
|
||||||
|
- false); // copy to device and execute
|
||||||
|
+ m_policy.space()
|
||||||
|
+ .impl_internal_space_instance()); // copy to device and execute
|
||||||
|
|
||||||
|
if (!m_result_ptr_device_accessible) {
|
||||||
|
if (m_result_ptr) {
|
||||||
|
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
|
||||||
|
index ac160f8fe2..d1031751c2 100644
|
||||||
|
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
|
||||||
|
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
|
||||||
|
@@ -135,8 +135,7 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
||||||
|
- *this, grid, block, 0, m_policy.space().impl_internal_space_instance(),
|
||||||
|
- false);
|
||||||
|
+ *this, grid, block, 0, m_policy.space().impl_internal_space_instance());
|
||||||
|
}
|
||||||
|
|
||||||
|
ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
|
||||||
|
@@ -375,8 +374,8 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
|
||||||
|
|
||||||
|
CudaParallelLaunch<ParallelReduce, LaunchBounds>(
|
||||||
|
*this, grid, block, shmem,
|
||||||
|
- m_policy.space().impl_internal_space_instance(),
|
||||||
|
- false); // copy to device and execute
|
||||||
|
+ m_policy.space()
|
||||||
|
+ .impl_internal_space_instance()); // copy to device and execute
|
||||||
|
|
||||||
|
if (!m_result_ptr_device_accessible) {
|
||||||
|
if (m_result_ptr) {
|
||||||
|
@@ -726,16 +725,16 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
|
||||||
|
m_final = false;
|
||||||
|
CudaParallelLaunch<ParallelScan, LaunchBounds>(
|
||||||
|
*this, grid, block, shmem,
|
||||||
|
- m_policy.space().impl_internal_space_instance(),
|
||||||
|
- false); // copy to device and execute
|
||||||
|
+ m_policy.space()
|
||||||
|
+ .impl_internal_space_instance()); // copy to device and execute
|
||||||
|
#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
m_final = true;
|
||||||
|
CudaParallelLaunch<ParallelScan, LaunchBounds>(
|
||||||
|
*this, grid, block, shmem,
|
||||||
|
- m_policy.space().impl_internal_space_instance(),
|
||||||
|
- false); // copy to device and execute
|
||||||
|
+ m_policy.space()
|
||||||
|
+ .impl_internal_space_instance()); // copy to device and execute
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@@ -1038,16 +1037,16 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
|
||||||
|
m_final = false;
|
||||||
|
CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>(
|
||||||
|
*this, grid, block, shmem,
|
||||||
|
- m_policy.space().impl_internal_space_instance(),
|
||||||
|
- false); // copy to device and execute
|
||||||
|
+ m_policy.space()
|
||||||
|
+ .impl_internal_space_instance()); // copy to device and execute
|
||||||
|
#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
m_final = true;
|
||||||
|
CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>(
|
||||||
|
*this, grid, block, shmem,
|
||||||
|
- m_policy.space().impl_internal_space_instance(),
|
||||||
|
- false); // copy to device and execute
|
||||||
|
+ m_policy.space()
|
||||||
|
+ .impl_internal_space_instance()); // copy to device and execute
|
||||||
|
|
||||||
|
const int size = Analysis::value_size(m_functor);
|
||||||
|
#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
|
||||||
|
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
|
||||||
|
index cdd16085b3..ea9430b812 100644
|
||||||
|
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
|
||||||
|
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
|
||||||
|
@@ -552,8 +552,8 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
|
||||||
|
|
||||||
|
CudaParallelLaunch<ParallelFor, LaunchBounds>(
|
||||||
|
*this, grid, block, shmem_size_total,
|
||||||
|
- m_policy.space().impl_internal_space_instance(),
|
||||||
|
- true); // copy to device and execute
|
||||||
|
+ m_policy.space()
|
||||||
|
+ .impl_internal_space_instance()); // copy to device and execute
|
||||||
|
}
|
||||||
|
|
||||||
|
ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
|
||||||
|
@@ -878,8 +878,8 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
|
||||||
|
|
||||||
|
CudaParallelLaunch<ParallelReduce, LaunchBounds>(
|
||||||
|
*this, grid, block, shmem_size_total,
|
||||||
|
- m_policy.space().impl_internal_space_instance(),
|
||||||
|
- true); // copy to device and execute
|
||||||
|
+ m_policy.space()
|
||||||
|
+ .impl_internal_space_instance()); // copy to device and execute
|
||||||
|
|
||||||
|
if (!m_result_ptr_device_accessible) {
|
||||||
|
m_policy.space().fence(
|
||||||
|
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
|
||||||
|
index 34d4bef9fd..178012431c 100644
|
||||||
|
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
|
||||||
|
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
|
||||||
|
@@ -428,11 +428,6 @@ struct CudaReductionsFunctor<FunctorType, false, false> {
|
||||||
|
// __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
|
||||||
|
// function qualifier which could be used to improve performance.
|
||||||
|
//----------------------------------------------------------------------------
|
||||||
|
-// Maximize shared memory and minimize L1 cache:
|
||||||
|
-// cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferShared );
|
||||||
|
-// For 2.0 capability: 48 KB shared and 16 KB L1
|
||||||
|
-//----------------------------------------------------------------------------
|
||||||
|
-//----------------------------------------------------------------------------
|
||||||
|
/*
|
||||||
|
* Algorithmic constraints:
|
||||||
|
* (a) blockDim.y <= 1024
|
||||||
|
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
|
||||||
|
index fb3a6b138f..a12378a891 100644
|
||||||
|
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
|
||||||
|
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
|
||||||
|
@@ -100,8 +100,7 @@ class ParallelFor<FunctorType, Kokkos::WorkGraphPolicy<Traits...>,
|
||||||
|
const int shared = 0;
|
||||||
|
|
||||||
|
Kokkos::Impl::CudaParallelLaunch<Self>(
|
||||||
|
- *this, grid, block, shared, Cuda().impl_internal_space_instance(),
|
||||||
|
- false);
|
||||||
|
+ *this, grid, block, shared, Cuda().impl_internal_space_instance());
|
||||||
|
}
|
||||||
|
|
||||||
|
inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
|
||||||
46
lib/kokkos/kokkos_5731.diff
Normal file
46
lib/kokkos/kokkos_5731.diff
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
|
||||||
|
index 30b6958a67..b94f053272 100644
|
||||||
|
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
|
||||||
|
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
|
||||||
|
@@ -207,41 +207,6 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
|
||||||
|
LaunchBounds{});
|
||||||
|
}
|
||||||
|
|
||||||
|
-// NOTE these number can be obtained several ways:
|
||||||
|
-// * One option is to download the CUDA Occupancy Calculator spreadsheet, select
|
||||||
|
-// "Compute Capability" first and check what is the smallest "Shared Memory
|
||||||
|
-// Size Config" that is available. The "Shared Memory Per Multiprocessor" in
|
||||||
|
-// bytes is then to be found below in the summary.
|
||||||
|
-// * Another option would be to look for the information in the "Tuning
|
||||||
|
-// Guide(s)" of the CUDA Toolkit Documentation for each GPU architecture, in
|
||||||
|
-// the "Shared Memory" section (more tedious)
|
||||||
|
-inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) {
|
||||||
|
- int const compute_capability = properties.major * 10 + properties.minor;
|
||||||
|
- return [compute_capability]() {
|
||||||
|
- switch (compute_capability) {
|
||||||
|
- case 30:
|
||||||
|
- case 32:
|
||||||
|
- case 35: return 16;
|
||||||
|
- case 37: return 80;
|
||||||
|
- case 50:
|
||||||
|
- case 53:
|
||||||
|
- case 60:
|
||||||
|
- case 62: return 64;
|
||||||
|
- case 52:
|
||||||
|
- case 61: return 96;
|
||||||
|
- case 70:
|
||||||
|
- case 80:
|
||||||
|
- case 86:
|
||||||
|
- case 90: return 8;
|
||||||
|
- case 75: return 32;
|
||||||
|
- default:
|
||||||
|
- Kokkos::Impl::throw_runtime_exception(
|
||||||
|
- "Unknown device in cuda block size deduction");
|
||||||
|
- }
|
||||||
|
- return 0;
|
||||||
|
- }() * 1024;
|
||||||
|
-}
|
||||||
|
-
|
||||||
|
} // namespace Impl
|
||||||
|
} // namespace Kokkos
|
||||||
|
|
||||||
204
lib/kokkos/kokkos_5739.diff
Normal file
204
lib/kokkos/kokkos_5739.diff
Normal file
@ -0,0 +1,204 @@
|
|||||||
|
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
|
||||||
|
index b94f053272..252c13c524 100644
|
||||||
|
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
|
||||||
|
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
|
||||||
|
@@ -53,17 +53,69 @@
|
||||||
|
namespace Kokkos {
|
||||||
|
namespace Impl {
|
||||||
|
|
||||||
|
+inline int cuda_warp_per_sm_allocation_granularity(
|
||||||
|
+ cudaDeviceProp const& properties) {
|
||||||
|
+ // Allocation granularity of warps in each sm
|
||||||
|
+ switch (properties.major) {
|
||||||
|
+ case 3:
|
||||||
|
+ case 5:
|
||||||
|
+ case 7:
|
||||||
|
+ case 8:
|
||||||
|
+ case 9: return 4;
|
||||||
|
+ case 6: return (properties.minor == 0 ? 2 : 4);
|
||||||
|
+ default:
|
||||||
|
+ throw_runtime_exception(
|
||||||
|
+ "Unknown device in cuda warp per sm allocation granularity");
|
||||||
|
+ return 0;
|
||||||
|
+ }
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+inline int cuda_max_warps_per_sm_registers(
|
||||||
|
+ cudaDeviceProp const& properties, cudaFuncAttributes const& attributes) {
|
||||||
|
+ // Maximum number of warps per sm as a function of register counts,
|
||||||
|
+ // subject to the constraint that warps are allocated with a fixed granularity
|
||||||
|
+ int const max_regs_per_block = properties.regsPerBlock;
|
||||||
|
+ int const regs_per_warp = attributes.numRegs * properties.warpSize;
|
||||||
|
+ int const warp_granularity =
|
||||||
|
+ cuda_warp_per_sm_allocation_granularity(properties);
|
||||||
|
+ // The granularity of register allocation is chunks of 256 registers per warp,
|
||||||
|
+ // which implies a need to over-allocate, so we round up
|
||||||
|
+ int const allocated_regs_per_warp = (regs_per_warp + 256 - 1) / 256;
|
||||||
|
+
|
||||||
|
+ // The maximum number of warps per SM is constrained from above by register
|
||||||
|
+ // allocation. To satisfy the constraint that warps per SM is allocated at a
|
||||||
|
+ // finite granularity, we need to round down.
|
||||||
|
+ int const max_warps_per_sm =
|
||||||
|
+ warp_granularity *
|
||||||
|
+ (max_regs_per_block / (allocated_regs_per_warp * warp_granularity));
|
||||||
|
+
|
||||||
|
+ return max_warps_per_sm;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
inline int cuda_max_active_blocks_per_sm(cudaDeviceProp const& properties,
|
||||||
|
cudaFuncAttributes const& attributes,
|
||||||
|
int block_size, size_t dynamic_shmem) {
|
||||||
|
- // Limits due do registers/SM
|
||||||
|
+ // Limits due to registers/SM
|
||||||
|
int const regs_per_sm = properties.regsPerMultiprocessor;
|
||||||
|
int const regs_per_thread = attributes.numRegs;
|
||||||
|
// The granularity of register allocation is chunks of 256 registers per warp
|
||||||
|
// -> 8 registers per thread
|
||||||
|
int const allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
|
||||||
|
- int const max_blocks_regs =
|
||||||
|
- regs_per_sm / (allocated_regs_per_thread * block_size);
|
||||||
|
+ int max_blocks_regs = regs_per_sm / (allocated_regs_per_thread * block_size);
|
||||||
|
+
|
||||||
|
+ // Compute the maximum number of warps as a function of the number of
|
||||||
|
+ // registers
|
||||||
|
+ int const max_warps_per_sm_registers =
|
||||||
|
+ cuda_max_warps_per_sm_registers(properties, attributes);
|
||||||
|
+
|
||||||
|
+ // Constrain the number of blocks to respect the maximum number of warps per
|
||||||
|
+ // SM On face value this should be an equality, but due to the warp
|
||||||
|
+ // granularity constraints noted in `cuda_max_warps_per_sm_registers` the
|
||||||
|
+ // left-hand-side of this comparison can overshoot what the hardware allows
|
||||||
|
+ // based on register counts alone
|
||||||
|
+ while ((max_blocks_regs * block_size / properties.warpSize) >
|
||||||
|
+ max_warps_per_sm_registers)
|
||||||
|
+ max_blocks_regs--;
|
||||||
|
|
||||||
|
// Limits due to shared memory/SM
|
||||||
|
size_t const shmem_per_sm = properties.sharedMemPerMultiprocessor;
|
||||||
|
@@ -207,6 +259,19 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
|
||||||
|
LaunchBounds{});
|
||||||
|
}
|
||||||
|
|
||||||
|
+template <class LaunchBounds>
|
||||||
|
+int cuda_get_opt_block_size_no_shmem(const cudaFuncAttributes& attr,
|
||||||
|
+ LaunchBounds) {
|
||||||
|
+ auto const& prop = Kokkos::Cuda().cuda_device_prop();
|
||||||
|
+
|
||||||
|
+ // Thin version of cuda_get_opt_block_size for cases where there is no shared
|
||||||
|
+ // memory
|
||||||
|
+ auto const block_size_to_no_shmem = [&](int /*block_size*/) { return 0; };
|
||||||
|
+
|
||||||
|
+ return cuda_deduce_block_size(false, prop, attr, block_size_to_no_shmem,
|
||||||
|
+ LaunchBounds{});
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
} // namespace Impl
|
||||||
|
} // namespace Kokkos
|
||||||
|
|
||||||
|
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
|
||||||
|
index 5c4c3a7d39..170183ca0a 100644
|
||||||
|
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
|
||||||
|
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
|
||||||
|
@@ -188,9 +188,23 @@ inline void configure_shmem_preference(const KernelFuncPtr& func,
|
||||||
|
// The granularity of register allocation is chunks of 256 registers per warp
|
||||||
|
// -> 8 registers per thread
|
||||||
|
const size_t allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
|
||||||
|
- const size_t max_blocks_regs =
|
||||||
|
+ size_t max_blocks_regs =
|
||||||
|
regs_per_sm / (allocated_regs_per_thread * block_size);
|
||||||
|
|
||||||
|
+ // Compute the maximum number of warps as a function of the number of
|
||||||
|
+ // registers
|
||||||
|
+ const size_t max_warps_per_sm_registers =
|
||||||
|
+ cuda_max_warps_per_sm_registers(device_props, func_attr);
|
||||||
|
+
|
||||||
|
+ // Constrain the number of blocks to respect the maximum number of warps per
|
||||||
|
+ // SM On face value this should be an equality, but due to the warp
|
||||||
|
+ // granularity constraints noted in `cuda_max_warps_per_sm_registers` the
|
||||||
|
+ // left-hand-side of this comparison can overshoot what the hardware allows
|
||||||
|
+ // based on register counts alone
|
||||||
|
+ while ((max_blocks_regs * block_size / device_props.warpSize) >
|
||||||
|
+ max_warps_per_sm_registers)
|
||||||
|
+ max_blocks_regs--;
|
||||||
|
+
|
||||||
|
// Compute how many threads per sm we actually want
|
||||||
|
const size_t max_threads_per_sm = device_props.maxThreadsPerMultiProcessor;
|
||||||
|
// only allocate multiples of warp size
|
||||||
|
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
|
||||||
|
index 0e348c092a..7e4f62f12e 100644
|
||||||
|
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
|
||||||
|
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
|
||||||
|
@@ -67,6 +67,34 @@
|
||||||
|
namespace Kokkos {
|
||||||
|
namespace Impl {
|
||||||
|
|
||||||
|
+template <typename ParallelType, typename Policy, typename LaunchBounds>
|
||||||
|
+int max_tile_size_product_helper(const Policy& pol, const LaunchBounds&) {
|
||||||
|
+ cudaFuncAttributes attr =
|
||||||
|
+ CudaParallelLaunch<ParallelType,
|
||||||
|
+ LaunchBounds>::get_cuda_func_attributes();
|
||||||
|
+ auto const& prop = pol.space().cuda_device_prop();
|
||||||
|
+
|
||||||
|
+ // Limits due to registers/SM, MDRange doesn't have
|
||||||
|
+ // shared memory constraints
|
||||||
|
+ int const optimal_block_size =
|
||||||
|
+ Kokkos::Impl::cuda_get_opt_block_size_no_shmem(attr, LaunchBounds{});
|
||||||
|
+
|
||||||
|
+ // Compute how many blocks of this size we can launch, based on warp
|
||||||
|
+ // constraints
|
||||||
|
+ int const max_warps_per_sm_registers =
|
||||||
|
+ Kokkos::Impl::cuda_max_warps_per_sm_registers(prop, attr);
|
||||||
|
+ int const max_num_threads_from_warps =
|
||||||
|
+ max_warps_per_sm_registers * prop.warpSize;
|
||||||
|
+ int const max_num_blocks = max_num_threads_from_warps / optimal_block_size;
|
||||||
|
+
|
||||||
|
+ // Compute the total number of threads
|
||||||
|
+ int const max_threads_per_sm = optimal_block_size * max_num_blocks;
|
||||||
|
+
|
||||||
|
+ return std::min(
|
||||||
|
+ max_threads_per_sm,
|
||||||
|
+ static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
template <class FunctorType, class... Traits>
|
||||||
|
class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
|
||||||
|
public:
|
||||||
|
@@ -85,18 +113,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
|
||||||
|
public:
|
||||||
|
template <typename Policy, typename Functor>
|
||||||
|
static int max_tile_size_product(const Policy& pol, const Functor&) {
|
||||||
|
- cudaFuncAttributes attr =
|
||||||
|
- CudaParallelLaunch<ParallelFor,
|
||||||
|
- LaunchBounds>::get_cuda_func_attributes();
|
||||||
|
- auto const& prop = pol.space().cuda_device_prop();
|
||||||
|
- // Limits due to registers/SM, MDRange doesn't have
|
||||||
|
- // shared memory constraints
|
||||||
|
- int const regs_per_sm = prop.regsPerMultiprocessor;
|
||||||
|
- int const regs_per_thread = attr.numRegs;
|
||||||
|
- int const max_threads_per_sm = regs_per_sm / regs_per_thread;
|
||||||
|
- return std::min(
|
||||||
|
- max_threads_per_sm,
|
||||||
|
- static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
|
||||||
|
+ return max_tile_size_product_helper<ParallelFor>(pol, LaunchBounds{});
|
||||||
|
}
|
||||||
|
Policy const& get_policy() const { return m_rp; }
|
||||||
|
inline __device__ void operator()() const {
|
||||||
|
@@ -258,17 +275,7 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
|
||||||
|
public:
|
||||||
|
template <typename Policy, typename Functor>
|
||||||
|
static int max_tile_size_product(const Policy& pol, const Functor&) {
|
||||||
|
- cudaFuncAttributes attr =
|
||||||
|
- CudaParallelLaunch<ParallelReduce,
|
||||||
|
- LaunchBounds>::get_cuda_func_attributes();
|
||||||
|
- auto const& prop = pol.space().cuda_device_prop();
|
||||||
|
- // Limits due do registers/SM
|
||||||
|
- int const regs_per_sm = prop.regsPerMultiprocessor;
|
||||||
|
- int const regs_per_thread = attr.numRegs;
|
||||||
|
- int const max_threads_per_sm = regs_per_sm / regs_per_thread;
|
||||||
|
- return std::min(
|
||||||
|
- max_threads_per_sm,
|
||||||
|
- static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
|
||||||
|
+ return max_tile_size_product_helper<ParallelReduce>(pol, LaunchBounds{});
|
||||||
|
}
|
||||||
|
Policy const& get_policy() const { return m_policy; }
|
||||||
|
inline __device__ void exec_range(reference_type update) const {
|
||||||
63
lib/kokkos/kokkos_fix_5706_apply_last.diff
Normal file
63
lib/kokkos/kokkos_fix_5706_apply_last.diff
Normal file
@ -0,0 +1,63 @@
|
|||||||
|
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
|
||||||
|
index 170183ca0a..ba43e362bb 100644
|
||||||
|
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
|
||||||
|
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
|
||||||
|
@@ -412,12 +412,16 @@ struct CudaParallelLaunchKernelInvoker<
|
||||||
|
Impl::check_shmem_request(cuda_instance, shmem);
|
||||||
|
if (DriverType::Policy::
|
||||||
|
experimental_contains_desired_occupancy) {
|
||||||
|
+ /*
|
||||||
|
int desired_occupancy =
|
||||||
|
driver.get_policy().impl_get_desired_occupancy().value();
|
||||||
|
size_t block_size = block.x * block.y * block.z;
|
||||||
|
Impl::configure_shmem_preference<DriverType, LaunchBounds>(
|
||||||
|
base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
|
||||||
|
- shmem, desired_occupancy);
|
||||||
|
+ shmem, desired_occupancy);*/
|
||||||
|
+ Kokkos::Impl::throw_runtime_exception(
|
||||||
|
+ std::string("Cuda graph node creation FAILED:"
|
||||||
|
+ " occupancy requests are currently broken."));
|
||||||
|
}
|
||||||
|
|
||||||
|
void const* args[] = {&driver};
|
||||||
|
@@ -511,14 +515,17 @@ struct CudaParallelLaunchKernelInvoker<
|
||||||
|
|
||||||
|
if (!Impl::is_empty_launch(grid, block)) {
|
||||||
|
Impl::check_shmem_request(cuda_instance, shmem);
|
||||||
|
- if constexpr (DriverType::Policy::
|
||||||
|
+ if (DriverType::Policy::
|
||||||
|
experimental_contains_desired_occupancy) {
|
||||||
|
- int desired_occupancy =
|
||||||
|
+ /*int desired_occupancy =
|
||||||
|
driver.get_policy().impl_get_desired_occupancy().value();
|
||||||
|
size_t block_size = block.x * block.y * block.z;
|
||||||
|
Impl::configure_shmem_preference<DriverType, LaunchBounds>(
|
||||||
|
base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
|
||||||
|
- shmem, desired_occupancy);
|
||||||
|
+ shmem, desired_occupancy);*/
|
||||||
|
+ Kokkos::Impl::throw_runtime_exception(
|
||||||
|
+ std::string("Cuda graph node creation FAILED:"
|
||||||
|
+ " occupancy requests are currently broken."));
|
||||||
|
}
|
||||||
|
|
||||||
|
auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver);
|
||||||
|
@@ -690,14 +697,17 @@ struct CudaParallelLaunchImpl<
|
||||||
|
|
||||||
|
if (DriverType::Policy::
|
||||||
|
experimental_contains_desired_occupancy) {
|
||||||
|
- int desired_occupancy =
|
||||||
|
+ /*int desired_occupancy =
|
||||||
|
driver.get_policy().impl_get_desired_occupancy().value();
|
||||||
|
size_t block_size = block.x * block.y * block.z;
|
||||||
|
Impl::configure_shmem_preference<
|
||||||
|
DriverType,
|
||||||
|
Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
|
||||||
|
base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
|
||||||
|
- shmem, desired_occupancy);
|
||||||
|
+ shmem, desired_occupancy);*/
|
||||||
|
+ Kokkos::Impl::throw_runtime_exception(
|
||||||
|
+ std::string("Cuda graph node creation FAILED:"
|
||||||
|
+ " occupancy requests are currently broken."));
|
||||||
|
}
|
||||||
|
|
||||||
|
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
|
||||||
@ -29,3 +29,4 @@ tag: 3.5.00 date: 11:19:2021 master: c28a8b03 release: 21b879e4
|
|||||||
tag: 3.6.00 date: 04:14:2022 master: 2834f94a release: 6ea708ff
|
tag: 3.6.00 date: 04:14:2022 master: 2834f94a release: 6ea708ff
|
||||||
tag: 3.6.01 date: 06:16:2022 master: b52f8c83 release: afe9b404
|
tag: 3.6.01 date: 06:16:2022 master: b52f8c83 release: afe9b404
|
||||||
tag: 3.7.00 date: 08:25:2022 master: d19aab99 release: 0018e5fb
|
tag: 3.7.00 date: 08:25:2022 master: d19aab99 release: 0018e5fb
|
||||||
|
tag: 3.7.01 date: 12:01:2022 master: 61d7db55 release: d3bb8cfe
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
|
TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
|
||||||
LIB_REQUIRED_PACKAGES KokkosCore
|
LIB_REQUIRED_PACKAGES KokkosCore
|
||||||
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC HPX
|
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC
|
||||||
TEST_OPTIONAL_TPLS CUSPARSE
|
TEST_OPTIONAL_TPLS CUSPARSE
|
||||||
)
|
)
|
||||||
|
|||||||
3
lib/kokkos/tpls/.clang-format
Normal file
3
lib/kokkos/tpls/.clang-format
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
#Official Tool: clang-format version 8.0.0
|
||||||
|
DisableFormat: true
|
||||||
|
SortIncludes: false
|
||||||
@ -76,7 +76,7 @@ namespace Impl {
|
|||||||
/// instances in other translation units, we must update this CUDA global
|
/// instances in other translation units, we must update this CUDA global
|
||||||
/// variable based on the Host global variable prior to running any kernels
|
/// variable based on the Host global variable prior to running any kernels
|
||||||
/// that will use it.
|
/// that will use it.
|
||||||
/// That is the purpose of the ensure_cuda_lock_arrays_on_device function.
|
/// That is the purpose of the KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE macro.
|
||||||
__device__
|
__device__
|
||||||
#ifdef __CUDACC_RDC__
|
#ifdef __CUDACC_RDC__
|
||||||
__constant__ extern
|
__constant__ extern
|
||||||
@ -138,42 +138,33 @@ namespace {
|
|||||||
static int lock_array_copied = 0;
|
static int lock_array_copied = 0;
|
||||||
inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
|
inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
#ifdef __CUDACC_RDC__
|
|
||||||
inline
|
|
||||||
#else
|
|
||||||
static
|
|
||||||
#endif
|
|
||||||
void
|
|
||||||
copy_cuda_lock_arrays_to_device() {
|
|
||||||
if (lock_array_copied == 0) {
|
|
||||||
cudaMemcpyToSymbol(CUDA_SPACE_ATOMIC_LOCKS_DEVICE,
|
|
||||||
&CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h,
|
|
||||||
sizeof(int32_t*));
|
|
||||||
cudaMemcpyToSymbol(CUDA_SPACE_ATOMIC_LOCKS_NODE,
|
|
||||||
&CUDA_SPACE_ATOMIC_LOCKS_NODE_h,
|
|
||||||
sizeof(int32_t*));
|
|
||||||
}
|
|
||||||
lock_array_copied = 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace Impl
|
} // namespace Impl
|
||||||
} // namespace desul
|
} // namespace desul
|
||||||
|
/* It is critical that this code be a macro, so that it will
|
||||||
|
capture the right address for desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE
|
||||||
|
putting this in an inline function will NOT do the right thing! */
|
||||||
|
#define DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE() \
|
||||||
|
{ \
|
||||||
|
if (::desul::Impl::lock_array_copied == 0) { \
|
||||||
|
cudaMemcpyToSymbol(::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE, \
|
||||||
|
&::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h, \
|
||||||
|
sizeof(int32_t*)); \
|
||||||
|
cudaMemcpyToSymbol(::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_NODE, \
|
||||||
|
&::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_NODE_h, \
|
||||||
|
sizeof(int32_t*)); \
|
||||||
|
} \
|
||||||
|
::desul::Impl::lock_array_copied = 1; \
|
||||||
|
}
|
||||||
|
|
||||||
#endif /* defined( __CUDACC__ ) */
|
#endif /* defined( __CUDACC__ ) */
|
||||||
|
|
||||||
#endif /* defined( DESUL_HAVE_CUDA_ATOMICS ) */
|
#endif /* defined( DESUL_HAVE_CUDA_ATOMICS ) */
|
||||||
|
|
||||||
namespace desul {
|
|
||||||
|
|
||||||
#if defined(__CUDACC_RDC__) || (!defined(__CUDACC__))
|
#if defined(__CUDACC_RDC__) || (!defined(__CUDACC__))
|
||||||
inline void ensure_cuda_lock_arrays_on_device() {}
|
#define DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
|
||||||
#else
|
#else
|
||||||
static inline void ensure_cuda_lock_arrays_on_device() {
|
#define DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() \
|
||||||
Impl::copy_cuda_lock_arrays_to_device();
|
DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
} // namespace desul
|
#endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP_ */
|
||||||
|
|
||||||
#endif /* #ifndef DESUL_ATOMICS_LOCK_ARRAY_CUDA_HPP_ */
|
|
||||||
|
|||||||
@ -70,7 +70,7 @@ void init_lock_arrays_cuda() {
|
|||||||
"init_lock_arrays_cuda: cudaMalloc host locks");
|
"init_lock_arrays_cuda: cudaMalloc host locks");
|
||||||
|
|
||||||
auto error_sync1 = cudaDeviceSynchronize();
|
auto error_sync1 = cudaDeviceSynchronize();
|
||||||
copy_cuda_lock_arrays_to_device();
|
DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
|
||||||
check_error_and_throw_cuda(error_sync1, "init_lock_arrays_cuda: post mallocs");
|
check_error_and_throw_cuda(error_sync1, "init_lock_arrays_cuda: post mallocs");
|
||||||
init_lock_arrays_cuda_kernel<<<(CUDA_SPACE_ATOMIC_MASK + 1 + 255) / 256, 256>>>();
|
init_lock_arrays_cuda_kernel<<<(CUDA_SPACE_ATOMIC_MASK + 1 + 255) / 256, 256>>>();
|
||||||
auto error_sync2 = cudaDeviceSynchronize();
|
auto error_sync2 = cudaDeviceSynchronize();
|
||||||
@ -85,7 +85,7 @@ void finalize_lock_arrays_cuda() {
|
|||||||
CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr;
|
CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr;
|
||||||
CUDA_SPACE_ATOMIC_LOCKS_NODE_h = nullptr;
|
CUDA_SPACE_ATOMIC_LOCKS_NODE_h = nullptr;
|
||||||
#ifdef __CUDACC_RDC__
|
#ifdef __CUDACC_RDC__
|
||||||
copy_cuda_lock_arrays_to_device();
|
DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -190,6 +190,20 @@ namespace LAMMPS_NS {
|
|||||||
Tp_BIAS,Tp_RMASS,Tp_ZERO>(i);
|
Tp_BIAS,Tp_RMASS,Tp_ZERO>(i);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
KOKKOS_INLINE_FUNCTION
|
||||||
|
static void init(value_type &update) {
|
||||||
|
update.fx = 0.0;
|
||||||
|
update.fy = 0.0;
|
||||||
|
update.fz = 0.0;
|
||||||
|
}
|
||||||
|
KOKKOS_INLINE_FUNCTION
|
||||||
|
static void join(value_type &update,
|
||||||
|
const value_type &source) {
|
||||||
|
update.fx += source.fx;
|
||||||
|
update.fy += source.fy;
|
||||||
|
update.fz += source.fz;
|
||||||
|
}
|
||||||
|
|
||||||
KOKKOS_INLINE_FUNCTION
|
KOKKOS_INLINE_FUNCTION
|
||||||
static void init(volatile value_type &update) {
|
static void init(volatile value_type &update) {
|
||||||
update.fx = 0.0;
|
update.fx = 0.0;
|
||||||
@ -233,6 +247,15 @@ namespace LAMMPS_NS {
|
|||||||
energy += c.compute_energy_item(i);
|
energy += c.compute_energy_item(i);
|
||||||
}
|
}
|
||||||
KOKKOS_INLINE_FUNCTION
|
KOKKOS_INLINE_FUNCTION
|
||||||
|
static void init(value_type &update) {
|
||||||
|
update = 0.0;
|
||||||
|
}
|
||||||
|
KOKKOS_INLINE_FUNCTION
|
||||||
|
static void join(value_type &update,
|
||||||
|
const value_type &source) {
|
||||||
|
update += source;
|
||||||
|
}
|
||||||
|
KOKKOS_INLINE_FUNCTION
|
||||||
static void init(volatile value_type &update) {
|
static void init(volatile value_type &update) {
|
||||||
update = 0.0;
|
update = 0.0;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -77,7 +77,6 @@ GPU_AWARE_UNKNOWN
|
|||||||
|
|
||||||
using namespace LAMMPS_NS;
|
using namespace LAMMPS_NS;
|
||||||
|
|
||||||
Kokkos::InitArguments KokkosLMP::args{-1, -1, -1, false};
|
|
||||||
int KokkosLMP::is_finalized = 0;
|
int KokkosLMP::is_finalized = 0;
|
||||||
int KokkosLMP::init_ngpus = 0;
|
int KokkosLMP::init_ngpus = 0;
|
||||||
|
|
||||||
@ -110,7 +109,6 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
|
|||||||
ngpus = 0;
|
ngpus = 0;
|
||||||
int device = 0;
|
int device = 0;
|
||||||
nthreads = 1;
|
nthreads = 1;
|
||||||
numa = 1;
|
|
||||||
|
|
||||||
int iarg = 0;
|
int iarg = 0;
|
||||||
while (iarg < narg) {
|
while (iarg < narg) {
|
||||||
@ -189,30 +187,24 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
|
|||||||
|
|
||||||
iarg += 2;
|
iarg += 2;
|
||||||
|
|
||||||
} else if (strcmp(arg[iarg],"n") == 0 ||
|
|
||||||
strcmp(arg[iarg],"numa") == 0) {
|
|
||||||
numa = utils::inumeric(FLERR, arg[iarg+1], false, lmp);
|
|
||||||
iarg += 2;
|
|
||||||
|
|
||||||
} else error->all(FLERR,"Invalid Kokkos command-line arg: {}", arg[iarg]);
|
} else error->all(FLERR,"Invalid Kokkos command-line arg: {}", arg[iarg]);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Initialize Kokkos. However, we cannot change any
|
// Initialize Kokkos. However, we cannot change any
|
||||||
// Kokkos library parameters after the first initalization
|
// Kokkos library parameters after the first initalization
|
||||||
|
|
||||||
if (args.num_threads != -1) {
|
Kokkos::InitializationSettings args;
|
||||||
if ((args.num_threads != nthreads) || (args.num_numa != numa) || (args.device_id != device))
|
|
||||||
|
if (args.has_num_threads()) {
|
||||||
|
if ((args.get_num_threads() != nthreads) || (args.get_device_id() != device))
|
||||||
if (me == 0)
|
if (me == 0)
|
||||||
error->warning(FLERR,"Kokkos package already initalized, "
|
error->warning(FLERR,"Kokkos package already initalized. Cannot change parameters");
|
||||||
"cannot reinitialize with different parameters");
|
nthreads = args.get_num_threads();
|
||||||
nthreads = args.num_threads;
|
device = args.get_device_id();
|
||||||
numa = args.num_numa;
|
|
||||||
device = args.device_id;
|
|
||||||
ngpus = init_ngpus;
|
ngpus = init_ngpus;
|
||||||
} else {
|
} else {
|
||||||
args.num_threads = nthreads;
|
args.set_num_threads(nthreads);
|
||||||
args.num_numa = numa;
|
args.set_device_id(device);
|
||||||
args.device_id = device;
|
|
||||||
init_ngpus = ngpus;
|
init_ngpus = ngpus;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -350,7 +342,7 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
|
|||||||
|
|
||||||
/* ---------------------------------------------------------------------- */
|
/* ---------------------------------------------------------------------- */
|
||||||
|
|
||||||
void KokkosLMP::initialize(Kokkos::InitArguments args, Error *error)
|
void KokkosLMP::initialize(Kokkos::InitializationSettings args, Error *error)
|
||||||
{
|
{
|
||||||
if (!Kokkos::is_initialized()) {
|
if (!Kokkos::is_initialized()) {
|
||||||
if (is_finalized)
|
if (is_finalized)
|
||||||
|
|||||||
@ -43,7 +43,6 @@ class KokkosLMP : protected Pointers {
|
|||||||
int forward_fix_comm_changed;
|
int forward_fix_comm_changed;
|
||||||
int reverse_comm_changed;
|
int reverse_comm_changed;
|
||||||
int nthreads,ngpus;
|
int nthreads,ngpus;
|
||||||
int numa;
|
|
||||||
int auto_sync;
|
int auto_sync;
|
||||||
int gpu_aware_flag;
|
int gpu_aware_flag;
|
||||||
int neigh_thread;
|
int neigh_thread;
|
||||||
@ -53,12 +52,11 @@ class KokkosLMP : protected Pointers {
|
|||||||
double binsize;
|
double binsize;
|
||||||
|
|
||||||
static int is_finalized;
|
static int is_finalized;
|
||||||
static Kokkos::InitArguments args;
|
|
||||||
static int init_ngpus;
|
static int init_ngpus;
|
||||||
|
|
||||||
KokkosLMP(class LAMMPS *, int, char **);
|
KokkosLMP(class LAMMPS *, int, char **);
|
||||||
|
|
||||||
static void initialize(Kokkos::InitArguments, Error *);
|
static void initialize(Kokkos::InitializationSettings, Error *);
|
||||||
static void finalize();
|
static void finalize();
|
||||||
void accelerator(int, char **);
|
void accelerator(int, char **);
|
||||||
int neigh_count(int);
|
int neigh_count(int);
|
||||||
|
|||||||
@ -524,6 +524,12 @@ struct PairReaxKokkosFindBondFunctor {
|
|||||||
PairReaxFFKokkos<DeviceType> c;
|
PairReaxFFKokkos<DeviceType> c;
|
||||||
PairReaxKokkosFindBondFunctor(PairReaxFFKokkos<DeviceType>* c_ptr):c(*c_ptr) {};
|
PairReaxKokkosFindBondFunctor(PairReaxFFKokkos<DeviceType>* c_ptr):c(*c_ptr) {};
|
||||||
|
|
||||||
|
KOKKOS_INLINE_FUNCTION
|
||||||
|
void join(int &dst,
|
||||||
|
const int &src) const {
|
||||||
|
dst = MAX(dst,src);
|
||||||
|
}
|
||||||
|
|
||||||
KOKKOS_INLINE_FUNCTION
|
KOKKOS_INLINE_FUNCTION
|
||||||
void join(volatile int &dst,
|
void join(volatile int &dst,
|
||||||
const volatile int &src) const {
|
const volatile int &src) const {
|
||||||
|
|||||||
@ -52,7 +52,6 @@ class KokkosLMP {
|
|||||||
int kokkos_exists;
|
int kokkos_exists;
|
||||||
int nthreads;
|
int nthreads;
|
||||||
int ngpus;
|
int ngpus;
|
||||||
int numa;
|
|
||||||
|
|
||||||
KokkosLMP(class LAMMPS *, int, char **) { kokkos_exists = 0; }
|
KokkosLMP(class LAMMPS *, int, char **) { kokkos_exists = 0; }
|
||||||
~KokkosLMP() {}
|
~KokkosLMP() {}
|
||||||
|
|||||||
@ -91,7 +91,7 @@ Comm::Comm(LAMMPS *lmp) : Pointers(lmp)
|
|||||||
nthreads = 1;
|
nthreads = 1;
|
||||||
#ifdef _OPENMP
|
#ifdef _OPENMP
|
||||||
if (lmp->kokkos) {
|
if (lmp->kokkos) {
|
||||||
nthreads = lmp->kokkos->nthreads * lmp->kokkos->numa;
|
nthreads = lmp->kokkos->nthreads;
|
||||||
} else if (getenv("OMP_NUM_THREADS") == nullptr) {
|
} else if (getenv("OMP_NUM_THREADS") == nullptr) {
|
||||||
nthreads = 1;
|
nthreads = 1;
|
||||||
if (me == 0)
|
if (me == 0)
|
||||||
|
|||||||
Reference in New Issue
Block a user