Merge pull request #3532 from stanmoore1/kk_occupancy

Update Kokkos version in LAMMPS to 3.7.1
This commit is contained in:
Axel Kohlmeyer
2023-01-20 17:52:05 -05:00
committed by GitHub
67 changed files with 2295 additions and 561 deletions

View File

@ -49,8 +49,8 @@ if(DOWNLOAD_KOKKOS)
list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_CXX_EXTENSIONS=${CMAKE_CXX_EXTENSIONS}")
list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}")
include(ExternalProject)
set(KOKKOS_URL "https://github.com/kokkos/kokkos/archive/3.7.00.tar.gz" CACHE STRING "URL for KOKKOS tarball")
set(KOKKOS_MD5 "84991eca9f066383abe119a5bc7a11c4" CACHE STRING "MD5 checksum of KOKKOS tarball")
set(KOKKOS_URL "https://github.com/kokkos/kokkos/archive/3.7.01.tar.gz" CACHE STRING "URL for KOKKOS tarball")
set(KOKKOS_MD5 "f140e02b826223b1045207d9bc10d404" CACHE STRING "MD5 checksum of KOKKOS tarball")
mark_as_advanced(KOKKOS_URL)
mark_as_advanced(KOKKOS_MD5)
ExternalProject_Add(kokkos_build
@ -74,7 +74,7 @@ if(DOWNLOAD_KOKKOS)
add_dependencies(LAMMPS::KOKKOSCORE kokkos_build)
add_dependencies(LAMMPS::KOKKOSCONTAINERS kokkos_build)
elseif(EXTERNAL_KOKKOS)
find_package(Kokkos 3.7.00 REQUIRED CONFIG)
find_package(Kokkos 3.7.01 REQUIRED CONFIG)
target_link_libraries(lammps PRIVATE Kokkos::kokkos)
target_link_libraries(lmp PRIVATE Kokkos::kokkos)
else()

View File

@ -105,13 +105,12 @@ Either the full word or an abbreviation can be used for the keywords.
Note that the keywords do not use a leading minus sign. I.e. the
keyword is "t", not "-t". Also note that each of the keywords has a
default setting. Examples of when to use these options and what
settings to use on different platforms is given on the :doc:`KOKKOS package <Speed_kokkos>`
doc page.
settings to use on different platforms is given on the :doc:`KOKKOS
package <Speed_kokkos>` doc page.
* d or device
* g or gpus
* t or threads
* n or numa
.. parsed-literal::
@ -164,19 +163,10 @@ the number of physical cores per node, to use your available hardware
optimally. This also sets the number of threads used by the host when
LAMMPS is compiled with CUDA=yes.
.. parsed-literal::
.. deprecated:: 22Dec2022
numa Nm
This option is only relevant when using pthreads with hwloc support.
In this case Nm defines the number of NUMA regions (typically sockets)
on a node which will be utilized by a single MPI rank. By default Nm
= 1. If this option is used the total number of worker-threads per
MPI rank is threads\*numa. Currently it is always almost better to
assign at least one MPI rank per NUMA region, and leave numa set to
its default value of 1. This is because letting a single process span
multiple NUMA regions induces a significant amount of cross NUMA data
traffic which is slow.
Support for the "numa" or "n" option was removed as its functionality
was ignored in Kokkos for some time already.
----------

View File

@ -1,5 +1,27 @@
# Change Log
## [3.7.01](https://github.com/kokkos/kokkos/tree/3.7.01) (2022-12-01)
[Full Changelog](https://github.com/kokkos/kokkos/compare/3.7.00...3.7.01)
### Bug Fixes:
- Add fences to all sorting routines not taking an execution space instance argument [\#5547](https://github.com/kokkos/kokkos/pull/5547)
- Fix repeated `team_reduce` without barrier [\#5552](https://github.com/kokkos/kokkos/pull/5552)
- Fix memory spaces in `create_mirror_view` overloads using `view_alloc` [\#5521](https://github.com/kokkos/kokkos/pull/5521)
- Allow `as_view_of_rank_n()` to be overloaded for "special" scalar types [\#5553](https://github.com/kokkos/kokkos/pull/5553)
- Fix warning calling a `__host__` function from a `__host__ __device__` from `View:: as_view_of_rank_n` [\#5591](https://github.com/kokkos/kokkos/pull/5591)
- OpenMPTarget: adding implementation to set device id. [\#5557](https://github.com/kokkos/kokkos/pull/5557)
- Use `Kokkos::atomic_load` to Correct Race Condition Giving Rise to Seg Faulting Error in OpenMP tests [\#5559](https://github.com/kokkos/kokkos/pull/5559)
- cmake: define `KOKKOS_ARCH_A64FX` [\#5561](https://github.com/kokkos/kokkos/pull/5561)
- Only link against libatomic in gnu-make OpenMPTarget build [\#5565](https://github.com/kokkos/kokkos/pull/5565)
- Fix static extents assignment for LayoutLeft/LayoutRight assignment [\#5566](https://github.com/kokkos/kokkos/pull/5566)
- Do not add -cuda to the link line with NVHPC compiler when the CUDA backend is not actually enabled [\#5569](https://github.com/kokkos/kokkos/pull/5569)
- Export the flags in `KOKKOS_AMDGPU_OPTIONS` when using Trilinos [\#5571](https://github.com/kokkos/kokkos/pull/5571)
- Add support for detecting MPI local rank with MPICH and PMI [\#5570](https://github.com/kokkos/kokkos/pull/5570) [\#5582](https://github.com/kokkos/kokkos/pull/5582)
- Remove listing of undefined TPL dependencies [\#5573](https://github.com/kokkos/kokkos/pull/5573)
- ClockTic changed to 64 bit to fix overflow on Power [\#5592](https://github.com/kokkos/kokkos/pull/5592)
- Fix incorrect offset in CUDA and HIP parallel scan for < 4 byte types [\#5607](https://github.com/kokkos/kokkos/pull/5607)
- Fix initialization of Cuda lock arrays [\#5622](https://github.com/kokkos/kokkos/pull/5622)
## [3.7.00](https://github.com/kokkos/kokkos/tree/3.7.00) (2022-08-22)
[Full Changelog](https://github.com/kokkos/kokkos/compare/3.6.01...3.7.00)
@ -102,7 +124,6 @@
- Deprecate command line arguments (other than `--help`) that are not prefixed with `kokkos-*` [\#5120](https://github.com/kokkos/kokkos/pull/5120)
- Deprecate `--[kokkos-]numa` cmdline arg and `KOKKOS_NUMA` env var [\#5117](https://github.com/kokkos/kokkos/pull/5117)
- Deprecate `--[kokkos-]threads` command line argument in favor of `--[kokkos-]num-threads` [\#5111](https://github.com/kokkos/kokkos/pull/5111)
- Deprecate `Kokkos::common_view_alloc_prop` [\#5059](https://github.com/kokkos/kokkos/pull/5059)
- Deprecate `Kokkos::is_reducer_type` [\#4957](https://github.com/kokkos/kokkos/pull/4957)
- Deprecate `OffsetView` constructors taking `index_list_type` [\#4810](https://github.com/kokkos/kokkos/pull/4810)
- Deprecate overloads of `Kokkos::sort` taking a parameter `bool always_use_kokkos_sort` [\#5382](https://github.com/kokkos/kokkos/issues/5382)

View File

@ -129,7 +129,7 @@ ENDIF()
set(Kokkos_VERSION_MAJOR 3)
set(Kokkos_VERSION_MINOR 7)
set(Kokkos_VERSION_PATCH 00)
set(Kokkos_VERSION_PATCH 01)
set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}")
math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}")
@ -152,6 +152,7 @@ ENDIF()
# but scoping issues can make it difficult
GLOBAL_SET(KOKKOS_COMPILE_OPTIONS)
GLOBAL_SET(KOKKOS_LINK_OPTIONS)
GLOBAL_SET(KOKKOS_AMDGPU_OPTIONS)
GLOBAL_SET(KOKKOS_CUDA_OPTIONS)
GLOBAL_SET(KOKKOS_CUDAFE_OPTIONS)
GLOBAL_SET(KOKKOS_XCOMPILER_OPTIONS)
@ -228,6 +229,9 @@ IF (KOKKOS_HAS_TRILINOS)
# we have to match the annoying behavior, also we have to preserve quotes
# which needs another workaround.
SET(KOKKOS_COMPILE_OPTIONS_TMP)
IF (KOKKOS_ENABLE_HIP)
LIST(APPEND KOKKOS_COMPILE_OPTIONS ${KOKKOS_AMDGPU_OPTIONS})
ENDIF()
FOREACH(OPTION ${KOKKOS_COMPILE_OPTIONS})
STRING(FIND "${OPTION}" " " OPTION_HAS_WHITESPACE)
IF(OPTION_HAS_WHITESPACE EQUAL -1)

View File

@ -12,7 +12,7 @@ endif
KOKKOS_VERSION_MAJOR = 3
KOKKOS_VERSION_MINOR = 7
KOKKOS_VERSION_PATCH = 00
KOKKOS_VERSION_PATCH = 01
KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc)
# Options: Cuda,HIP,SYCL,OpenMPTarget,OpenMP,Threads,Serial
@ -20,7 +20,7 @@ KOKKOS_DEVICES ?= "OpenMP"
#KOKKOS_DEVICES ?= "Threads"
# Options:
# Intel: KNC,KNL,SNB,HSW,BDW,SKL,SKX,ICL,ICX,SPR
# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86
# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86,Hopper90
# ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX
# IBM: BGQ,Power7,Power8,Power9
# AMD-GPUS: Vega900,Vega906,Vega908,Vega90A
@ -401,6 +401,7 @@ KOKKOS_INTERNAL_USE_ARCH_VOLTA72 := $(call kokkos_has_string,$(KOKKOS_ARCH),Volt
KOKKOS_INTERNAL_USE_ARCH_TURING75 := $(call kokkos_has_string,$(KOKKOS_ARCH),Turing75)
KOKKOS_INTERNAL_USE_ARCH_AMPERE80 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere80)
KOKKOS_INTERNAL_USE_ARCH_AMPERE86 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere86)
KOKKOS_INTERNAL_USE_ARCH_HOPPER90 := $(call kokkos_has_string,$(KOKKOS_ARCH),Hopper90)
KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30) \
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32) \
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \
@ -414,7 +415,8 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLE
+ $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72) \
+ $(KOKKOS_INTERNAL_USE_ARCH_TURING75) \
+ $(KOKKOS_INTERNAL_USE_ARCH_AMPERE80) \
+ $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86))
+ $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86) \
+ $(KOKKOS_INTERNAL_USE_ARCH_HOPPER90))
#SEK: This seems like a bug to me
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
@ -505,10 +507,6 @@ KOKKOS_LINK_FLAGS =
KOKKOS_SRC =
KOKKOS_HEADERS =
#ifeq ($(KOKKOS_INTERNAL_COMPILER_GCC), 1)
KOKKOS_LIBS += -latomic
#endif
# Generating the KokkosCore_config.h file.
KOKKOS_INTERNAL_CONFIG_TMP=KokkosCore_config.tmp
@ -550,6 +548,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1)
endif
ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
KOKKOS_LIBS += -latomic
tmp := $(call kokkos_append_header,'$H''define KOKKOS_ENABLE_OPENMPTARGET')
ifeq ($(KOKKOS_INTERNAL_COMPILER_GCC), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_WORKAROUND_OPENMPTARGET_GCC")
@ -1197,6 +1196,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA_ARCH), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE86")
KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_86
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_HOPPER90), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER")
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER90")
KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_90
endif
ifneq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)

View File

@ -1,5 +1,5 @@
TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
LIB_REQUIRED_PACKAGES KokkosCore KokkosContainers
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC HPX
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC
TEST_OPTIONAL_TPLS CUSPARSE
)

View File

@ -265,8 +265,8 @@ class BinSort {
//----------------------------------------
// Create the permutation vector, the bin_offset array and the bin_count
// array. Can be called again if keys changed
template <class ExecutionSpace = exec_space>
void create_permute_vector(const ExecutionSpace& exec = exec_space{}) {
template <class ExecutionSpace>
void create_permute_vector(const ExecutionSpace& exec) {
static_assert(
Kokkos::SpaceAccessibility<ExecutionSpace,
typename Space::memory_space>::accessible,
@ -297,6 +297,15 @@ class BinSort {
*this);
}
// Create the permutation vector, the bin_offset array and the bin_count
// array. Can be called again if keys changed
void create_permute_vector() {
Kokkos::fence("Kokkos::Binsort::create_permute_vector: before");
exec_space e{};
create_permute_vector(e);
e.fence("Kokkos::Binsort::create_permute_vector: after");
}
// Sort a subset of a view with respect to the first dimension using the
// permutation array
template <class ExecutionSpace, class ValuesViewType>
@ -372,9 +381,10 @@ class BinSort {
template <class ValuesViewType>
void sort(ValuesViewType const& values, int values_range_begin,
int values_range_end) const {
Kokkos::fence("Kokkos::Binsort::sort: before");
exec_space exec;
sort(exec, values, values_range_begin, values_range_end);
exec.fence("Kokkos::Sort: fence after sorting");
exec.fence("Kokkos::BinSort:sort: after");
}
template <class ExecutionSpace, class ValuesViewType>
@ -641,9 +651,10 @@ std::enable_if_t<Kokkos::is_execution_space<ExecutionSpace>::value> sort(
template <class ViewType>
void sort(ViewType const& view) {
Kokkos::fence("Kokkos::sort: before");
typename ViewType::execution_space exec;
sort(exec, view);
exec.fence("Kokkos::Sort: fence after sorting");
exec.fence("Kokkos::sort: fence after sorting");
}
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
@ -682,6 +693,7 @@ std::enable_if_t<Kokkos::is_execution_space<ExecutionSpace>::value> sort(
template <class ViewType>
void sort(ViewType view, size_t const begin, size_t const end) {
Kokkos::fence("Kokkos::sort: before");
typename ViewType::execution_space exec;
sort(exec, view, begin, end);
exec.fence("Kokkos::Sort: fence after sorting");

View File

@ -10,10 +10,12 @@
# Default settings: change those according to your machine. For
# example, you may have have two different wrappers with either icpc
# or g++ as their back-end compiler. The defaults can be overwritten
# by using the usual arguments (e.g., -arch=sm_30 -ccbin icpc).
# by using the usual arguments (e.g., -arch=sm_80 -ccbin icpc).
# sm_70 is supported by every CUDA version from 9-12 and is thus
# chosen as default
default_arch="sm_35"
#default_arch="sm_50"
default_arch="sm_70"
#default_arch="sm_80"
#
# The default C++ compiler.

View File

@ -66,6 +66,7 @@
#cmakedefine KOKKOS_ARCH_ARMV8_THUNDERX
#cmakedefine KOKKOS_ARCH_ARMV81
#cmakedefine KOKKOS_ARCH_ARMV8_THUNDERX2
#cmakedefine KOKKOS_ARCH_A64FX
#cmakedefine KOKKOS_ARCH_AMD_AVX2
#cmakedefine KOKKOS_ARCH_AVX
#cmakedefine KOKKOS_ARCH_AVX2
@ -101,6 +102,7 @@
#cmakedefine KOKKOS_ARCH_AMPERE
#cmakedefine KOKKOS_ARCH_AMPERE80
#cmakedefine KOKKOS_ARCH_AMPERE86
#cmakedefine KOKKOS_ARCH_HOPPER90
#cmakedefine KOKKOS_ARCH_AMD_ZEN
#cmakedefine KOKKOS_ARCH_AMD_ZEN2
#cmakedefine KOKKOS_ARCH_AMD_ZEN3

View File

@ -74,6 +74,7 @@ int main() {
case 75: std::cout << "Set -DKokkos_ARCH_TURING75=ON ." << std::endl; break;
case 80: std::cout << "Set -DKokkos_ARCH_AMPERE80=ON ." << std::endl; break;
case 86: std::cout << "Set -DKokkos_ARCH_AMPERE86=ON ." << std::endl; break;
case 90: std::cout << "Set -DKokkos_ARCH_HOPPER90=ON ." << std::endl; break;
default:
std::cout << "Compute capability " << compute_capability
<< " is not supported" << std::endl;

View File

@ -86,6 +86,7 @@ KOKKOS_ARCH_OPTION(VOLTA72 GPU "NVIDIA Volta generation CC 7.2" "KOKK
KOKKOS_ARCH_OPTION(TURING75 GPU "NVIDIA Turing generation CC 7.5" "KOKKOS_SHOW_CUDA_ARCHS")
KOKKOS_ARCH_OPTION(AMPERE80 GPU "NVIDIA Ampere generation CC 8.0" "KOKKOS_SHOW_CUDA_ARCHS")
KOKKOS_ARCH_OPTION(AMPERE86 GPU "NVIDIA Ampere generation CC 8.6" "KOKKOS_SHOW_CUDA_ARCHS")
KOKKOS_ARCH_OPTION(HOPPER90 GPU "NVIDIA Hopper generation CC 9.0" "KOKKOS_SHOW_CUDA_ARCHS")
IF(Kokkos_ENABLE_HIP OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_UNSUPPORTED_ARCHS)
SET(KOKKOS_SHOW_HIP_ARCHS ON)
@ -187,7 +188,9 @@ IF (KOKKOS_CXX_COMPILER_ID STREQUAL Clang)
ELSEIF (KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
SET(CUDA_ARCH_FLAG "-gpu")
GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS -cuda)
GLOBAL_APPEND(KOKKOS_LINK_OPTIONS -cuda)
IF (KOKKOS_ENABLE_CUDA) # FIXME ideally unreachable when CUDA not enabled
GLOBAL_APPEND(KOKKOS_LINK_OPTIONS -cuda)
ENDIF()
ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA)
SET(CUDA_ARCH_FLAG "-arch")
ENDIF()
@ -542,6 +545,7 @@ CHECK_CUDA_ARCH(VOLTA72 sm_72)
CHECK_CUDA_ARCH(TURING75 sm_75)
CHECK_CUDA_ARCH(AMPERE80 sm_80)
CHECK_CUDA_ARCH(AMPERE86 sm_86)
CHECK_CUDA_ARCH(HOPPER90 sm_90)
SET(AMDGPU_ARCH_ALREADY_SPECIFIED "")
FUNCTION(CHECK_AMDGPU_ARCH ARCH FLAG)
@ -804,6 +808,10 @@ IF (KOKKOS_ARCH_AMPERE80 OR KOKKOS_ARCH_AMPERE86)
SET(KOKKOS_ARCH_AMPERE ON)
ENDIF()
IF (KOKKOS_ARCH_HOPPER90)
SET(KOKKOS_ARCH_HOPPER ON)
ENDIF()
#Regardless of version, make sure we define the general architecture name
IF (KOKKOS_ARCH_VEGA900 OR KOKKOS_ARCH_VEGA906 OR KOKKOS_ARCH_VEGA908 OR KOKKOS_ARCH_VEGA90A)
SET(KOKKOS_ARCH_VEGA ON)

View File

@ -1,5 +1,5 @@
TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
LIB_REQUIRED_PACKAGES KokkosCore
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC HPX
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC
TEST_OPTIONAL_TPLS CUSPARSE
)

View File

@ -1701,7 +1701,11 @@ namespace Impl {
underlying memory, to facilitate implementation of deep_copy() and
other routines that are defined on View */
template <unsigned N, typename T, typename... Args>
KOKKOS_FUNCTION auto as_view_of_rank_n(DynRankView<T, Args...> v) {
KOKKOS_FUNCTION auto as_view_of_rank_n(
DynRankView<T, Args...> v,
typename std::enable_if<std::is_same<
typename ViewTraits<T, Args...>::specialize, void>::value>::type* =
nullptr) {
if (v.rank() != N) {
KOKKOS_IF_ON_HOST(
const std::string message =
@ -2114,9 +2118,10 @@ inline auto create_mirror(
namespace Impl {
template <class T, class... P, class... ViewCtorArgs>
inline std::enable_if_t<
std::is_same<
typename DynRankView<T, P...>::memory_space,
typename DynRankView<T, P...>::HostMirror::memory_space>::value &&
!Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
std::is_same<
typename DynRankView<T, P...>::memory_space,
typename DynRankView<T, P...>::HostMirror::memory_space>::value &&
std::is_same<
typename DynRankView<T, P...>::data_type,
typename DynRankView<T, P...>::HostMirror::data_type>::value,
@ -2128,12 +2133,13 @@ create_mirror_view(const DynRankView<T, P...>& src,
template <class T, class... P, class... ViewCtorArgs>
inline std::enable_if_t<
!(std::is_same<
typename DynRankView<T, P...>::memory_space,
typename DynRankView<T, P...>::HostMirror::memory_space>::value &&
std::is_same<
typename DynRankView<T, P...>::data_type,
typename DynRankView<T, P...>::HostMirror::data_type>::value),
!Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
!(std::is_same<
typename DynRankView<T, P...>::memory_space,
typename DynRankView<T, P...>::HostMirror::memory_space>::value &&
std::is_same<
typename DynRankView<T, P...>::data_type,
typename DynRankView<T, P...>::HostMirror::data_type>::value),
typename DynRankView<T, P...>::HostMirror>
create_mirror_view(
const DynRankView<T, P...>& src,
@ -2141,29 +2147,39 @@ create_mirror_view(
return Kokkos::Impl::create_mirror(src, arg_prop);
}
template <class Space, class T, class... P, class... ViewCtorArgs>
template <class T, class... P, class... ViewCtorArgs,
class = std::enable_if_t<
Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
inline std::enable_if_t<
Kokkos::is_space<Space>::value &&
Impl::MirrorDRViewType<Space, T, P...>::is_same_memspace,
typename Impl::MirrorDRViewType<Space, T, P...>::view_type>
create_mirror_view(const Space&, const Kokkos::DynRankView<T, P...>& src,
Kokkos::is_space<
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space>::value &&
Impl::MirrorDRViewType<
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T,
P...>::is_same_memspace,
typename Impl::MirrorDRViewType<
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T,
P...>::view_type>
create_mirror_view(const Kokkos::DynRankView<T, P...>& src,
const typename Impl::ViewCtorProp<ViewCtorArgs...>&) {
return src;
}
template <class Space, class T, class... P, class... ViewCtorArgs>
template <class T, class... P, class... ViewCtorArgs,
class = std::enable_if_t<
Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
inline std::enable_if_t<
Kokkos::is_space<Space>::value &&
!Impl::MirrorDRViewType<Space, T, P...>::is_same_memspace,
typename Impl::MirrorDRViewType<Space, T, P...>::view_type>
Kokkos::is_space<
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space>::value &&
!Impl::MirrorDRViewType<
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T,
P...>::is_same_memspace,
typename Impl::MirrorDRViewType<
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T,
P...>::view_type>
create_mirror_view(
const Space&, const Kokkos::DynRankView<T, P...>& src,
const Kokkos::DynRankView<T, P...>& src,
const typename Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
using MemorySpace = typename Space::memory_space;
using alloc_prop = Impl::ViewCtorProp<ViewCtorArgs..., MemorySpace>;
alloc_prop prop_copy(arg_prop);
return Kokkos::Impl::create_mirror(src, prop_copy);
return Kokkos::Impl::create_mirror(src, arg_prop);
}
} // namespace Impl
@ -2224,9 +2240,10 @@ create_mirror_view(
template <class Space, class T, class... P>
inline auto create_mirror_view(Kokkos::Impl::WithoutInitializing_t wi,
const Space& space,
const Space&,
const Kokkos::DynRankView<T, P...>& src) {
return Impl::create_mirror_view(space, src, Kokkos::view_alloc(wi));
return Impl::create_mirror_view(
src, Kokkos::view_alloc(typename Space::memory_space{}, wi));
}
template <class T, class... P, class... ViewCtorArgs>

View File

@ -710,7 +710,7 @@ template <class Space, class T, class... P>
inline auto create_mirror(
const Space&, const Kokkos::Experimental::DynamicView<T, P...>& src) {
return Impl::create_mirror(
src, Impl::ViewCtorProp<>{typename Space::memory_space{}});
src, Kokkos::view_alloc(typename Space::memory_space{}));
}
template <class Space, class T, class... P>
@ -729,48 +729,68 @@ inline auto create_mirror(
}
namespace Impl {
template <class T, class... P, class... ViewCtorArgs>
inline std::enable_if_t<
(std::is_same<
typename Kokkos::Experimental::DynamicView<T, P...>::memory_space,
typename Kokkos::Experimental::DynamicView<
T, P...>::HostMirror::memory_space>::value &&
std::is_same<
typename Kokkos::Experimental::DynamicView<T, P...>::data_type,
typename Kokkos::Experimental::DynamicView<
T, P...>::HostMirror::data_type>::value),
!Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
(std::is_same<
typename Kokkos::Experimental::DynamicView<T, P...>::memory_space,
typename Kokkos::Experimental::DynamicView<
T, P...>::HostMirror::memory_space>::value &&
std::is_same<
typename Kokkos::Experimental::DynamicView<T, P...>::data_type,
typename Kokkos::Experimental::DynamicView<
T, P...>::HostMirror::data_type>::value),
typename Kokkos::Experimental::DynamicView<T, P...>::HostMirror>
create_mirror_view(
const typename Kokkos::Experimental::DynamicView<T, P...>& src,
const Impl::ViewCtorProp<ViewCtorArgs...>&) {
create_mirror_view(const Kokkos::Experimental::DynamicView<T, P...>& src,
const Impl::ViewCtorProp<ViewCtorArgs...>&) {
return src;
}
template <class T, class... P, class... ViewCtorArgs>
inline std::enable_if_t<
!(std::is_same<
typename Kokkos::Experimental::DynamicView<T, P...>::memory_space,
typename Kokkos::Experimental::DynamicView<
T, P...>::HostMirror::memory_space>::value &&
std::is_same<
typename Kokkos::Experimental::DynamicView<T, P...>::data_type,
typename Kokkos::Experimental::DynamicView<
T, P...>::HostMirror::data_type>::value),
!Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
!(std::is_same<
typename Kokkos::Experimental::DynamicView<T, P...>::memory_space,
typename Kokkos::Experimental::DynamicView<
T, P...>::HostMirror::memory_space>::value &&
std::is_same<
typename Kokkos::Experimental::DynamicView<T, P...>::data_type,
typename Kokkos::Experimental::DynamicView<
T, P...>::HostMirror::data_type>::value),
typename Kokkos::Experimental::DynamicView<T, P...>::HostMirror>
create_mirror_view(const Kokkos::Experimental::DynamicView<T, P...>& src,
const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
return Kokkos::create_mirror(arg_prop, src);
}
template <class Space, class T, class... P, class... ViewCtorArgs>
inline std::enable_if_t<
Impl::MirrorDynamicViewType<Space, T, P...>::is_same_memspace,
typename Kokkos::Impl::MirrorDynamicViewType<Space, T, P...>::view_type>
create_mirror_view(const Space&,
const Kokkos::Experimental::DynamicView<T, P...>& src,
template <class T, class... P, class... ViewCtorArgs,
class = std::enable_if_t<
Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
std::enable_if_t<Impl::MirrorDynamicViewType<
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
T, P...>::is_same_memspace,
typename Impl::MirrorDynamicViewType<
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
T, P...>::view_type>
create_mirror_view(const Kokkos::Experimental::DynamicView<T, P...>& src,
const Impl::ViewCtorProp<ViewCtorArgs...>&) {
return src;
}
template <class T, class... P, class... ViewCtorArgs,
class = std::enable_if_t<
Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
std::enable_if_t<!Impl::MirrorDynamicViewType<
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
T, P...>::is_same_memspace,
typename Impl::MirrorDynamicViewType<
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
T, P...>::view_type>
create_mirror_view(const Kokkos::Experimental::DynamicView<T, P...>& src,
const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
return Kokkos::Impl::create_mirror(src, arg_prop);
}
} // namespace Impl
// Create a mirror view in host space
@ -790,8 +810,9 @@ inline auto create_mirror_view(
// Create a mirror in a new space
template <class Space, class T, class... P>
inline auto create_mirror_view(
const Space& space, const Kokkos::Experimental::DynamicView<T, P...>& src) {
return Impl::create_mirror_view(space, src, Impl::ViewCtorProp<>{});
const Space&, const Kokkos::Experimental::DynamicView<T, P...>& src) {
return Impl::create_mirror_view(src,
view_alloc(typename Space::memory_space{}));
}
template <class Space, class T, class... P>

View File

@ -1901,19 +1901,22 @@ struct MirrorOffsetType {
namespace Impl {
template <class T, class... P, class... ViewCtorArgs>
inline typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror
inline std::enable_if_t<
!Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space,
typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror>
create_mirror(const Kokkos::Experimental::OffsetView<T, P...>& src,
const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
return typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror(
Kokkos::create_mirror(arg_prop, src.view()), src.begins());
}
template <class Space, class T, class... P, class... ViewCtorArgs>
inline typename Kokkos::Impl::MirrorOffsetType<Space, T, P...>::view_type
create_mirror(const Space&,
const Kokkos::Experimental::OffsetView<T, P...>& src,
const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
template <class T, class... P, class... ViewCtorArgs,
class = std::enable_if_t<
Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
inline auto create_mirror(const Kokkos::Experimental::OffsetView<T, P...>& src,
const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
using alloc_prop_input = Impl::ViewCtorProp<ViewCtorArgs...>;
using Space = typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space;
static_assert(
!alloc_prop_input::has_label,
@ -1923,10 +1926,6 @@ create_mirror(const Space&,
!alloc_prop_input::has_pointer,
"The view constructor arguments passed to Kokkos::create_mirror must "
"not include a pointer!");
static_assert(
!alloc_prop_input::has_memory_space,
"The view constructor arguments passed to Kokkos::create_mirror must "
"not include a memory space instance!");
static_assert(
!alloc_prop_input::allow_padding,
"The view constructor arguments passed to Kokkos::create_mirror must "
@ -1962,15 +1961,17 @@ inline auto create_mirror(
template <class Space, class T, class... P,
typename Enable = std::enable_if_t<Kokkos::is_space<Space>::value>>
inline auto create_mirror(
const Space& space, const Kokkos::Experimental::OffsetView<T, P...>& src) {
return Impl::create_mirror(space, src, Impl::ViewCtorProp<>{});
const Space&, const Kokkos::Experimental::OffsetView<T, P...>& src) {
return Impl::create_mirror(
src, Kokkos::view_alloc(typename Space::memory_space{}));
}
template <class Space, class T, class... P>
typename Kokkos::Impl::MirrorOffsetType<Space, T, P...>::view_type
create_mirror(Kokkos::Impl::WithoutInitializing_t wi, const Space& space,
create_mirror(Kokkos::Impl::WithoutInitializing_t wi, const Space&,
const Kokkos::Experimental::OffsetView<T, P...>& src) {
return Impl::create_mirror(space, src, Kokkos::view_alloc(wi));
return Impl::create_mirror(
src, Kokkos::view_alloc(typename Space::memory_space{}, wi));
}
template <class T, class... P, class... ViewCtorArgs>
@ -1983,54 +1984,64 @@ inline auto create_mirror(
namespace Impl {
template <class T, class... P, class... ViewCtorArgs>
inline std::enable_if_t<
(std::is_same<
typename Kokkos::Experimental::OffsetView<T, P...>::memory_space,
typename Kokkos::Experimental::OffsetView<
T, P...>::HostMirror::memory_space>::value &&
std::is_same<typename Kokkos::Experimental::OffsetView<T, P...>::data_type,
typename Kokkos::Experimental::OffsetView<
T, P...>::HostMirror::data_type>::value),
!Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
(std::is_same<
typename Kokkos::Experimental::OffsetView<T, P...>::memory_space,
typename Kokkos::Experimental::OffsetView<
T, P...>::HostMirror::memory_space>::value &&
std::is_same<
typename Kokkos::Experimental::OffsetView<T, P...>::data_type,
typename Kokkos::Experimental::OffsetView<
T, P...>::HostMirror::data_type>::value),
typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror>
create_mirror_view(
const typename Kokkos::Experimental::OffsetView<T, P...>& src,
const Impl::ViewCtorProp<ViewCtorArgs...>&) {
create_mirror_view(const Kokkos::Experimental::OffsetView<T, P...>& src,
const Impl::ViewCtorProp<ViewCtorArgs...>&) {
return src;
}
template <class T, class... P, class... ViewCtorArgs>
inline std::enable_if_t<
!(std::is_same<
typename Kokkos::Experimental::OffsetView<T, P...>::memory_space,
typename Kokkos::Experimental::OffsetView<
T, P...>::HostMirror::memory_space>::value &&
std::is_same<
typename Kokkos::Experimental::OffsetView<T, P...>::data_type,
typename Kokkos::Experimental::OffsetView<
T, P...>::HostMirror::data_type>::value),
!Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
!(std::is_same<
typename Kokkos::Experimental::OffsetView<T, P...>::memory_space,
typename Kokkos::Experimental::OffsetView<
T, P...>::HostMirror::memory_space>::value &&
std::is_same<
typename Kokkos::Experimental::OffsetView<T, P...>::data_type,
typename Kokkos::Experimental::OffsetView<
T, P...>::HostMirror::data_type>::value),
typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror>
create_mirror_view(const Kokkos::Experimental::OffsetView<T, P...>& src,
const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
return Kokkos::create_mirror(arg_prop, src);
}
template <class Space, class T, class... P, class... ViewCtorArgs>
inline std::enable_if_t<
Impl::MirrorOffsetViewType<Space, T, P...>::is_same_memspace,
Kokkos::Experimental::OffsetView<T, P...>>
create_mirror_view(const Space&,
const Kokkos::Experimental::OffsetView<T, P...>& src,
template <class T, class... P, class... ViewCtorArgs,
class = std::enable_if_t<
Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
std::enable_if_t<Impl::MirrorOffsetViewType<
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
T, P...>::is_same_memspace,
typename Impl::MirrorOffsetViewType<
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
T, P...>::view_type>
create_mirror_view(const Kokkos::Experimental::OffsetView<T, P...>& src,
const Impl::ViewCtorProp<ViewCtorArgs...>&) {
return src;
}
template <class Space, class T, class... P, class... ViewCtorArgs>
std::enable_if_t<
!Impl::MirrorOffsetViewType<Space, T, P...>::is_same_memspace,
typename Kokkos::Impl::MirrorOffsetViewType<Space, T, P...>::view_type>
create_mirror_view(const Space& space,
const Kokkos::Experimental::OffsetView<T, P...>& src,
template <class T, class... P, class... ViewCtorArgs,
class = std::enable_if_t<
Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
std::enable_if_t<!Impl::MirrorOffsetViewType<
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
T, P...>::is_same_memspace,
typename Impl::MirrorOffsetViewType<
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
T, P...>::view_type>
create_mirror_view(const Kokkos::Experimental::OffsetView<T, P...>& src,
const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
return create_mirror(space, src, arg_prop);
return Kokkos::Impl::create_mirror(src, arg_prop);
}
} // namespace Impl
@ -2052,15 +2063,17 @@ inline auto create_mirror_view(
template <class Space, class T, class... P,
typename Enable = std::enable_if_t<Kokkos::is_space<Space>::value>>
inline auto create_mirror_view(
const Space& space, const Kokkos::Experimental::OffsetView<T, P...>& src) {
return Impl::create_mirror_view(space, src, Impl::ViewCtorProp<>{});
const Space&, const Kokkos::Experimental::OffsetView<T, P...>& src) {
return Impl::create_mirror_view(
src, Kokkos::view_alloc(typename Space::memory_space{}));
}
template <class Space, class T, class... P>
inline auto create_mirror_view(
Kokkos::Impl::WithoutInitializing_t wi, const Space& space,
Kokkos::Impl::WithoutInitializing_t wi, const Space&,
const Kokkos::Experimental::OffsetView<T, P...>& src) {
return Impl::create_mirror_view(space, src, Kokkos::view_alloc(wi));
return Impl::create_mirror_view(
src, Kokkos::view_alloc(typename Space::memory_space{}, wi));
}
template <class T, class... P, class... ViewCtorArgs>

View File

@ -46,3 +46,13 @@ foreach(Tag Threads;Serial;OpenMP;HPX;Cuda;HIP;SYCL)
KOKKOS_ADD_EXECUTABLE_AND_TEST(UnitTest_${Tag} SOURCES ${UnitTestSources})
endif()
endforeach()
SET(COMPILE_ONLY_SOURCES
TestCreateMirror.cpp
)
KOKKOS_ADD_EXECUTABLE(
TestCompileOnly
SOURCES
TestCompileMain.cpp
${COMPILE_ONLY_SOURCES}
)

View File

@ -0,0 +1 @@
int main() {}

View File

@ -0,0 +1,179 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 3.0
// Copyright (2020) National Technology & Engineering
// Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Core.hpp>
#include <Kokkos_DynamicView.hpp>
#include <Kokkos_DynRankView.hpp>
#include <Kokkos_OffsetView.hpp>
template <typename TestView, typename MemorySpace>
void check_memory_space(TestView, MemorySpace) {
static_assert(
std::is_same<typename TestView::memory_space, MemorySpace>::value, "");
}
template <class View>
auto host_mirror_test_space(View) {
return std::conditional_t<
Kokkos::SpaceAccessibility<Kokkos::HostSpace,
typename View::memory_space>::accessible,
typename View::memory_space, Kokkos::HostSpace>{};
}
template <typename View>
void test_create_mirror_properties(const View& view) {
using namespace Kokkos;
using DeviceMemorySpace = typename DefaultExecutionSpace::memory_space;
// clang-format off
// create_mirror
#ifndef KOKKOS_ENABLE_CXX14
// FIXME DynamicView: HostMirror is the same type
if constexpr (!is_dynamic_view<View>::value) {
check_memory_space(create_mirror(WithoutInitializing, view), host_mirror_test_space(view));
check_memory_space(create_mirror( view), host_mirror_test_space(view));
}
#endif
check_memory_space(create_mirror(WithoutInitializing, DefaultExecutionSpace{}, view), DeviceMemorySpace{});
check_memory_space(create_mirror( DefaultExecutionSpace{}, view), DeviceMemorySpace{});
// create_mirror_view
#ifndef KOKKOS_ENABLE_CXX14
// FIXME DynamicView: HostMirror is the same type
if constexpr (!is_dynamic_view<View>::value) {
check_memory_space(create_mirror_view(WithoutInitializing, view), host_mirror_test_space(view));
check_memory_space(create_mirror_view( view), host_mirror_test_space(view));
}
#endif
check_memory_space(create_mirror_view(WithoutInitializing, DefaultExecutionSpace{}, view), DeviceMemorySpace{});
check_memory_space(create_mirror_view( DefaultExecutionSpace{}, view), DeviceMemorySpace{});
// create_mirror view_alloc
#ifndef KOKKOS_ENABLE_CXX14
// FIXME DynamicView: HostMirror is the same type
if constexpr (!is_dynamic_view<View>::value) {
check_memory_space(create_mirror(view_alloc(WithoutInitializing), view), host_mirror_test_space(view));
check_memory_space(create_mirror(view_alloc(), view), host_mirror_test_space(view));
}
#endif
check_memory_space(create_mirror(view_alloc(WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
check_memory_space(create_mirror(view_alloc( DeviceMemorySpace{}), view), DeviceMemorySpace{});
// create_mirror_view view_alloc
#ifndef KOKKOS_ENABLE_CXX14
// FIXME DynamicView: HostMirror is the same type
if constexpr (!is_dynamic_view<View>::value) {
check_memory_space(create_mirror_view(view_alloc(WithoutInitializing), view), host_mirror_test_space(view));
check_memory_space(create_mirror_view(view_alloc(), view), host_mirror_test_space(view));
}
#endif
check_memory_space(create_mirror_view(view_alloc(WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
check_memory_space(create_mirror_view(view_alloc( DeviceMemorySpace{}), view), DeviceMemorySpace{});
// create_mirror view_alloc + execution space
#ifndef KOKKOS_ENABLE_CXX14
// FIXME DynamicView: HostMirror is the same type
if constexpr (!is_dynamic_view<View>::value) {
check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{}, WithoutInitializing), view), host_mirror_test_space(view));
check_memory_space(create_mirror(view_alloc(DefaultHostExecutionSpace{}), view), host_mirror_test_space(view));
}
#endif
check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{}, WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{}, DeviceMemorySpace{}), view), DeviceMemorySpace{});
// create_mirror_view view_alloc + execution space
#ifndef KOKKOS_ENABLE_CXX14
// FIXME DynamicView: HostMirror is the same type
if constexpr (!is_dynamic_view<View>::value) {
check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{}, WithoutInitializing), view), host_mirror_test_space(view));
check_memory_space(create_mirror_view(view_alloc(DefaultHostExecutionSpace{}), view), host_mirror_test_space(view));
}
#endif
check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{}, WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{}, DeviceMemorySpace{}), view), DeviceMemorySpace{});
// create_mirror_view_and_copy
check_memory_space(create_mirror_view_and_copy(HostSpace{}, view), HostSpace{});
check_memory_space(create_mirror_view_and_copy(DeviceMemorySpace{}, view), DeviceMemorySpace{});
// create_mirror_view_and_copy view_alloc
check_memory_space(create_mirror_view_and_copy(view_alloc(HostSpace{}), view), HostSpace{});
check_memory_space(create_mirror_view_and_copy(view_alloc(DeviceMemorySpace{}), view), DeviceMemorySpace{});
// create_mirror_view_and_copy view_alloc + execution space
check_memory_space(create_mirror_view_and_copy(view_alloc(HostSpace{}, DefaultHostExecutionSpace{}), view), HostSpace{});
check_memory_space(create_mirror_view_and_copy(view_alloc(DeviceMemorySpace{}, DefaultExecutionSpace{}), view), DeviceMemorySpace{});
// clang-format on
}
void test_create_mirror_dynrankview() {
Kokkos::DynRankView<int, Kokkos::DefaultExecutionSpace> device_view(
"device view", 10);
Kokkos::DynRankView<int, Kokkos::HostSpace> host_view("host view", 10);
test_create_mirror_properties(device_view);
test_create_mirror_properties(host_view);
}
void test_reate_mirror_offsetview() {
Kokkos::Experimental::OffsetView<int*, Kokkos::DefaultExecutionSpace>
device_view("device view", {0, 10});
Kokkos::Experimental::OffsetView<int*, Kokkos::HostSpace> host_view(
"host view", {0, 10});
test_create_mirror_properties(device_view);
test_create_mirror_properties(host_view);
}
void test_create_mirror_dynamicview() {
Kokkos::Experimental::DynamicView<int*, Kokkos::DefaultExecutionSpace>
device_view("device view", 2, 10);
Kokkos::Experimental::DynamicView<int*, Kokkos::HostSpace> host_view(
"host view", 2, 10);
test_create_mirror_properties(device_view);
test_create_mirror_properties(host_view);
}

View File

@ -1,5 +1,5 @@
TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC DLlib HPX
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC DLlib
TEST_OPTIONAL_TPLS CUSPARSE
)

View File

@ -53,13 +53,69 @@
namespace Kokkos {
namespace Impl {
inline int cuda_warp_per_sm_allocation_granularity(
cudaDeviceProp const& properties) {
// Allocation granularity of warps in each sm
switch (properties.major) {
case 3:
case 5:
case 7:
case 8:
case 9: return 4;
case 6: return (properties.minor == 0 ? 2 : 4);
default:
throw_runtime_exception(
"Unknown device in cuda warp per sm allocation granularity");
return 0;
}
}
inline int cuda_max_warps_per_sm_registers(
cudaDeviceProp const& properties, cudaFuncAttributes const& attributes) {
// Maximum number of warps per sm as a function of register counts,
// subject to the constraint that warps are allocated with a fixed granularity
int const max_regs_per_block = properties.regsPerBlock;
int const regs_per_warp = attributes.numRegs * properties.warpSize;
int const warp_granularity =
cuda_warp_per_sm_allocation_granularity(properties);
// The granularity of register allocation is chunks of 256 registers per warp,
// which implies a need to over-allocate, so we round up
int const allocated_regs_per_warp = 256 * ((regs_per_warp + 256 - 1) / 256);
// The maximum number of warps per SM is constrained from above by register
// allocation. To satisfy the constraint that warps per SM is allocated at a
// finite granularity, we need to round down.
int const max_warps_per_sm =
warp_granularity *
(max_regs_per_block / (allocated_regs_per_warp * warp_granularity));
return max_warps_per_sm;
}
inline int cuda_max_active_blocks_per_sm(cudaDeviceProp const& properties,
cudaFuncAttributes const& attributes,
int block_size, size_t dynamic_shmem) {
// Limits due do registers/SM
// Limits due to registers/SM
int const regs_per_sm = properties.regsPerMultiprocessor;
int const regs_per_thread = attributes.numRegs;
int const max_blocks_regs = regs_per_sm / (regs_per_thread * block_size);
// The granularity of register allocation is chunks of 256 registers per warp
// -> 8 registers per thread
int const allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
int max_blocks_regs = regs_per_sm / (allocated_regs_per_thread * block_size);
// Compute the maximum number of warps as a function of the number of
// registers
int const max_warps_per_sm_registers =
cuda_max_warps_per_sm_registers(properties, attributes);
// Constrain the number of blocks to respect the maximum number of warps per
// SM On face value this should be an equality, but due to the warp
// granularity constraints noted in `cuda_max_warps_per_sm_registers` the
// left-hand-side of this comparison can overshoot what the hardware allows
// based on register counts alone
while ((max_blocks_regs * block_size / properties.warpSize) >
max_warps_per_sm_registers)
max_blocks_regs--;
// Limits due to shared memory/SM
size_t const shmem_per_sm = properties.sharedMemPerMultiprocessor;
@ -203,40 +259,19 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
LaunchBounds{});
}
// Assuming cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1)
// NOTE these number can be obtained several ways:
// * One option is to download the CUDA Occupancy Calculator spreadsheet, select
// "Compute Capability" first and check what is the smallest "Shared Memory
// Size Config" that is available. The "Shared Memory Per Multiprocessor" in
// bytes is then to be found below in the summary.
// * Another option would be to look for the information in the "Tuning
// Guide(s)" of the CUDA Toolkit Documentation for each GPU architecture, in
// the "Shared Memory" section (more tedious)
inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) {
int const compute_capability = properties.major * 10 + properties.minor;
return [compute_capability]() {
switch (compute_capability) {
case 30:
case 32:
case 35: return 16;
case 37: return 80;
case 50:
case 53:
case 60:
case 62: return 64;
case 52:
case 61: return 96;
case 70:
case 80:
case 86: return 8;
case 75: return 32;
default:
Kokkos::Impl::throw_runtime_exception(
"Unknown device in cuda block size deduction");
}
return 0;
}() * 1024;
template <class LaunchBounds>
int cuda_get_opt_block_size_no_shmem(const cudaFuncAttributes& attr,
LaunchBounds) {
auto const& prop = Kokkos::Cuda().cuda_device_prop();
// Thin version of cuda_get_opt_block_size for cases where there is no shared
// memory
auto const block_size_to_no_shmem = [&](int /*block_size*/) { return 0; };
return cuda_deduce_block_size(false, prop, attr, block_size_to_no_shmem,
LaunchBounds{});
}
} // namespace Impl
} // namespace Kokkos

View File

@ -418,7 +418,7 @@ KOKKOS_INLINE_FUNCTION
#endif // CUDA_VERSION >= 11000 && CUDA_VERSION < 11010
#if CUDA_VERSION >= 11010 && \
((defined(KOKKOS_ARCH_AMPERE80) || defined(KOKKOS_ARCH_AMPERE86)))
((defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER)))
KOKKOS_INLINE_FUNCTION
bhalf_t cast_to_bhalf(bhalf_t val) { return val; }
KOKKOS_INLINE_FUNCTION

View File

@ -569,12 +569,6 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default
}
#endif
#ifdef KOKKOS_ENABLE_PRE_CUDA_10_DEPRECATION_API
cudaThreadSetCacheConfig(cudaFuncCachePreferShared);
#else
cudaDeviceSetCacheConfig(cudaFuncCachePreferShared);
#endif
// Init the array for used for arbitrarily sized atomics
if (stream == nullptr) Impl::initialize_host_cuda_lock_arrays();

View File

@ -93,10 +93,6 @@ namespace Impl {
// __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
// function qualifier which could be used to improve performance.
//----------------------------------------------------------------------------
// Maximize L1 cache and minimize shared memory:
// cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1 );
// For 2.0 capability: 48 KB L1 and 16 KB shared
//----------------------------------------------------------------------------
template <class DriverType>
__global__ static void cuda_parallel_launch_constant_memory() {
@ -158,63 +154,119 @@ inline void check_shmem_request(CudaInternal const* cuda_instance, int shmem) {
}
}
// This function needs to be template on DriverType and LaunchBounds
// These functions needs to be template on DriverType and LaunchBounds
// so that the static bool is unique for each type combo
// KernelFuncPtr does not necessarily contain that type information.
template <class DriverType, class LaunchBounds, class KernelFuncPtr>
inline void configure_shmem_preference(KernelFuncPtr const& func,
bool prefer_shmem) {
#ifndef KOKKOS_ARCH_KEPLER
// On Kepler the L1 has no benefit since it doesn't cache reads
auto set_cache_config = [&] {
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
func,
(prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1)));
return prefer_shmem;
const cudaFuncAttributes& get_cuda_kernel_func_attributes(
const KernelFuncPtr& func) {
// Only call cudaFuncGetAttributes once for each unique kernel
// by leveraging static variable initialization rules
auto wrap_get_attributes = [&]() -> cudaFuncAttributes {
cudaFuncAttributes attr;
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncGetAttributes(&attr, func));
return attr;
};
static bool cache_config_preference_cached = set_cache_config();
if (cache_config_preference_cached != prefer_shmem) {
static cudaFuncAttributes func_attr = wrap_get_attributes();
return func_attr;
}
template <class DriverType, class LaunchBounds, class KernelFuncPtr>
inline void configure_shmem_preference(const KernelFuncPtr& func,
const cudaDeviceProp& device_props,
const size_t block_size, int& shmem,
const size_t occupancy) {
#ifndef KOKKOS_ARCH_KEPLER
const auto& func_attr =
get_cuda_kernel_func_attributes<DriverType, LaunchBounds>(func);
// Compute limits for number of blocks due to registers/SM
const size_t regs_per_sm = device_props.regsPerMultiprocessor;
const size_t regs_per_thread = func_attr.numRegs;
// The granularity of register allocation is chunks of 256 registers per warp
// -> 8 registers per thread
const size_t allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
size_t max_blocks_regs =
regs_per_sm / (allocated_regs_per_thread * block_size);
// Compute the maximum number of warps as a function of the number of
// registers
const size_t max_warps_per_sm_registers =
cuda_max_warps_per_sm_registers(device_props, func_attr);
// Constrain the number of blocks to respect the maximum number of warps per
// SM On face value this should be an equality, but due to the warp
// granularity constraints noted in `cuda_max_warps_per_sm_registers` the
// left-hand-side of this comparison can overshoot what the hardware allows
// based on register counts alone
while ((max_blocks_regs * block_size / device_props.warpSize) >
max_warps_per_sm_registers)
max_blocks_regs--;
// Compute how many threads per sm we actually want
const size_t max_threads_per_sm = device_props.maxThreadsPerMultiProcessor;
// only allocate multiples of warp size
const size_t num_threads_desired =
((max_threads_per_sm * occupancy / 100 + 31) / 32) * 32;
// Get close to the desired occupancy,
// don't undershoot by much but also don't allocate a whole new block just
// because one is a few threads over otherwise.
size_t num_blocks_desired =
(num_threads_desired + block_size * 0.8) / block_size;
num_blocks_desired = ::std::min(max_blocks_regs, num_blocks_desired);
if (num_blocks_desired == 0) num_blocks_desired = 1;
// Calculate how much shared memory we need per block
size_t shmem_per_block = shmem + func_attr.sharedSizeBytes;
// The minimum shared memory allocation we can have in total per SM is 8kB.
// If we want to lower occupancy we have to make sure we request at least that
// much in aggregate over all blocks, so that shared memory actually becomes a
// limiting factor for occupancy
constexpr size_t min_shmem_size_per_sm = 8192;
if ((occupancy < 100) &&
(shmem_per_block * num_blocks_desired < min_shmem_size_per_sm)) {
shmem_per_block = min_shmem_size_per_sm / num_blocks_desired;
// Need to set the caller's shmem variable so that the
// kernel launch uses the correct dynamic shared memory request
shmem = shmem_per_block - func_attr.sharedSizeBytes;
}
// Compute the carveout fraction we need based on occupancy
// Use multiples of 8kB
const size_t max_shmem_per_sm = device_props.sharedMemPerMultiprocessor;
size_t carveout = shmem_per_block == 0
? 0
: 100 *
(((num_blocks_desired * shmem_per_block +
min_shmem_size_per_sm - 1) /
min_shmem_size_per_sm) *
min_shmem_size_per_sm) /
max_shmem_per_sm;
if (carveout > 100) carveout = 100;
// Set the carveout, but only call it once per kernel or when it changes
auto set_cache_config = [&] {
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetAttribute(
func, cudaFuncAttributePreferredSharedMemoryCarveout, carveout));
return carveout;
};
// Store the value in a static variable so we only reset if needed
static size_t cache_config_preference_cached = set_cache_config();
if (cache_config_preference_cached != carveout) {
cache_config_preference_cached = set_cache_config();
}
#else
// Use the parameters so we don't get a warning
(void)func;
(void)prefer_shmem;
(void)device_props;
(void)block_size;
(void)occupancy;
#endif
}
template <class Policy>
std::enable_if_t<Policy::experimental_contains_desired_occupancy>
modify_launch_configuration_if_desired_occupancy_is_specified(
Policy const& policy, cudaDeviceProp const& properties,
cudaFuncAttributes const& attributes, dim3 const& block, int& shmem,
bool& prefer_shmem) {
int const block_size = block.x * block.y * block.z;
int const desired_occupancy = policy.impl_get_desired_occupancy().value();
size_t const shmem_per_sm_prefer_l1 = get_shmem_per_sm_prefer_l1(properties);
size_t const static_shmem = attributes.sharedSizeBytes;
// round to nearest integer and avoid division by zero
int active_blocks = std::max(
1, static_cast<int>(std::round(
static_cast<double>(properties.maxThreadsPerMultiProcessor) /
block_size * desired_occupancy / 100)));
int const dynamic_shmem =
shmem_per_sm_prefer_l1 / active_blocks - static_shmem;
if (dynamic_shmem > shmem) {
shmem = dynamic_shmem;
prefer_shmem = false;
}
}
template <class Policy>
std::enable_if_t<!Policy::experimental_contains_desired_occupancy>
modify_launch_configuration_if_desired_occupancy_is_specified(
Policy const&, cudaDeviceProp const&, cudaFuncAttributes const&,
dim3 const& /*block*/, int& /*shmem*/, bool& /*prefer_shmem*/) {}
// </editor-fold> end Some helper functions for launch code readability }}}1
//==============================================================================
@ -348,7 +400,7 @@ struct CudaParallelLaunchKernelInvoker<
#ifdef KOKKOS_CUDA_ENABLE_GRAPHS
inline static void create_parallel_launch_graph_node(
DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
CudaInternal const* cuda_instance, bool prefer_shmem) {
CudaInternal const* cuda_instance) {
//----------------------------------------
auto const& graph = Impl::get_cuda_graph_from_kernel(driver);
KOKKOS_EXPECTS(bool(graph));
@ -358,8 +410,19 @@ struct CudaParallelLaunchKernelInvoker<
if (!Impl::is_empty_launch(grid, block)) {
Impl::check_shmem_request(cuda_instance, shmem);
Impl::configure_shmem_preference<DriverType, LaunchBounds>(
base_t::get_kernel_func(), prefer_shmem);
if (DriverType::Policy::
experimental_contains_desired_occupancy) {
/*
int desired_occupancy =
driver.get_policy().impl_get_desired_occupancy().value();
size_t block_size = block.x * block.y * block.z;
Impl::configure_shmem_preference<DriverType, LaunchBounds>(
base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
shmem, desired_occupancy);*/
Kokkos::Impl::throw_runtime_exception(
std::string("Cuda graph node creation FAILED:"
" occupancy requests are currently broken."));
}
void const* args[] = {&driver};
@ -442,7 +505,7 @@ struct CudaParallelLaunchKernelInvoker<
#ifdef KOKKOS_CUDA_ENABLE_GRAPHS
inline static void create_parallel_launch_graph_node(
DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
CudaInternal const* cuda_instance, bool prefer_shmem) {
CudaInternal const* cuda_instance) {
//----------------------------------------
auto const& graph = Impl::get_cuda_graph_from_kernel(driver);
KOKKOS_EXPECTS(bool(graph));
@ -452,8 +515,18 @@ struct CudaParallelLaunchKernelInvoker<
if (!Impl::is_empty_launch(grid, block)) {
Impl::check_shmem_request(cuda_instance, shmem);
Impl::configure_shmem_preference<DriverType, LaunchBounds>(
base_t::get_kernel_func(), prefer_shmem);
if (DriverType::Policy::
experimental_contains_desired_occupancy) {
/*int desired_occupancy =
driver.get_policy().impl_get_desired_occupancy().value();
size_t block_size = block.x * block.y * block.z;
Impl::configure_shmem_preference<DriverType, LaunchBounds>(
base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
shmem, desired_occupancy);*/
Kokkos::Impl::throw_runtime_exception(
std::string("Cuda graph node creation FAILED:"
" occupancy requests are currently broken."));
}
auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver);
@ -566,7 +639,7 @@ struct CudaParallelLaunchKernelInvoker<
#ifdef KOKKOS_CUDA_ENABLE_GRAPHS
inline static void create_parallel_launch_graph_node(
DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
CudaInternal const* cuda_instance, bool prefer_shmem) {
CudaInternal const* cuda_instance) {
// Just use global memory; coordinating through events to share constant
// memory with the non-graph interface is not really reasonable since
// events don't work with Graphs directly, and this would anyway require
@ -580,7 +653,7 @@ struct CudaParallelLaunchKernelInvoker<
DriverType, LaunchBounds,
Experimental::CudaLaunchMechanism::GlobalMemory>;
global_launch_impl_t::create_parallel_launch_graph_node(
driver, grid, block, shmem, cuda_instance, prefer_shmem);
driver, grid, block, shmem, cuda_instance);
}
#endif
};
@ -613,8 +686,7 @@ struct CudaParallelLaunchImpl<
inline static void launch_kernel(const DriverType& driver, const dim3& grid,
const dim3& block, int shmem,
const CudaInternal* cuda_instance,
bool prefer_shmem) {
const CudaInternal* cuda_instance) {
if (!Impl::is_empty_launch(grid, block)) {
// Prevent multiple threads to simultaneously set the cache configuration
// preference and launch the same kernel
@ -623,20 +695,22 @@ struct CudaParallelLaunchImpl<
Impl::check_shmem_request(cuda_instance, shmem);
// If a desired occupancy is specified, we compute how much shared memory
// to ask for to achieve that occupancy, assuming that the cache
// configuration is `cudaFuncCachePreferL1`. If the amount of dynamic
// shared memory computed is actually smaller than `shmem` we overwrite
// `shmem` and set `prefer_shmem` to `false`.
modify_launch_configuration_if_desired_occupancy_is_specified(
driver.get_policy(), cuda_instance->m_deviceProp,
get_cuda_func_attributes(), block, shmem, prefer_shmem);
if (DriverType::Policy::
experimental_contains_desired_occupancy) {
/*int desired_occupancy =
driver.get_policy().impl_get_desired_occupancy().value();
size_t block_size = block.x * block.y * block.z;
Impl::configure_shmem_preference<
DriverType,
Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
shmem, desired_occupancy);*/
Kokkos::Impl::throw_runtime_exception(
std::string("Cuda graph node creation FAILED:"
" occupancy requests are currently broken."));
}
Impl::configure_shmem_preference<
DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
base_t::get_kernel_func(), prefer_shmem);
ensure_cuda_lock_arrays_on_device();
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
// Invoke the driver function on the device
base_t::invoke_kernel(driver, grid, block, shmem, cuda_instance);
@ -650,18 +724,9 @@ struct CudaParallelLaunchImpl<
}
static cudaFuncAttributes get_cuda_func_attributes() {
// Race condition inside of cudaFuncGetAttributes if the same address is
// given requires using a local variable as input instead of a static Rely
// on static variable initialization to make sure only one thread executes
// the code and the result is visible.
auto wrap_get_attributes = []() -> cudaFuncAttributes {
cudaFuncAttributes attr_tmp;
KOKKOS_IMPL_CUDA_SAFE_CALL(
cudaFuncGetAttributes(&attr_tmp, base_t::get_kernel_func()));
return attr_tmp;
};
static cudaFuncAttributes attr = wrap_get_attributes();
return attr;
return get_cuda_kernel_func_attributes<
DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
base_t::get_kernel_func());
}
};

View File

@ -79,7 +79,8 @@ CudaLockArrays g_host_cuda_lock_arrays = {nullptr, 0};
void initialize_host_cuda_lock_arrays() {
#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
desul::Impl::init_lock_arrays();
desul::ensure_cuda_lock_arrays_on_device();
DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
#endif
if (g_host_cuda_lock_arrays.atomic != nullptr) return;
KOKKOS_IMPL_CUDA_SAFE_CALL(
@ -88,7 +89,7 @@ void initialize_host_cuda_lock_arrays() {
Impl::cuda_device_synchronize(
"Kokkos::Impl::initialize_host_cuda_lock_arrays: Pre Init Lock Arrays");
g_host_cuda_lock_arrays.n = Cuda::concurrency();
copy_cuda_lock_arrays_to_device();
KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK + 1 + 255) / 256,
256>>>();
Impl::cuda_device_synchronize(
@ -105,7 +106,7 @@ void finalize_host_cuda_lock_arrays() {
g_host_cuda_lock_arrays.atomic = nullptr;
g_host_cuda_lock_arrays.n = 0;
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
copy_cuda_lock_arrays_to_device();
KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
#endif
}

View File

@ -67,7 +67,7 @@ struct CudaLockArrays {
/// \brief This global variable in Host space is the central definition
/// of these arrays.
extern CudaLockArrays g_host_cuda_lock_arrays;
extern Kokkos::Impl::CudaLockArrays g_host_cuda_lock_arrays;
/// \brief After this call, the g_host_cuda_lock_arrays variable has
/// valid, initialized arrays.
@ -105,12 +105,12 @@ namespace Impl {
/// instances in other translation units, we must update this CUDA global
/// variable based on the Host global variable prior to running any kernels
/// that will use it.
/// That is the purpose of the ensure_cuda_lock_arrays_on_device function.
/// That is the purpose of the KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE macro.
__device__
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
__constant__ extern
#endif
CudaLockArrays g_device_cuda_lock_arrays;
Kokkos::Impl::CudaLockArrays g_device_cuda_lock_arrays;
#define CUDA_SPACE_ATOMIC_MASK 0x1FFFF
@ -123,7 +123,9 @@ __device__ inline bool lock_address_cuda_space(void* ptr) {
size_t offset = size_t(ptr);
offset = offset >> 2;
offset = offset & CUDA_SPACE_ATOMIC_MASK;
return (0 == atomicCAS(&g_device_cuda_lock_arrays.atomic[offset], 0, 1));
return (
0 ==
atomicCAS(&Kokkos::Impl::g_device_cuda_lock_arrays.atomic[offset], 0, 1));
}
/// \brief Release lock for the address
@ -136,7 +138,7 @@ __device__ inline void unlock_address_cuda_space(void* ptr) {
size_t offset = size_t(ptr);
offset = offset >> 2;
offset = offset & CUDA_SPACE_ATOMIC_MASK;
atomicExch(&g_device_cuda_lock_arrays.atomic[offset], 0);
atomicExch(&Kokkos::Impl::g_device_cuda_lock_arrays.atomic[offset], 0);
}
} // namespace Impl
@ -149,49 +151,45 @@ namespace {
static int lock_array_copied = 0;
inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
} // namespace
} // namespace Impl
} // namespace Kokkos
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
inline
#else
static
#endif
void
copy_cuda_lock_arrays_to_device() {
if (lock_array_copied == 0) {
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemcpyToSymbol(g_device_cuda_lock_arrays,
&g_host_cuda_lock_arrays,
sizeof(CudaLockArrays)));
/* Dan Ibanez: it is critical that this code be a macro, so that it will
capture the right address for Kokkos::Impl::g_device_cuda_lock_arrays!
putting this in an inline function will NOT do the right thing! */
#define KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE() \
{ \
if (::Kokkos::Impl::lock_array_copied == 0) { \
KOKKOS_IMPL_CUDA_SAFE_CALL( \
cudaMemcpyToSymbol(Kokkos::Impl::g_device_cuda_lock_arrays, \
&Kokkos::Impl::g_host_cuda_lock_arrays, \
sizeof(Kokkos::Impl::CudaLockArrays))); \
} \
lock_array_copied = 1; \
}
lock_array_copied = 1;
}
#ifndef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
inline void ensure_cuda_lock_arrays_on_device() {}
#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
#else
inline static void ensure_cuda_lock_arrays_on_device() {
copy_cuda_lock_arrays_to_device();
}
#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() \
KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()
#endif
#else
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
inline void ensure_cuda_lock_arrays_on_device() {}
#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
#else
// Still Need COPY_CUDA_LOCK_ARRAYS for team scratch etc.
inline static void ensure_cuda_lock_arrays_on_device() {
copy_cuda_lock_arrays_to_device();
desul::ensure_cuda_lock_arrays_on_device();
}
#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() \
KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE() \
DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
#endif
#endif /* defined( KOKKOS_ENABLE_IMPL_DESUL_ATOMICS ) */
} // namespace Impl
} // namespace Kokkos
#endif /* defined( KOKKOS_ENABLE_CUDA ) */
#endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP */

View File

@ -67,6 +67,34 @@
namespace Kokkos {
namespace Impl {
template <typename ParallelType, typename Policy, typename LaunchBounds>
int max_tile_size_product_helper(const Policy& pol, const LaunchBounds&) {
cudaFuncAttributes attr =
CudaParallelLaunch<ParallelType,
LaunchBounds>::get_cuda_func_attributes();
auto const& prop = pol.space().cuda_device_prop();
// Limits due to registers/SM, MDRange doesn't have
// shared memory constraints
int const optimal_block_size =
Kokkos::Impl::cuda_get_opt_block_size_no_shmem(attr, LaunchBounds{});
// Compute how many blocks of this size we can launch, based on warp
// constraints
int const max_warps_per_sm_registers =
Kokkos::Impl::cuda_max_warps_per_sm_registers(prop, attr);
int const max_num_threads_from_warps =
max_warps_per_sm_registers * prop.warpSize;
int const max_num_blocks = max_num_threads_from_warps / optimal_block_size;
// Compute the total number of threads
int const max_threads_per_sm = optimal_block_size * max_num_blocks;
return std::min(
max_threads_per_sm,
static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
}
template <class FunctorType, class... Traits>
class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
public:
@ -85,18 +113,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
public:
template <typename Policy, typename Functor>
static int max_tile_size_product(const Policy& pol, const Functor&) {
cudaFuncAttributes attr =
CudaParallelLaunch<ParallelFor,
LaunchBounds>::get_cuda_func_attributes();
auto const& prop = pol.space().cuda_device_prop();
// Limits due to registers/SM, MDRange doesn't have
// shared memory constraints
int const regs_per_sm = prop.regsPerMultiprocessor;
int const regs_per_thread = attr.numRegs;
int const max_threads_per_sm = regs_per_sm / regs_per_thread;
return std::min(
max_threads_per_sm,
static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
return max_tile_size_product_helper<ParallelFor>(pol, LaunchBounds{});
}
Policy const& get_policy() const { return m_rp; }
inline __device__ void operator()() const {
@ -121,8 +138,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
maxblocks[1]),
1);
CudaParallelLaunch<ParallelFor, LaunchBounds>(
*this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
false);
*this, grid, block, 0, m_rp.space().impl_internal_space_instance());
} else if (RP::rank == 3) {
const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], m_rp.m_tile[2]);
KOKKOS_ASSERT(block.x > 0);
@ -139,8 +155,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
(m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1) / block.z,
maxblocks[2]));
CudaParallelLaunch<ParallelFor, LaunchBounds>(
*this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
false);
*this, grid, block, 0, m_rp.space().impl_internal_space_instance());
} else if (RP::rank == 4) {
// id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to
// threadIdx.z
@ -158,8 +173,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
(m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1) / block.z,
maxblocks[2]));
CudaParallelLaunch<ParallelFor, LaunchBounds>(
*this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
false);
*this, grid, block, 0, m_rp.space().impl_internal_space_instance());
} else if (RP::rank == 5) {
// id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to
// threadIdx.z
@ -175,8 +189,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
(m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1) / block.z,
maxblocks[2]));
CudaParallelLaunch<ParallelFor, LaunchBounds>(
*this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
false);
*this, grid, block, 0, m_rp.space().impl_internal_space_instance());
} else if (RP::rank == 6) {
// id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4,id5 to
// threadIdx.z
@ -191,8 +204,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
std::min<array_index_type>(m_rp.m_tile_end[4] * m_rp.m_tile_end[5],
maxblocks[2]));
CudaParallelLaunch<ParallelFor, LaunchBounds>(
*this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
false);
*this, grid, block, 0, m_rp.space().impl_internal_space_instance());
} else {
Kokkos::abort("Kokkos::MDRange Error: Exceeded rank bounds with Cuda\n");
}
@ -263,17 +275,7 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
public:
template <typename Policy, typename Functor>
static int max_tile_size_product(const Policy& pol, const Functor&) {
cudaFuncAttributes attr =
CudaParallelLaunch<ParallelReduce,
LaunchBounds>::get_cuda_func_attributes();
auto const& prop = pol.space().cuda_device_prop();
// Limits due do registers/SM
int const regs_per_sm = prop.regsPerMultiprocessor;
int const regs_per_thread = attr.numRegs;
int const max_threads_per_sm = regs_per_sm / regs_per_thread;
return std::min(
max_threads_per_sm,
static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
return max_tile_size_product_helper<ParallelReduce>(pol, LaunchBounds{});
}
Policy const& get_policy() const { return m_policy; }
inline __device__ void exec_range(reference_type update) const {
@ -405,8 +407,8 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
CudaParallelLaunch<ParallelReduce, LaunchBounds>(
*this, grid, block, shmem,
m_policy.space().impl_internal_space_instance(),
false); // copy to device and execute
m_policy.space()
.impl_internal_space_instance()); // copy to device and execute
if (!m_result_ptr_device_accessible) {
if (m_result_ptr) {

View File

@ -135,8 +135,7 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
#endif
CudaParallelLaunch<ParallelFor, LaunchBounds>(
*this, grid, block, 0, m_policy.space().impl_internal_space_instance(),
false);
*this, grid, block, 0, m_policy.space().impl_internal_space_instance());
}
ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
@ -375,8 +374,8 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
CudaParallelLaunch<ParallelReduce, LaunchBounds>(
*this, grid, block, shmem,
m_policy.space().impl_internal_space_instance(),
false); // copy to device and execute
m_policy.space()
.impl_internal_space_instance()); // copy to device and execute
if (!m_result_ptr_device_accessible) {
if (m_result_ptr) {
@ -465,8 +464,24 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
public:
using pointer_type = typename Analysis::pointer_type;
using reference_type = typename Analysis::reference_type;
using value_type = typename Analysis::value_type;
using functor_type = FunctorType;
using size_type = Cuda::size_type;
// Conditionally set word_size_type to int16_t or int8_t if value_type is
// smaller than int32_t (Kokkos::Cuda::size_type)
// word_size_type is used to determine the word count, shared memory buffer
// size, and global memory buffer size before the scan is performed.
// Within the scan, the word count is recomputed based on word_size_type
// and when calculating indexes into the shared/global memory buffers for
// performing the scan, word_size_type is used again.
// For scalars > 4 bytes in size, indexing into shared/global memory relies
// on the block and grid dimensions to ensure that we index at the correct
// offset rather than at every 4 byte word; such that, when the join is
// performed, we have the correct data that was copied over in chunks of 4
// bytes.
using word_size_type = std::conditional_t<
sizeof(value_type) < sizeof(size_type),
std::conditional_t<sizeof(value_type) == 2, int16_t, int8_t>, size_type>;
private:
// Algorithmic constraints:
@ -477,7 +492,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
const FunctorType m_functor;
const Policy m_policy;
size_type* m_scratch_space;
word_size_type* m_scratch_space;
size_type* m_scratch_flags;
size_type m_final;
#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
@ -501,12 +516,12 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
__device__ inline void initial() const {
typename Analysis::Reducer final_reducer(&m_functor);
const integral_nonzero_constant<size_type, Analysis::StaticValueSize /
sizeof(size_type)>
word_count(Analysis::value_size(m_functor) / sizeof(size_type));
const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
sizeof(word_size_type)>
word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));
size_type* const shared_value =
kokkos_impl_cuda_shared_memory<size_type>() +
word_size_type* const shared_value =
kokkos_impl_cuda_shared_memory<word_size_type>() +
word_count.value * threadIdx.y;
final_reducer.init(reinterpret_cast<pointer_type>(shared_value));
@ -532,7 +547,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
// gridDim.x
cuda_single_inter_block_reduce_scan<true>(
final_reducer, blockIdx.x, gridDim.x,
kokkos_impl_cuda_shared_memory<size_type>(), m_scratch_space,
kokkos_impl_cuda_shared_memory<word_size_type>(), m_scratch_space,
m_scratch_flags);
}
@ -541,21 +556,22 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
__device__ inline void final() const {
typename Analysis::Reducer final_reducer(&m_functor);
const integral_nonzero_constant<size_type, Analysis::StaticValueSize /
sizeof(size_type)>
word_count(Analysis::value_size(m_functor) / sizeof(size_type));
const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
sizeof(word_size_type)>
word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));
// Use shared memory as an exclusive scan: { 0 , value[0] , value[1] ,
// value[2] , ... }
size_type* const shared_data = kokkos_impl_cuda_shared_memory<size_type>();
size_type* const shared_prefix =
word_size_type* const shared_data =
kokkos_impl_cuda_shared_memory<word_size_type>();
word_size_type* const shared_prefix =
shared_data + word_count.value * threadIdx.y;
size_type* const shared_accum =
word_size_type* const shared_accum =
shared_data + word_count.value * (blockDim.y + 1);
// Starting value for this thread block is the previous block's total.
if (blockIdx.x) {
size_type* const block_total =
word_size_type* const block_total =
m_scratch_space + word_count.value * (blockIdx.x - 1);
for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
shared_accum[i] = block_total[i];
@ -602,7 +618,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
typename Analysis::pointer_type(shared_data + word_count.value));
{
size_type* const block_total =
word_size_type* const block_total =
shared_data + word_count.value * blockDim.y;
for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
shared_accum[i] = block_total[i];
@ -690,8 +706,9 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
// How many block are really needed for this much work:
const int grid_x = (nwork + work_per_block - 1) / work_per_block;
m_scratch_space = cuda_internal_scratch_space(
m_policy.space(), Analysis::value_size(m_functor) * grid_x);
m_scratch_space =
reinterpret_cast<word_size_type*>(cuda_internal_scratch_space(
m_policy.space(), Analysis::value_size(m_functor) * grid_x));
m_scratch_flags =
cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type) * 1);
@ -708,16 +725,16 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
m_final = false;
CudaParallelLaunch<ParallelScan, LaunchBounds>(
*this, grid, block, shmem,
m_policy.space().impl_internal_space_instance(),
false); // copy to device and execute
m_policy.space()
.impl_internal_space_instance()); // copy to device and execute
#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
}
#endif
m_final = true;
CudaParallelLaunch<ParallelScan, LaunchBounds>(
*this, grid, block, shmem,
m_policy.space().impl_internal_space_instance(),
false); // copy to device and execute
m_policy.space()
.impl_internal_space_instance()); // copy to device and execute
}
}
@ -752,10 +769,26 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
Policy, FunctorType>;
public:
using value_type = typename Analysis::value_type;
using pointer_type = typename Analysis::pointer_type;
using reference_type = typename Analysis::reference_type;
using functor_type = FunctorType;
using size_type = Cuda::size_type;
// Conditionally set word_size_type to int16_t or int8_t if value_type is
// smaller than int32_t (Kokkos::Cuda::size_type)
// word_size_type is used to determine the word count, shared memory buffer
// size, and global memory buffer size before the scan is performed.
// Within the scan, the word count is recomputed based on word_size_type
// and when calculating indexes into the shared/global memory buffers for
// performing the scan, word_size_type is used again.
// For scalars > 4 bytes in size, indexing into shared/global memory relies
// on the block and grid dimensions to ensure that we index at the correct
// offset rather than at every 4 byte word; such that, when the join is
// performed, we have the correct data that was copied over in chunks of 4
// bytes.
using word_size_type = std::conditional_t<
sizeof(value_type) < sizeof(size_type),
std::conditional_t<sizeof(value_type) == 2, int16_t, int8_t>, size_type>;
private:
// Algorithmic constraints:
@ -766,7 +799,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
const FunctorType m_functor;
const Policy m_policy;
size_type* m_scratch_space;
word_size_type* m_scratch_space;
size_type* m_scratch_flags;
size_type m_final;
ReturnType& m_returnvalue;
@ -791,12 +824,12 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
__device__ inline void initial() const {
typename Analysis::Reducer final_reducer(&m_functor);
const integral_nonzero_constant<size_type, Analysis::StaticValueSize /
sizeof(size_type)>
word_count(Analysis::value_size(m_functor) / sizeof(size_type));
const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
sizeof(word_size_type)>
word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));
size_type* const shared_value =
kokkos_impl_cuda_shared_memory<size_type>() +
word_size_type* const shared_value =
kokkos_impl_cuda_shared_memory<word_size_type>() +
word_count.value * threadIdx.y;
final_reducer.init(reinterpret_cast<pointer_type>(shared_value));
@ -822,7 +855,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
// gridDim.x
cuda_single_inter_block_reduce_scan<true>(
final_reducer, blockIdx.x, gridDim.x,
kokkos_impl_cuda_shared_memory<size_type>(), m_scratch_space,
kokkos_impl_cuda_shared_memory<word_size_type>(), m_scratch_space,
m_scratch_flags);
}
@ -831,21 +864,22 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
__device__ inline void final() const {
typename Analysis::Reducer final_reducer(&m_functor);
const integral_nonzero_constant<size_type, Analysis::StaticValueSize /
sizeof(size_type)>
word_count(Analysis::value_size(m_functor) / sizeof(size_type));
const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
sizeof(word_size_type)>
word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));
// Use shared memory as an exclusive scan: { 0 , value[0] , value[1] ,
// value[2] , ... }
size_type* const shared_data = kokkos_impl_cuda_shared_memory<size_type>();
size_type* const shared_prefix =
word_size_type* const shared_data =
kokkos_impl_cuda_shared_memory<word_size_type>();
word_size_type* const shared_prefix =
shared_data + word_count.value * threadIdx.y;
size_type* const shared_accum =
word_size_type* const shared_accum =
shared_data + word_count.value * (blockDim.y + 1);
// Starting value for this thread block is the previous block's total.
if (blockIdx.x) {
size_type* const block_total =
word_size_type* const block_total =
m_scratch_space + word_count.value * (blockIdx.x - 1);
for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
shared_accum[i] = block_total[i];
@ -894,7 +928,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
typename Analysis::pointer_type(shared_data + word_count.value));
{
size_type* const block_total =
word_size_type* const block_total =
shared_data + word_count.value * blockDim.y;
for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
shared_accum[i] = block_total[i];
@ -983,8 +1017,9 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
// How many block are really needed for this much work:
const int grid_x = (nwork + work_per_block - 1) / work_per_block;
m_scratch_space = cuda_internal_scratch_space(
m_policy.space(), Analysis::value_size(m_functor) * grid_x);
m_scratch_space =
reinterpret_cast<word_size_type*>(cuda_internal_scratch_space(
m_policy.space(), Analysis::value_size(m_functor) * grid_x));
m_scratch_flags =
cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type) * 1);
@ -1002,16 +1037,16 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
m_final = false;
CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>(
*this, grid, block, shmem,
m_policy.space().impl_internal_space_instance(),
false); // copy to device and execute
m_policy.space()
.impl_internal_space_instance()); // copy to device and execute
#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
}
#endif
m_final = true;
CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>(
*this, grid, block, shmem,
m_policy.space().impl_internal_space_instance(),
false); // copy to device and execute
m_policy.space()
.impl_internal_space_instance()); // copy to device and execute
const int size = Analysis::value_size(m_functor);
#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
@ -1022,7 +1057,8 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
#endif
DeepCopy<HostSpace, CudaSpace, Cuda>(
m_policy.space(), &m_returnvalue,
m_scratch_space + (grid_x - 1) * size / sizeof(int), size);
m_scratch_space + (grid_x - 1) * size / sizeof(word_size_type),
size);
}
}

View File

@ -552,8 +552,8 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
CudaParallelLaunch<ParallelFor, LaunchBounds>(
*this, grid, block, shmem_size_total,
m_policy.space().impl_internal_space_instance(),
true); // copy to device and execute
m_policy.space()
.impl_internal_space_instance()); // copy to device and execute
}
ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
@ -878,8 +878,8 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
CudaParallelLaunch<ParallelReduce, LaunchBounds>(
*this, grid, block, shmem_size_total,
m_policy.space().impl_internal_space_instance(),
true); // copy to device and execute
m_policy.space()
.impl_internal_space_instance()); // copy to device and execute
if (!m_result_ptr_device_accessible) {
m_policy.space().fence(

View File

@ -116,6 +116,7 @@ __device__ inline void cuda_inter_warp_reduction(
value = result[0];
for (int i = 1; (i * step < max_active_thread) && i < STEP_WIDTH; i++)
reducer.join(&value, &result[i]);
__syncthreads();
}
template <class ValueType, class ReducerType>
@ -427,11 +428,6 @@ struct CudaReductionsFunctor<FunctorType, false, false> {
// __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
// function qualifier which could be used to improve performance.
//----------------------------------------------------------------------------
// Maximize shared memory and minimize L1 cache:
// cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferShared );
// For 2.0 capability: 48 KB shared and 16 KB L1
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
/*
* Algorithmic constraints:
* (a) blockDim.y <= 1024

View File

@ -100,8 +100,7 @@ class ParallelFor<FunctorType, Kokkos::WorkGraphPolicy<Traits...>,
const int shared = 0;
Kokkos::Impl::CudaParallelLaunch<Self>(
*this, grid, block, shared, Cuda().impl_internal_space_instance(),
false);
*this, grid, block, shared, Cuda().impl_internal_space_instance());
}
inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)

View File

@ -448,11 +448,27 @@ class ParallelScanHIPBase {
Policy, FunctorType>;
public:
using value_type = typename Analysis::value_type;
using pointer_type = typename Analysis::pointer_type;
using reference_type = typename Analysis::reference_type;
using functor_type = FunctorType;
using size_type = Kokkos::Experimental::HIP::size_type;
using index_type = typename Policy::index_type;
// Conditionally set word_size_type to int16_t or int8_t if value_type is
// smaller than int32_t (Kokkos::HIP::size_type)
// word_size_type is used to determine the word count, shared memory buffer
// size, and global memory buffer size before the scan is performed.
// Within the scan, the word count is recomputed based on word_size_type
// and when calculating indexes into the shared/global memory buffers for
// performing the scan, word_size_type is used again.
// For scalars > 4 bytes in size, indexing into shared/global memory relies
// on the block and grid dimensions to ensure that we index at the correct
// offset rather than at every 4 byte word; such that, when the join is
// performed, we have the correct data that was copied over in chunks of 4
// bytes.
using word_size_type = std::conditional_t<
sizeof(value_type) < sizeof(size_type),
std::conditional_t<sizeof(value_type) == 2, int16_t, int8_t>, size_type>;
protected:
// Algorithmic constraints:
@ -463,10 +479,10 @@ class ParallelScanHIPBase {
const FunctorType m_functor;
const Policy m_policy;
size_type* m_scratch_space = nullptr;
size_type* m_scratch_flags = nullptr;
size_type m_final = false;
int m_grid_x = 0;
word_size_type* m_scratch_space = nullptr;
size_type* m_scratch_flags = nullptr;
size_type m_final = false;
int m_grid_x = 0;
// Only let one ParallelReduce/Scan modify the shared memory. The
// constructor acquires the mutex which is released in the destructor.
std::lock_guard<std::mutex> m_shared_memory_lock;
@ -489,12 +505,12 @@ class ParallelScanHIPBase {
__device__ inline void initial() const {
typename Analysis::Reducer final_reducer(&m_functor);
const integral_nonzero_constant<size_type, Analysis::StaticValueSize /
sizeof(size_type)>
word_count(Analysis::value_size(m_functor) / sizeof(size_type));
const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
sizeof(word_size_type)>
word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));
pointer_type const shared_value = reinterpret_cast<pointer_type>(
Kokkos::Experimental::kokkos_impl_hip_shared_memory<size_type>() +
Kokkos::Experimental::kokkos_impl_hip_shared_memory<word_size_type>() +
word_count.value * threadIdx.y);
final_reducer.init(shared_value);
@ -518,7 +534,7 @@ class ParallelScanHIPBase {
// gridDim.x
hip_single_inter_block_reduce_scan<true>(
final_reducer, blockIdx.x, gridDim.x,
Kokkos::Experimental::kokkos_impl_hip_shared_memory<size_type>(),
Kokkos::Experimental::kokkos_impl_hip_shared_memory<word_size_type>(),
m_scratch_space, m_scratch_flags);
}
@ -527,22 +543,22 @@ class ParallelScanHIPBase {
__device__ inline void final() const {
typename Analysis::Reducer final_reducer(&m_functor);
const integral_nonzero_constant<size_type, Analysis::StaticValueSize /
sizeof(size_type)>
word_count(Analysis::value_size(m_functor) / sizeof(size_type));
const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
sizeof(word_size_type)>
word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));
// Use shared memory as an exclusive scan: { 0 , value[0] , value[1] ,
// value[2] , ... }
size_type* const shared_data =
Kokkos::Experimental::kokkos_impl_hip_shared_memory<size_type>();
size_type* const shared_prefix =
word_size_type* const shared_data =
Kokkos::Experimental::kokkos_impl_hip_shared_memory<word_size_type>();
word_size_type* const shared_prefix =
shared_data + word_count.value * threadIdx.y;
size_type* const shared_accum =
word_size_type* const shared_accum =
shared_data + word_count.value * (blockDim.y + 1);
// Starting value for this thread block is the previous block's total.
if (blockIdx.x) {
size_type* const block_total =
word_size_type* const block_total =
m_scratch_space + word_count.value * (blockIdx.x - 1);
for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
shared_accum[i] = block_total[i];
@ -588,7 +604,7 @@ class ParallelScanHIPBase {
typename Analysis::pointer_type(shared_data + word_count.value));
{
size_type* const block_total =
word_size_type* const block_total =
shared_data + word_count.value * blockDim.y;
for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
shared_accum[i] = block_total[i];
@ -647,8 +663,9 @@ class ParallelScanHIPBase {
// How many block are really needed for this much work:
m_grid_x = (nwork + work_per_block - 1) / work_per_block;
m_scratch_space = Kokkos::Experimental::Impl::hip_internal_scratch_space(
m_policy.space(), Analysis::value_size(m_functor) * m_grid_x);
m_scratch_space = reinterpret_cast<word_size_type*>(
Kokkos::Experimental::Impl::hip_internal_scratch_space(
m_policy.space(), Analysis::value_size(m_functor) * m_grid_x));
m_scratch_flags = Kokkos::Experimental::Impl::hip_internal_scratch_flags(
m_policy.space(), sizeof(size_type) * 1);
@ -734,7 +751,8 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace,
Kokkos::Experimental::HIP>(
Base::m_policy.space(), &m_returnvalue,
Base::m_scratch_space + (Base::m_grid_x - 1) * size / sizeof(int),
Base::m_scratch_space + (Base::m_grid_x - 1) * size /
sizeof(typename Base::word_size_type),
size);
}
}

View File

@ -225,11 +225,11 @@ struct HIPReductionsFunctor<FunctorType, false> {
}
}
template <typename SizeType>
__device__ static inline bool scalar_inter_block_reduction(
FunctorType const& functor,
::Kokkos::Experimental::HIP::size_type const block_count,
::Kokkos::Experimental::HIP::size_type* const shared_data,
::Kokkos::Experimental::HIP::size_type* const global_data,
SizeType* const shared_data, SizeType* const global_data,
::Kokkos::Experimental::HIP::size_type* const global_flags) {
Scalar* const global_team_buffer_element =
reinterpret_cast<Scalar*>(global_data);
@ -411,16 +411,14 @@ __device__ void hip_intra_block_reduce_scan(
* Global reduce result is in the last threads' 'shared_data' location.
*/
template <bool DoScan, class FunctorType>
template <bool DoScan, typename FunctorType, typename SizeType>
__device__ bool hip_single_inter_block_reduce_scan_impl(
FunctorType const& functor,
::Kokkos::Experimental::HIP::size_type const block_id,
::Kokkos::Experimental::HIP::size_type const block_count,
::Kokkos::Experimental::HIP::size_type* const shared_data,
::Kokkos::Experimental::HIP::size_type* const global_data,
SizeType* const shared_data, SizeType* const global_data,
::Kokkos::Experimental::HIP::size_type* const global_flags) {
using size_type = ::Kokkos::Experimental::HIP::size_type;
using size_type = SizeType;
using value_type = typename FunctorType::value_type;
using pointer_type = typename FunctorType::pointer_type;
@ -518,13 +516,12 @@ __device__ bool hip_single_inter_block_reduce_scan_impl(
return is_last_block;
}
template <bool DoScan, typename FunctorType>
template <bool DoScan, typename FunctorType, typename SizeType>
__device__ bool hip_single_inter_block_reduce_scan(
FunctorType const& functor,
::Kokkos::Experimental::HIP::size_type const block_id,
::Kokkos::Experimental::HIP::size_type const block_count,
::Kokkos::Experimental::HIP::size_type* const shared_data,
::Kokkos::Experimental::HIP::size_type* const global_data,
SizeType* const shared_data, SizeType* const global_data,
::Kokkos::Experimental::HIP::size_type* const global_flags) {
// If we are doing a reduction and we don't do an array reduction, we use the
// reduction-only path. Otherwise, we use the common path between reduction

View File

@ -116,6 +116,7 @@ __device__ inline void hip_inter_warp_shuffle_reduction(
value = result[0];
for (int i = 1; (i * step < max_active_thread) && (i < step_width); ++i)
reducer.join(&value, &result[i]);
__syncthreads();
}
template <typename ValueType, typename ReducerType>

View File

@ -3711,12 +3711,13 @@ namespace Impl {
template <class T, class... P, class... ViewCtorArgs>
inline std::enable_if_t<
(std::is_same<
typename Kokkos::View<T, P...>::memory_space,
typename Kokkos::View<T, P...>::HostMirror::memory_space>::value &&
std::is_same<
typename Kokkos::View<T, P...>::data_type,
typename Kokkos::View<T, P...>::HostMirror::data_type>::value),
!Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
(std::is_same<
typename Kokkos::View<T, P...>::memory_space,
typename Kokkos::View<T, P...>::HostMirror::memory_space>::value &&
std::is_same<
typename Kokkos::View<T, P...>::data_type,
typename Kokkos::View<T, P...>::HostMirror::data_type>::value),
typename Kokkos::View<T, P...>::HostMirror>
create_mirror_view(const Kokkos::View<T, P...>& src,
const Impl::ViewCtorProp<ViewCtorArgs...>&) {
@ -3725,12 +3726,13 @@ create_mirror_view(const Kokkos::View<T, P...>& src,
template <class T, class... P, class... ViewCtorArgs>
inline std::enable_if_t<
!(std::is_same<
typename Kokkos::View<T, P...>::memory_space,
typename Kokkos::View<T, P...>::HostMirror::memory_space>::value &&
std::is_same<
typename Kokkos::View<T, P...>::data_type,
typename Kokkos::View<T, P...>::HostMirror::data_type>::value),
!Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
!(std::is_same<typename Kokkos::View<T, P...>::memory_space,
typename Kokkos::View<
T, P...>::HostMirror::memory_space>::value &&
std::is_same<
typename Kokkos::View<T, P...>::data_type,
typename Kokkos::View<T, P...>::HostMirror::data_type>::value),
typename Kokkos::View<T, P...>::HostMirror>
create_mirror_view(const Kokkos::View<T, P...>& src,
const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
@ -3738,25 +3740,33 @@ create_mirror_view(const Kokkos::View<T, P...>& src,
}
// Create a mirror view in a new space (specialization for same space)
template <class Space, class T, class... P, class... ViewCtorArgs>
std::enable_if_t<Impl::MirrorViewType<Space, T, P...>::is_same_memspace,
typename Impl::MirrorViewType<Space, T, P...>::view_type>
create_mirror_view(const Space&, const Kokkos::View<T, P...>& src,
template <class T, class... P, class... ViewCtorArgs,
class = std::enable_if_t<
Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
std::enable_if_t<Impl::MirrorViewType<
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
T, P...>::is_same_memspace,
typename Impl::MirrorViewType<
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
T, P...>::view_type>
create_mirror_view(const Kokkos::View<T, P...>& src,
const Impl::ViewCtorProp<ViewCtorArgs...>&) {
return src;
}
// Create a mirror view in a new space (specialization for different space)
template <class Space, class T, class... P, class... ViewCtorArgs>
std::enable_if_t<!Impl::MirrorViewType<Space, T, P...>::is_same_memspace,
typename Impl::MirrorViewType<Space, T, P...>::view_type>
create_mirror_view(const Space&, const Kokkos::View<T, P...>& src,
template <class T, class... P, class... ViewCtorArgs,
class = std::enable_if_t<
Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
std::enable_if_t<!Impl::MirrorViewType<
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
T, P...>::is_same_memspace,
typename Impl::MirrorViewType<
typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
T, P...>::view_type>
create_mirror_view(const Kokkos::View<T, P...>& src,
const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
using MemorySpace = typename Space::memory_space;
using alloc_prop = Impl::ViewCtorProp<ViewCtorArgs..., MemorySpace>;
alloc_prop prop_copy(arg_prop);
return Kokkos::Impl::create_mirror(src, prop_copy);
return Kokkos::Impl::create_mirror(src, arg_prop);
}
} // namespace Impl
@ -3815,9 +3825,10 @@ typename Impl::MirrorViewType<Space, T, P...>::view_type create_mirror_view(
template <class Space, class T, class... P,
typename Enable = std::enable_if_t<Kokkos::is_space<Space>::value>>
typename Impl::MirrorViewType<Space, T, P...>::view_type create_mirror_view(
Kokkos::Impl::WithoutInitializing_t wi, Space const& space,
Kokkos::Impl::WithoutInitializing_t wi, Space const&,
Kokkos::View<T, P...> const& v) {
return Impl::create_mirror_view(space, v, view_alloc(wi));
return Impl::create_mirror_view(
v, view_alloc(typename Space::memory_space{}, wi));
}
template <class T, class... P, class... ViewCtorArgs>

View File

@ -1754,7 +1754,10 @@ struct RankDataType<ValueType, 0> {
};
template <unsigned N, typename... Args>
KOKKOS_FUNCTION std::enable_if_t<N == View<Args...>::Rank, View<Args...>>
KOKKOS_FUNCTION std::enable_if_t<
N == View<Args...>::Rank &&
std::is_same<typename ViewTraits<Args...>::specialize, void>::value,
View<Args...>>
as_view_of_rank_n(View<Args...> v) {
return v;
}
@ -1762,13 +1765,13 @@ as_view_of_rank_n(View<Args...> v) {
// Placeholder implementation to compile generic code for DynRankView; should
// never be called
template <unsigned N, typename T, typename... Args>
std::enable_if_t<
N != View<T, Args...>::Rank,
KOKKOS_FUNCTION std::enable_if_t<
N != View<T, Args...>::Rank &&
std::is_same<typename ViewTraits<T, Args...>::specialize, void>::value,
View<typename RankDataType<typename View<T, Args...>::value_type, N>::type,
Args...>>
as_view_of_rank_n(View<T, Args...>) {
Kokkos::Impl::throw_runtime_exception(
"Trying to get at a View of the wrong rank");
Kokkos::abort("Trying to get at a View of the wrong rank");
return {};
}

View File

@ -101,8 +101,8 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
void push_work(const std::int32_t w) const noexcept {
const std::int32_t N = m_graph.numRows();
std::int32_t volatile* const ready_queue = &m_queue[0];
std::int32_t volatile* const end_hint = &m_queue[2 * N + 1];
std::int32_t* const ready_queue = &m_queue[0];
std::int32_t* const end_hint = &m_queue[2 * N + 1];
// Push work to end of queue
const std::int32_t j = atomic_fetch_add(end_hint, 1);
@ -134,14 +134,14 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
std::int32_t pop_work() const noexcept {
const std::int32_t N = m_graph.numRows();
std::int32_t volatile* const ready_queue = &m_queue[0];
std::int32_t volatile* const begin_hint = &m_queue[2 * N];
std::int32_t* const ready_queue = &m_queue[0];
std::int32_t* const begin_hint = &m_queue[2 * N];
// begin hint is guaranteed to be less than or equal to
// actual begin location in the queue.
for (std::int32_t i = *begin_hint; i < N; ++i) {
const std::int32_t w = ready_queue[i];
for (std::int32_t i = Kokkos::atomic_load(begin_hint); i < N; ++i) {
const std::int32_t w = Kokkos::atomic_load(&ready_queue[i]);
if (w == END_TOKEN) {
return END_TOKEN;
@ -169,7 +169,7 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
const std::int32_t N = m_graph.numRows();
std::int32_t volatile* const count_queue = &m_queue[N];
std::int32_t* const count_queue = &m_queue[N];
const std::int32_t B = m_graph.row_map(w);
const std::int32_t E = m_graph.row_map(w + 1);
@ -199,7 +199,7 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
KOKKOS_INLINE_FUNCTION
void operator()(const TagCount, int i) const noexcept {
std::int32_t volatile* const count_queue = &m_queue[m_graph.numRows()];
std::int32_t* const count_queue = &m_queue[m_graph.numRows()];
atomic_increment(count_queue + m_graph.entries[i]);
}

View File

@ -51,7 +51,7 @@ namespace Kokkos::Experimental::Impl {
struct OpenACC_Traits {
#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
defined(KOKKOS_ARCH_AMPERE)
defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER)
static constexpr acc_device_t dev_type = acc_device_nvidia;
static constexpr bool may_fallback_to_host = false;
#else

View File

@ -47,6 +47,7 @@
#endif
#include <Kokkos_Macros.hpp>
#include <impl/Kokkos_DeviceManagement.hpp>
#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(_OPENMP)
@ -114,8 +115,9 @@ void OpenMPTargetInternal::impl_initialize() {
// FIXME_OPENMPTARGET: Only fix the number of teams for NVIDIA architectures
// from Pascal and upwards.
#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
defined(KOKKOS_ARCH_HOPPER)
#if defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG >= 1300)
omp_set_num_teams(512);
#endif
@ -164,7 +166,11 @@ void OpenMPTarget::impl_static_fence(const std::string& name) {
name, Kokkos::Experimental::Impl::openmp_fence_is_static::yes);
}
void OpenMPTarget::impl_initialize(InitializationSettings const&) {
void OpenMPTarget::impl_initialize(InitializationSettings const& settings) {
using Kokkos::Impl::get_gpu;
const int device_num = get_gpu(settings);
omp_set_default_device(device_num);
Impl::OpenMPTargetInternal::impl_singleton()->impl_initialize();
}
void OpenMPTarget::impl_finalize() {

View File

@ -155,7 +155,7 @@ void SYCL::impl_initialize(InitializationSettings const& settings) {
#if !defined(KOKKOS_ARCH_INTEL_GPU) && !defined(KOKKOS_ARCH_KEPLER) && \
!defined(KOKKOS_ARCH_MAXWELL) && !defined(KOKKOS_ARCH_PASCAL) && \
!defined(KOKKOS_ARCH_VOLTA) && !defined(KOKKOS_ARCH_TURING75) && \
!defined(KOKKOS_ARCH_AMPERE)
!defined(KOKKOS_ARCH_AMPERE) && !defined(KOKKOS_ARCH_HOPPER)
if (!settings.has_device_id() && gpu_devices.empty()) {
Impl::SYCLInternal::singleton().initialize(sycl::device());
return;

View File

@ -335,9 +335,10 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
return std::min({
int(m_space.impl_internal_space_instance()->m_maxWorkgroupSize),
// FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs.
#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
defined(KOKKOS_ARCH_HOPPER)
256,
#endif
max_threads_for_memory
@ -367,9 +368,10 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
return std::min<int>({
int(m_space.impl_internal_space_instance()->m_maxWorkgroupSize),
// FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs.
#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
defined(KOKKOS_ARCH_HOPPER)
256,
#endif
max_threads_for_memory

View File

@ -110,10 +110,9 @@ KOKKOS_IMPL_HOST_FUNCTION inline uint64_t clock_tic_host() noexcept {
return ((uint64_t)a) | (((uint64_t)d) << 32);
#elif defined(__powerpc) || defined(__powerpc__) || defined(__powerpc64__) || \
defined(__POWERPC__) || defined(__ppc__) || defined(__ppc64__)
#elif defined(__powerpc64__) || defined(__ppc64__)
unsigned int cycles = 0;
unsigned long cycles = 0;
asm volatile("mftb %0" : "=r"(cycles));

View File

@ -166,6 +166,8 @@ int get_device_count() {
#elif defined(KOKKOS_ENABLE_OPENACC)
return acc_get_num_devices(
Kokkos::Experimental::Impl::OpenACC_Traits::dev_type);
#elif defined(KOKKOS_ENABLE_OPENMPTARGET)
return omp_get_num_devices();
#else
Kokkos::abort("implementation bug");
return -1;
@ -426,11 +428,17 @@ int Kokkos::Impl::get_gpu(const InitializationSettings& settings) {
Kokkos::abort("implementation bug");
}
auto const* local_rank_str =
std::getenv("OMPI_COMM_WORLD_LOCAL_RANK"); // OpenMPI
if (!local_rank_str)
local_rank_str = std::getenv("MV2_COMM_WORLD_LOCAL_RANK"); // MVAPICH2
if (!local_rank_str) local_rank_str = std::getenv("SLURM_LOCALID"); // SLURM
char const* local_rank_str = nullptr;
for (char const* env_var : {
"OMPI_COMM_WORLD_LOCAL_RANK", // OpenMPI
"MV2_COMM_WORLD_LOCAL_RANK", // MVAPICH2
"MPI_LOCALRANKID", // MPICH
"SLURM_LOCALID", // SLURM
"PMI_LOCAL_RANK" // PMI
}) {
local_rank_str = std::getenv(env_var);
if (local_rank_str) break;
}
// use first GPU available for execution if unable to detect local MPI rank
if (!local_rank_str) {

View File

@ -1128,9 +1128,8 @@ struct ViewOffset<
KOKKOS_INLINE_FUNCTION constexpr ViewOffset(
const ViewOffset<DimRHS, Kokkos::LayoutRight, void>& rhs)
: m_dim(rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0) {
static_assert((DimRHS::rank == 0 && dimension_type::rank == 0) ||
(DimRHS::rank == 1 && dimension_type::rank == 1 &&
dimension_type::rank_dynamic == 1),
static_assert(((DimRHS::rank == 0 && dimension_type::rank == 0) ||
(DimRHS::rank == 1 && dimension_type::rank == 1)),
"ViewOffset LayoutLeft and LayoutRight are only compatible "
"when rank <= 1");
}
@ -1778,8 +1777,7 @@ struct ViewOffset<
const ViewOffset<DimRHS, Kokkos::LayoutLeft, void>& rhs)
: m_dim(rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0) {
static_assert((DimRHS::rank == 0 && dimension_type::rank == 0) ||
(DimRHS::rank == 1 && dimension_type::rank == 1 &&
dimension_type::rank_dynamic == 1),
(DimRHS::rank == 1 && dimension_type::rank == 1),
"ViewOffset LayoutRight and LayoutLeft are only compatible "
"when rank <= 1");
}
@ -3059,10 +3057,10 @@ struct ViewValueFunctor<DeviceType, ValueType, true /* is_scalar */> {
std::is_trivially_copy_assignable<Dummy>::value>
construct_shared_allocation() {
// Shortcut for zero initialization
ValueType value{};
// On A64FX memset seems to do the wrong thing with regards to first touch
// leading to the significant performance issues
#ifndef KOKKOS_ARCH_A64FX
ValueType value{};
if (Impl::is_zero_byte(value)) {
uint64_t kpID = 0;
if (Kokkos::Profiling::profileLibraryLoaded()) {
@ -3539,9 +3537,7 @@ class ViewMapping<
typename SrcTraits::array_layout>::value ||
std::is_same<typename DstTraits::array_layout,
Kokkos::LayoutStride>::value ||
(DstTraits::dimension::rank == 0) ||
(DstTraits::dimension::rank == 1 &&
DstTraits::dimension::rank_dynamic == 1)
(DstTraits::dimension::rank == 0) || (DstTraits::dimension::rank == 1)
};
public:

View File

@ -73,6 +73,7 @@ KOKKOS_INCLUDE_DIRECTORIES(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files)
SET(COMPILE_ONLY_SOURCES
TestArray.cpp
TestCreateMirror.cpp
TestDetectionIdiom.cpp
TestInterOp.cpp
TestLegionInteroperability.cpp
@ -86,6 +87,7 @@ ENDIF()
KOKKOS_ADD_EXECUTABLE(
TestCompileOnly
SOURCES
TestCompileMain.cpp
${COMPILE_ONLY_SOURCES}
)

View File

@ -0,0 +1 @@
int main() {}

View File

@ -0,0 +1,126 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 3.0
// Copyright (2020) National Technology & Engineering
// Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Core.hpp>
template <typename TestView, typename MemorySpace>
void check_memory_space(TestView, MemorySpace) {
static_assert(
std::is_same<typename TestView::memory_space, MemorySpace>::value, "");
}
template <class View>
auto host_mirror_test_space(View) {
return std::conditional_t<
Kokkos::SpaceAccessibility<Kokkos::HostSpace,
typename View::memory_space>::accessible,
typename View::memory_space, Kokkos::HostSpace>{};
}
template <typename View>
void test_create_mirror_properties(const View& view) {
using namespace Kokkos;
using DeviceMemorySpace = typename DefaultExecutionSpace::memory_space;
// clang-format off
// create_mirror
check_memory_space(create_mirror(WithoutInitializing, view), host_mirror_test_space(view));
check_memory_space(create_mirror( view), host_mirror_test_space(view));
check_memory_space(create_mirror(WithoutInitializing, DefaultExecutionSpace{}, view), DeviceMemorySpace{});
check_memory_space(create_mirror( DefaultExecutionSpace{}, view), DeviceMemorySpace{});
// create_mirror_view
check_memory_space(create_mirror_view(WithoutInitializing, view), host_mirror_test_space(view));
check_memory_space(create_mirror_view( view), host_mirror_test_space(view));
check_memory_space(create_mirror_view(WithoutInitializing, DefaultExecutionSpace{}, view), DeviceMemorySpace{});
check_memory_space(create_mirror_view( DefaultExecutionSpace{}, view), DeviceMemorySpace{});
// create_mirror view_alloc
check_memory_space(create_mirror(view_alloc(WithoutInitializing), view), host_mirror_test_space(view));
check_memory_space(create_mirror(view_alloc(), view), host_mirror_test_space(view));
check_memory_space(create_mirror(view_alloc(WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
check_memory_space(create_mirror(view_alloc( DeviceMemorySpace{}), view), DeviceMemorySpace{});
// create_mirror_view view_alloc
check_memory_space(create_mirror_view(view_alloc(WithoutInitializing), view), host_mirror_test_space(view));
check_memory_space(create_mirror_view(view_alloc(), view), host_mirror_test_space(view));
check_memory_space(create_mirror_view(view_alloc(WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
check_memory_space(create_mirror_view(view_alloc( DeviceMemorySpace{}), view), DeviceMemorySpace{});
// create_mirror view_alloc + execution space
check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{}, WithoutInitializing), view), host_mirror_test_space(view));
check_memory_space(create_mirror(view_alloc(DefaultHostExecutionSpace{}), view), host_mirror_test_space(view));
check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{}, WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{}, DeviceMemorySpace{}), view), DeviceMemorySpace{});
// create_mirror_view view_alloc + execution space
check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{}, WithoutInitializing), view), host_mirror_test_space(view));
check_memory_space(create_mirror_view(view_alloc(DefaultHostExecutionSpace{}), view), host_mirror_test_space(view));
check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{}, WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{}, DeviceMemorySpace{}), view), DeviceMemorySpace{});
// create_mirror_view_and_copy
check_memory_space(create_mirror_view_and_copy(HostSpace{}, view), HostSpace{});
check_memory_space(create_mirror_view_and_copy(DeviceMemorySpace{}, view), DeviceMemorySpace{});
// create_mirror_view_and_copy view_alloc
check_memory_space(create_mirror_view_and_copy(view_alloc(HostSpace{}), view), HostSpace{});
check_memory_space(create_mirror_view_and_copy(view_alloc(DeviceMemorySpace{}), view), DeviceMemorySpace{});
// create_mirror_view_and_copy view_alloc + execution space
check_memory_space(create_mirror_view_and_copy(view_alloc(HostSpace{}, DefaultHostExecutionSpace{}), view), HostSpace{});
check_memory_space(create_mirror_view_and_copy(view_alloc(DeviceMemorySpace{}, DefaultExecutionSpace{}), view), DeviceMemorySpace{});
// clang-format on
}
void test() {
Kokkos::View<int*, Kokkos::DefaultExecutionSpace> device_view("device view",
10);
Kokkos::View<int*, Kokkos::HostSpace> host_view("host view", 10);
test_create_mirror_properties(device_view);
test_create_mirror_properties(host_view);
}

View File

@ -92,5 +92,3 @@ static_assert(std::is_same<difference_type<Woof>, int>::value,
static_assert(std::is_same<difference_type<Bark>, std::ptrdiff_t>::value,
"Bark's difference_type should be ptrdiff_t!");
} // namespace Example
int main() {}

View File

@ -45,20 +45,23 @@
#include <Kokkos_Core.hpp>
#include <cstdio>
namespace Test {
namespace {
template <class Device>
template <class Device, class T, T ImbalanceSz>
struct TestScan {
using execution_space = Device;
using value_type = int64_t;
using value_type = T;
Kokkos::View<int, Device, Kokkos::MemoryTraits<Kokkos::Atomic> > errors;
KOKKOS_INLINE_FUNCTION
void operator()(const int iwork, value_type& update,
const bool final_pass) const {
const value_type n = iwork + 1;
const value_type imbalance = ((1000 <= n) && (0 == n % 1000)) ? 1000 : 0;
const value_type n = iwork + 1;
const value_type imbalance =
((ImbalanceSz <= n) && (value_type(0) == n % ImbalanceSz))
? ImbalanceSz
: value_type(0);
// Insert an artificial load imbalance
@ -133,12 +136,29 @@ struct TestScan {
}
}
};
} // namespace
TEST(TEST_CATEGORY, scan) {
TestScan<TEST_EXECSPACE>::test_range(1, 1000);
TestScan<TEST_EXECSPACE>(0);
TestScan<TEST_EXECSPACE>(100000);
TestScan<TEST_EXECSPACE>(10000000);
TEST_EXECSPACE().fence();
constexpr auto imbalance_size = 1000;
TestScan<TEST_EXECSPACE, int64_t, imbalance_size>::test_range(1, 1000);
TestScan<TEST_EXECSPACE, int64_t, imbalance_size>(0);
TestScan<TEST_EXECSPACE, int64_t, imbalance_size>(100000);
TestScan<TEST_EXECSPACE, int64_t, imbalance_size>(10000000);
}
TEST(TEST_CATEGORY, small_size_scan) {
constexpr auto imbalance_size = 10; // Pick to not overflow...
TestScan<TEST_EXECSPACE, std::int8_t, imbalance_size>(0);
TestScan<TEST_EXECSPACE, std::int8_t, imbalance_size>(5);
TestScan<TEST_EXECSPACE, std::int8_t, imbalance_size>(10);
TestScan<TEST_EXECSPACE, std::int8_t, imbalance_size>(
static_cast<std::size_t>(
std::sqrt(std::numeric_limits<std::int8_t>::max())));
constexpr auto short_imbalance_size = 100; // Pick to not overflow...
TestScan<TEST_EXECSPACE, std::int16_t, short_imbalance_size>(0);
TestScan<TEST_EXECSPACE, std::int16_t, short_imbalance_size>(5);
TestScan<TEST_EXECSPACE, std::int16_t, short_imbalance_size>(100);
TestScan<TEST_EXECSPACE, std::int16_t, short_imbalance_size>(
static_cast<std::size_t>(
std::sqrt(std::numeric_limits<std::int16_t>::max())));
}
} // namespace Test

View File

@ -1616,6 +1616,73 @@ struct TestTeamPolicyHandleByValue {
} // namespace
namespace {
template <typename ExecutionSpace>
struct TestRepeatedTeamReduce {
static constexpr int ncol = 1500; // nothing special, just some work
KOKKOS_FUNCTION void operator()(
const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type &team)
const {
// non-divisible by power of two to make triggering problems easier
constexpr int nlev = 129;
constexpr auto pi = Kokkos::Experimental::pi_v<double>;
double b = 0.;
for (int ri = 0; ri < 10; ++ri) {
// The contributions here must be sufficiently complex, simply adding ones
// wasn't enough to trigger the bug.
const auto g1 = [&](const int k, double &acc) {
acc += Kokkos::cos(pi * double(k) / nlev);
};
const auto g2 = [&](const int k, double &acc) {
acc += Kokkos::sin(pi * double(k) / nlev);
};
double a1, a2;
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team, nlev), g1, a1);
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team, nlev), g2, a2);
b += a1;
b += a2;
}
const auto h = [&]() {
const auto col = team.league_rank();
v(col) = b + col;
};
Kokkos::single(Kokkos::PerTeam(team), h);
}
KOKKOS_FUNCTION void operator()(const int i, int &bad) const {
if (v(i) != v(0) + i) {
++bad;
KOKKOS_IMPL_DO_NOT_USE_PRINTF("Failing at %d!\n", i);
}
}
TestRepeatedTeamReduce() : v("v", ncol) { test(); }
void test() {
int team_size_recommended =
Kokkos::TeamPolicy<ExecutionSpace>(1, 1).team_size_recommended(
*this, Kokkos::ParallelForTag());
// Choose a non-recommened (non-power of two for GPUs) team size
int team_size = team_size_recommended > 1 ? team_size_recommended - 1 : 1;
// The failure was non-deterministic so run the test a bunch of times
for (int it = 0; it < 100; ++it) {
Kokkos::parallel_for(
Kokkos::TeamPolicy<ExecutionSpace>(ncol, team_size, 1), *this);
int bad = 0;
Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecutionSpace>(0, ncol),
*this, bad);
ASSERT_EQ(bad, 0) << " Failing in iteration " << it;
}
}
Kokkos::View<double *, ExecutionSpace> v;
};
} // namespace
} // namespace Test
/*--------------------------------------------------------------------------*/

View File

@ -134,5 +134,15 @@ TEST(TEST_CATEGORY, team_parallel_dummy_with_reducer_and_scratch_space) {
}
}
TEST(TEST_CATEGORY, repeated_team_reduce) {
#ifdef KOKKOS_ENABLE_OPENMPTARGET
if (std::is_same<TEST_EXECSPACE, Kokkos::Experimental::OpenMPTarget>::value)
GTEST_SKIP() << "skipping since team_reduce for OpenMPTarget is not "
"properly implemented";
#endif
TestRepeatedTeamReduce<TEST_EXECSPACE>();
}
} // namespace Test
#endif

View File

@ -92,8 +92,18 @@ TEST(TEST_CATEGORY, view_is_assignable) {
View<double*, left, d_exec>>::test(false, false, 10);
// Layout assignment
Impl::TestAssignability<View<int, left, d_exec>,
View<int, right, d_exec>>::test(true, true);
Impl::TestAssignability<View<int*, left, d_exec>,
View<int*, right, d_exec>>::test(true, true, 10);
Impl::TestAssignability<View<int[5], left, d_exec>,
View<int*, right, d_exec>>::test(false, false, 10);
Impl::TestAssignability<View<int[10], left, d_exec>,
View<int*, right, d_exec>>::test(false, true, 10);
Impl::TestAssignability<View<int*, left, d_exec>,
View<int[5], right, d_exec>>::test(true, true);
Impl::TestAssignability<View<int[5], left, d_exec>,
View<int[10], right, d_exec>>::test(false, false);
// This could be made possible (due to the degenerate nature of the views) but
// we do not allow this yet

199
lib/kokkos/kokkos_5538.diff Normal file
View File

@ -0,0 +1,199 @@
diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos
index 22af411f32..530510a0d1 100644
--- a/lib/kokkos/Makefile.kokkos
+++ b/lib/kokkos/Makefile.kokkos
@@ -20,7 +20,7 @@ KOKKOS_DEVICES ?= "OpenMP"
#KOKKOS_DEVICES ?= "Threads"
# Options:
# Intel: KNC,KNL,SNB,HSW,BDW,SKL,SKX,ICL,ICX,SPR
-# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86
+# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86,Hopper90
# ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX
# IBM: BGQ,Power7,Power8,Power9
# AMD-GPUS: Vega900,Vega906,Vega908,Vega90A
@@ -401,6 +401,7 @@ KOKKOS_INTERNAL_USE_ARCH_VOLTA72 := $(call kokkos_has_string,$(KOKKOS_ARCH),Volt
KOKKOS_INTERNAL_USE_ARCH_TURING75 := $(call kokkos_has_string,$(KOKKOS_ARCH),Turing75)
KOKKOS_INTERNAL_USE_ARCH_AMPERE80 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere80)
KOKKOS_INTERNAL_USE_ARCH_AMPERE86 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere86)
+KOKKOS_INTERNAL_USE_ARCH_HOPPER90 := $(call kokkos_has_string,$(KOKKOS_ARCH),Hopper90)
KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30) \
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32) \
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \
@@ -414,7 +415,8 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLE
+ $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72) \
+ $(KOKKOS_INTERNAL_USE_ARCH_TURING75) \
+ $(KOKKOS_INTERNAL_USE_ARCH_AMPERE80) \
- + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86))
+ + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86) \
+ + $(KOKKOS_INTERNAL_USE_ARCH_HOPPER90))
#SEK: This seems like a bug to me
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
@@ -1194,6 +1196,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA_ARCH), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE86")
KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_86
endif
+ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_HOPPER90), 1)
+ tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER")
+ tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER90")
+ KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_90
+ endif
ifneq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)
diff --git a/lib/kokkos/cmake/KokkosCore_config.h.in b/lib/kokkos/cmake/KokkosCore_config.h.in
index 88ddc48378..b83ced9243 100644
--- a/lib/kokkos/cmake/KokkosCore_config.h.in
+++ b/lib/kokkos/cmake/KokkosCore_config.h.in
@@ -102,6 +102,7 @@
#cmakedefine KOKKOS_ARCH_AMPERE
#cmakedefine KOKKOS_ARCH_AMPERE80
#cmakedefine KOKKOS_ARCH_AMPERE86
+#cmakedefine KOKKOS_ARCH_HOPPER90
#cmakedefine KOKKOS_ARCH_AMD_ZEN
#cmakedefine KOKKOS_ARCH_AMD_ZEN2
#cmakedefine KOKKOS_ARCH_AMD_ZEN3
diff --git a/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc b/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc
index f56cef1651..2585a6a64c 100644
--- a/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc
+++ b/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc
@@ -74,6 +74,7 @@ int main() {
case 75: std::cout << "Set -DKokkos_ARCH_TURING75=ON ." << std::endl; break;
case 80: std::cout << "Set -DKokkos_ARCH_AMPERE80=ON ." << std::endl; break;
case 86: std::cout << "Set -DKokkos_ARCH_AMPERE86=ON ." << std::endl; break;
+ case 90: std::cout << "Set -DKokkos_ARCH_HOPPER90=ON ." << std::endl; break;
default:
std::cout << "Compute capability " << compute_capability
<< " is not supported" << std::endl;
diff --git a/lib/kokkos/cmake/kokkos_arch.cmake b/lib/kokkos/cmake/kokkos_arch.cmake
index ef16aad047..c1d76cceeb 100644
--- a/lib/kokkos/cmake/kokkos_arch.cmake
+++ b/lib/kokkos/cmake/kokkos_arch.cmake
@@ -86,6 +86,7 @@ KOKKOS_ARCH_OPTION(VOLTA72 GPU "NVIDIA Volta generation CC 7.2" "KOKK
KOKKOS_ARCH_OPTION(TURING75 GPU "NVIDIA Turing generation CC 7.5" "KOKKOS_SHOW_CUDA_ARCHS")
KOKKOS_ARCH_OPTION(AMPERE80 GPU "NVIDIA Ampere generation CC 8.0" "KOKKOS_SHOW_CUDA_ARCHS")
KOKKOS_ARCH_OPTION(AMPERE86 GPU "NVIDIA Ampere generation CC 8.6" "KOKKOS_SHOW_CUDA_ARCHS")
+KOKKOS_ARCH_OPTION(HOPPER90 GPU "NVIDIA Hopper generation CC 9.0" "KOKKOS_SHOW_CUDA_ARCHS")
IF(Kokkos_ENABLE_HIP OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_UNSUPPORTED_ARCHS)
SET(KOKKOS_SHOW_HIP_ARCHS ON)
@@ -544,6 +545,7 @@ CHECK_CUDA_ARCH(VOLTA72 sm_72)
CHECK_CUDA_ARCH(TURING75 sm_75)
CHECK_CUDA_ARCH(AMPERE80 sm_80)
CHECK_CUDA_ARCH(AMPERE86 sm_86)
+CHECK_CUDA_ARCH(HOPPER90 sm_90)
SET(AMDGPU_ARCH_ALREADY_SPECIFIED "")
FUNCTION(CHECK_AMDGPU_ARCH ARCH FLAG)
@@ -806,6 +808,10 @@ IF (KOKKOS_ARCH_AMPERE80 OR KOKKOS_ARCH_AMPERE86)
SET(KOKKOS_ARCH_AMPERE ON)
ENDIF()
+IF (KOKKOS_ARCH_HOPPER90)
+ SET(KOKKOS_ARCH_HOPPER ON)
+ENDIF()
+
#Regardless of version, make sure we define the general architecture name
IF (KOKKOS_ARCH_VEGA900 OR KOKKOS_ARCH_VEGA906 OR KOKKOS_ARCH_VEGA908 OR KOKKOS_ARCH_VEGA90A)
SET(KOKKOS_ARCH_VEGA ON)
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
index 56f9117844..fcd4773dbc 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
@@ -232,7 +232,8 @@ inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) {
case 61: return 96;
case 70:
case 80:
- case 86: return 8;
+ case 86:
+ case 90: return 8;
case 75: return 32;
default:
Kokkos::Impl::throw_runtime_exception(
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp
index 40a263561f..8c40ebd60d 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp
@@ -418,7 +418,7 @@ KOKKOS_INLINE_FUNCTION
#endif // CUDA_VERSION >= 11000 && CUDA_VERSION < 11010
#if CUDA_VERSION >= 11010 && \
- ((defined(KOKKOS_ARCH_AMPERE80) || defined(KOKKOS_ARCH_AMPERE86)))
+ ((defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER)))
KOKKOS_INLINE_FUNCTION
bhalf_t cast_to_bhalf(bhalf_t val) { return val; }
KOKKOS_INLINE_FUNCTION
diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp
index f9451ecfe6..2ce1efb98c 100644
--- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp
+++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp
@@ -51,7 +51,7 @@ namespace Kokkos::Experimental::Impl {
struct OpenACC_Traits {
#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
- defined(KOKKOS_ARCH_AMPERE)
+ defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER)
static constexpr acc_device_t dev_type = acc_device_nvidia;
static constexpr bool may_fallback_to_host = false;
#else
diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
index a9bc085912..27ee1d4232 100644
--- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
+++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
@@ -115,8 +115,9 @@ void OpenMPTargetInternal::impl_initialize() {
// FIXME_OPENMPTARGET: Only fix the number of teams for NVIDIA architectures
// from Pascal and upwards.
-#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
- defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
+#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
+ defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
+ defined(KOKKOS_ARCH_HOPPER)
#if defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG >= 1300)
omp_set_num_teams(512);
#endif
diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
index 840db4327c..7e5addbc5b 100644
--- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
@@ -155,7 +155,7 @@ void SYCL::impl_initialize(InitializationSettings const& settings) {
#if !defined(KOKKOS_ARCH_INTEL_GPU) && !defined(KOKKOS_ARCH_KEPLER) && \
!defined(KOKKOS_ARCH_MAXWELL) && !defined(KOKKOS_ARCH_PASCAL) && \
!defined(KOKKOS_ARCH_VOLTA) && !defined(KOKKOS_ARCH_TURING75) && \
- !defined(KOKKOS_ARCH_AMPERE)
+ !defined(KOKKOS_ARCH_AMPERE) && !defined(KOKKOS_ARCH_HOPPER)
if (!settings.has_device_id() && gpu_devices.empty()) {
Impl::SYCLInternal::singleton().initialize(sycl::device());
return;
diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
index 5ac7d8af30..ba101f699e 100644
--- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
@@ -335,9 +335,10 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
return std::min({
int(m_space.impl_internal_space_instance()->m_maxWorkgroupSize),
// FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs.
-#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
- defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
- defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
+#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
+ defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
+ defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
+ defined(KOKKOS_ARCH_HOPPER)
256,
#endif
max_threads_for_memory
@@ -367,9 +368,10 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
return std::min<int>({
int(m_space.impl_internal_space_instance()->m_maxWorkgroupSize),
// FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs.
-#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
- defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
- defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
+#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
+ defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
+ defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
+ defined(KOKKOS_ARCH_HOPPER)
256,
#endif
max_threads_for_memory

523
lib/kokkos/kokkos_5706.diff Normal file
View File

@ -0,0 +1,523 @@
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
index fcd4773dbc..30b6958a67 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
@@ -207,7 +207,6 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
LaunchBounds{});
}
-// Assuming cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1)
// NOTE these number can be obtained several ways:
// * One option is to download the CUDA Occupancy Calculator spreadsheet, select
// "Compute Capability" first and check what is the smallest "Shared Memory
@@ -242,6 +241,7 @@ inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) {
return 0;
}() * 1024;
}
+
} // namespace Impl
} // namespace Kokkos
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
index 5811498e01..e22eb3b842 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
@@ -569,12 +569,6 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default
}
#endif
-#ifdef KOKKOS_ENABLE_PRE_CUDA_10_DEPRECATION_API
- cudaThreadSetCacheConfig(cudaFuncCachePreferShared);
-#else
- cudaDeviceSetCacheConfig(cudaFuncCachePreferShared);
-#endif
-
// Init the array for used for arbitrarily sized atomics
if (stream == nullptr) Impl::initialize_host_cuda_lock_arrays();
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
index b7a80ad84f..5c4c3a7d39 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
@@ -93,10 +93,6 @@ namespace Impl {
// __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
// function qualifier which could be used to improve performance.
//----------------------------------------------------------------------------
-// Maximize L1 cache and minimize shared memory:
-// cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1 );
-// For 2.0 capability: 48 KB L1 and 16 KB shared
-//----------------------------------------------------------------------------
template <class DriverType>
__global__ static void cuda_parallel_launch_constant_memory() {
@@ -158,63 +154,105 @@ inline void check_shmem_request(CudaInternal const* cuda_instance, int shmem) {
}
}
-// This function needs to be template on DriverType and LaunchBounds
+// These functions needs to be template on DriverType and LaunchBounds
// so that the static bool is unique for each type combo
// KernelFuncPtr does not necessarily contain that type information.
+
template <class DriverType, class LaunchBounds, class KernelFuncPtr>
-inline void configure_shmem_preference(KernelFuncPtr const& func,
- bool prefer_shmem) {
+const cudaFuncAttributes& get_cuda_kernel_func_attributes(
+ const KernelFuncPtr& func) {
+ // Only call cudaFuncGetAttributes once for each unique kernel
+ // by leveraging static variable initialization rules
+ auto wrap_get_attributes = [&]() -> cudaFuncAttributes {
+ cudaFuncAttributes attr;
+ KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncGetAttributes(&attr, func));
+ return attr;
+ };
+ static cudaFuncAttributes func_attr = wrap_get_attributes();
+ return func_attr;
+}
+
+template <class DriverType, class LaunchBounds, class KernelFuncPtr>
+inline void configure_shmem_preference(const KernelFuncPtr& func,
+ const cudaDeviceProp& device_props,
+ const size_t block_size, int& shmem,
+ const size_t occupancy) {
#ifndef KOKKOS_ARCH_KEPLER
- // On Kepler the L1 has no benefit since it doesn't cache reads
+
+ const auto& func_attr =
+ get_cuda_kernel_func_attributes<DriverType, LaunchBounds>(func);
+
+ // Compute limits for number of blocks due to registers/SM
+ const size_t regs_per_sm = device_props.regsPerMultiprocessor;
+ const size_t regs_per_thread = func_attr.numRegs;
+ // The granularity of register allocation is chunks of 256 registers per warp
+ // -> 8 registers per thread
+ const size_t allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
+ const size_t max_blocks_regs =
+ regs_per_sm / (allocated_regs_per_thread * block_size);
+
+ // Compute how many threads per sm we actually want
+ const size_t max_threads_per_sm = device_props.maxThreadsPerMultiProcessor;
+ // only allocate multiples of warp size
+ const size_t num_threads_desired =
+ ((max_threads_per_sm * occupancy / 100 + 31) / 32) * 32;
+ // Get close to the desired occupancy,
+ // don't undershoot by much but also don't allocate a whole new block just
+ // because one is a few threads over otherwise.
+ size_t num_blocks_desired =
+ (num_threads_desired + block_size * 0.8) / block_size;
+ num_blocks_desired = ::std::min(max_blocks_regs, num_blocks_desired);
+ if (num_blocks_desired == 0) num_blocks_desired = 1;
+
+ // Calculate how much shared memory we need per block
+ size_t shmem_per_block = shmem + func_attr.sharedSizeBytes;
+
+ // The minimum shared memory allocation we can have in total per SM is 8kB.
+ // If we want to lower occupancy we have to make sure we request at least that
+ // much in aggregate over all blocks, so that shared memory actually becomes a
+ // limiting factor for occupancy
+ constexpr size_t min_shmem_size_per_sm = 8192;
+ if ((occupancy < 100) &&
+ (shmem_per_block * num_blocks_desired < min_shmem_size_per_sm)) {
+ shmem_per_block = min_shmem_size_per_sm / num_blocks_desired;
+ // Need to set the caller's shmem variable so that the
+ // kernel launch uses the correct dynamic shared memory request
+ shmem = shmem_per_block - func_attr.sharedSizeBytes;
+ }
+
+ // Compute the carveout fraction we need based on occupancy
+ // Use multiples of 8kB
+ const size_t max_shmem_per_sm = device_props.sharedMemPerMultiprocessor;
+ size_t carveout = shmem_per_block == 0
+ ? 0
+ : 100 *
+ (((num_blocks_desired * shmem_per_block +
+ min_shmem_size_per_sm - 1) /
+ min_shmem_size_per_sm) *
+ min_shmem_size_per_sm) /
+ max_shmem_per_sm;
+ if (carveout > 100) carveout = 100;
+
+ // Set the carveout, but only call it once per kernel or when it changes
auto set_cache_config = [&] {
- KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
- func,
- (prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1)));
- return prefer_shmem;
+ KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetAttribute(
+ func, cudaFuncAttributePreferredSharedMemoryCarveout, carveout));
+ return carveout;
};
- static bool cache_config_preference_cached = set_cache_config();
- if (cache_config_preference_cached != prefer_shmem) {
+ // Store the value in a static variable so we only reset if needed
+ static size_t cache_config_preference_cached = set_cache_config();
+ if (cache_config_preference_cached != carveout) {
cache_config_preference_cached = set_cache_config();
}
#else
// Use the parameters so we don't get a warning
(void)func;
- (void)prefer_shmem;
+ (void)device_props;
+ (void)block_size;
+ (void)occupancy;
#endif
}
-template <class Policy>
-std::enable_if_t<Policy::experimental_contains_desired_occupancy>
-modify_launch_configuration_if_desired_occupancy_is_specified(
- Policy const& policy, cudaDeviceProp const& properties,
- cudaFuncAttributes const& attributes, dim3 const& block, int& shmem,
- bool& prefer_shmem) {
- int const block_size = block.x * block.y * block.z;
- int const desired_occupancy = policy.impl_get_desired_occupancy().value();
-
- size_t const shmem_per_sm_prefer_l1 = get_shmem_per_sm_prefer_l1(properties);
- size_t const static_shmem = attributes.sharedSizeBytes;
-
- // round to nearest integer and avoid division by zero
- int active_blocks = std::max(
- 1, static_cast<int>(std::round(
- static_cast<double>(properties.maxThreadsPerMultiProcessor) /
- block_size * desired_occupancy / 100)));
- int const dynamic_shmem =
- shmem_per_sm_prefer_l1 / active_blocks - static_shmem;
-
- if (dynamic_shmem > shmem) {
- shmem = dynamic_shmem;
- prefer_shmem = false;
- }
-}
-
-template <class Policy>
-std::enable_if_t<!Policy::experimental_contains_desired_occupancy>
-modify_launch_configuration_if_desired_occupancy_is_specified(
- Policy const&, cudaDeviceProp const&, cudaFuncAttributes const&,
- dim3 const& /*block*/, int& /*shmem*/, bool& /*prefer_shmem*/) {}
-
// </editor-fold> end Some helper functions for launch code readability }}}1
//==============================================================================
@@ -348,7 +386,7 @@ struct CudaParallelLaunchKernelInvoker<
#ifdef KOKKOS_CUDA_ENABLE_GRAPHS
inline static void create_parallel_launch_graph_node(
DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
- CudaInternal const* cuda_instance, bool prefer_shmem) {
+ CudaInternal const* cuda_instance) {
//----------------------------------------
auto const& graph = Impl::get_cuda_graph_from_kernel(driver);
KOKKOS_EXPECTS(bool(graph));
@@ -358,8 +396,15 @@ struct CudaParallelLaunchKernelInvoker<
if (!Impl::is_empty_launch(grid, block)) {
Impl::check_shmem_request(cuda_instance, shmem);
- Impl::configure_shmem_preference<DriverType, LaunchBounds>(
- base_t::get_kernel_func(), prefer_shmem);
+ if (DriverType::Policy::
+ experimental_contains_desired_occupancy) {
+ int desired_occupancy =
+ driver.get_policy().impl_get_desired_occupancy().value();
+ size_t block_size = block.x * block.y * block.z;
+ Impl::configure_shmem_preference<DriverType, LaunchBounds>(
+ base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
+ shmem, desired_occupancy);
+ }
void const* args[] = {&driver};
@@ -442,7 +487,7 @@ struct CudaParallelLaunchKernelInvoker<
#ifdef KOKKOS_CUDA_ENABLE_GRAPHS
inline static void create_parallel_launch_graph_node(
DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
- CudaInternal const* cuda_instance, bool prefer_shmem) {
+ CudaInternal const* cuda_instance) {
//----------------------------------------
auto const& graph = Impl::get_cuda_graph_from_kernel(driver);
KOKKOS_EXPECTS(bool(graph));
@@ -452,8 +497,15 @@ struct CudaParallelLaunchKernelInvoker<
if (!Impl::is_empty_launch(grid, block)) {
Impl::check_shmem_request(cuda_instance, shmem);
- Impl::configure_shmem_preference<DriverType, LaunchBounds>(
- base_t::get_kernel_func(), prefer_shmem);
+ if constexpr (DriverType::Policy::
+ experimental_contains_desired_occupancy) {
+ int desired_occupancy =
+ driver.get_policy().impl_get_desired_occupancy().value();
+ size_t block_size = block.x * block.y * block.z;
+ Impl::configure_shmem_preference<DriverType, LaunchBounds>(
+ base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
+ shmem, desired_occupancy);
+ }
auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver);
@@ -566,7 +618,7 @@ struct CudaParallelLaunchKernelInvoker<
#ifdef KOKKOS_CUDA_ENABLE_GRAPHS
inline static void create_parallel_launch_graph_node(
DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
- CudaInternal const* cuda_instance, bool prefer_shmem) {
+ CudaInternal const* cuda_instance) {
// Just use global memory; coordinating through events to share constant
// memory with the non-graph interface is not really reasonable since
// events don't work with Graphs directly, and this would anyway require
@@ -580,7 +632,7 @@ struct CudaParallelLaunchKernelInvoker<
DriverType, LaunchBounds,
Experimental::CudaLaunchMechanism::GlobalMemory>;
global_launch_impl_t::create_parallel_launch_graph_node(
- driver, grid, block, shmem, cuda_instance, prefer_shmem);
+ driver, grid, block, shmem, cuda_instance);
}
#endif
};
@@ -613,8 +665,7 @@ struct CudaParallelLaunchImpl<
inline static void launch_kernel(const DriverType& driver, const dim3& grid,
const dim3& block, int shmem,
- const CudaInternal* cuda_instance,
- bool prefer_shmem) {
+ const CudaInternal* cuda_instance) {
if (!Impl::is_empty_launch(grid, block)) {
// Prevent multiple threads to simultaneously set the cache configuration
// preference and launch the same kernel
@@ -623,18 +674,17 @@ struct CudaParallelLaunchImpl<
Impl::check_shmem_request(cuda_instance, shmem);
- // If a desired occupancy is specified, we compute how much shared memory
- // to ask for to achieve that occupancy, assuming that the cache
- // configuration is `cudaFuncCachePreferL1`. If the amount of dynamic
- // shared memory computed is actually smaller than `shmem` we overwrite
- // `shmem` and set `prefer_shmem` to `false`.
- modify_launch_configuration_if_desired_occupancy_is_specified(
- driver.get_policy(), cuda_instance->m_deviceProp,
- get_cuda_func_attributes(), block, shmem, prefer_shmem);
-
- Impl::configure_shmem_preference<
- DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
- base_t::get_kernel_func(), prefer_shmem);
+ if (DriverType::Policy::
+ experimental_contains_desired_occupancy) {
+ int desired_occupancy =
+ driver.get_policy().impl_get_desired_occupancy().value();
+ size_t block_size = block.x * block.y * block.z;
+ Impl::configure_shmem_preference<
+ DriverType,
+ Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
+ base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
+ shmem, desired_occupancy);
+ }
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
@@ -650,18 +700,9 @@ struct CudaParallelLaunchImpl<
}
static cudaFuncAttributes get_cuda_func_attributes() {
- // Race condition inside of cudaFuncGetAttributes if the same address is
- // given requires using a local variable as input instead of a static Rely
- // on static variable initialization to make sure only one thread executes
- // the code and the result is visible.
- auto wrap_get_attributes = []() -> cudaFuncAttributes {
- cudaFuncAttributes attr_tmp;
- KOKKOS_IMPL_CUDA_SAFE_CALL(
- cudaFuncGetAttributes(&attr_tmp, base_t::get_kernel_func()));
- return attr_tmp;
- };
- static cudaFuncAttributes attr = wrap_get_attributes();
- return attr;
+ return get_cuda_kernel_func_attributes<
+ DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
+ base_t::get_kernel_func());
}
};
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
index e586bb4cc6..0e348c092a 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
@@ -121,8 +121,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
maxblocks[1]),
1);
CudaParallelLaunch<ParallelFor, LaunchBounds>(
- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
- false);
+ *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
} else if (RP::rank == 3) {
const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], m_rp.m_tile[2]);
KOKKOS_ASSERT(block.x > 0);
@@ -139,8 +138,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
(m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1) / block.z,
maxblocks[2]));
CudaParallelLaunch<ParallelFor, LaunchBounds>(
- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
- false);
+ *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
} else if (RP::rank == 4) {
// id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to
// threadIdx.z
@@ -158,8 +156,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
(m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1) / block.z,
maxblocks[2]));
CudaParallelLaunch<ParallelFor, LaunchBounds>(
- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
- false);
+ *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
} else if (RP::rank == 5) {
// id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to
// threadIdx.z
@@ -175,8 +172,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
(m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1) / block.z,
maxblocks[2]));
CudaParallelLaunch<ParallelFor, LaunchBounds>(
- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
- false);
+ *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
} else if (RP::rank == 6) {
// id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4,id5 to
// threadIdx.z
@@ -191,8 +187,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
std::min<array_index_type>(m_rp.m_tile_end[4] * m_rp.m_tile_end[5],
maxblocks[2]));
CudaParallelLaunch<ParallelFor, LaunchBounds>(
- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
- false);
+ *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
} else {
Kokkos::abort("Kokkos::MDRange Error: Exceeded rank bounds with Cuda\n");
}
@@ -405,8 +400,8 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
CudaParallelLaunch<ParallelReduce, LaunchBounds>(
*this, grid, block, shmem,
- m_policy.space().impl_internal_space_instance(),
- false); // copy to device and execute
+ m_policy.space()
+ .impl_internal_space_instance()); // copy to device and execute
if (!m_result_ptr_device_accessible) {
if (m_result_ptr) {
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
index ac160f8fe2..d1031751c2 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
@@ -135,8 +135,7 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
#endif
CudaParallelLaunch<ParallelFor, LaunchBounds>(
- *this, grid, block, 0, m_policy.space().impl_internal_space_instance(),
- false);
+ *this, grid, block, 0, m_policy.space().impl_internal_space_instance());
}
ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
@@ -375,8 +374,8 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
CudaParallelLaunch<ParallelReduce, LaunchBounds>(
*this, grid, block, shmem,
- m_policy.space().impl_internal_space_instance(),
- false); // copy to device and execute
+ m_policy.space()
+ .impl_internal_space_instance()); // copy to device and execute
if (!m_result_ptr_device_accessible) {
if (m_result_ptr) {
@@ -726,16 +725,16 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
m_final = false;
CudaParallelLaunch<ParallelScan, LaunchBounds>(
*this, grid, block, shmem,
- m_policy.space().impl_internal_space_instance(),
- false); // copy to device and execute
+ m_policy.space()
+ .impl_internal_space_instance()); // copy to device and execute
#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
}
#endif
m_final = true;
CudaParallelLaunch<ParallelScan, LaunchBounds>(
*this, grid, block, shmem,
- m_policy.space().impl_internal_space_instance(),
- false); // copy to device and execute
+ m_policy.space()
+ .impl_internal_space_instance()); // copy to device and execute
}
}
@@ -1038,16 +1037,16 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
m_final = false;
CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>(
*this, grid, block, shmem,
- m_policy.space().impl_internal_space_instance(),
- false); // copy to device and execute
+ m_policy.space()
+ .impl_internal_space_instance()); // copy to device and execute
#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
}
#endif
m_final = true;
CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>(
*this, grid, block, shmem,
- m_policy.space().impl_internal_space_instance(),
- false); // copy to device and execute
+ m_policy.space()
+ .impl_internal_space_instance()); // copy to device and execute
const int size = Analysis::value_size(m_functor);
#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
index cdd16085b3..ea9430b812 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
@@ -552,8 +552,8 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
CudaParallelLaunch<ParallelFor, LaunchBounds>(
*this, grid, block, shmem_size_total,
- m_policy.space().impl_internal_space_instance(),
- true); // copy to device and execute
+ m_policy.space()
+ .impl_internal_space_instance()); // copy to device and execute
}
ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
@@ -878,8 +878,8 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
CudaParallelLaunch<ParallelReduce, LaunchBounds>(
*this, grid, block, shmem_size_total,
- m_policy.space().impl_internal_space_instance(),
- true); // copy to device and execute
+ m_policy.space()
+ .impl_internal_space_instance()); // copy to device and execute
if (!m_result_ptr_device_accessible) {
m_policy.space().fence(
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
index 34d4bef9fd..178012431c 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
@@ -428,11 +428,6 @@ struct CudaReductionsFunctor<FunctorType, false, false> {
// __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
// function qualifier which could be used to improve performance.
//----------------------------------------------------------------------------
-// Maximize shared memory and minimize L1 cache:
-// cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferShared );
-// For 2.0 capability: 48 KB shared and 16 KB L1
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
/*
* Algorithmic constraints:
* (a) blockDim.y <= 1024
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
index fb3a6b138f..a12378a891 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
@@ -100,8 +100,7 @@ class ParallelFor<FunctorType, Kokkos::WorkGraphPolicy<Traits...>,
const int shared = 0;
Kokkos::Impl::CudaParallelLaunch<Self>(
- *this, grid, block, shared, Cuda().impl_internal_space_instance(),
- false);
+ *this, grid, block, shared, Cuda().impl_internal_space_instance());
}
inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)

View File

@ -0,0 +1,46 @@
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
index 30b6958a67..b94f053272 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
@@ -207,41 +207,6 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
LaunchBounds{});
}
-// NOTE these number can be obtained several ways:
-// * One option is to download the CUDA Occupancy Calculator spreadsheet, select
-// "Compute Capability" first and check what is the smallest "Shared Memory
-// Size Config" that is available. The "Shared Memory Per Multiprocessor" in
-// bytes is then to be found below in the summary.
-// * Another option would be to look for the information in the "Tuning
-// Guide(s)" of the CUDA Toolkit Documentation for each GPU architecture, in
-// the "Shared Memory" section (more tedious)
-inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) {
- int const compute_capability = properties.major * 10 + properties.minor;
- return [compute_capability]() {
- switch (compute_capability) {
- case 30:
- case 32:
- case 35: return 16;
- case 37: return 80;
- case 50:
- case 53:
- case 60:
- case 62: return 64;
- case 52:
- case 61: return 96;
- case 70:
- case 80:
- case 86:
- case 90: return 8;
- case 75: return 32;
- default:
- Kokkos::Impl::throw_runtime_exception(
- "Unknown device in cuda block size deduction");
- }
- return 0;
- }() * 1024;
-}
-
} // namespace Impl
} // namespace Kokkos

204
lib/kokkos/kokkos_5739.diff Normal file
View File

@ -0,0 +1,204 @@
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
index b94f053272..252c13c524 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
@@ -53,17 +53,69 @@
namespace Kokkos {
namespace Impl {
+inline int cuda_warp_per_sm_allocation_granularity(
+ cudaDeviceProp const& properties) {
+ // Allocation granularity of warps in each sm
+ switch (properties.major) {
+ case 3:
+ case 5:
+ case 7:
+ case 8:
+ case 9: return 4;
+ case 6: return (properties.minor == 0 ? 2 : 4);
+ default:
+ throw_runtime_exception(
+ "Unknown device in cuda warp per sm allocation granularity");
+ return 0;
+ }
+}
+
+inline int cuda_max_warps_per_sm_registers(
+ cudaDeviceProp const& properties, cudaFuncAttributes const& attributes) {
+ // Maximum number of warps per sm as a function of register counts,
+ // subject to the constraint that warps are allocated with a fixed granularity
+ int const max_regs_per_block = properties.regsPerBlock;
+ int const regs_per_warp = attributes.numRegs * properties.warpSize;
+ int const warp_granularity =
+ cuda_warp_per_sm_allocation_granularity(properties);
+ // The granularity of register allocation is chunks of 256 registers per warp,
+ // which implies a need to over-allocate, so we round up
+ int const allocated_regs_per_warp = (regs_per_warp + 256 - 1) / 256;
+
+ // The maximum number of warps per SM is constrained from above by register
+ // allocation. To satisfy the constraint that warps per SM is allocated at a
+ // finite granularity, we need to round down.
+ int const max_warps_per_sm =
+ warp_granularity *
+ (max_regs_per_block / (allocated_regs_per_warp * warp_granularity));
+
+ return max_warps_per_sm;
+}
+
inline int cuda_max_active_blocks_per_sm(cudaDeviceProp const& properties,
cudaFuncAttributes const& attributes,
int block_size, size_t dynamic_shmem) {
- // Limits due do registers/SM
+ // Limits due to registers/SM
int const regs_per_sm = properties.regsPerMultiprocessor;
int const regs_per_thread = attributes.numRegs;
// The granularity of register allocation is chunks of 256 registers per warp
// -> 8 registers per thread
int const allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
- int const max_blocks_regs =
- regs_per_sm / (allocated_regs_per_thread * block_size);
+ int max_blocks_regs = regs_per_sm / (allocated_regs_per_thread * block_size);
+
+ // Compute the maximum number of warps as a function of the number of
+ // registers
+ int const max_warps_per_sm_registers =
+ cuda_max_warps_per_sm_registers(properties, attributes);
+
+ // Constrain the number of blocks to respect the maximum number of warps per
+ // SM On face value this should be an equality, but due to the warp
+ // granularity constraints noted in `cuda_max_warps_per_sm_registers` the
+ // left-hand-side of this comparison can overshoot what the hardware allows
+ // based on register counts alone
+ while ((max_blocks_regs * block_size / properties.warpSize) >
+ max_warps_per_sm_registers)
+ max_blocks_regs--;
// Limits due to shared memory/SM
size_t const shmem_per_sm = properties.sharedMemPerMultiprocessor;
@@ -207,6 +259,19 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
LaunchBounds{});
}
+template <class LaunchBounds>
+int cuda_get_opt_block_size_no_shmem(const cudaFuncAttributes& attr,
+ LaunchBounds) {
+ auto const& prop = Kokkos::Cuda().cuda_device_prop();
+
+ // Thin version of cuda_get_opt_block_size for cases where there is no shared
+ // memory
+ auto const block_size_to_no_shmem = [&](int /*block_size*/) { return 0; };
+
+ return cuda_deduce_block_size(false, prop, attr, block_size_to_no_shmem,
+ LaunchBounds{});
+}
+
} // namespace Impl
} // namespace Kokkos
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
index 5c4c3a7d39..170183ca0a 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
@@ -188,9 +188,23 @@ inline void configure_shmem_preference(const KernelFuncPtr& func,
// The granularity of register allocation is chunks of 256 registers per warp
// -> 8 registers per thread
const size_t allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
- const size_t max_blocks_regs =
+ size_t max_blocks_regs =
regs_per_sm / (allocated_regs_per_thread * block_size);
+ // Compute the maximum number of warps as a function of the number of
+ // registers
+ const size_t max_warps_per_sm_registers =
+ cuda_max_warps_per_sm_registers(device_props, func_attr);
+
+ // Constrain the number of blocks to respect the maximum number of warps per
+ // SM On face value this should be an equality, but due to the warp
+ // granularity constraints noted in `cuda_max_warps_per_sm_registers` the
+ // left-hand-side of this comparison can overshoot what the hardware allows
+ // based on register counts alone
+ while ((max_blocks_regs * block_size / device_props.warpSize) >
+ max_warps_per_sm_registers)
+ max_blocks_regs--;
+
// Compute how many threads per sm we actually want
const size_t max_threads_per_sm = device_props.maxThreadsPerMultiProcessor;
// only allocate multiples of warp size
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
index 0e348c092a..7e4f62f12e 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
@@ -67,6 +67,34 @@
namespace Kokkos {
namespace Impl {
+template <typename ParallelType, typename Policy, typename LaunchBounds>
+int max_tile_size_product_helper(const Policy& pol, const LaunchBounds&) {
+ cudaFuncAttributes attr =
+ CudaParallelLaunch<ParallelType,
+ LaunchBounds>::get_cuda_func_attributes();
+ auto const& prop = pol.space().cuda_device_prop();
+
+ // Limits due to registers/SM, MDRange doesn't have
+ // shared memory constraints
+ int const optimal_block_size =
+ Kokkos::Impl::cuda_get_opt_block_size_no_shmem(attr, LaunchBounds{});
+
+ // Compute how many blocks of this size we can launch, based on warp
+ // constraints
+ int const max_warps_per_sm_registers =
+ Kokkos::Impl::cuda_max_warps_per_sm_registers(prop, attr);
+ int const max_num_threads_from_warps =
+ max_warps_per_sm_registers * prop.warpSize;
+ int const max_num_blocks = max_num_threads_from_warps / optimal_block_size;
+
+ // Compute the total number of threads
+ int const max_threads_per_sm = optimal_block_size * max_num_blocks;
+
+ return std::min(
+ max_threads_per_sm,
+ static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
+}
+
template <class FunctorType, class... Traits>
class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
public:
@@ -85,18 +113,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
public:
template <typename Policy, typename Functor>
static int max_tile_size_product(const Policy& pol, const Functor&) {
- cudaFuncAttributes attr =
- CudaParallelLaunch<ParallelFor,
- LaunchBounds>::get_cuda_func_attributes();
- auto const& prop = pol.space().cuda_device_prop();
- // Limits due to registers/SM, MDRange doesn't have
- // shared memory constraints
- int const regs_per_sm = prop.regsPerMultiprocessor;
- int const regs_per_thread = attr.numRegs;
- int const max_threads_per_sm = regs_per_sm / regs_per_thread;
- return std::min(
- max_threads_per_sm,
- static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
+ return max_tile_size_product_helper<ParallelFor>(pol, LaunchBounds{});
}
Policy const& get_policy() const { return m_rp; }
inline __device__ void operator()() const {
@@ -258,17 +275,7 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
public:
template <typename Policy, typename Functor>
static int max_tile_size_product(const Policy& pol, const Functor&) {
- cudaFuncAttributes attr =
- CudaParallelLaunch<ParallelReduce,
- LaunchBounds>::get_cuda_func_attributes();
- auto const& prop = pol.space().cuda_device_prop();
- // Limits due do registers/SM
- int const regs_per_sm = prop.regsPerMultiprocessor;
- int const regs_per_thread = attr.numRegs;
- int const max_threads_per_sm = regs_per_sm / regs_per_thread;
- return std::min(
- max_threads_per_sm,
- static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
+ return max_tile_size_product_helper<ParallelReduce>(pol, LaunchBounds{});
}
Policy const& get_policy() const { return m_policy; }
inline __device__ void exec_range(reference_type update) const {

View File

@ -0,0 +1,63 @@
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
index 170183ca0a..ba43e362bb 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
@@ -412,12 +412,16 @@ struct CudaParallelLaunchKernelInvoker<
Impl::check_shmem_request(cuda_instance, shmem);
if (DriverType::Policy::
experimental_contains_desired_occupancy) {
+ /*
int desired_occupancy =
driver.get_policy().impl_get_desired_occupancy().value();
size_t block_size = block.x * block.y * block.z;
Impl::configure_shmem_preference<DriverType, LaunchBounds>(
base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
- shmem, desired_occupancy);
+ shmem, desired_occupancy);*/
+ Kokkos::Impl::throw_runtime_exception(
+ std::string("Cuda graph node creation FAILED:"
+ " occupancy requests are currently broken."));
}
void const* args[] = {&driver};
@@ -511,14 +515,17 @@ struct CudaParallelLaunchKernelInvoker<
if (!Impl::is_empty_launch(grid, block)) {
Impl::check_shmem_request(cuda_instance, shmem);
- if constexpr (DriverType::Policy::
+ if (DriverType::Policy::
experimental_contains_desired_occupancy) {
- int desired_occupancy =
+ /*int desired_occupancy =
driver.get_policy().impl_get_desired_occupancy().value();
size_t block_size = block.x * block.y * block.z;
Impl::configure_shmem_preference<DriverType, LaunchBounds>(
base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
- shmem, desired_occupancy);
+ shmem, desired_occupancy);*/
+ Kokkos::Impl::throw_runtime_exception(
+ std::string("Cuda graph node creation FAILED:"
+ " occupancy requests are currently broken."));
}
auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver);
@@ -690,14 +697,17 @@ struct CudaParallelLaunchImpl<
if (DriverType::Policy::
experimental_contains_desired_occupancy) {
- int desired_occupancy =
+ /*int desired_occupancy =
driver.get_policy().impl_get_desired_occupancy().value();
size_t block_size = block.x * block.y * block.z;
Impl::configure_shmem_preference<
DriverType,
Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
- shmem, desired_occupancy);
+ shmem, desired_occupancy);*/
+ Kokkos::Impl::throw_runtime_exception(
+ std::string("Cuda graph node creation FAILED:"
+ " occupancy requests are currently broken."));
}
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();

View File

@ -29,3 +29,4 @@ tag: 3.5.00 date: 11:19:2021 master: c28a8b03 release: 21b879e4
tag: 3.6.00 date: 04:14:2022 master: 2834f94a release: 6ea708ff
tag: 3.6.01 date: 06:16:2022 master: b52f8c83 release: afe9b404
tag: 3.7.00 date: 08:25:2022 master: d19aab99 release: 0018e5fb
tag: 3.7.01 date: 12:01:2022 master: 61d7db55 release: d3bb8cfe

View File

@ -1,5 +1,5 @@
TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
LIB_REQUIRED_PACKAGES KokkosCore
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC HPX
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC
TEST_OPTIONAL_TPLS CUSPARSE
)

View File

@ -0,0 +1,3 @@
#Official Tool: clang-format version 8.0.0
DisableFormat: true
SortIncludes: false

View File

@ -76,7 +76,7 @@ namespace Impl {
/// instances in other translation units, we must update this CUDA global
/// variable based on the Host global variable prior to running any kernels
/// that will use it.
/// That is the purpose of the ensure_cuda_lock_arrays_on_device function.
/// That is the purpose of the KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE macro.
__device__
#ifdef __CUDACC_RDC__
__constant__ extern
@ -138,42 +138,33 @@ namespace {
static int lock_array_copied = 0;
inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
} // namespace
#ifdef __CUDACC_RDC__
inline
#else
static
#endif
void
copy_cuda_lock_arrays_to_device() {
if (lock_array_copied == 0) {
cudaMemcpyToSymbol(CUDA_SPACE_ATOMIC_LOCKS_DEVICE,
&CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h,
sizeof(int32_t*));
cudaMemcpyToSymbol(CUDA_SPACE_ATOMIC_LOCKS_NODE,
&CUDA_SPACE_ATOMIC_LOCKS_NODE_h,
sizeof(int32_t*));
}
lock_array_copied = 1;
}
} // namespace Impl
} // namespace desul
/* It is critical that this code be a macro, so that it will
capture the right address for desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE
putting this in an inline function will NOT do the right thing! */
#define DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE() \
{ \
if (::desul::Impl::lock_array_copied == 0) { \
cudaMemcpyToSymbol(::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE, \
&::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h, \
sizeof(int32_t*)); \
cudaMemcpyToSymbol(::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_NODE, \
&::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_NODE_h, \
sizeof(int32_t*)); \
} \
::desul::Impl::lock_array_copied = 1; \
}
#endif /* defined( __CUDACC__ ) */
#endif /* defined( DESUL_HAVE_CUDA_ATOMICS ) */
namespace desul {
#if defined(__CUDACC_RDC__) || (!defined(__CUDACC__))
inline void ensure_cuda_lock_arrays_on_device() {}
#define DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
#else
static inline void ensure_cuda_lock_arrays_on_device() {
Impl::copy_cuda_lock_arrays_to_device();
}
#define DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() \
DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()
#endif
} // namespace desul
#endif /* #ifndef DESUL_ATOMICS_LOCK_ARRAY_CUDA_HPP_ */
#endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP_ */

View File

@ -70,7 +70,7 @@ void init_lock_arrays_cuda() {
"init_lock_arrays_cuda: cudaMalloc host locks");
auto error_sync1 = cudaDeviceSynchronize();
copy_cuda_lock_arrays_to_device();
DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
check_error_and_throw_cuda(error_sync1, "init_lock_arrays_cuda: post mallocs");
init_lock_arrays_cuda_kernel<<<(CUDA_SPACE_ATOMIC_MASK + 1 + 255) / 256, 256>>>();
auto error_sync2 = cudaDeviceSynchronize();
@ -85,7 +85,7 @@ void finalize_lock_arrays_cuda() {
CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr;
CUDA_SPACE_ATOMIC_LOCKS_NODE_h = nullptr;
#ifdef __CUDACC_RDC__
copy_cuda_lock_arrays_to_device();
DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
#endif
}

View File

@ -190,6 +190,20 @@ namespace LAMMPS_NS {
Tp_BIAS,Tp_RMASS,Tp_ZERO>(i);
}
KOKKOS_INLINE_FUNCTION
static void init(value_type &update) {
update.fx = 0.0;
update.fy = 0.0;
update.fz = 0.0;
}
KOKKOS_INLINE_FUNCTION
static void join(value_type &update,
const value_type &source) {
update.fx += source.fx;
update.fy += source.fy;
update.fz += source.fz;
}
KOKKOS_INLINE_FUNCTION
static void init(volatile value_type &update) {
update.fx = 0.0;
@ -233,6 +247,15 @@ namespace LAMMPS_NS {
energy += c.compute_energy_item(i);
}
KOKKOS_INLINE_FUNCTION
static void init(value_type &update) {
update = 0.0;
}
KOKKOS_INLINE_FUNCTION
static void join(value_type &update,
const value_type &source) {
update += source;
}
KOKKOS_INLINE_FUNCTION
static void init(volatile value_type &update) {
update = 0.0;
}

View File

@ -77,7 +77,6 @@ GPU_AWARE_UNKNOWN
using namespace LAMMPS_NS;
Kokkos::InitArguments KokkosLMP::args{-1, -1, -1, false};
int KokkosLMP::is_finalized = 0;
int KokkosLMP::init_ngpus = 0;
@ -110,7 +109,6 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
ngpus = 0;
int device = 0;
nthreads = 1;
numa = 1;
int iarg = 0;
while (iarg < narg) {
@ -189,30 +187,24 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
iarg += 2;
} else if (strcmp(arg[iarg],"n") == 0 ||
strcmp(arg[iarg],"numa") == 0) {
numa = utils::inumeric(FLERR, arg[iarg+1], false, lmp);
iarg += 2;
} else error->all(FLERR,"Invalid Kokkos command-line arg: {}", arg[iarg]);
}
// Initialize Kokkos. However, we cannot change any
// Kokkos library parameters after the first initalization
if (args.num_threads != -1) {
if ((args.num_threads != nthreads) || (args.num_numa != numa) || (args.device_id != device))
Kokkos::InitializationSettings args;
if (args.has_num_threads()) {
if ((args.get_num_threads() != nthreads) || (args.get_device_id() != device))
if (me == 0)
error->warning(FLERR,"Kokkos package already initalized, "
"cannot reinitialize with different parameters");
nthreads = args.num_threads;
numa = args.num_numa;
device = args.device_id;
error->warning(FLERR,"Kokkos package already initalized. Cannot change parameters");
nthreads = args.get_num_threads();
device = args.get_device_id();
ngpus = init_ngpus;
} else {
args.num_threads = nthreads;
args.num_numa = numa;
args.device_id = device;
args.set_num_threads(nthreads);
args.set_device_id(device);
init_ngpus = ngpus;
}
@ -350,7 +342,7 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
/* ---------------------------------------------------------------------- */
void KokkosLMP::initialize(Kokkos::InitArguments args, Error *error)
void KokkosLMP::initialize(Kokkos::InitializationSettings args, Error *error)
{
if (!Kokkos::is_initialized()) {
if (is_finalized)

View File

@ -43,7 +43,6 @@ class KokkosLMP : protected Pointers {
int forward_fix_comm_changed;
int reverse_comm_changed;
int nthreads,ngpus;
int numa;
int auto_sync;
int gpu_aware_flag;
int neigh_thread;
@ -53,12 +52,11 @@ class KokkosLMP : protected Pointers {
double binsize;
static int is_finalized;
static Kokkos::InitArguments args;
static int init_ngpus;
KokkosLMP(class LAMMPS *, int, char **);
static void initialize(Kokkos::InitArguments, Error *);
static void initialize(Kokkos::InitializationSettings, Error *);
static void finalize();
void accelerator(int, char **);
int neigh_count(int);

View File

@ -524,6 +524,12 @@ struct PairReaxKokkosFindBondFunctor {
PairReaxFFKokkos<DeviceType> c;
PairReaxKokkosFindBondFunctor(PairReaxFFKokkos<DeviceType>* c_ptr):c(*c_ptr) {};
KOKKOS_INLINE_FUNCTION
void join(int &dst,
const int &src) const {
dst = MAX(dst,src);
}
KOKKOS_INLINE_FUNCTION
void join(volatile int &dst,
const volatile int &src) const {

View File

@ -52,7 +52,6 @@ class KokkosLMP {
int kokkos_exists;
int nthreads;
int ngpus;
int numa;
KokkosLMP(class LAMMPS *, int, char **) { kokkos_exists = 0; }
~KokkosLMP() {}

View File

@ -91,7 +91,7 @@ Comm::Comm(LAMMPS *lmp) : Pointers(lmp)
nthreads = 1;
#ifdef _OPENMP
if (lmp->kokkos) {
nthreads = lmp->kokkos->nthreads * lmp->kokkos->numa;
nthreads = lmp->kokkos->nthreads;
} else if (getenv("OMP_NUM_THREADS") == nullptr) {
nthreads = 1;
if (me == 0)