diff --git a/cmake/Modules/Packages/KOKKOS.cmake b/cmake/Modules/Packages/KOKKOS.cmake index 83691f1841..f2cfa078c2 100644 --- a/cmake/Modules/Packages/KOKKOS.cmake +++ b/cmake/Modules/Packages/KOKKOS.cmake @@ -49,8 +49,8 @@ if(DOWNLOAD_KOKKOS) list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_CXX_EXTENSIONS=${CMAKE_CXX_EXTENSIONS}") list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}") include(ExternalProject) - set(KOKKOS_URL "https://github.com/kokkos/kokkos/archive/3.7.00.tar.gz" CACHE STRING "URL for KOKKOS tarball") - set(KOKKOS_MD5 "84991eca9f066383abe119a5bc7a11c4" CACHE STRING "MD5 checksum of KOKKOS tarball") + set(KOKKOS_URL "https://github.com/kokkos/kokkos/archive/3.7.01.tar.gz" CACHE STRING "URL for KOKKOS tarball") + set(KOKKOS_MD5 "f140e02b826223b1045207d9bc10d404" CACHE STRING "MD5 checksum of KOKKOS tarball") mark_as_advanced(KOKKOS_URL) mark_as_advanced(KOKKOS_MD5) ExternalProject_Add(kokkos_build @@ -74,7 +74,7 @@ if(DOWNLOAD_KOKKOS) add_dependencies(LAMMPS::KOKKOSCORE kokkos_build) add_dependencies(LAMMPS::KOKKOSCONTAINERS kokkos_build) elseif(EXTERNAL_KOKKOS) - find_package(Kokkos 3.7.00 REQUIRED CONFIG) + find_package(Kokkos 3.7.01 REQUIRED CONFIG) target_link_libraries(lammps PRIVATE Kokkos::kokkos) target_link_libraries(lmp PRIVATE Kokkos::kokkos) else() diff --git a/doc/src/Run_options.rst b/doc/src/Run_options.rst index 75bb2cf288..4dbd6f7d2f 100644 --- a/doc/src/Run_options.rst +++ b/doc/src/Run_options.rst @@ -105,13 +105,12 @@ Either the full word or an abbreviation can be used for the keywords. Note that the keywords do not use a leading minus sign. I.e. the keyword is "t", not "-t". Also note that each of the keywords has a default setting. Examples of when to use these options and what -settings to use on different platforms is given on the :doc:`KOKKOS package ` -doc page. +settings to use on different platforms is given on the :doc:`KOKKOS +package ` doc page. * d or device * g or gpus * t or threads -* n or numa .. parsed-literal:: @@ -164,19 +163,10 @@ the number of physical cores per node, to use your available hardware optimally. This also sets the number of threads used by the host when LAMMPS is compiled with CUDA=yes. -.. parsed-literal:: +.. deprecated:: 22Dec2022 - numa Nm - -This option is only relevant when using pthreads with hwloc support. -In this case Nm defines the number of NUMA regions (typically sockets) -on a node which will be utilized by a single MPI rank. By default Nm -= 1. If this option is used the total number of worker-threads per -MPI rank is threads\*numa. Currently it is always almost better to -assign at least one MPI rank per NUMA region, and leave numa set to -its default value of 1. This is because letting a single process span -multiple NUMA regions induces a significant amount of cross NUMA data -traffic which is slow. +Support for the "numa" or "n" option was removed as its functionality +was ignored in Kokkos for some time already. ---------- diff --git a/lib/kokkos/CHANGELOG.md b/lib/kokkos/CHANGELOG.md index e81f294451..bdbc75604b 100644 --- a/lib/kokkos/CHANGELOG.md +++ b/lib/kokkos/CHANGELOG.md @@ -1,5 +1,27 @@ # Change Log +## [3.7.01](https://github.com/kokkos/kokkos/tree/3.7.01) (2022-12-01) +[Full Changelog](https://github.com/kokkos/kokkos/compare/3.7.00...3.7.01) + +### Bug Fixes: +- Add fences to all sorting routines not taking an execution space instance argument [\#5547](https://github.com/kokkos/kokkos/pull/5547) +- Fix repeated `team_reduce` without barrier [\#5552](https://github.com/kokkos/kokkos/pull/5552) +- Fix memory spaces in `create_mirror_view` overloads using `view_alloc` [\#5521](https://github.com/kokkos/kokkos/pull/5521) +- Allow `as_view_of_rank_n()` to be overloaded for "special" scalar types [\#5553](https://github.com/kokkos/kokkos/pull/5553) +- Fix warning calling a `__host__` function from a `__host__ __device__` from `View:: as_view_of_rank_n` [\#5591](https://github.com/kokkos/kokkos/pull/5591) +- OpenMPTarget: adding implementation to set device id. [\#5557](https://github.com/kokkos/kokkos/pull/5557) +- Use `Kokkos::atomic_load` to Correct Race Condition Giving Rise to Seg Faulting Error in OpenMP tests [\#5559](https://github.com/kokkos/kokkos/pull/5559) +- cmake: define `KOKKOS_ARCH_A64FX` [\#5561](https://github.com/kokkos/kokkos/pull/5561) +- Only link against libatomic in gnu-make OpenMPTarget build [\#5565](https://github.com/kokkos/kokkos/pull/5565) +- Fix static extents assignment for LayoutLeft/LayoutRight assignment [\#5566](https://github.com/kokkos/kokkos/pull/5566) +- Do not add -cuda to the link line with NVHPC compiler when the CUDA backend is not actually enabled [\#5569](https://github.com/kokkos/kokkos/pull/5569) +- Export the flags in `KOKKOS_AMDGPU_OPTIONS` when using Trilinos [\#5571](https://github.com/kokkos/kokkos/pull/5571) +- Add support for detecting MPI local rank with MPICH and PMI [\#5570](https://github.com/kokkos/kokkos/pull/5570) [\#5582](https://github.com/kokkos/kokkos/pull/5582) +- Remove listing of undefined TPL dependencies [\#5573](https://github.com/kokkos/kokkos/pull/5573) +- ClockTic changed to 64 bit to fix overflow on Power [\#5592](https://github.com/kokkos/kokkos/pull/5592) +- Fix incorrect offset in CUDA and HIP parallel scan for < 4 byte types [\#5607](https://github.com/kokkos/kokkos/pull/5607) +- Fix initialization of Cuda lock arrays [\#5622](https://github.com/kokkos/kokkos/pull/5622) + ## [3.7.00](https://github.com/kokkos/kokkos/tree/3.7.00) (2022-08-22) [Full Changelog](https://github.com/kokkos/kokkos/compare/3.6.01...3.7.00) @@ -102,7 +124,6 @@ - Deprecate command line arguments (other than `--help`) that are not prefixed with `kokkos-*` [\#5120](https://github.com/kokkos/kokkos/pull/5120) - Deprecate `--[kokkos-]numa` cmdline arg and `KOKKOS_NUMA` env var [\#5117](https://github.com/kokkos/kokkos/pull/5117) - Deprecate `--[kokkos-]threads` command line argument in favor of `--[kokkos-]num-threads` [\#5111](https://github.com/kokkos/kokkos/pull/5111) -- Deprecate `Kokkos::common_view_alloc_prop` [\#5059](https://github.com/kokkos/kokkos/pull/5059) - Deprecate `Kokkos::is_reducer_type` [\#4957](https://github.com/kokkos/kokkos/pull/4957) - Deprecate `OffsetView` constructors taking `index_list_type` [\#4810](https://github.com/kokkos/kokkos/pull/4810) - Deprecate overloads of `Kokkos::sort` taking a parameter `bool always_use_kokkos_sort` [\#5382](https://github.com/kokkos/kokkos/issues/5382) diff --git a/lib/kokkos/CMakeLists.txt b/lib/kokkos/CMakeLists.txt index a05bfcdb94..7b78f29d73 100644 --- a/lib/kokkos/CMakeLists.txt +++ b/lib/kokkos/CMakeLists.txt @@ -129,7 +129,7 @@ ENDIF() set(Kokkos_VERSION_MAJOR 3) set(Kokkos_VERSION_MINOR 7) -set(Kokkos_VERSION_PATCH 00) +set(Kokkos_VERSION_PATCH 01) set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}") math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}") @@ -152,6 +152,7 @@ ENDIF() # but scoping issues can make it difficult GLOBAL_SET(KOKKOS_COMPILE_OPTIONS) GLOBAL_SET(KOKKOS_LINK_OPTIONS) +GLOBAL_SET(KOKKOS_AMDGPU_OPTIONS) GLOBAL_SET(KOKKOS_CUDA_OPTIONS) GLOBAL_SET(KOKKOS_CUDAFE_OPTIONS) GLOBAL_SET(KOKKOS_XCOMPILER_OPTIONS) @@ -228,6 +229,9 @@ IF (KOKKOS_HAS_TRILINOS) # we have to match the annoying behavior, also we have to preserve quotes # which needs another workaround. SET(KOKKOS_COMPILE_OPTIONS_TMP) + IF (KOKKOS_ENABLE_HIP) + LIST(APPEND KOKKOS_COMPILE_OPTIONS ${KOKKOS_AMDGPU_OPTIONS}) + ENDIF() FOREACH(OPTION ${KOKKOS_COMPILE_OPTIONS}) STRING(FIND "${OPTION}" " " OPTION_HAS_WHITESPACE) IF(OPTION_HAS_WHITESPACE EQUAL -1) diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos index 7aa04aad7d..530510a0d1 100644 --- a/lib/kokkos/Makefile.kokkos +++ b/lib/kokkos/Makefile.kokkos @@ -12,7 +12,7 @@ endif KOKKOS_VERSION_MAJOR = 3 KOKKOS_VERSION_MINOR = 7 -KOKKOS_VERSION_PATCH = 00 +KOKKOS_VERSION_PATCH = 01 KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc) # Options: Cuda,HIP,SYCL,OpenMPTarget,OpenMP,Threads,Serial @@ -20,7 +20,7 @@ KOKKOS_DEVICES ?= "OpenMP" #KOKKOS_DEVICES ?= "Threads" # Options: # Intel: KNC,KNL,SNB,HSW,BDW,SKL,SKX,ICL,ICX,SPR -# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86 +# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86,Hopper90 # ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX # IBM: BGQ,Power7,Power8,Power9 # AMD-GPUS: Vega900,Vega906,Vega908,Vega90A @@ -401,6 +401,7 @@ KOKKOS_INTERNAL_USE_ARCH_VOLTA72 := $(call kokkos_has_string,$(KOKKOS_ARCH),Volt KOKKOS_INTERNAL_USE_ARCH_TURING75 := $(call kokkos_has_string,$(KOKKOS_ARCH),Turing75) KOKKOS_INTERNAL_USE_ARCH_AMPERE80 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere80) KOKKOS_INTERNAL_USE_ARCH_AMPERE86 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere86) +KOKKOS_INTERNAL_USE_ARCH_HOPPER90 := $(call kokkos_has_string,$(KOKKOS_ARCH),Hopper90) KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30) \ + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32) \ + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \ @@ -414,7 +415,8 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLE + $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72) \ + $(KOKKOS_INTERNAL_USE_ARCH_TURING75) \ + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE80) \ - + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86)) + + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86) \ + + $(KOKKOS_INTERNAL_USE_ARCH_HOPPER90)) #SEK: This seems like a bug to me ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0) @@ -505,10 +507,6 @@ KOKKOS_LINK_FLAGS = KOKKOS_SRC = KOKKOS_HEADERS = -#ifeq ($(KOKKOS_INTERNAL_COMPILER_GCC), 1) - KOKKOS_LIBS += -latomic -#endif - # Generating the KokkosCore_config.h file. KOKKOS_INTERNAL_CONFIG_TMP=KokkosCore_config.tmp @@ -550,6 +548,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1) endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) + KOKKOS_LIBS += -latomic tmp := $(call kokkos_append_header,'$H''define KOKKOS_ENABLE_OPENMPTARGET') ifeq ($(KOKKOS_INTERNAL_COMPILER_GCC), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_WORKAROUND_OPENMPTARGET_GCC") @@ -1197,6 +1196,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA_ARCH), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE86") KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_86 endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_HOPPER90), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER90") + KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_90 + endif ifneq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0) KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG) diff --git a/lib/kokkos/algorithms/cmake/Dependencies.cmake b/lib/kokkos/algorithms/cmake/Dependencies.cmake index 1b41310681..c36b62523f 100644 --- a/lib/kokkos/algorithms/cmake/Dependencies.cmake +++ b/lib/kokkos/algorithms/cmake/Dependencies.cmake @@ -1,5 +1,5 @@ TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( LIB_REQUIRED_PACKAGES KokkosCore KokkosContainers - LIB_OPTIONAL_TPLS Pthread CUDA HWLOC HPX + LIB_OPTIONAL_TPLS Pthread CUDA HWLOC TEST_OPTIONAL_TPLS CUSPARSE ) diff --git a/lib/kokkos/algorithms/src/Kokkos_Sort.hpp b/lib/kokkos/algorithms/src/Kokkos_Sort.hpp index ad0c2d47b6..c7be70e09a 100644 --- a/lib/kokkos/algorithms/src/Kokkos_Sort.hpp +++ b/lib/kokkos/algorithms/src/Kokkos_Sort.hpp @@ -265,8 +265,8 @@ class BinSort { //---------------------------------------- // Create the permutation vector, the bin_offset array and the bin_count // array. Can be called again if keys changed - template - void create_permute_vector(const ExecutionSpace& exec = exec_space{}) { + template + void create_permute_vector(const ExecutionSpace& exec) { static_assert( Kokkos::SpaceAccessibility::accessible, @@ -297,6 +297,15 @@ class BinSort { *this); } + // Create the permutation vector, the bin_offset array and the bin_count + // array. Can be called again if keys changed + void create_permute_vector() { + Kokkos::fence("Kokkos::Binsort::create_permute_vector: before"); + exec_space e{}; + create_permute_vector(e); + e.fence("Kokkos::Binsort::create_permute_vector: after"); + } + // Sort a subset of a view with respect to the first dimension using the // permutation array template @@ -372,9 +381,10 @@ class BinSort { template void sort(ValuesViewType const& values, int values_range_begin, int values_range_end) const { + Kokkos::fence("Kokkos::Binsort::sort: before"); exec_space exec; sort(exec, values, values_range_begin, values_range_end); - exec.fence("Kokkos::Sort: fence after sorting"); + exec.fence("Kokkos::BinSort:sort: after"); } template @@ -641,9 +651,10 @@ std::enable_if_t::value> sort( template void sort(ViewType const& view) { + Kokkos::fence("Kokkos::sort: before"); typename ViewType::execution_space exec; sort(exec, view); - exec.fence("Kokkos::Sort: fence after sorting"); + exec.fence("Kokkos::sort: fence after sorting"); } #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 @@ -682,6 +693,7 @@ std::enable_if_t::value> sort( template void sort(ViewType view, size_t const begin, size_t const end) { + Kokkos::fence("Kokkos::sort: before"); typename ViewType::execution_space exec; sort(exec, view, begin, end); exec.fence("Kokkos::Sort: fence after sorting"); diff --git a/lib/kokkos/bin/nvcc_wrapper b/lib/kokkos/bin/nvcc_wrapper index e1a2088138..566b355c5b 100755 --- a/lib/kokkos/bin/nvcc_wrapper +++ b/lib/kokkos/bin/nvcc_wrapper @@ -10,10 +10,12 @@ # Default settings: change those according to your machine. For # example, you may have have two different wrappers with either icpc # or g++ as their back-end compiler. The defaults can be overwritten -# by using the usual arguments (e.g., -arch=sm_30 -ccbin icpc). +# by using the usual arguments (e.g., -arch=sm_80 -ccbin icpc). +# sm_70 is supported by every CUDA version from 9-12 and is thus +# chosen as default -default_arch="sm_35" -#default_arch="sm_50" +default_arch="sm_70" +#default_arch="sm_80" # # The default C++ compiler. diff --git a/lib/kokkos/cmake/KokkosCore_config.h.in b/lib/kokkos/cmake/KokkosCore_config.h.in index 34807ac2b2..b83ced9243 100644 --- a/lib/kokkos/cmake/KokkosCore_config.h.in +++ b/lib/kokkos/cmake/KokkosCore_config.h.in @@ -66,6 +66,7 @@ #cmakedefine KOKKOS_ARCH_ARMV8_THUNDERX #cmakedefine KOKKOS_ARCH_ARMV81 #cmakedefine KOKKOS_ARCH_ARMV8_THUNDERX2 +#cmakedefine KOKKOS_ARCH_A64FX #cmakedefine KOKKOS_ARCH_AMD_AVX2 #cmakedefine KOKKOS_ARCH_AVX #cmakedefine KOKKOS_ARCH_AVX2 @@ -101,6 +102,7 @@ #cmakedefine KOKKOS_ARCH_AMPERE #cmakedefine KOKKOS_ARCH_AMPERE80 #cmakedefine KOKKOS_ARCH_AMPERE86 +#cmakedefine KOKKOS_ARCH_HOPPER90 #cmakedefine KOKKOS_ARCH_AMD_ZEN #cmakedefine KOKKOS_ARCH_AMD_ZEN2 #cmakedefine KOKKOS_ARCH_AMD_ZEN3 diff --git a/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc b/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc index f56cef1651..2585a6a64c 100644 --- a/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc +++ b/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc @@ -74,6 +74,7 @@ int main() { case 75: std::cout << "Set -DKokkos_ARCH_TURING75=ON ." << std::endl; break; case 80: std::cout << "Set -DKokkos_ARCH_AMPERE80=ON ." << std::endl; break; case 86: std::cout << "Set -DKokkos_ARCH_AMPERE86=ON ." << std::endl; break; + case 90: std::cout << "Set -DKokkos_ARCH_HOPPER90=ON ." << std::endl; break; default: std::cout << "Compute capability " << compute_capability << " is not supported" << std::endl; diff --git a/lib/kokkos/cmake/kokkos_arch.cmake b/lib/kokkos/cmake/kokkos_arch.cmake index d4c2cda651..c1d76cceeb 100644 --- a/lib/kokkos/cmake/kokkos_arch.cmake +++ b/lib/kokkos/cmake/kokkos_arch.cmake @@ -86,6 +86,7 @@ KOKKOS_ARCH_OPTION(VOLTA72 GPU "NVIDIA Volta generation CC 7.2" "KOKK KOKKOS_ARCH_OPTION(TURING75 GPU "NVIDIA Turing generation CC 7.5" "KOKKOS_SHOW_CUDA_ARCHS") KOKKOS_ARCH_OPTION(AMPERE80 GPU "NVIDIA Ampere generation CC 8.0" "KOKKOS_SHOW_CUDA_ARCHS") KOKKOS_ARCH_OPTION(AMPERE86 GPU "NVIDIA Ampere generation CC 8.6" "KOKKOS_SHOW_CUDA_ARCHS") +KOKKOS_ARCH_OPTION(HOPPER90 GPU "NVIDIA Hopper generation CC 9.0" "KOKKOS_SHOW_CUDA_ARCHS") IF(Kokkos_ENABLE_HIP OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_UNSUPPORTED_ARCHS) SET(KOKKOS_SHOW_HIP_ARCHS ON) @@ -187,7 +188,9 @@ IF (KOKKOS_CXX_COMPILER_ID STREQUAL Clang) ELSEIF (KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) SET(CUDA_ARCH_FLAG "-gpu") GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS -cuda) - GLOBAL_APPEND(KOKKOS_LINK_OPTIONS -cuda) + IF (KOKKOS_ENABLE_CUDA) # FIXME ideally unreachable when CUDA not enabled + GLOBAL_APPEND(KOKKOS_LINK_OPTIONS -cuda) + ENDIF() ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) SET(CUDA_ARCH_FLAG "-arch") ENDIF() @@ -542,6 +545,7 @@ CHECK_CUDA_ARCH(VOLTA72 sm_72) CHECK_CUDA_ARCH(TURING75 sm_75) CHECK_CUDA_ARCH(AMPERE80 sm_80) CHECK_CUDA_ARCH(AMPERE86 sm_86) +CHECK_CUDA_ARCH(HOPPER90 sm_90) SET(AMDGPU_ARCH_ALREADY_SPECIFIED "") FUNCTION(CHECK_AMDGPU_ARCH ARCH FLAG) @@ -804,6 +808,10 @@ IF (KOKKOS_ARCH_AMPERE80 OR KOKKOS_ARCH_AMPERE86) SET(KOKKOS_ARCH_AMPERE ON) ENDIF() +IF (KOKKOS_ARCH_HOPPER90) + SET(KOKKOS_ARCH_HOPPER ON) +ENDIF() + #Regardless of version, make sure we define the general architecture name IF (KOKKOS_ARCH_VEGA900 OR KOKKOS_ARCH_VEGA906 OR KOKKOS_ARCH_VEGA908 OR KOKKOS_ARCH_VEGA90A) SET(KOKKOS_ARCH_VEGA ON) diff --git a/lib/kokkos/containers/cmake/Dependencies.cmake b/lib/kokkos/containers/cmake/Dependencies.cmake index 5e29157369..1d71d8af34 100644 --- a/lib/kokkos/containers/cmake/Dependencies.cmake +++ b/lib/kokkos/containers/cmake/Dependencies.cmake @@ -1,5 +1,5 @@ TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( LIB_REQUIRED_PACKAGES KokkosCore - LIB_OPTIONAL_TPLS Pthread CUDA HWLOC HPX + LIB_OPTIONAL_TPLS Pthread CUDA HWLOC TEST_OPTIONAL_TPLS CUSPARSE ) diff --git a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp index 442f0d8617..059ce8a610 100644 --- a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp +++ b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp @@ -1701,7 +1701,11 @@ namespace Impl { underlying memory, to facilitate implementation of deep_copy() and other routines that are defined on View */ template -KOKKOS_FUNCTION auto as_view_of_rank_n(DynRankView v) { +KOKKOS_FUNCTION auto as_view_of_rank_n( + DynRankView v, + typename std::enable_if::specialize, void>::value>::type* = + nullptr) { if (v.rank() != N) { KOKKOS_IF_ON_HOST( const std::string message = @@ -2114,9 +2118,10 @@ inline auto create_mirror( namespace Impl { template inline std::enable_if_t< - std::is_same< - typename DynRankView::memory_space, - typename DynRankView::HostMirror::memory_space>::value && + !Impl::ViewCtorProp::has_memory_space && + std::is_same< + typename DynRankView::memory_space, + typename DynRankView::HostMirror::memory_space>::value && std::is_same< typename DynRankView::data_type, typename DynRankView::HostMirror::data_type>::value, @@ -2128,12 +2133,13 @@ create_mirror_view(const DynRankView& src, template inline std::enable_if_t< - !(std::is_same< - typename DynRankView::memory_space, - typename DynRankView::HostMirror::memory_space>::value && - std::is_same< - typename DynRankView::data_type, - typename DynRankView::HostMirror::data_type>::value), + !Impl::ViewCtorProp::has_memory_space && + !(std::is_same< + typename DynRankView::memory_space, + typename DynRankView::HostMirror::memory_space>::value && + std::is_same< + typename DynRankView::data_type, + typename DynRankView::HostMirror::data_type>::value), typename DynRankView::HostMirror> create_mirror_view( const DynRankView& src, @@ -2141,29 +2147,39 @@ create_mirror_view( return Kokkos::Impl::create_mirror(src, arg_prop); } -template +template ::has_memory_space>> inline std::enable_if_t< - Kokkos::is_space::value && - Impl::MirrorDRViewType::is_same_memspace, - typename Impl::MirrorDRViewType::view_type> -create_mirror_view(const Space&, const Kokkos::DynRankView& src, + Kokkos::is_space< + typename Impl::ViewCtorProp::memory_space>::value && + Impl::MirrorDRViewType< + typename Impl::ViewCtorProp::memory_space, T, + P...>::is_same_memspace, + typename Impl::MirrorDRViewType< + typename Impl::ViewCtorProp::memory_space, T, + P...>::view_type> +create_mirror_view(const Kokkos::DynRankView& src, const typename Impl::ViewCtorProp&) { return src; } -template +template ::has_memory_space>> inline std::enable_if_t< - Kokkos::is_space::value && - !Impl::MirrorDRViewType::is_same_memspace, - typename Impl::MirrorDRViewType::view_type> + Kokkos::is_space< + typename Impl::ViewCtorProp::memory_space>::value && + !Impl::MirrorDRViewType< + typename Impl::ViewCtorProp::memory_space, T, + P...>::is_same_memspace, + typename Impl::MirrorDRViewType< + typename Impl::ViewCtorProp::memory_space, T, + P...>::view_type> create_mirror_view( - const Space&, const Kokkos::DynRankView& src, + const Kokkos::DynRankView& src, const typename Impl::ViewCtorProp& arg_prop) { - using MemorySpace = typename Space::memory_space; - using alloc_prop = Impl::ViewCtorProp; - alloc_prop prop_copy(arg_prop); - - return Kokkos::Impl::create_mirror(src, prop_copy); + return Kokkos::Impl::create_mirror(src, arg_prop); } } // namespace Impl @@ -2224,9 +2240,10 @@ create_mirror_view( template inline auto create_mirror_view(Kokkos::Impl::WithoutInitializing_t wi, - const Space& space, + const Space&, const Kokkos::DynRankView& src) { - return Impl::create_mirror_view(space, src, Kokkos::view_alloc(wi)); + return Impl::create_mirror_view( + src, Kokkos::view_alloc(typename Space::memory_space{}, wi)); } template diff --git a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp index 015a75cb0b..a2b68064de 100644 --- a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp +++ b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp @@ -710,7 +710,7 @@ template inline auto create_mirror( const Space&, const Kokkos::Experimental::DynamicView& src) { return Impl::create_mirror( - src, Impl::ViewCtorProp<>{typename Space::memory_space{}}); + src, Kokkos::view_alloc(typename Space::memory_space{})); } template @@ -729,48 +729,68 @@ inline auto create_mirror( } namespace Impl { + template inline std::enable_if_t< - (std::is_same< - typename Kokkos::Experimental::DynamicView::memory_space, - typename Kokkos::Experimental::DynamicView< - T, P...>::HostMirror::memory_space>::value && - std::is_same< - typename Kokkos::Experimental::DynamicView::data_type, - typename Kokkos::Experimental::DynamicView< - T, P...>::HostMirror::data_type>::value), + !Impl::ViewCtorProp::has_memory_space && + (std::is_same< + typename Kokkos::Experimental::DynamicView::memory_space, + typename Kokkos::Experimental::DynamicView< + T, P...>::HostMirror::memory_space>::value && + std::is_same< + typename Kokkos::Experimental::DynamicView::data_type, + typename Kokkos::Experimental::DynamicView< + T, P...>::HostMirror::data_type>::value), typename Kokkos::Experimental::DynamicView::HostMirror> -create_mirror_view( - const typename Kokkos::Experimental::DynamicView& src, - const Impl::ViewCtorProp&) { +create_mirror_view(const Kokkos::Experimental::DynamicView& src, + const Impl::ViewCtorProp&) { return src; } template inline std::enable_if_t< - !(std::is_same< - typename Kokkos::Experimental::DynamicView::memory_space, - typename Kokkos::Experimental::DynamicView< - T, P...>::HostMirror::memory_space>::value && - std::is_same< - typename Kokkos::Experimental::DynamicView::data_type, - typename Kokkos::Experimental::DynamicView< - T, P...>::HostMirror::data_type>::value), + !Impl::ViewCtorProp::has_memory_space && + !(std::is_same< + typename Kokkos::Experimental::DynamicView::memory_space, + typename Kokkos::Experimental::DynamicView< + T, P...>::HostMirror::memory_space>::value && + std::is_same< + typename Kokkos::Experimental::DynamicView::data_type, + typename Kokkos::Experimental::DynamicView< + T, P...>::HostMirror::data_type>::value), typename Kokkos::Experimental::DynamicView::HostMirror> create_mirror_view(const Kokkos::Experimental::DynamicView& src, const Impl::ViewCtorProp& arg_prop) { return Kokkos::create_mirror(arg_prop, src); } -template -inline std::enable_if_t< - Impl::MirrorDynamicViewType::is_same_memspace, - typename Kokkos::Impl::MirrorDynamicViewType::view_type> -create_mirror_view(const Space&, - const Kokkos::Experimental::DynamicView& src, +template ::has_memory_space>> +std::enable_if_t::memory_space, + T, P...>::is_same_memspace, + typename Impl::MirrorDynamicViewType< + typename Impl::ViewCtorProp::memory_space, + T, P...>::view_type> +create_mirror_view(const Kokkos::Experimental::DynamicView& src, const Impl::ViewCtorProp&) { return src; } + +template ::has_memory_space>> +std::enable_if_t::memory_space, + T, P...>::is_same_memspace, + typename Impl::MirrorDynamicViewType< + typename Impl::ViewCtorProp::memory_space, + T, P...>::view_type> +create_mirror_view(const Kokkos::Experimental::DynamicView& src, + const Impl::ViewCtorProp& arg_prop) { + return Kokkos::Impl::create_mirror(src, arg_prop); +} } // namespace Impl // Create a mirror view in host space @@ -790,8 +810,9 @@ inline auto create_mirror_view( // Create a mirror in a new space template inline auto create_mirror_view( - const Space& space, const Kokkos::Experimental::DynamicView& src) { - return Impl::create_mirror_view(space, src, Impl::ViewCtorProp<>{}); + const Space&, const Kokkos::Experimental::DynamicView& src) { + return Impl::create_mirror_view(src, + view_alloc(typename Space::memory_space{})); } template diff --git a/lib/kokkos/containers/src/Kokkos_OffsetView.hpp b/lib/kokkos/containers/src/Kokkos_OffsetView.hpp index 0b54d1bdd9..5027763a02 100644 --- a/lib/kokkos/containers/src/Kokkos_OffsetView.hpp +++ b/lib/kokkos/containers/src/Kokkos_OffsetView.hpp @@ -1901,19 +1901,22 @@ struct MirrorOffsetType { namespace Impl { template -inline typename Kokkos::Experimental::OffsetView::HostMirror +inline std::enable_if_t< + !Impl::ViewCtorProp::has_memory_space, + typename Kokkos::Experimental::OffsetView::HostMirror> create_mirror(const Kokkos::Experimental::OffsetView& src, const Impl::ViewCtorProp& arg_prop) { return typename Kokkos::Experimental::OffsetView::HostMirror( Kokkos::create_mirror(arg_prop, src.view()), src.begins()); } -template -inline typename Kokkos::Impl::MirrorOffsetType::view_type -create_mirror(const Space&, - const Kokkos::Experimental::OffsetView& src, - const Impl::ViewCtorProp& arg_prop) { +template ::has_memory_space>> +inline auto create_mirror(const Kokkos::Experimental::OffsetView& src, + const Impl::ViewCtorProp& arg_prop) { using alloc_prop_input = Impl::ViewCtorProp; + using Space = typename Impl::ViewCtorProp::memory_space; static_assert( !alloc_prop_input::has_label, @@ -1923,10 +1926,6 @@ create_mirror(const Space&, !alloc_prop_input::has_pointer, "The view constructor arguments passed to Kokkos::create_mirror must " "not include a pointer!"); - static_assert( - !alloc_prop_input::has_memory_space, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not include a memory space instance!"); static_assert( !alloc_prop_input::allow_padding, "The view constructor arguments passed to Kokkos::create_mirror must " @@ -1962,15 +1961,17 @@ inline auto create_mirror( template ::value>> inline auto create_mirror( - const Space& space, const Kokkos::Experimental::OffsetView& src) { - return Impl::create_mirror(space, src, Impl::ViewCtorProp<>{}); + const Space&, const Kokkos::Experimental::OffsetView& src) { + return Impl::create_mirror( + src, Kokkos::view_alloc(typename Space::memory_space{})); } template typename Kokkos::Impl::MirrorOffsetType::view_type -create_mirror(Kokkos::Impl::WithoutInitializing_t wi, const Space& space, +create_mirror(Kokkos::Impl::WithoutInitializing_t wi, const Space&, const Kokkos::Experimental::OffsetView& src) { - return Impl::create_mirror(space, src, Kokkos::view_alloc(wi)); + return Impl::create_mirror( + src, Kokkos::view_alloc(typename Space::memory_space{}, wi)); } template @@ -1983,54 +1984,64 @@ inline auto create_mirror( namespace Impl { template inline std::enable_if_t< - (std::is_same< - typename Kokkos::Experimental::OffsetView::memory_space, - typename Kokkos::Experimental::OffsetView< - T, P...>::HostMirror::memory_space>::value && - std::is_same::data_type, - typename Kokkos::Experimental::OffsetView< - T, P...>::HostMirror::data_type>::value), + !Impl::ViewCtorProp::has_memory_space && + (std::is_same< + typename Kokkos::Experimental::OffsetView::memory_space, + typename Kokkos::Experimental::OffsetView< + T, P...>::HostMirror::memory_space>::value && + std::is_same< + typename Kokkos::Experimental::OffsetView::data_type, + typename Kokkos::Experimental::OffsetView< + T, P...>::HostMirror::data_type>::value), typename Kokkos::Experimental::OffsetView::HostMirror> -create_mirror_view( - const typename Kokkos::Experimental::OffsetView& src, - const Impl::ViewCtorProp&) { +create_mirror_view(const Kokkos::Experimental::OffsetView& src, + const Impl::ViewCtorProp&) { return src; } template inline std::enable_if_t< - !(std::is_same< - typename Kokkos::Experimental::OffsetView::memory_space, - typename Kokkos::Experimental::OffsetView< - T, P...>::HostMirror::memory_space>::value && - std::is_same< - typename Kokkos::Experimental::OffsetView::data_type, - typename Kokkos::Experimental::OffsetView< - T, P...>::HostMirror::data_type>::value), + !Impl::ViewCtorProp::has_memory_space && + !(std::is_same< + typename Kokkos::Experimental::OffsetView::memory_space, + typename Kokkos::Experimental::OffsetView< + T, P...>::HostMirror::memory_space>::value && + std::is_same< + typename Kokkos::Experimental::OffsetView::data_type, + typename Kokkos::Experimental::OffsetView< + T, P...>::HostMirror::data_type>::value), typename Kokkos::Experimental::OffsetView::HostMirror> create_mirror_view(const Kokkos::Experimental::OffsetView& src, const Impl::ViewCtorProp& arg_prop) { return Kokkos::create_mirror(arg_prop, src); } -template -inline std::enable_if_t< - Impl::MirrorOffsetViewType::is_same_memspace, - Kokkos::Experimental::OffsetView> -create_mirror_view(const Space&, - const Kokkos::Experimental::OffsetView& src, +template ::has_memory_space>> +std::enable_if_t::memory_space, + T, P...>::is_same_memspace, + typename Impl::MirrorOffsetViewType< + typename Impl::ViewCtorProp::memory_space, + T, P...>::view_type> +create_mirror_view(const Kokkos::Experimental::OffsetView& src, const Impl::ViewCtorProp&) { return src; } -template -std::enable_if_t< - !Impl::MirrorOffsetViewType::is_same_memspace, - typename Kokkos::Impl::MirrorOffsetViewType::view_type> -create_mirror_view(const Space& space, - const Kokkos::Experimental::OffsetView& src, +template ::has_memory_space>> +std::enable_if_t::memory_space, + T, P...>::is_same_memspace, + typename Impl::MirrorOffsetViewType< + typename Impl::ViewCtorProp::memory_space, + T, P...>::view_type> +create_mirror_view(const Kokkos::Experimental::OffsetView& src, const Impl::ViewCtorProp& arg_prop) { - return create_mirror(space, src, arg_prop); + return Kokkos::Impl::create_mirror(src, arg_prop); } } // namespace Impl @@ -2052,15 +2063,17 @@ inline auto create_mirror_view( template ::value>> inline auto create_mirror_view( - const Space& space, const Kokkos::Experimental::OffsetView& src) { - return Impl::create_mirror_view(space, src, Impl::ViewCtorProp<>{}); + const Space&, const Kokkos::Experimental::OffsetView& src) { + return Impl::create_mirror_view( + src, Kokkos::view_alloc(typename Space::memory_space{})); } template inline auto create_mirror_view( - Kokkos::Impl::WithoutInitializing_t wi, const Space& space, + Kokkos::Impl::WithoutInitializing_t wi, const Space&, const Kokkos::Experimental::OffsetView& src) { - return Impl::create_mirror_view(space, src, Kokkos::view_alloc(wi)); + return Impl::create_mirror_view( + src, Kokkos::view_alloc(typename Space::memory_space{}, wi)); } template diff --git a/lib/kokkos/containers/unit_tests/CMakeLists.txt b/lib/kokkos/containers/unit_tests/CMakeLists.txt index f16572b603..261d9dcd42 100644 --- a/lib/kokkos/containers/unit_tests/CMakeLists.txt +++ b/lib/kokkos/containers/unit_tests/CMakeLists.txt @@ -46,3 +46,13 @@ foreach(Tag Threads;Serial;OpenMP;HPX;Cuda;HIP;SYCL) KOKKOS_ADD_EXECUTABLE_AND_TEST(UnitTest_${Tag} SOURCES ${UnitTestSources}) endif() endforeach() + +SET(COMPILE_ONLY_SOURCES + TestCreateMirror.cpp +) +KOKKOS_ADD_EXECUTABLE( + TestCompileOnly + SOURCES + TestCompileMain.cpp + ${COMPILE_ONLY_SOURCES} +) diff --git a/lib/kokkos/containers/unit_tests/TestCompileMain.cpp b/lib/kokkos/containers/unit_tests/TestCompileMain.cpp new file mode 100644 index 0000000000..237c8ce181 --- /dev/null +++ b/lib/kokkos/containers/unit_tests/TestCompileMain.cpp @@ -0,0 +1 @@ +int main() {} diff --git a/lib/kokkos/containers/unit_tests/TestCreateMirror.cpp b/lib/kokkos/containers/unit_tests/TestCreateMirror.cpp new file mode 100644 index 0000000000..0e43be4364 --- /dev/null +++ b/lib/kokkos/containers/unit_tests/TestCreateMirror.cpp @@ -0,0 +1,179 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#include +#include +#include + +template +void check_memory_space(TestView, MemorySpace) { + static_assert( + std::is_same::value, ""); +} + +template +auto host_mirror_test_space(View) { + return std::conditional_t< + Kokkos::SpaceAccessibility::accessible, + typename View::memory_space, Kokkos::HostSpace>{}; +} + +template +void test_create_mirror_properties(const View& view) { + using namespace Kokkos; + using DeviceMemorySpace = typename DefaultExecutionSpace::memory_space; + + // clang-format off + + // create_mirror +#ifndef KOKKOS_ENABLE_CXX14 + // FIXME DynamicView: HostMirror is the same type + if constexpr (!is_dynamic_view::value) { + check_memory_space(create_mirror(WithoutInitializing, view), host_mirror_test_space(view)); + check_memory_space(create_mirror( view), host_mirror_test_space(view)); + } +#endif + check_memory_space(create_mirror(WithoutInitializing, DefaultExecutionSpace{}, view), DeviceMemorySpace{}); + check_memory_space(create_mirror( DefaultExecutionSpace{}, view), DeviceMemorySpace{}); + + // create_mirror_view +#ifndef KOKKOS_ENABLE_CXX14 + // FIXME DynamicView: HostMirror is the same type + if constexpr (!is_dynamic_view::value) { + check_memory_space(create_mirror_view(WithoutInitializing, view), host_mirror_test_space(view)); + check_memory_space(create_mirror_view( view), host_mirror_test_space(view)); + } +#endif + check_memory_space(create_mirror_view(WithoutInitializing, DefaultExecutionSpace{}, view), DeviceMemorySpace{}); + check_memory_space(create_mirror_view( DefaultExecutionSpace{}, view), DeviceMemorySpace{}); + + // create_mirror view_alloc +#ifndef KOKKOS_ENABLE_CXX14 + // FIXME DynamicView: HostMirror is the same type + if constexpr (!is_dynamic_view::value) { + check_memory_space(create_mirror(view_alloc(WithoutInitializing), view), host_mirror_test_space(view)); + check_memory_space(create_mirror(view_alloc(), view), host_mirror_test_space(view)); + } +#endif + check_memory_space(create_mirror(view_alloc(WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{}); + check_memory_space(create_mirror(view_alloc( DeviceMemorySpace{}), view), DeviceMemorySpace{}); + + // create_mirror_view view_alloc +#ifndef KOKKOS_ENABLE_CXX14 + // FIXME DynamicView: HostMirror is the same type + if constexpr (!is_dynamic_view::value) { + check_memory_space(create_mirror_view(view_alloc(WithoutInitializing), view), host_mirror_test_space(view)); + check_memory_space(create_mirror_view(view_alloc(), view), host_mirror_test_space(view)); + } +#endif + check_memory_space(create_mirror_view(view_alloc(WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{}); + check_memory_space(create_mirror_view(view_alloc( DeviceMemorySpace{}), view), DeviceMemorySpace{}); + + // create_mirror view_alloc + execution space +#ifndef KOKKOS_ENABLE_CXX14 + // FIXME DynamicView: HostMirror is the same type + if constexpr (!is_dynamic_view::value) { + check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{}, WithoutInitializing), view), host_mirror_test_space(view)); + check_memory_space(create_mirror(view_alloc(DefaultHostExecutionSpace{}), view), host_mirror_test_space(view)); + } +#endif + check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{}, WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{}); + check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{}, DeviceMemorySpace{}), view), DeviceMemorySpace{}); + + // create_mirror_view view_alloc + execution space +#ifndef KOKKOS_ENABLE_CXX14 + // FIXME DynamicView: HostMirror is the same type + if constexpr (!is_dynamic_view::value) { + check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{}, WithoutInitializing), view), host_mirror_test_space(view)); + check_memory_space(create_mirror_view(view_alloc(DefaultHostExecutionSpace{}), view), host_mirror_test_space(view)); + } +#endif + check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{}, WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{}); + check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{}, DeviceMemorySpace{}), view), DeviceMemorySpace{}); + + // create_mirror_view_and_copy + check_memory_space(create_mirror_view_and_copy(HostSpace{}, view), HostSpace{}); + check_memory_space(create_mirror_view_and_copy(DeviceMemorySpace{}, view), DeviceMemorySpace{}); + + // create_mirror_view_and_copy view_alloc + check_memory_space(create_mirror_view_and_copy(view_alloc(HostSpace{}), view), HostSpace{}); + check_memory_space(create_mirror_view_and_copy(view_alloc(DeviceMemorySpace{}), view), DeviceMemorySpace{}); + + // create_mirror_view_and_copy view_alloc + execution space + check_memory_space(create_mirror_view_and_copy(view_alloc(HostSpace{}, DefaultHostExecutionSpace{}), view), HostSpace{}); + check_memory_space(create_mirror_view_and_copy(view_alloc(DeviceMemorySpace{}, DefaultExecutionSpace{}), view), DeviceMemorySpace{}); + + // clang-format on +} + +void test_create_mirror_dynrankview() { + Kokkos::DynRankView device_view( + "device view", 10); + Kokkos::DynRankView host_view("host view", 10); + + test_create_mirror_properties(device_view); + test_create_mirror_properties(host_view); +} + +void test_reate_mirror_offsetview() { + Kokkos::Experimental::OffsetView + device_view("device view", {0, 10}); + Kokkos::Experimental::OffsetView host_view( + "host view", {0, 10}); + + test_create_mirror_properties(device_view); + test_create_mirror_properties(host_view); +} + +void test_create_mirror_dynamicview() { + Kokkos::Experimental::DynamicView + device_view("device view", 2, 10); + Kokkos::Experimental::DynamicView host_view( + "host view", 2, 10); + + test_create_mirror_properties(device_view); + test_create_mirror_properties(host_view); +} diff --git a/lib/kokkos/core/cmake/Dependencies.cmake b/lib/kokkos/core/cmake/Dependencies.cmake index cc901a4ede..611c089b2e 100644 --- a/lib/kokkos/core/cmake/Dependencies.cmake +++ b/lib/kokkos/core/cmake/Dependencies.cmake @@ -1,5 +1,5 @@ TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( - LIB_OPTIONAL_TPLS Pthread CUDA HWLOC DLlib HPX + LIB_OPTIONAL_TPLS Pthread CUDA HWLOC DLlib TEST_OPTIONAL_TPLS CUSPARSE ) diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp index 993c8d1bba..70f5b70fc0 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp @@ -53,13 +53,69 @@ namespace Kokkos { namespace Impl { +inline int cuda_warp_per_sm_allocation_granularity( + cudaDeviceProp const& properties) { + // Allocation granularity of warps in each sm + switch (properties.major) { + case 3: + case 5: + case 7: + case 8: + case 9: return 4; + case 6: return (properties.minor == 0 ? 2 : 4); + default: + throw_runtime_exception( + "Unknown device in cuda warp per sm allocation granularity"); + return 0; + } +} + +inline int cuda_max_warps_per_sm_registers( + cudaDeviceProp const& properties, cudaFuncAttributes const& attributes) { + // Maximum number of warps per sm as a function of register counts, + // subject to the constraint that warps are allocated with a fixed granularity + int const max_regs_per_block = properties.regsPerBlock; + int const regs_per_warp = attributes.numRegs * properties.warpSize; + int const warp_granularity = + cuda_warp_per_sm_allocation_granularity(properties); + // The granularity of register allocation is chunks of 256 registers per warp, + // which implies a need to over-allocate, so we round up + int const allocated_regs_per_warp = 256 * ((regs_per_warp + 256 - 1) / 256); + + // The maximum number of warps per SM is constrained from above by register + // allocation. To satisfy the constraint that warps per SM is allocated at a + // finite granularity, we need to round down. + int const max_warps_per_sm = + warp_granularity * + (max_regs_per_block / (allocated_regs_per_warp * warp_granularity)); + + return max_warps_per_sm; +} + inline int cuda_max_active_blocks_per_sm(cudaDeviceProp const& properties, cudaFuncAttributes const& attributes, int block_size, size_t dynamic_shmem) { - // Limits due do registers/SM + // Limits due to registers/SM int const regs_per_sm = properties.regsPerMultiprocessor; int const regs_per_thread = attributes.numRegs; - int const max_blocks_regs = regs_per_sm / (regs_per_thread * block_size); + // The granularity of register allocation is chunks of 256 registers per warp + // -> 8 registers per thread + int const allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8); + int max_blocks_regs = regs_per_sm / (allocated_regs_per_thread * block_size); + + // Compute the maximum number of warps as a function of the number of + // registers + int const max_warps_per_sm_registers = + cuda_max_warps_per_sm_registers(properties, attributes); + + // Constrain the number of blocks to respect the maximum number of warps per + // SM On face value this should be an equality, but due to the warp + // granularity constraints noted in `cuda_max_warps_per_sm_registers` the + // left-hand-side of this comparison can overshoot what the hardware allows + // based on register counts alone + while ((max_blocks_regs * block_size / properties.warpSize) > + max_warps_per_sm_registers) + max_blocks_regs--; // Limits due to shared memory/SM size_t const shmem_per_sm = properties.sharedMemPerMultiprocessor; @@ -203,40 +259,19 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance, LaunchBounds{}); } -// Assuming cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1) -// NOTE these number can be obtained several ways: -// * One option is to download the CUDA Occupancy Calculator spreadsheet, select -// "Compute Capability" first and check what is the smallest "Shared Memory -// Size Config" that is available. The "Shared Memory Per Multiprocessor" in -// bytes is then to be found below in the summary. -// * Another option would be to look for the information in the "Tuning -// Guide(s)" of the CUDA Toolkit Documentation for each GPU architecture, in -// the "Shared Memory" section (more tedious) -inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) { - int const compute_capability = properties.major * 10 + properties.minor; - return [compute_capability]() { - switch (compute_capability) { - case 30: - case 32: - case 35: return 16; - case 37: return 80; - case 50: - case 53: - case 60: - case 62: return 64; - case 52: - case 61: return 96; - case 70: - case 80: - case 86: return 8; - case 75: return 32; - default: - Kokkos::Impl::throw_runtime_exception( - "Unknown device in cuda block size deduction"); - } - return 0; - }() * 1024; +template +int cuda_get_opt_block_size_no_shmem(const cudaFuncAttributes& attr, + LaunchBounds) { + auto const& prop = Kokkos::Cuda().cuda_device_prop(); + + // Thin version of cuda_get_opt_block_size for cases where there is no shared + // memory + auto const block_size_to_no_shmem = [&](int /*block_size*/) { return 0; }; + + return cuda_deduce_block_size(false, prop, attr, block_size_to_no_shmem, + LaunchBounds{}); } + } // namespace Impl } // namespace Kokkos diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp index 40a263561f..8c40ebd60d 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp @@ -418,7 +418,7 @@ KOKKOS_INLINE_FUNCTION #endif // CUDA_VERSION >= 11000 && CUDA_VERSION < 11010 #if CUDA_VERSION >= 11010 && \ - ((defined(KOKKOS_ARCH_AMPERE80) || defined(KOKKOS_ARCH_AMPERE86))) + ((defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER))) KOKKOS_INLINE_FUNCTION bhalf_t cast_to_bhalf(bhalf_t val) { return val; } KOKKOS_INLINE_FUNCTION diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp index 5811498e01..e22eb3b842 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -569,12 +569,6 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default } #endif -#ifdef KOKKOS_ENABLE_PRE_CUDA_10_DEPRECATION_API - cudaThreadSetCacheConfig(cudaFuncCachePreferShared); -#else - cudaDeviceSetCacheConfig(cudaFuncCachePreferShared); -#endif - // Init the array for used for arbitrarily sized atomics if (stream == nullptr) Impl::initialize_host_cuda_lock_arrays(); diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp index 88810b6fc2..ba43e362bb 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp @@ -93,10 +93,6 @@ namespace Impl { // __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor) // function qualifier which could be used to improve performance. //---------------------------------------------------------------------------- -// Maximize L1 cache and minimize shared memory: -// cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1 ); -// For 2.0 capability: 48 KB L1 and 16 KB shared -//---------------------------------------------------------------------------- template __global__ static void cuda_parallel_launch_constant_memory() { @@ -158,63 +154,119 @@ inline void check_shmem_request(CudaInternal const* cuda_instance, int shmem) { } } -// This function needs to be template on DriverType and LaunchBounds +// These functions needs to be template on DriverType and LaunchBounds // so that the static bool is unique for each type combo // KernelFuncPtr does not necessarily contain that type information. + template -inline void configure_shmem_preference(KernelFuncPtr const& func, - bool prefer_shmem) { -#ifndef KOKKOS_ARCH_KEPLER - // On Kepler the L1 has no benefit since it doesn't cache reads - auto set_cache_config = [&] { - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetCacheConfig( - func, - (prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1))); - return prefer_shmem; +const cudaFuncAttributes& get_cuda_kernel_func_attributes( + const KernelFuncPtr& func) { + // Only call cudaFuncGetAttributes once for each unique kernel + // by leveraging static variable initialization rules + auto wrap_get_attributes = [&]() -> cudaFuncAttributes { + cudaFuncAttributes attr; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncGetAttributes(&attr, func)); + return attr; }; - static bool cache_config_preference_cached = set_cache_config(); - if (cache_config_preference_cached != prefer_shmem) { + static cudaFuncAttributes func_attr = wrap_get_attributes(); + return func_attr; +} + +template +inline void configure_shmem_preference(const KernelFuncPtr& func, + const cudaDeviceProp& device_props, + const size_t block_size, int& shmem, + const size_t occupancy) { +#ifndef KOKKOS_ARCH_KEPLER + + const auto& func_attr = + get_cuda_kernel_func_attributes(func); + + // Compute limits for number of blocks due to registers/SM + const size_t regs_per_sm = device_props.regsPerMultiprocessor; + const size_t regs_per_thread = func_attr.numRegs; + // The granularity of register allocation is chunks of 256 registers per warp + // -> 8 registers per thread + const size_t allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8); + size_t max_blocks_regs = + regs_per_sm / (allocated_regs_per_thread * block_size); + + // Compute the maximum number of warps as a function of the number of + // registers + const size_t max_warps_per_sm_registers = + cuda_max_warps_per_sm_registers(device_props, func_attr); + + // Constrain the number of blocks to respect the maximum number of warps per + // SM On face value this should be an equality, but due to the warp + // granularity constraints noted in `cuda_max_warps_per_sm_registers` the + // left-hand-side of this comparison can overshoot what the hardware allows + // based on register counts alone + while ((max_blocks_regs * block_size / device_props.warpSize) > + max_warps_per_sm_registers) + max_blocks_regs--; + + // Compute how many threads per sm we actually want + const size_t max_threads_per_sm = device_props.maxThreadsPerMultiProcessor; + // only allocate multiples of warp size + const size_t num_threads_desired = + ((max_threads_per_sm * occupancy / 100 + 31) / 32) * 32; + // Get close to the desired occupancy, + // don't undershoot by much but also don't allocate a whole new block just + // because one is a few threads over otherwise. + size_t num_blocks_desired = + (num_threads_desired + block_size * 0.8) / block_size; + num_blocks_desired = ::std::min(max_blocks_regs, num_blocks_desired); + if (num_blocks_desired == 0) num_blocks_desired = 1; + + // Calculate how much shared memory we need per block + size_t shmem_per_block = shmem + func_attr.sharedSizeBytes; + + // The minimum shared memory allocation we can have in total per SM is 8kB. + // If we want to lower occupancy we have to make sure we request at least that + // much in aggregate over all blocks, so that shared memory actually becomes a + // limiting factor for occupancy + constexpr size_t min_shmem_size_per_sm = 8192; + if ((occupancy < 100) && + (shmem_per_block * num_blocks_desired < min_shmem_size_per_sm)) { + shmem_per_block = min_shmem_size_per_sm / num_blocks_desired; + // Need to set the caller's shmem variable so that the + // kernel launch uses the correct dynamic shared memory request + shmem = shmem_per_block - func_attr.sharedSizeBytes; + } + + // Compute the carveout fraction we need based on occupancy + // Use multiples of 8kB + const size_t max_shmem_per_sm = device_props.sharedMemPerMultiprocessor; + size_t carveout = shmem_per_block == 0 + ? 0 + : 100 * + (((num_blocks_desired * shmem_per_block + + min_shmem_size_per_sm - 1) / + min_shmem_size_per_sm) * + min_shmem_size_per_sm) / + max_shmem_per_sm; + if (carveout > 100) carveout = 100; + + // Set the carveout, but only call it once per kernel or when it changes + auto set_cache_config = [&] { + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetAttribute( + func, cudaFuncAttributePreferredSharedMemoryCarveout, carveout)); + return carveout; + }; + // Store the value in a static variable so we only reset if needed + static size_t cache_config_preference_cached = set_cache_config(); + if (cache_config_preference_cached != carveout) { cache_config_preference_cached = set_cache_config(); } #else // Use the parameters so we don't get a warning (void)func; - (void)prefer_shmem; + (void)device_props; + (void)block_size; + (void)occupancy; #endif } -template -std::enable_if_t -modify_launch_configuration_if_desired_occupancy_is_specified( - Policy const& policy, cudaDeviceProp const& properties, - cudaFuncAttributes const& attributes, dim3 const& block, int& shmem, - bool& prefer_shmem) { - int const block_size = block.x * block.y * block.z; - int const desired_occupancy = policy.impl_get_desired_occupancy().value(); - - size_t const shmem_per_sm_prefer_l1 = get_shmem_per_sm_prefer_l1(properties); - size_t const static_shmem = attributes.sharedSizeBytes; - - // round to nearest integer and avoid division by zero - int active_blocks = std::max( - 1, static_cast(std::round( - static_cast(properties.maxThreadsPerMultiProcessor) / - block_size * desired_occupancy / 100))); - int const dynamic_shmem = - shmem_per_sm_prefer_l1 / active_blocks - static_shmem; - - if (dynamic_shmem > shmem) { - shmem = dynamic_shmem; - prefer_shmem = false; - } -} - -template -std::enable_if_t -modify_launch_configuration_if_desired_occupancy_is_specified( - Policy const&, cudaDeviceProp const&, cudaFuncAttributes const&, - dim3 const& /*block*/, int& /*shmem*/, bool& /*prefer_shmem*/) {} - // end Some helper functions for launch code readability }}}1 //============================================================================== @@ -348,7 +400,7 @@ struct CudaParallelLaunchKernelInvoker< #ifdef KOKKOS_CUDA_ENABLE_GRAPHS inline static void create_parallel_launch_graph_node( DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem, - CudaInternal const* cuda_instance, bool prefer_shmem) { + CudaInternal const* cuda_instance) { //---------------------------------------- auto const& graph = Impl::get_cuda_graph_from_kernel(driver); KOKKOS_EXPECTS(bool(graph)); @@ -358,8 +410,19 @@ struct CudaParallelLaunchKernelInvoker< if (!Impl::is_empty_launch(grid, block)) { Impl::check_shmem_request(cuda_instance, shmem); - Impl::configure_shmem_preference( - base_t::get_kernel_func(), prefer_shmem); + if (DriverType::Policy:: + experimental_contains_desired_occupancy) { + /* + int desired_occupancy = + driver.get_policy().impl_get_desired_occupancy().value(); + size_t block_size = block.x * block.y * block.z; + Impl::configure_shmem_preference( + base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size, + shmem, desired_occupancy);*/ + Kokkos::Impl::throw_runtime_exception( + std::string("Cuda graph node creation FAILED:" + " occupancy requests are currently broken.")); + } void const* args[] = {&driver}; @@ -442,7 +505,7 @@ struct CudaParallelLaunchKernelInvoker< #ifdef KOKKOS_CUDA_ENABLE_GRAPHS inline static void create_parallel_launch_graph_node( DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem, - CudaInternal const* cuda_instance, bool prefer_shmem) { + CudaInternal const* cuda_instance) { //---------------------------------------- auto const& graph = Impl::get_cuda_graph_from_kernel(driver); KOKKOS_EXPECTS(bool(graph)); @@ -452,8 +515,18 @@ struct CudaParallelLaunchKernelInvoker< if (!Impl::is_empty_launch(grid, block)) { Impl::check_shmem_request(cuda_instance, shmem); - Impl::configure_shmem_preference( - base_t::get_kernel_func(), prefer_shmem); + if (DriverType::Policy:: + experimental_contains_desired_occupancy) { + /*int desired_occupancy = + driver.get_policy().impl_get_desired_occupancy().value(); + size_t block_size = block.x * block.y * block.z; + Impl::configure_shmem_preference( + base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size, + shmem, desired_occupancy);*/ + Kokkos::Impl::throw_runtime_exception( + std::string("Cuda graph node creation FAILED:" + " occupancy requests are currently broken.")); + } auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver); @@ -566,7 +639,7 @@ struct CudaParallelLaunchKernelInvoker< #ifdef KOKKOS_CUDA_ENABLE_GRAPHS inline static void create_parallel_launch_graph_node( DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem, - CudaInternal const* cuda_instance, bool prefer_shmem) { + CudaInternal const* cuda_instance) { // Just use global memory; coordinating through events to share constant // memory with the non-graph interface is not really reasonable since // events don't work with Graphs directly, and this would anyway require @@ -580,7 +653,7 @@ struct CudaParallelLaunchKernelInvoker< DriverType, LaunchBounds, Experimental::CudaLaunchMechanism::GlobalMemory>; global_launch_impl_t::create_parallel_launch_graph_node( - driver, grid, block, shmem, cuda_instance, prefer_shmem); + driver, grid, block, shmem, cuda_instance); } #endif }; @@ -613,8 +686,7 @@ struct CudaParallelLaunchImpl< inline static void launch_kernel(const DriverType& driver, const dim3& grid, const dim3& block, int shmem, - const CudaInternal* cuda_instance, - bool prefer_shmem) { + const CudaInternal* cuda_instance) { if (!Impl::is_empty_launch(grid, block)) { // Prevent multiple threads to simultaneously set the cache configuration // preference and launch the same kernel @@ -623,20 +695,22 @@ struct CudaParallelLaunchImpl< Impl::check_shmem_request(cuda_instance, shmem); - // If a desired occupancy is specified, we compute how much shared memory - // to ask for to achieve that occupancy, assuming that the cache - // configuration is `cudaFuncCachePreferL1`. If the amount of dynamic - // shared memory computed is actually smaller than `shmem` we overwrite - // `shmem` and set `prefer_shmem` to `false`. - modify_launch_configuration_if_desired_occupancy_is_specified( - driver.get_policy(), cuda_instance->m_deviceProp, - get_cuda_func_attributes(), block, shmem, prefer_shmem); + if (DriverType::Policy:: + experimental_contains_desired_occupancy) { + /*int desired_occupancy = + driver.get_policy().impl_get_desired_occupancy().value(); + size_t block_size = block.x * block.y * block.z; + Impl::configure_shmem_preference< + DriverType, + Kokkos::LaunchBounds>( + base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size, + shmem, desired_occupancy);*/ + Kokkos::Impl::throw_runtime_exception( + std::string("Cuda graph node creation FAILED:" + " occupancy requests are currently broken.")); + } - Impl::configure_shmem_preference< - DriverType, Kokkos::LaunchBounds>( - base_t::get_kernel_func(), prefer_shmem); - - ensure_cuda_lock_arrays_on_device(); + KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE(); // Invoke the driver function on the device base_t::invoke_kernel(driver, grid, block, shmem, cuda_instance); @@ -650,18 +724,9 @@ struct CudaParallelLaunchImpl< } static cudaFuncAttributes get_cuda_func_attributes() { - // Race condition inside of cudaFuncGetAttributes if the same address is - // given requires using a local variable as input instead of a static Rely - // on static variable initialization to make sure only one thread executes - // the code and the result is visible. - auto wrap_get_attributes = []() -> cudaFuncAttributes { - cudaFuncAttributes attr_tmp; - KOKKOS_IMPL_CUDA_SAFE_CALL( - cudaFuncGetAttributes(&attr_tmp, base_t::get_kernel_func())); - return attr_tmp; - }; - static cudaFuncAttributes attr = wrap_get_attributes(); - return attr; + return get_cuda_kernel_func_attributes< + DriverType, Kokkos::LaunchBounds>( + base_t::get_kernel_func()); } }; diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp index 3796534816..84d4307cfd 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp @@ -79,7 +79,8 @@ CudaLockArrays g_host_cuda_lock_arrays = {nullptr, 0}; void initialize_host_cuda_lock_arrays() { #ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS desul::Impl::init_lock_arrays(); - desul::ensure_cuda_lock_arrays_on_device(); + + DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE(); #endif if (g_host_cuda_lock_arrays.atomic != nullptr) return; KOKKOS_IMPL_CUDA_SAFE_CALL( @@ -88,7 +89,7 @@ void initialize_host_cuda_lock_arrays() { Impl::cuda_device_synchronize( "Kokkos::Impl::initialize_host_cuda_lock_arrays: Pre Init Lock Arrays"); g_host_cuda_lock_arrays.n = Cuda::concurrency(); - copy_cuda_lock_arrays_to_device(); + KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE(); init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK + 1 + 255) / 256, 256>>>(); Impl::cuda_device_synchronize( @@ -105,7 +106,7 @@ void finalize_host_cuda_lock_arrays() { g_host_cuda_lock_arrays.atomic = nullptr; g_host_cuda_lock_arrays.n = 0; #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE - copy_cuda_lock_arrays_to_device(); + KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE(); #endif } diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp index 244f142f0d..bdb7723985 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp @@ -67,7 +67,7 @@ struct CudaLockArrays { /// \brief This global variable in Host space is the central definition /// of these arrays. -extern CudaLockArrays g_host_cuda_lock_arrays; +extern Kokkos::Impl::CudaLockArrays g_host_cuda_lock_arrays; /// \brief After this call, the g_host_cuda_lock_arrays variable has /// valid, initialized arrays. @@ -105,12 +105,12 @@ namespace Impl { /// instances in other translation units, we must update this CUDA global /// variable based on the Host global variable prior to running any kernels /// that will use it. -/// That is the purpose of the ensure_cuda_lock_arrays_on_device function. +/// That is the purpose of the KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE macro. __device__ #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE __constant__ extern #endif - CudaLockArrays g_device_cuda_lock_arrays; + Kokkos::Impl::CudaLockArrays g_device_cuda_lock_arrays; #define CUDA_SPACE_ATOMIC_MASK 0x1FFFF @@ -123,7 +123,9 @@ __device__ inline bool lock_address_cuda_space(void* ptr) { size_t offset = size_t(ptr); offset = offset >> 2; offset = offset & CUDA_SPACE_ATOMIC_MASK; - return (0 == atomicCAS(&g_device_cuda_lock_arrays.atomic[offset], 0, 1)); + return ( + 0 == + atomicCAS(&Kokkos::Impl::g_device_cuda_lock_arrays.atomic[offset], 0, 1)); } /// \brief Release lock for the address @@ -136,7 +138,7 @@ __device__ inline void unlock_address_cuda_space(void* ptr) { size_t offset = size_t(ptr); offset = offset >> 2; offset = offset & CUDA_SPACE_ATOMIC_MASK; - atomicExch(&g_device_cuda_lock_arrays.atomic[offset], 0); + atomicExch(&Kokkos::Impl::g_device_cuda_lock_arrays.atomic[offset], 0); } } // namespace Impl @@ -149,49 +151,45 @@ namespace { static int lock_array_copied = 0; inline int eliminate_warning_for_lock_array() { return lock_array_copied; } } // namespace +} // namespace Impl +} // namespace Kokkos -#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE -inline -#else -static -#endif - void - copy_cuda_lock_arrays_to_device() { - if (lock_array_copied == 0) { - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemcpyToSymbol(g_device_cuda_lock_arrays, - &g_host_cuda_lock_arrays, - sizeof(CudaLockArrays))); +/* Dan Ibanez: it is critical that this code be a macro, so that it will + capture the right address for Kokkos::Impl::g_device_cuda_lock_arrays! + putting this in an inline function will NOT do the right thing! */ +#define KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE() \ + { \ + if (::Kokkos::Impl::lock_array_copied == 0) { \ + KOKKOS_IMPL_CUDA_SAFE_CALL( \ + cudaMemcpyToSymbol(Kokkos::Impl::g_device_cuda_lock_arrays, \ + &Kokkos::Impl::g_host_cuda_lock_arrays, \ + sizeof(Kokkos::Impl::CudaLockArrays))); \ + } \ + lock_array_copied = 1; \ } - lock_array_copied = 1; -} #ifndef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE -inline void ensure_cuda_lock_arrays_on_device() {} +#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() #else -inline static void ensure_cuda_lock_arrays_on_device() { - copy_cuda_lock_arrays_to_device(); -} +#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() \ + KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE() #endif #else #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE -inline void ensure_cuda_lock_arrays_on_device() {} +#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() #else // Still Need COPY_CUDA_LOCK_ARRAYS for team scratch etc. -inline static void ensure_cuda_lock_arrays_on_device() { - copy_cuda_lock_arrays_to_device(); - desul::ensure_cuda_lock_arrays_on_device(); -} +#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() \ + KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE() \ + DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() #endif #endif /* defined( KOKKOS_ENABLE_IMPL_DESUL_ATOMICS ) */ -} // namespace Impl -} // namespace Kokkos - #endif /* defined( KOKKOS_ENABLE_CUDA ) */ #endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP */ diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp index e586bb4cc6..7e4f62f12e 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp @@ -67,6 +67,34 @@ namespace Kokkos { namespace Impl { +template +int max_tile_size_product_helper(const Policy& pol, const LaunchBounds&) { + cudaFuncAttributes attr = + CudaParallelLaunch::get_cuda_func_attributes(); + auto const& prop = pol.space().cuda_device_prop(); + + // Limits due to registers/SM, MDRange doesn't have + // shared memory constraints + int const optimal_block_size = + Kokkos::Impl::cuda_get_opt_block_size_no_shmem(attr, LaunchBounds{}); + + // Compute how many blocks of this size we can launch, based on warp + // constraints + int const max_warps_per_sm_registers = + Kokkos::Impl::cuda_max_warps_per_sm_registers(prop, attr); + int const max_num_threads_from_warps = + max_warps_per_sm_registers * prop.warpSize; + int const max_num_blocks = max_num_threads_from_warps / optimal_block_size; + + // Compute the total number of threads + int const max_threads_per_sm = optimal_block_size * max_num_blocks; + + return std::min( + max_threads_per_sm, + static_cast(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism)); +} + template class ParallelFor, Kokkos::Cuda> { public: @@ -85,18 +113,7 @@ class ParallelFor, Kokkos::Cuda> { public: template static int max_tile_size_product(const Policy& pol, const Functor&) { - cudaFuncAttributes attr = - CudaParallelLaunch::get_cuda_func_attributes(); - auto const& prop = pol.space().cuda_device_prop(); - // Limits due to registers/SM, MDRange doesn't have - // shared memory constraints - int const regs_per_sm = prop.regsPerMultiprocessor; - int const regs_per_thread = attr.numRegs; - int const max_threads_per_sm = regs_per_sm / regs_per_thread; - return std::min( - max_threads_per_sm, - static_cast(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism)); + return max_tile_size_product_helper(pol, LaunchBounds{}); } Policy const& get_policy() const { return m_rp; } inline __device__ void operator()() const { @@ -121,8 +138,7 @@ class ParallelFor, Kokkos::Cuda> { maxblocks[1]), 1); CudaParallelLaunch( - *this, grid, block, 0, m_rp.space().impl_internal_space_instance(), - false); + *this, grid, block, 0, m_rp.space().impl_internal_space_instance()); } else if (RP::rank == 3) { const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], m_rp.m_tile[2]); KOKKOS_ASSERT(block.x > 0); @@ -139,8 +155,7 @@ class ParallelFor, Kokkos::Cuda> { (m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1) / block.z, maxblocks[2])); CudaParallelLaunch( - *this, grid, block, 0, m_rp.space().impl_internal_space_instance(), - false); + *this, grid, block, 0, m_rp.space().impl_internal_space_instance()); } else if (RP::rank == 4) { // id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to // threadIdx.z @@ -158,8 +173,7 @@ class ParallelFor, Kokkos::Cuda> { (m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1) / block.z, maxblocks[2])); CudaParallelLaunch( - *this, grid, block, 0, m_rp.space().impl_internal_space_instance(), - false); + *this, grid, block, 0, m_rp.space().impl_internal_space_instance()); } else if (RP::rank == 5) { // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to // threadIdx.z @@ -175,8 +189,7 @@ class ParallelFor, Kokkos::Cuda> { (m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1) / block.z, maxblocks[2])); CudaParallelLaunch( - *this, grid, block, 0, m_rp.space().impl_internal_space_instance(), - false); + *this, grid, block, 0, m_rp.space().impl_internal_space_instance()); } else if (RP::rank == 6) { // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4,id5 to // threadIdx.z @@ -191,8 +204,7 @@ class ParallelFor, Kokkos::Cuda> { std::min(m_rp.m_tile_end[4] * m_rp.m_tile_end[5], maxblocks[2])); CudaParallelLaunch( - *this, grid, block, 0, m_rp.space().impl_internal_space_instance(), - false); + *this, grid, block, 0, m_rp.space().impl_internal_space_instance()); } else { Kokkos::abort("Kokkos::MDRange Error: Exceeded rank bounds with Cuda\n"); } @@ -263,17 +275,7 @@ class ParallelReduce, ReducerType, public: template static int max_tile_size_product(const Policy& pol, const Functor&) { - cudaFuncAttributes attr = - CudaParallelLaunch::get_cuda_func_attributes(); - auto const& prop = pol.space().cuda_device_prop(); - // Limits due do registers/SM - int const regs_per_sm = prop.regsPerMultiprocessor; - int const regs_per_thread = attr.numRegs; - int const max_threads_per_sm = regs_per_sm / regs_per_thread; - return std::min( - max_threads_per_sm, - static_cast(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism)); + return max_tile_size_product_helper(pol, LaunchBounds{}); } Policy const& get_policy() const { return m_policy; } inline __device__ void exec_range(reference_type update) const { @@ -405,8 +407,8 @@ class ParallelReduce, ReducerType, CudaParallelLaunch( *this, grid, block, shmem, - m_policy.space().impl_internal_space_instance(), - false); // copy to device and execute + m_policy.space() + .impl_internal_space_instance()); // copy to device and execute if (!m_result_ptr_device_accessible) { if (m_result_ptr) { diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp index 9873343006..d1031751c2 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp @@ -135,8 +135,7 @@ class ParallelFor, Kokkos::Cuda> { #endif CudaParallelLaunch( - *this, grid, block, 0, m_policy.space().impl_internal_space_instance(), - false); + *this, grid, block, 0, m_policy.space().impl_internal_space_instance()); } ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) @@ -375,8 +374,8 @@ class ParallelReduce, ReducerType, CudaParallelLaunch( *this, grid, block, shmem, - m_policy.space().impl_internal_space_instance(), - false); // copy to device and execute + m_policy.space() + .impl_internal_space_instance()); // copy to device and execute if (!m_result_ptr_device_accessible) { if (m_result_ptr) { @@ -465,8 +464,24 @@ class ParallelScan, Kokkos::Cuda> { public: using pointer_type = typename Analysis::pointer_type; using reference_type = typename Analysis::reference_type; + using value_type = typename Analysis::value_type; using functor_type = FunctorType; using size_type = Cuda::size_type; + // Conditionally set word_size_type to int16_t or int8_t if value_type is + // smaller than int32_t (Kokkos::Cuda::size_type) + // word_size_type is used to determine the word count, shared memory buffer + // size, and global memory buffer size before the scan is performed. + // Within the scan, the word count is recomputed based on word_size_type + // and when calculating indexes into the shared/global memory buffers for + // performing the scan, word_size_type is used again. + // For scalars > 4 bytes in size, indexing into shared/global memory relies + // on the block and grid dimensions to ensure that we index at the correct + // offset rather than at every 4 byte word; such that, when the join is + // performed, we have the correct data that was copied over in chunks of 4 + // bytes. + using word_size_type = std::conditional_t< + sizeof(value_type) < sizeof(size_type), + std::conditional_t, size_type>; private: // Algorithmic constraints: @@ -477,7 +492,7 @@ class ParallelScan, Kokkos::Cuda> { const FunctorType m_functor; const Policy m_policy; - size_type* m_scratch_space; + word_size_type* m_scratch_space; size_type* m_scratch_flags; size_type m_final; #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION @@ -501,12 +516,12 @@ class ParallelScan, Kokkos::Cuda> { __device__ inline void initial() const { typename Analysis::Reducer final_reducer(&m_functor); - const integral_nonzero_constant - word_count(Analysis::value_size(m_functor) / sizeof(size_type)); + const integral_nonzero_constant + word_count(Analysis::value_size(m_functor) / sizeof(word_size_type)); - size_type* const shared_value = - kokkos_impl_cuda_shared_memory() + + word_size_type* const shared_value = + kokkos_impl_cuda_shared_memory() + word_count.value * threadIdx.y; final_reducer.init(reinterpret_cast(shared_value)); @@ -532,7 +547,7 @@ class ParallelScan, Kokkos::Cuda> { // gridDim.x cuda_single_inter_block_reduce_scan( final_reducer, blockIdx.x, gridDim.x, - kokkos_impl_cuda_shared_memory(), m_scratch_space, + kokkos_impl_cuda_shared_memory(), m_scratch_space, m_scratch_flags); } @@ -541,21 +556,22 @@ class ParallelScan, Kokkos::Cuda> { __device__ inline void final() const { typename Analysis::Reducer final_reducer(&m_functor); - const integral_nonzero_constant - word_count(Analysis::value_size(m_functor) / sizeof(size_type)); + const integral_nonzero_constant + word_count(Analysis::value_size(m_functor) / sizeof(word_size_type)); // Use shared memory as an exclusive scan: { 0 , value[0] , value[1] , // value[2] , ... } - size_type* const shared_data = kokkos_impl_cuda_shared_memory(); - size_type* const shared_prefix = + word_size_type* const shared_data = + kokkos_impl_cuda_shared_memory(); + word_size_type* const shared_prefix = shared_data + word_count.value * threadIdx.y; - size_type* const shared_accum = + word_size_type* const shared_accum = shared_data + word_count.value * (blockDim.y + 1); // Starting value for this thread block is the previous block's total. if (blockIdx.x) { - size_type* const block_total = + word_size_type* const block_total = m_scratch_space + word_count.value * (blockIdx.x - 1); for (unsigned i = threadIdx.y; i < word_count.value; ++i) { shared_accum[i] = block_total[i]; @@ -602,7 +618,7 @@ class ParallelScan, Kokkos::Cuda> { typename Analysis::pointer_type(shared_data + word_count.value)); { - size_type* const block_total = + word_size_type* const block_total = shared_data + word_count.value * blockDim.y; for (unsigned i = threadIdx.y; i < word_count.value; ++i) { shared_accum[i] = block_total[i]; @@ -690,8 +706,9 @@ class ParallelScan, Kokkos::Cuda> { // How many block are really needed for this much work: const int grid_x = (nwork + work_per_block - 1) / work_per_block; - m_scratch_space = cuda_internal_scratch_space( - m_policy.space(), Analysis::value_size(m_functor) * grid_x); + m_scratch_space = + reinterpret_cast(cuda_internal_scratch_space( + m_policy.space(), Analysis::value_size(m_functor) * grid_x)); m_scratch_flags = cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type) * 1); @@ -708,16 +725,16 @@ class ParallelScan, Kokkos::Cuda> { m_final = false; CudaParallelLaunch( *this, grid, block, shmem, - m_policy.space().impl_internal_space_instance(), - false); // copy to device and execute + m_policy.space() + .impl_internal_space_instance()); // copy to device and execute #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION } #endif m_final = true; CudaParallelLaunch( *this, grid, block, shmem, - m_policy.space().impl_internal_space_instance(), - false); // copy to device and execute + m_policy.space() + .impl_internal_space_instance()); // copy to device and execute } } @@ -752,10 +769,26 @@ class ParallelScanWithTotal, Policy, FunctorType>; public: + using value_type = typename Analysis::value_type; using pointer_type = typename Analysis::pointer_type; using reference_type = typename Analysis::reference_type; using functor_type = FunctorType; using size_type = Cuda::size_type; + // Conditionally set word_size_type to int16_t or int8_t if value_type is + // smaller than int32_t (Kokkos::Cuda::size_type) + // word_size_type is used to determine the word count, shared memory buffer + // size, and global memory buffer size before the scan is performed. + // Within the scan, the word count is recomputed based on word_size_type + // and when calculating indexes into the shared/global memory buffers for + // performing the scan, word_size_type is used again. + // For scalars > 4 bytes in size, indexing into shared/global memory relies + // on the block and grid dimensions to ensure that we index at the correct + // offset rather than at every 4 byte word; such that, when the join is + // performed, we have the correct data that was copied over in chunks of 4 + // bytes. + using word_size_type = std::conditional_t< + sizeof(value_type) < sizeof(size_type), + std::conditional_t, size_type>; private: // Algorithmic constraints: @@ -766,7 +799,7 @@ class ParallelScanWithTotal, const FunctorType m_functor; const Policy m_policy; - size_type* m_scratch_space; + word_size_type* m_scratch_space; size_type* m_scratch_flags; size_type m_final; ReturnType& m_returnvalue; @@ -791,12 +824,12 @@ class ParallelScanWithTotal, __device__ inline void initial() const { typename Analysis::Reducer final_reducer(&m_functor); - const integral_nonzero_constant - word_count(Analysis::value_size(m_functor) / sizeof(size_type)); + const integral_nonzero_constant + word_count(Analysis::value_size(m_functor) / sizeof(word_size_type)); - size_type* const shared_value = - kokkos_impl_cuda_shared_memory() + + word_size_type* const shared_value = + kokkos_impl_cuda_shared_memory() + word_count.value * threadIdx.y; final_reducer.init(reinterpret_cast(shared_value)); @@ -822,7 +855,7 @@ class ParallelScanWithTotal, // gridDim.x cuda_single_inter_block_reduce_scan( final_reducer, blockIdx.x, gridDim.x, - kokkos_impl_cuda_shared_memory(), m_scratch_space, + kokkos_impl_cuda_shared_memory(), m_scratch_space, m_scratch_flags); } @@ -831,21 +864,22 @@ class ParallelScanWithTotal, __device__ inline void final() const { typename Analysis::Reducer final_reducer(&m_functor); - const integral_nonzero_constant - word_count(Analysis::value_size(m_functor) / sizeof(size_type)); + const integral_nonzero_constant + word_count(Analysis::value_size(m_functor) / sizeof(word_size_type)); // Use shared memory as an exclusive scan: { 0 , value[0] , value[1] , // value[2] , ... } - size_type* const shared_data = kokkos_impl_cuda_shared_memory(); - size_type* const shared_prefix = + word_size_type* const shared_data = + kokkos_impl_cuda_shared_memory(); + word_size_type* const shared_prefix = shared_data + word_count.value * threadIdx.y; - size_type* const shared_accum = + word_size_type* const shared_accum = shared_data + word_count.value * (blockDim.y + 1); // Starting value for this thread block is the previous block's total. if (blockIdx.x) { - size_type* const block_total = + word_size_type* const block_total = m_scratch_space + word_count.value * (blockIdx.x - 1); for (unsigned i = threadIdx.y; i < word_count.value; ++i) { shared_accum[i] = block_total[i]; @@ -894,7 +928,7 @@ class ParallelScanWithTotal, typename Analysis::pointer_type(shared_data + word_count.value)); { - size_type* const block_total = + word_size_type* const block_total = shared_data + word_count.value * blockDim.y; for (unsigned i = threadIdx.y; i < word_count.value; ++i) { shared_accum[i] = block_total[i]; @@ -983,8 +1017,9 @@ class ParallelScanWithTotal, // How many block are really needed for this much work: const int grid_x = (nwork + work_per_block - 1) / work_per_block; - m_scratch_space = cuda_internal_scratch_space( - m_policy.space(), Analysis::value_size(m_functor) * grid_x); + m_scratch_space = + reinterpret_cast(cuda_internal_scratch_space( + m_policy.space(), Analysis::value_size(m_functor) * grid_x)); m_scratch_flags = cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type) * 1); @@ -1002,16 +1037,16 @@ class ParallelScanWithTotal, m_final = false; CudaParallelLaunch( *this, grid, block, shmem, - m_policy.space().impl_internal_space_instance(), - false); // copy to device and execute + m_policy.space() + .impl_internal_space_instance()); // copy to device and execute #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION } #endif m_final = true; CudaParallelLaunch( *this, grid, block, shmem, - m_policy.space().impl_internal_space_instance(), - false); // copy to device and execute + m_policy.space() + .impl_internal_space_instance()); // copy to device and execute const int size = Analysis::value_size(m_functor); #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION @@ -1022,7 +1057,8 @@ class ParallelScanWithTotal, #endif DeepCopy( m_policy.space(), &m_returnvalue, - m_scratch_space + (grid_x - 1) * size / sizeof(int), size); + m_scratch_space + (grid_x - 1) * size / sizeof(word_size_type), + size); } } diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp index cdd16085b3..ea9430b812 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp @@ -552,8 +552,8 @@ class ParallelFor, CudaParallelLaunch( *this, grid, block, shmem_size_total, - m_policy.space().impl_internal_space_instance(), - true); // copy to device and execute + m_policy.space() + .impl_internal_space_instance()); // copy to device and execute } ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) @@ -878,8 +878,8 @@ class ParallelReduce, CudaParallelLaunch( *this, grid, block, shmem_size_total, - m_policy.space().impl_internal_space_instance(), - true); // copy to device and execute + m_policy.space() + .impl_internal_space_instance()); // copy to device and execute if (!m_result_ptr_device_accessible) { m_policy.space().fence( diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp index 078315b65d..178012431c 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp @@ -116,6 +116,7 @@ __device__ inline void cuda_inter_warp_reduction( value = result[0]; for (int i = 1; (i * step < max_active_thread) && i < STEP_WIDTH; i++) reducer.join(&value, &result[i]); + __syncthreads(); } template @@ -427,11 +428,6 @@ struct CudaReductionsFunctor { // __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor) // function qualifier which could be used to improve performance. //---------------------------------------------------------------------------- -// Maximize shared memory and minimize L1 cache: -// cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferShared ); -// For 2.0 capability: 48 KB shared and 16 KB L1 -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- /* * Algorithmic constraints: * (a) blockDim.y <= 1024 diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp index fb3a6b138f..a12378a891 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp @@ -100,8 +100,7 @@ class ParallelFor, const int shared = 0; Kokkos::Impl::CudaParallelLaunch( - *this, grid, block, shared, Cuda().impl_internal_space_instance(), - false); + *this, grid, block, shared, Cuda().impl_internal_space_instance()); } inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp index 5c871e0d61..dca1fb9073 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp @@ -448,11 +448,27 @@ class ParallelScanHIPBase { Policy, FunctorType>; public: + using value_type = typename Analysis::value_type; using pointer_type = typename Analysis::pointer_type; using reference_type = typename Analysis::reference_type; using functor_type = FunctorType; using size_type = Kokkos::Experimental::HIP::size_type; using index_type = typename Policy::index_type; + // Conditionally set word_size_type to int16_t or int8_t if value_type is + // smaller than int32_t (Kokkos::HIP::size_type) + // word_size_type is used to determine the word count, shared memory buffer + // size, and global memory buffer size before the scan is performed. + // Within the scan, the word count is recomputed based on word_size_type + // and when calculating indexes into the shared/global memory buffers for + // performing the scan, word_size_type is used again. + // For scalars > 4 bytes in size, indexing into shared/global memory relies + // on the block and grid dimensions to ensure that we index at the correct + // offset rather than at every 4 byte word; such that, when the join is + // performed, we have the correct data that was copied over in chunks of 4 + // bytes. + using word_size_type = std::conditional_t< + sizeof(value_type) < sizeof(size_type), + std::conditional_t, size_type>; protected: // Algorithmic constraints: @@ -463,10 +479,10 @@ class ParallelScanHIPBase { const FunctorType m_functor; const Policy m_policy; - size_type* m_scratch_space = nullptr; - size_type* m_scratch_flags = nullptr; - size_type m_final = false; - int m_grid_x = 0; + word_size_type* m_scratch_space = nullptr; + size_type* m_scratch_flags = nullptr; + size_type m_final = false; + int m_grid_x = 0; // Only let one ParallelReduce/Scan modify the shared memory. The // constructor acquires the mutex which is released in the destructor. std::lock_guard m_shared_memory_lock; @@ -489,12 +505,12 @@ class ParallelScanHIPBase { __device__ inline void initial() const { typename Analysis::Reducer final_reducer(&m_functor); - const integral_nonzero_constant - word_count(Analysis::value_size(m_functor) / sizeof(size_type)); + const integral_nonzero_constant + word_count(Analysis::value_size(m_functor) / sizeof(word_size_type)); pointer_type const shared_value = reinterpret_cast( - Kokkos::Experimental::kokkos_impl_hip_shared_memory() + + Kokkos::Experimental::kokkos_impl_hip_shared_memory() + word_count.value * threadIdx.y); final_reducer.init(shared_value); @@ -518,7 +534,7 @@ class ParallelScanHIPBase { // gridDim.x hip_single_inter_block_reduce_scan( final_reducer, blockIdx.x, gridDim.x, - Kokkos::Experimental::kokkos_impl_hip_shared_memory(), + Kokkos::Experimental::kokkos_impl_hip_shared_memory(), m_scratch_space, m_scratch_flags); } @@ -527,22 +543,22 @@ class ParallelScanHIPBase { __device__ inline void final() const { typename Analysis::Reducer final_reducer(&m_functor); - const integral_nonzero_constant - word_count(Analysis::value_size(m_functor) / sizeof(size_type)); + const integral_nonzero_constant + word_count(Analysis::value_size(m_functor) / sizeof(word_size_type)); // Use shared memory as an exclusive scan: { 0 , value[0] , value[1] , // value[2] , ... } - size_type* const shared_data = - Kokkos::Experimental::kokkos_impl_hip_shared_memory(); - size_type* const shared_prefix = + word_size_type* const shared_data = + Kokkos::Experimental::kokkos_impl_hip_shared_memory(); + word_size_type* const shared_prefix = shared_data + word_count.value * threadIdx.y; - size_type* const shared_accum = + word_size_type* const shared_accum = shared_data + word_count.value * (blockDim.y + 1); // Starting value for this thread block is the previous block's total. if (blockIdx.x) { - size_type* const block_total = + word_size_type* const block_total = m_scratch_space + word_count.value * (blockIdx.x - 1); for (unsigned i = threadIdx.y; i < word_count.value; ++i) { shared_accum[i] = block_total[i]; @@ -588,7 +604,7 @@ class ParallelScanHIPBase { typename Analysis::pointer_type(shared_data + word_count.value)); { - size_type* const block_total = + word_size_type* const block_total = shared_data + word_count.value * blockDim.y; for (unsigned i = threadIdx.y; i < word_count.value; ++i) { shared_accum[i] = block_total[i]; @@ -647,8 +663,9 @@ class ParallelScanHIPBase { // How many block are really needed for this much work: m_grid_x = (nwork + work_per_block - 1) / work_per_block; - m_scratch_space = Kokkos::Experimental::Impl::hip_internal_scratch_space( - m_policy.space(), Analysis::value_size(m_functor) * m_grid_x); + m_scratch_space = reinterpret_cast( + Kokkos::Experimental::Impl::hip_internal_scratch_space( + m_policy.space(), Analysis::value_size(m_functor) * m_grid_x)); m_scratch_flags = Kokkos::Experimental::Impl::hip_internal_scratch_flags( m_policy.space(), sizeof(size_type) * 1); @@ -734,7 +751,8 @@ class ParallelScanWithTotal, DeepCopy( Base::m_policy.space(), &m_returnvalue, - Base::m_scratch_space + (Base::m_grid_x - 1) * size / sizeof(int), + Base::m_scratch_space + (Base::m_grid_x - 1) * size / + sizeof(typename Base::word_size_type), size); } } diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_ReduceScan.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_ReduceScan.hpp index 1091ad5cea..9002f69589 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_ReduceScan.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_ReduceScan.hpp @@ -225,11 +225,11 @@ struct HIPReductionsFunctor { } } + template __device__ static inline bool scalar_inter_block_reduction( FunctorType const& functor, ::Kokkos::Experimental::HIP::size_type const block_count, - ::Kokkos::Experimental::HIP::size_type* const shared_data, - ::Kokkos::Experimental::HIP::size_type* const global_data, + SizeType* const shared_data, SizeType* const global_data, ::Kokkos::Experimental::HIP::size_type* const global_flags) { Scalar* const global_team_buffer_element = reinterpret_cast(global_data); @@ -411,16 +411,14 @@ __device__ void hip_intra_block_reduce_scan( * Global reduce result is in the last threads' 'shared_data' location. */ -template +template __device__ bool hip_single_inter_block_reduce_scan_impl( FunctorType const& functor, ::Kokkos::Experimental::HIP::size_type const block_id, ::Kokkos::Experimental::HIP::size_type const block_count, - ::Kokkos::Experimental::HIP::size_type* const shared_data, - ::Kokkos::Experimental::HIP::size_type* const global_data, + SizeType* const shared_data, SizeType* const global_data, ::Kokkos::Experimental::HIP::size_type* const global_flags) { - using size_type = ::Kokkos::Experimental::HIP::size_type; - + using size_type = SizeType; using value_type = typename FunctorType::value_type; using pointer_type = typename FunctorType::pointer_type; @@ -518,13 +516,12 @@ __device__ bool hip_single_inter_block_reduce_scan_impl( return is_last_block; } -template +template __device__ bool hip_single_inter_block_reduce_scan( FunctorType const& functor, ::Kokkos::Experimental::HIP::size_type const block_id, ::Kokkos::Experimental::HIP::size_type const block_count, - ::Kokkos::Experimental::HIP::size_type* const shared_data, - ::Kokkos::Experimental::HIP::size_type* const global_data, + SizeType* const shared_data, SizeType* const global_data, ::Kokkos::Experimental::HIP::size_type* const global_flags) { // If we are doing a reduction and we don't do an array reduction, we use the // reduction-only path. Otherwise, we use the common path between reduction diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp index eb85ed4709..d0bbc18da8 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp @@ -116,6 +116,7 @@ __device__ inline void hip_inter_warp_shuffle_reduction( value = result[0]; for (int i = 1; (i * step < max_active_thread) && (i < step_width); ++i) reducer.join(&value, &result[i]); + __syncthreads(); } template diff --git a/lib/kokkos/core/src/Kokkos_CopyViews.hpp b/lib/kokkos/core/src/Kokkos_CopyViews.hpp index 0a66ee9da7..d859a5d8ae 100644 --- a/lib/kokkos/core/src/Kokkos_CopyViews.hpp +++ b/lib/kokkos/core/src/Kokkos_CopyViews.hpp @@ -3711,12 +3711,13 @@ namespace Impl { template inline std::enable_if_t< - (std::is_same< - typename Kokkos::View::memory_space, - typename Kokkos::View::HostMirror::memory_space>::value && - std::is_same< - typename Kokkos::View::data_type, - typename Kokkos::View::HostMirror::data_type>::value), + !Impl::ViewCtorProp::has_memory_space && + (std::is_same< + typename Kokkos::View::memory_space, + typename Kokkos::View::HostMirror::memory_space>::value && + std::is_same< + typename Kokkos::View::data_type, + typename Kokkos::View::HostMirror::data_type>::value), typename Kokkos::View::HostMirror> create_mirror_view(const Kokkos::View& src, const Impl::ViewCtorProp&) { @@ -3725,12 +3726,13 @@ create_mirror_view(const Kokkos::View& src, template inline std::enable_if_t< - !(std::is_same< - typename Kokkos::View::memory_space, - typename Kokkos::View::HostMirror::memory_space>::value && - std::is_same< - typename Kokkos::View::data_type, - typename Kokkos::View::HostMirror::data_type>::value), + !Impl::ViewCtorProp::has_memory_space && + !(std::is_same::memory_space, + typename Kokkos::View< + T, P...>::HostMirror::memory_space>::value && + std::is_same< + typename Kokkos::View::data_type, + typename Kokkos::View::HostMirror::data_type>::value), typename Kokkos::View::HostMirror> create_mirror_view(const Kokkos::View& src, const Impl::ViewCtorProp& arg_prop) { @@ -3738,25 +3740,33 @@ create_mirror_view(const Kokkos::View& src, } // Create a mirror view in a new space (specialization for same space) -template -std::enable_if_t::is_same_memspace, - typename Impl::MirrorViewType::view_type> -create_mirror_view(const Space&, const Kokkos::View& src, +template ::has_memory_space>> +std::enable_if_t::memory_space, + T, P...>::is_same_memspace, + typename Impl::MirrorViewType< + typename Impl::ViewCtorProp::memory_space, + T, P...>::view_type> +create_mirror_view(const Kokkos::View& src, const Impl::ViewCtorProp&) { return src; } // Create a mirror view in a new space (specialization for different space) -template -std::enable_if_t::is_same_memspace, - typename Impl::MirrorViewType::view_type> -create_mirror_view(const Space&, const Kokkos::View& src, +template ::has_memory_space>> +std::enable_if_t::memory_space, + T, P...>::is_same_memspace, + typename Impl::MirrorViewType< + typename Impl::ViewCtorProp::memory_space, + T, P...>::view_type> +create_mirror_view(const Kokkos::View& src, const Impl::ViewCtorProp& arg_prop) { - using MemorySpace = typename Space::memory_space; - using alloc_prop = Impl::ViewCtorProp; - alloc_prop prop_copy(arg_prop); - - return Kokkos::Impl::create_mirror(src, prop_copy); + return Kokkos::Impl::create_mirror(src, arg_prop); } } // namespace Impl @@ -3815,9 +3825,10 @@ typename Impl::MirrorViewType::view_type create_mirror_view( template ::value>> typename Impl::MirrorViewType::view_type create_mirror_view( - Kokkos::Impl::WithoutInitializing_t wi, Space const& space, + Kokkos::Impl::WithoutInitializing_t wi, Space const&, Kokkos::View const& v) { - return Impl::create_mirror_view(space, v, view_alloc(wi)); + return Impl::create_mirror_view( + v, view_alloc(typename Space::memory_space{}, wi)); } template diff --git a/lib/kokkos/core/src/Kokkos_View.hpp b/lib/kokkos/core/src/Kokkos_View.hpp index e92ed7d2e9..f8dcfc869e 100644 --- a/lib/kokkos/core/src/Kokkos_View.hpp +++ b/lib/kokkos/core/src/Kokkos_View.hpp @@ -1754,7 +1754,10 @@ struct RankDataType { }; template -KOKKOS_FUNCTION std::enable_if_t::Rank, View> +KOKKOS_FUNCTION std::enable_if_t< + N == View::Rank && + std::is_same::specialize, void>::value, + View> as_view_of_rank_n(View v) { return v; } @@ -1762,13 +1765,13 @@ as_view_of_rank_n(View v) { // Placeholder implementation to compile generic code for DynRankView; should // never be called template -std::enable_if_t< - N != View::Rank, +KOKKOS_FUNCTION std::enable_if_t< + N != View::Rank && + std::is_same::specialize, void>::value, View::value_type, N>::type, Args...>> as_view_of_rank_n(View) { - Kokkos::Impl::throw_runtime_exception( - "Trying to get at a View of the wrong rank"); + Kokkos::abort("Trying to get at a View of the wrong rank"); return {}; } diff --git a/lib/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp b/lib/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp index fafd825df2..129a489387 100644 --- a/lib/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp +++ b/lib/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp @@ -101,8 +101,8 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits { void push_work(const std::int32_t w) const noexcept { const std::int32_t N = m_graph.numRows(); - std::int32_t volatile* const ready_queue = &m_queue[0]; - std::int32_t volatile* const end_hint = &m_queue[2 * N + 1]; + std::int32_t* const ready_queue = &m_queue[0]; + std::int32_t* const end_hint = &m_queue[2 * N + 1]; // Push work to end of queue const std::int32_t j = atomic_fetch_add(end_hint, 1); @@ -134,14 +134,14 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits { std::int32_t pop_work() const noexcept { const std::int32_t N = m_graph.numRows(); - std::int32_t volatile* const ready_queue = &m_queue[0]; - std::int32_t volatile* const begin_hint = &m_queue[2 * N]; + std::int32_t* const ready_queue = &m_queue[0]; + std::int32_t* const begin_hint = &m_queue[2 * N]; // begin hint is guaranteed to be less than or equal to // actual begin location in the queue. - for (std::int32_t i = *begin_hint; i < N; ++i) { - const std::int32_t w = ready_queue[i]; + for (std::int32_t i = Kokkos::atomic_load(begin_hint); i < N; ++i) { + const std::int32_t w = Kokkos::atomic_load(&ready_queue[i]); if (w == END_TOKEN) { return END_TOKEN; @@ -169,7 +169,7 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits { const std::int32_t N = m_graph.numRows(); - std::int32_t volatile* const count_queue = &m_queue[N]; + std::int32_t* const count_queue = &m_queue[N]; const std::int32_t B = m_graph.row_map(w); const std::int32_t E = m_graph.row_map(w + 1); @@ -199,7 +199,7 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits { KOKKOS_INLINE_FUNCTION void operator()(const TagCount, int i) const noexcept { - std::int32_t volatile* const count_queue = &m_queue[m_graph.numRows()]; + std::int32_t* const count_queue = &m_queue[m_graph.numRows()]; atomic_increment(count_queue + m_graph.entries[i]); } diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp index f9451ecfe6..2ce1efb98c 100644 --- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp +++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp @@ -51,7 +51,7 @@ namespace Kokkos::Experimental::Impl { struct OpenACC_Traits { #if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \ - defined(KOKKOS_ARCH_AMPERE) + defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER) static constexpr acc_device_t dev_type = acc_device_nvidia; static constexpr bool may_fallback_to_host = false; #else diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp index 51921765ba..27ee1d4232 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp @@ -47,6 +47,7 @@ #endif #include +#include #if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(_OPENMP) @@ -114,8 +115,9 @@ void OpenMPTargetInternal::impl_initialize() { // FIXME_OPENMPTARGET: Only fix the number of teams for NVIDIA architectures // from Pascal and upwards. -#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \ - defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) +#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \ + defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \ + defined(KOKKOS_ARCH_HOPPER) #if defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG >= 1300) omp_set_num_teams(512); #endif @@ -164,7 +166,11 @@ void OpenMPTarget::impl_static_fence(const std::string& name) { name, Kokkos::Experimental::Impl::openmp_fence_is_static::yes); } -void OpenMPTarget::impl_initialize(InitializationSettings const&) { +void OpenMPTarget::impl_initialize(InitializationSettings const& settings) { + using Kokkos::Impl::get_gpu; + const int device_num = get_gpu(settings); + omp_set_default_device(device_num); + Impl::OpenMPTargetInternal::impl_singleton()->impl_initialize(); } void OpenMPTarget::impl_finalize() { diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp index 840db4327c..7e5addbc5b 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp @@ -155,7 +155,7 @@ void SYCL::impl_initialize(InitializationSettings const& settings) { #if !defined(KOKKOS_ARCH_INTEL_GPU) && !defined(KOKKOS_ARCH_KEPLER) && \ !defined(KOKKOS_ARCH_MAXWELL) && !defined(KOKKOS_ARCH_PASCAL) && \ !defined(KOKKOS_ARCH_VOLTA) && !defined(KOKKOS_ARCH_TURING75) && \ - !defined(KOKKOS_ARCH_AMPERE) + !defined(KOKKOS_ARCH_AMPERE) && !defined(KOKKOS_ARCH_HOPPER) if (!settings.has_device_id() && gpu_devices.empty()) { Impl::SYCLInternal::singleton().initialize(sycl::device()); return; diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp index 5ac7d8af30..ba101f699e 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp @@ -335,9 +335,10 @@ class TeamPolicyInternal return std::min({ int(m_space.impl_internal_space_instance()->m_maxWorkgroupSize), // FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs. -#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \ - defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \ - defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) +#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \ + defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \ + defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \ + defined(KOKKOS_ARCH_HOPPER) 256, #endif max_threads_for_memory @@ -367,9 +368,10 @@ class TeamPolicyInternal return std::min({ int(m_space.impl_internal_space_instance()->m_maxWorkgroupSize), // FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs. -#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \ - defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \ - defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) +#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \ + defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \ + defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \ + defined(KOKKOS_ARCH_HOPPER) 256, #endif max_threads_for_memory diff --git a/lib/kokkos/core/src/impl/Kokkos_ClockTic.hpp b/lib/kokkos/core/src/impl/Kokkos_ClockTic.hpp index c1cb6a7d91..ecece72cf9 100644 --- a/lib/kokkos/core/src/impl/Kokkos_ClockTic.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_ClockTic.hpp @@ -110,10 +110,9 @@ KOKKOS_IMPL_HOST_FUNCTION inline uint64_t clock_tic_host() noexcept { return ((uint64_t)a) | (((uint64_t)d) << 32); -#elif defined(__powerpc) || defined(__powerpc__) || defined(__powerpc64__) || \ - defined(__POWERPC__) || defined(__ppc__) || defined(__ppc64__) +#elif defined(__powerpc64__) || defined(__ppc64__) - unsigned int cycles = 0; + unsigned long cycles = 0; asm volatile("mftb %0" : "=r"(cycles)); diff --git a/lib/kokkos/core/src/impl/Kokkos_Core.cpp b/lib/kokkos/core/src/impl/Kokkos_Core.cpp index f624e7a14c..a5bd003237 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Core.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_Core.cpp @@ -166,6 +166,8 @@ int get_device_count() { #elif defined(KOKKOS_ENABLE_OPENACC) return acc_get_num_devices( Kokkos::Experimental::Impl::OpenACC_Traits::dev_type); +#elif defined(KOKKOS_ENABLE_OPENMPTARGET) + return omp_get_num_devices(); #else Kokkos::abort("implementation bug"); return -1; @@ -426,11 +428,17 @@ int Kokkos::Impl::get_gpu(const InitializationSettings& settings) { Kokkos::abort("implementation bug"); } - auto const* local_rank_str = - std::getenv("OMPI_COMM_WORLD_LOCAL_RANK"); // OpenMPI - if (!local_rank_str) - local_rank_str = std::getenv("MV2_COMM_WORLD_LOCAL_RANK"); // MVAPICH2 - if (!local_rank_str) local_rank_str = std::getenv("SLURM_LOCALID"); // SLURM + char const* local_rank_str = nullptr; + for (char const* env_var : { + "OMPI_COMM_WORLD_LOCAL_RANK", // OpenMPI + "MV2_COMM_WORLD_LOCAL_RANK", // MVAPICH2 + "MPI_LOCALRANKID", // MPICH + "SLURM_LOCALID", // SLURM + "PMI_LOCAL_RANK" // PMI + }) { + local_rank_str = std::getenv(env_var); + if (local_rank_str) break; + } // use first GPU available for execution if unable to detect local MPI rank if (!local_rank_str) { diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp b/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp index 738231677c..994dd0b2ad 100644 --- a/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp @@ -1128,9 +1128,8 @@ struct ViewOffset< KOKKOS_INLINE_FUNCTION constexpr ViewOffset( const ViewOffset& rhs) : m_dim(rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0) { - static_assert((DimRHS::rank == 0 && dimension_type::rank == 0) || - (DimRHS::rank == 1 && dimension_type::rank == 1 && - dimension_type::rank_dynamic == 1), + static_assert(((DimRHS::rank == 0 && dimension_type::rank == 0) || + (DimRHS::rank == 1 && dimension_type::rank == 1)), "ViewOffset LayoutLeft and LayoutRight are only compatible " "when rank <= 1"); } @@ -1778,8 +1777,7 @@ struct ViewOffset< const ViewOffset& rhs) : m_dim(rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0) { static_assert((DimRHS::rank == 0 && dimension_type::rank == 0) || - (DimRHS::rank == 1 && dimension_type::rank == 1 && - dimension_type::rank_dynamic == 1), + (DimRHS::rank == 1 && dimension_type::rank == 1), "ViewOffset LayoutRight and LayoutLeft are only compatible " "when rank <= 1"); } @@ -3059,10 +3057,10 @@ struct ViewValueFunctor { std::is_trivially_copy_assignable::value> construct_shared_allocation() { // Shortcut for zero initialization - ValueType value{}; // On A64FX memset seems to do the wrong thing with regards to first touch // leading to the significant performance issues #ifndef KOKKOS_ARCH_A64FX + ValueType value{}; if (Impl::is_zero_byte(value)) { uint64_t kpID = 0; if (Kokkos::Profiling::profileLibraryLoaded()) { @@ -3539,9 +3537,7 @@ class ViewMapping< typename SrcTraits::array_layout>::value || std::is_same::value || - (DstTraits::dimension::rank == 0) || - (DstTraits::dimension::rank == 1 && - DstTraits::dimension::rank_dynamic == 1) + (DstTraits::dimension::rank == 0) || (DstTraits::dimension::rank == 1) }; public: diff --git a/lib/kokkos/core/unit_test/CMakeLists.txt b/lib/kokkos/core/unit_test/CMakeLists.txt index 24f70c0ccb..16fdb39d1a 100644 --- a/lib/kokkos/core/unit_test/CMakeLists.txt +++ b/lib/kokkos/core/unit_test/CMakeLists.txt @@ -73,6 +73,7 @@ KOKKOS_INCLUDE_DIRECTORIES(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files) SET(COMPILE_ONLY_SOURCES TestArray.cpp + TestCreateMirror.cpp TestDetectionIdiom.cpp TestInterOp.cpp TestLegionInteroperability.cpp @@ -86,6 +87,7 @@ ENDIF() KOKKOS_ADD_EXECUTABLE( TestCompileOnly SOURCES + TestCompileMain.cpp ${COMPILE_ONLY_SOURCES} ) diff --git a/lib/kokkos/core/unit_test/TestCompileMain.cpp b/lib/kokkos/core/unit_test/TestCompileMain.cpp new file mode 100644 index 0000000000..237c8ce181 --- /dev/null +++ b/lib/kokkos/core/unit_test/TestCompileMain.cpp @@ -0,0 +1 @@ +int main() {} diff --git a/lib/kokkos/core/unit_test/TestCreateMirror.cpp b/lib/kokkos/core/unit_test/TestCreateMirror.cpp new file mode 100644 index 0000000000..e8b3b6ea10 --- /dev/null +++ b/lib/kokkos/core/unit_test/TestCreateMirror.cpp @@ -0,0 +1,126 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include + +template +void check_memory_space(TestView, MemorySpace) { + static_assert( + std::is_same::value, ""); +} + +template +auto host_mirror_test_space(View) { + return std::conditional_t< + Kokkos::SpaceAccessibility::accessible, + typename View::memory_space, Kokkos::HostSpace>{}; +} + +template +void test_create_mirror_properties(const View& view) { + using namespace Kokkos; + using DeviceMemorySpace = typename DefaultExecutionSpace::memory_space; + + // clang-format off + + // create_mirror + check_memory_space(create_mirror(WithoutInitializing, view), host_mirror_test_space(view)); + check_memory_space(create_mirror( view), host_mirror_test_space(view)); + check_memory_space(create_mirror(WithoutInitializing, DefaultExecutionSpace{}, view), DeviceMemorySpace{}); + check_memory_space(create_mirror( DefaultExecutionSpace{}, view), DeviceMemorySpace{}); + + // create_mirror_view + check_memory_space(create_mirror_view(WithoutInitializing, view), host_mirror_test_space(view)); + check_memory_space(create_mirror_view( view), host_mirror_test_space(view)); + check_memory_space(create_mirror_view(WithoutInitializing, DefaultExecutionSpace{}, view), DeviceMemorySpace{}); + check_memory_space(create_mirror_view( DefaultExecutionSpace{}, view), DeviceMemorySpace{}); + + // create_mirror view_alloc + check_memory_space(create_mirror(view_alloc(WithoutInitializing), view), host_mirror_test_space(view)); + check_memory_space(create_mirror(view_alloc(), view), host_mirror_test_space(view)); + check_memory_space(create_mirror(view_alloc(WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{}); + check_memory_space(create_mirror(view_alloc( DeviceMemorySpace{}), view), DeviceMemorySpace{}); + + // create_mirror_view view_alloc + check_memory_space(create_mirror_view(view_alloc(WithoutInitializing), view), host_mirror_test_space(view)); + check_memory_space(create_mirror_view(view_alloc(), view), host_mirror_test_space(view)); + check_memory_space(create_mirror_view(view_alloc(WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{}); + check_memory_space(create_mirror_view(view_alloc( DeviceMemorySpace{}), view), DeviceMemorySpace{}); + + // create_mirror view_alloc + execution space + check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{}, WithoutInitializing), view), host_mirror_test_space(view)); + check_memory_space(create_mirror(view_alloc(DefaultHostExecutionSpace{}), view), host_mirror_test_space(view)); + check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{}, WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{}); + check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{}, DeviceMemorySpace{}), view), DeviceMemorySpace{}); + + // create_mirror_view view_alloc + execution space + check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{}, WithoutInitializing), view), host_mirror_test_space(view)); + check_memory_space(create_mirror_view(view_alloc(DefaultHostExecutionSpace{}), view), host_mirror_test_space(view)); + check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{}, WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{}); + check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{}, DeviceMemorySpace{}), view), DeviceMemorySpace{}); + + // create_mirror_view_and_copy + check_memory_space(create_mirror_view_and_copy(HostSpace{}, view), HostSpace{}); + check_memory_space(create_mirror_view_and_copy(DeviceMemorySpace{}, view), DeviceMemorySpace{}); + + // create_mirror_view_and_copy view_alloc + check_memory_space(create_mirror_view_and_copy(view_alloc(HostSpace{}), view), HostSpace{}); + check_memory_space(create_mirror_view_and_copy(view_alloc(DeviceMemorySpace{}), view), DeviceMemorySpace{}); + + // create_mirror_view_and_copy view_alloc + execution space + check_memory_space(create_mirror_view_and_copy(view_alloc(HostSpace{}, DefaultHostExecutionSpace{}), view), HostSpace{}); + check_memory_space(create_mirror_view_and_copy(view_alloc(DeviceMemorySpace{}, DefaultExecutionSpace{}), view), DeviceMemorySpace{}); + + // clang-format on +} + +void test() { + Kokkos::View device_view("device view", + 10); + Kokkos::View host_view("host view", 10); + + test_create_mirror_properties(device_view); + test_create_mirror_properties(host_view); +} diff --git a/lib/kokkos/core/unit_test/TestDetectionIdiom.cpp b/lib/kokkos/core/unit_test/TestDetectionIdiom.cpp index f87fda6156..23da339cae 100644 --- a/lib/kokkos/core/unit_test/TestDetectionIdiom.cpp +++ b/lib/kokkos/core/unit_test/TestDetectionIdiom.cpp @@ -92,5 +92,3 @@ static_assert(std::is_same, int>::value, static_assert(std::is_same, std::ptrdiff_t>::value, "Bark's difference_type should be ptrdiff_t!"); } // namespace Example - -int main() {} diff --git a/lib/kokkos/core/unit_test/TestScan.hpp b/lib/kokkos/core/unit_test/TestScan.hpp index 1a4056af07..356ffde956 100644 --- a/lib/kokkos/core/unit_test/TestScan.hpp +++ b/lib/kokkos/core/unit_test/TestScan.hpp @@ -45,20 +45,23 @@ #include #include -namespace Test { +namespace { -template +template struct TestScan { using execution_space = Device; - using value_type = int64_t; + using value_type = T; Kokkos::View > errors; KOKKOS_INLINE_FUNCTION void operator()(const int iwork, value_type& update, const bool final_pass) const { - const value_type n = iwork + 1; - const value_type imbalance = ((1000 <= n) && (0 == n % 1000)) ? 1000 : 0; + const value_type n = iwork + 1; + const value_type imbalance = + ((ImbalanceSz <= n) && (value_type(0) == n % ImbalanceSz)) + ? ImbalanceSz + : value_type(0); // Insert an artificial load imbalance @@ -133,12 +136,29 @@ struct TestScan { } } }; +} // namespace TEST(TEST_CATEGORY, scan) { - TestScan::test_range(1, 1000); - TestScan(0); - TestScan(100000); - TestScan(10000000); - TEST_EXECSPACE().fence(); + constexpr auto imbalance_size = 1000; + TestScan::test_range(1, 1000); + TestScan(0); + TestScan(100000); + TestScan(10000000); +} + +TEST(TEST_CATEGORY, small_size_scan) { + constexpr auto imbalance_size = 10; // Pick to not overflow... + TestScan(0); + TestScan(5); + TestScan(10); + TestScan( + static_cast( + std::sqrt(std::numeric_limits::max()))); + constexpr auto short_imbalance_size = 100; // Pick to not overflow... + TestScan(0); + TestScan(5); + TestScan(100); + TestScan( + static_cast( + std::sqrt(std::numeric_limits::max()))); } -} // namespace Test diff --git a/lib/kokkos/core/unit_test/TestTeam.hpp b/lib/kokkos/core/unit_test/TestTeam.hpp index f1d0f9cb3b..3f05b2ef66 100644 --- a/lib/kokkos/core/unit_test/TestTeam.hpp +++ b/lib/kokkos/core/unit_test/TestTeam.hpp @@ -1616,6 +1616,73 @@ struct TestTeamPolicyHandleByValue { } // namespace +namespace { +template +struct TestRepeatedTeamReduce { + static constexpr int ncol = 1500; // nothing special, just some work + + KOKKOS_FUNCTION void operator()( + const typename Kokkos::TeamPolicy::member_type &team) + const { + // non-divisible by power of two to make triggering problems easier + constexpr int nlev = 129; + constexpr auto pi = Kokkos::Experimental::pi_v; + double b = 0.; + for (int ri = 0; ri < 10; ++ri) { + // The contributions here must be sufficiently complex, simply adding ones + // wasn't enough to trigger the bug. + const auto g1 = [&](const int k, double &acc) { + acc += Kokkos::cos(pi * double(k) / nlev); + }; + const auto g2 = [&](const int k, double &acc) { + acc += Kokkos::sin(pi * double(k) / nlev); + }; + double a1, a2; + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team, nlev), g1, a1); + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team, nlev), g2, a2); + b += a1; + b += a2; + } + const auto h = [&]() { + const auto col = team.league_rank(); + v(col) = b + col; + }; + Kokkos::single(Kokkos::PerTeam(team), h); + } + + KOKKOS_FUNCTION void operator()(const int i, int &bad) const { + if (v(i) != v(0) + i) { + ++bad; + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Failing at %d!\n", i); + } + } + + TestRepeatedTeamReduce() : v("v", ncol) { test(); } + + void test() { + int team_size_recommended = + Kokkos::TeamPolicy(1, 1).team_size_recommended( + *this, Kokkos::ParallelForTag()); + // Choose a non-recommened (non-power of two for GPUs) team size + int team_size = team_size_recommended > 1 ? team_size_recommended - 1 : 1; + + // The failure was non-deterministic so run the test a bunch of times + for (int it = 0; it < 100; ++it) { + Kokkos::parallel_for( + Kokkos::TeamPolicy(ncol, team_size, 1), *this); + + int bad = 0; + Kokkos::parallel_reduce(Kokkos::RangePolicy(0, ncol), + *this, bad); + ASSERT_EQ(bad, 0) << " Failing in iteration " << it; + } + } + + Kokkos::View v; +}; + +} // namespace + } // namespace Test /*--------------------------------------------------------------------------*/ diff --git a/lib/kokkos/core/unit_test/TestTeamReductionScan.hpp b/lib/kokkos/core/unit_test/TestTeamReductionScan.hpp index 469bba23b7..4d4f3b1f4d 100644 --- a/lib/kokkos/core/unit_test/TestTeamReductionScan.hpp +++ b/lib/kokkos/core/unit_test/TestTeamReductionScan.hpp @@ -134,5 +134,15 @@ TEST(TEST_CATEGORY, team_parallel_dummy_with_reducer_and_scratch_space) { } } +TEST(TEST_CATEGORY, repeated_team_reduce) { +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (std::is_same::value) + GTEST_SKIP() << "skipping since team_reduce for OpenMPTarget is not " + "properly implemented"; +#endif + + TestRepeatedTeamReduce(); +} + } // namespace Test #endif diff --git a/lib/kokkos/core/unit_test/TestViewIsAssignable.hpp b/lib/kokkos/core/unit_test/TestViewIsAssignable.hpp index 03c3b977ed..3ac392d3e9 100644 --- a/lib/kokkos/core/unit_test/TestViewIsAssignable.hpp +++ b/lib/kokkos/core/unit_test/TestViewIsAssignable.hpp @@ -92,8 +92,18 @@ TEST(TEST_CATEGORY, view_is_assignable) { View>::test(false, false, 10); // Layout assignment + Impl::TestAssignability, + View>::test(true, true); Impl::TestAssignability, View>::test(true, true, 10); + Impl::TestAssignability, + View>::test(false, false, 10); + Impl::TestAssignability, + View>::test(false, true, 10); + Impl::TestAssignability, + View>::test(true, true); + Impl::TestAssignability, + View>::test(false, false); // This could be made possible (due to the degenerate nature of the views) but // we do not allow this yet diff --git a/lib/kokkos/kokkos_5538.diff b/lib/kokkos/kokkos_5538.diff new file mode 100644 index 0000000000..6bf2ccf6a4 --- /dev/null +++ b/lib/kokkos/kokkos_5538.diff @@ -0,0 +1,199 @@ +diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos +index 22af411f32..530510a0d1 100644 +--- a/lib/kokkos/Makefile.kokkos ++++ b/lib/kokkos/Makefile.kokkos +@@ -20,7 +20,7 @@ KOKKOS_DEVICES ?= "OpenMP" + #KOKKOS_DEVICES ?= "Threads" + # Options: + # Intel: KNC,KNL,SNB,HSW,BDW,SKL,SKX,ICL,ICX,SPR +-# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86 ++# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86,Hopper90 + # ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX + # IBM: BGQ,Power7,Power8,Power9 + # AMD-GPUS: Vega900,Vega906,Vega908,Vega90A +@@ -401,6 +401,7 @@ KOKKOS_INTERNAL_USE_ARCH_VOLTA72 := $(call kokkos_has_string,$(KOKKOS_ARCH),Volt + KOKKOS_INTERNAL_USE_ARCH_TURING75 := $(call kokkos_has_string,$(KOKKOS_ARCH),Turing75) + KOKKOS_INTERNAL_USE_ARCH_AMPERE80 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere80) + KOKKOS_INTERNAL_USE_ARCH_AMPERE86 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere86) ++KOKKOS_INTERNAL_USE_ARCH_HOPPER90 := $(call kokkos_has_string,$(KOKKOS_ARCH),Hopper90) + KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30) \ + + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32) \ + + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \ +@@ -414,7 +415,8 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLE + + $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72) \ + + $(KOKKOS_INTERNAL_USE_ARCH_TURING75) \ + + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE80) \ +- + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86)) ++ + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86) \ ++ + $(KOKKOS_INTERNAL_USE_ARCH_HOPPER90)) + + #SEK: This seems like a bug to me + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0) +@@ -1194,6 +1196,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA_ARCH), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE86") + KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_86 + endif ++ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_HOPPER90), 1) ++ tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER") ++ tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER90") ++ KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_90 ++ endif + + ifneq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0) + KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG) +diff --git a/lib/kokkos/cmake/KokkosCore_config.h.in b/lib/kokkos/cmake/KokkosCore_config.h.in +index 88ddc48378..b83ced9243 100644 +--- a/lib/kokkos/cmake/KokkosCore_config.h.in ++++ b/lib/kokkos/cmake/KokkosCore_config.h.in +@@ -102,6 +102,7 @@ + #cmakedefine KOKKOS_ARCH_AMPERE + #cmakedefine KOKKOS_ARCH_AMPERE80 + #cmakedefine KOKKOS_ARCH_AMPERE86 ++#cmakedefine KOKKOS_ARCH_HOPPER90 + #cmakedefine KOKKOS_ARCH_AMD_ZEN + #cmakedefine KOKKOS_ARCH_AMD_ZEN2 + #cmakedefine KOKKOS_ARCH_AMD_ZEN3 +diff --git a/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc b/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc +index f56cef1651..2585a6a64c 100644 +--- a/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc ++++ b/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc +@@ -74,6 +74,7 @@ int main() { + case 75: std::cout << "Set -DKokkos_ARCH_TURING75=ON ." << std::endl; break; + case 80: std::cout << "Set -DKokkos_ARCH_AMPERE80=ON ." << std::endl; break; + case 86: std::cout << "Set -DKokkos_ARCH_AMPERE86=ON ." << std::endl; break; ++ case 90: std::cout << "Set -DKokkos_ARCH_HOPPER90=ON ." << std::endl; break; + default: + std::cout << "Compute capability " << compute_capability + << " is not supported" << std::endl; +diff --git a/lib/kokkos/cmake/kokkos_arch.cmake b/lib/kokkos/cmake/kokkos_arch.cmake +index ef16aad047..c1d76cceeb 100644 +--- a/lib/kokkos/cmake/kokkos_arch.cmake ++++ b/lib/kokkos/cmake/kokkos_arch.cmake +@@ -86,6 +86,7 @@ KOKKOS_ARCH_OPTION(VOLTA72 GPU "NVIDIA Volta generation CC 7.2" "KOKK + KOKKOS_ARCH_OPTION(TURING75 GPU "NVIDIA Turing generation CC 7.5" "KOKKOS_SHOW_CUDA_ARCHS") + KOKKOS_ARCH_OPTION(AMPERE80 GPU "NVIDIA Ampere generation CC 8.0" "KOKKOS_SHOW_CUDA_ARCHS") + KOKKOS_ARCH_OPTION(AMPERE86 GPU "NVIDIA Ampere generation CC 8.6" "KOKKOS_SHOW_CUDA_ARCHS") ++KOKKOS_ARCH_OPTION(HOPPER90 GPU "NVIDIA Hopper generation CC 9.0" "KOKKOS_SHOW_CUDA_ARCHS") + + IF(Kokkos_ENABLE_HIP OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_UNSUPPORTED_ARCHS) + SET(KOKKOS_SHOW_HIP_ARCHS ON) +@@ -544,6 +545,7 @@ CHECK_CUDA_ARCH(VOLTA72 sm_72) + CHECK_CUDA_ARCH(TURING75 sm_75) + CHECK_CUDA_ARCH(AMPERE80 sm_80) + CHECK_CUDA_ARCH(AMPERE86 sm_86) ++CHECK_CUDA_ARCH(HOPPER90 sm_90) + + SET(AMDGPU_ARCH_ALREADY_SPECIFIED "") + FUNCTION(CHECK_AMDGPU_ARCH ARCH FLAG) +@@ -806,6 +808,10 @@ IF (KOKKOS_ARCH_AMPERE80 OR KOKKOS_ARCH_AMPERE86) + SET(KOKKOS_ARCH_AMPERE ON) + ENDIF() + ++IF (KOKKOS_ARCH_HOPPER90) ++ SET(KOKKOS_ARCH_HOPPER ON) ++ENDIF() ++ + #Regardless of version, make sure we define the general architecture name + IF (KOKKOS_ARCH_VEGA900 OR KOKKOS_ARCH_VEGA906 OR KOKKOS_ARCH_VEGA908 OR KOKKOS_ARCH_VEGA90A) + SET(KOKKOS_ARCH_VEGA ON) +diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp +index 56f9117844..fcd4773dbc 100644 +--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp ++++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp +@@ -232,7 +232,8 @@ inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) { + case 61: return 96; + case 70: + case 80: +- case 86: return 8; ++ case 86: ++ case 90: return 8; + case 75: return 32; + default: + Kokkos::Impl::throw_runtime_exception( +diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp +index 40a263561f..8c40ebd60d 100644 +--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp ++++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp +@@ -418,7 +418,7 @@ KOKKOS_INLINE_FUNCTION + #endif // CUDA_VERSION >= 11000 && CUDA_VERSION < 11010 + + #if CUDA_VERSION >= 11010 && \ +- ((defined(KOKKOS_ARCH_AMPERE80) || defined(KOKKOS_ARCH_AMPERE86))) ++ ((defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER))) + KOKKOS_INLINE_FUNCTION + bhalf_t cast_to_bhalf(bhalf_t val) { return val; } + KOKKOS_INLINE_FUNCTION +diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp +index f9451ecfe6..2ce1efb98c 100644 +--- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp ++++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp +@@ -51,7 +51,7 @@ namespace Kokkos::Experimental::Impl { + + struct OpenACC_Traits { + #if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \ +- defined(KOKKOS_ARCH_AMPERE) ++ defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER) + static constexpr acc_device_t dev_type = acc_device_nvidia; + static constexpr bool may_fallback_to_host = false; + #else +diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp +index a9bc085912..27ee1d4232 100644 +--- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp ++++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp +@@ -115,8 +115,9 @@ void OpenMPTargetInternal::impl_initialize() { + + // FIXME_OPENMPTARGET: Only fix the number of teams for NVIDIA architectures + // from Pascal and upwards. +-#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \ +- defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) ++#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \ ++ defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \ ++ defined(KOKKOS_ARCH_HOPPER) + #if defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG >= 1300) + omp_set_num_teams(512); + #endif +diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp +index 840db4327c..7e5addbc5b 100644 +--- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp ++++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp +@@ -155,7 +155,7 @@ void SYCL::impl_initialize(InitializationSettings const& settings) { + #if !defined(KOKKOS_ARCH_INTEL_GPU) && !defined(KOKKOS_ARCH_KEPLER) && \ + !defined(KOKKOS_ARCH_MAXWELL) && !defined(KOKKOS_ARCH_PASCAL) && \ + !defined(KOKKOS_ARCH_VOLTA) && !defined(KOKKOS_ARCH_TURING75) && \ +- !defined(KOKKOS_ARCH_AMPERE) ++ !defined(KOKKOS_ARCH_AMPERE) && !defined(KOKKOS_ARCH_HOPPER) + if (!settings.has_device_id() && gpu_devices.empty()) { + Impl::SYCLInternal::singleton().initialize(sycl::device()); + return; +diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp +index 5ac7d8af30..ba101f699e 100644 +--- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp ++++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp +@@ -335,9 +335,10 @@ class TeamPolicyInternal + return std::min({ + int(m_space.impl_internal_space_instance()->m_maxWorkgroupSize), + // FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs. +-#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \ +- defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \ +- defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) ++#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \ ++ defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \ ++ defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \ ++ defined(KOKKOS_ARCH_HOPPER) + 256, + #endif + max_threads_for_memory +@@ -367,9 +368,10 @@ class TeamPolicyInternal + return std::min({ + int(m_space.impl_internal_space_instance()->m_maxWorkgroupSize), + // FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs. +-#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \ +- defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \ +- defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) ++#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \ ++ defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \ ++ defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \ ++ defined(KOKKOS_ARCH_HOPPER) + 256, + #endif + max_threads_for_memory diff --git a/lib/kokkos/kokkos_5706.diff b/lib/kokkos/kokkos_5706.diff new file mode 100644 index 0000000000..2bfbb35b06 --- /dev/null +++ b/lib/kokkos/kokkos_5706.diff @@ -0,0 +1,523 @@ +diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp +index fcd4773dbc..30b6958a67 100644 +--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp ++++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp +@@ -207,7 +207,6 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance, + LaunchBounds{}); + } + +-// Assuming cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1) + // NOTE these number can be obtained several ways: + // * One option is to download the CUDA Occupancy Calculator spreadsheet, select + // "Compute Capability" first and check what is the smallest "Shared Memory +@@ -242,6 +241,7 @@ inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) { + return 0; + }() * 1024; + } ++ + } // namespace Impl + } // namespace Kokkos + +diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp +index 5811498e01..e22eb3b842 100644 +--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp ++++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp +@@ -569,12 +569,6 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default + } + #endif + +-#ifdef KOKKOS_ENABLE_PRE_CUDA_10_DEPRECATION_API +- cudaThreadSetCacheConfig(cudaFuncCachePreferShared); +-#else +- cudaDeviceSetCacheConfig(cudaFuncCachePreferShared); +-#endif +- + // Init the array for used for arbitrarily sized atomics + if (stream == nullptr) Impl::initialize_host_cuda_lock_arrays(); + +diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp +index b7a80ad84f..5c4c3a7d39 100644 +--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp ++++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp +@@ -93,10 +93,6 @@ namespace Impl { + // __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor) + // function qualifier which could be used to improve performance. + //---------------------------------------------------------------------------- +-// Maximize L1 cache and minimize shared memory: +-// cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1 ); +-// For 2.0 capability: 48 KB L1 and 16 KB shared +-//---------------------------------------------------------------------------- + + template + __global__ static void cuda_parallel_launch_constant_memory() { +@@ -158,63 +154,105 @@ inline void check_shmem_request(CudaInternal const* cuda_instance, int shmem) { + } + } + +-// This function needs to be template on DriverType and LaunchBounds ++// These functions needs to be template on DriverType and LaunchBounds + // so that the static bool is unique for each type combo + // KernelFuncPtr does not necessarily contain that type information. ++ + template +-inline void configure_shmem_preference(KernelFuncPtr const& func, +- bool prefer_shmem) { ++const cudaFuncAttributes& get_cuda_kernel_func_attributes( ++ const KernelFuncPtr& func) { ++ // Only call cudaFuncGetAttributes once for each unique kernel ++ // by leveraging static variable initialization rules ++ auto wrap_get_attributes = [&]() -> cudaFuncAttributes { ++ cudaFuncAttributes attr; ++ KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncGetAttributes(&attr, func)); ++ return attr; ++ }; ++ static cudaFuncAttributes func_attr = wrap_get_attributes(); ++ return func_attr; ++} ++ ++template ++inline void configure_shmem_preference(const KernelFuncPtr& func, ++ const cudaDeviceProp& device_props, ++ const size_t block_size, int& shmem, ++ const size_t occupancy) { + #ifndef KOKKOS_ARCH_KEPLER +- // On Kepler the L1 has no benefit since it doesn't cache reads ++ ++ const auto& func_attr = ++ get_cuda_kernel_func_attributes(func); ++ ++ // Compute limits for number of blocks due to registers/SM ++ const size_t regs_per_sm = device_props.regsPerMultiprocessor; ++ const size_t regs_per_thread = func_attr.numRegs; ++ // The granularity of register allocation is chunks of 256 registers per warp ++ // -> 8 registers per thread ++ const size_t allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8); ++ const size_t max_blocks_regs = ++ regs_per_sm / (allocated_regs_per_thread * block_size); ++ ++ // Compute how many threads per sm we actually want ++ const size_t max_threads_per_sm = device_props.maxThreadsPerMultiProcessor; ++ // only allocate multiples of warp size ++ const size_t num_threads_desired = ++ ((max_threads_per_sm * occupancy / 100 + 31) / 32) * 32; ++ // Get close to the desired occupancy, ++ // don't undershoot by much but also don't allocate a whole new block just ++ // because one is a few threads over otherwise. ++ size_t num_blocks_desired = ++ (num_threads_desired + block_size * 0.8) / block_size; ++ num_blocks_desired = ::std::min(max_blocks_regs, num_blocks_desired); ++ if (num_blocks_desired == 0) num_blocks_desired = 1; ++ ++ // Calculate how much shared memory we need per block ++ size_t shmem_per_block = shmem + func_attr.sharedSizeBytes; ++ ++ // The minimum shared memory allocation we can have in total per SM is 8kB. ++ // If we want to lower occupancy we have to make sure we request at least that ++ // much in aggregate over all blocks, so that shared memory actually becomes a ++ // limiting factor for occupancy ++ constexpr size_t min_shmem_size_per_sm = 8192; ++ if ((occupancy < 100) && ++ (shmem_per_block * num_blocks_desired < min_shmem_size_per_sm)) { ++ shmem_per_block = min_shmem_size_per_sm / num_blocks_desired; ++ // Need to set the caller's shmem variable so that the ++ // kernel launch uses the correct dynamic shared memory request ++ shmem = shmem_per_block - func_attr.sharedSizeBytes; ++ } ++ ++ // Compute the carveout fraction we need based on occupancy ++ // Use multiples of 8kB ++ const size_t max_shmem_per_sm = device_props.sharedMemPerMultiprocessor; ++ size_t carveout = shmem_per_block == 0 ++ ? 0 ++ : 100 * ++ (((num_blocks_desired * shmem_per_block + ++ min_shmem_size_per_sm - 1) / ++ min_shmem_size_per_sm) * ++ min_shmem_size_per_sm) / ++ max_shmem_per_sm; ++ if (carveout > 100) carveout = 100; ++ ++ // Set the carveout, but only call it once per kernel or when it changes + auto set_cache_config = [&] { +- KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetCacheConfig( +- func, +- (prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1))); +- return prefer_shmem; ++ KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetAttribute( ++ func, cudaFuncAttributePreferredSharedMemoryCarveout, carveout)); ++ return carveout; + }; +- static bool cache_config_preference_cached = set_cache_config(); +- if (cache_config_preference_cached != prefer_shmem) { ++ // Store the value in a static variable so we only reset if needed ++ static size_t cache_config_preference_cached = set_cache_config(); ++ if (cache_config_preference_cached != carveout) { + cache_config_preference_cached = set_cache_config(); + } + #else + // Use the parameters so we don't get a warning + (void)func; +- (void)prefer_shmem; ++ (void)device_props; ++ (void)block_size; ++ (void)occupancy; + #endif + } + +-template +-std::enable_if_t +-modify_launch_configuration_if_desired_occupancy_is_specified( +- Policy const& policy, cudaDeviceProp const& properties, +- cudaFuncAttributes const& attributes, dim3 const& block, int& shmem, +- bool& prefer_shmem) { +- int const block_size = block.x * block.y * block.z; +- int const desired_occupancy = policy.impl_get_desired_occupancy().value(); +- +- size_t const shmem_per_sm_prefer_l1 = get_shmem_per_sm_prefer_l1(properties); +- size_t const static_shmem = attributes.sharedSizeBytes; +- +- // round to nearest integer and avoid division by zero +- int active_blocks = std::max( +- 1, static_cast(std::round( +- static_cast(properties.maxThreadsPerMultiProcessor) / +- block_size * desired_occupancy / 100))); +- int const dynamic_shmem = +- shmem_per_sm_prefer_l1 / active_blocks - static_shmem; +- +- if (dynamic_shmem > shmem) { +- shmem = dynamic_shmem; +- prefer_shmem = false; +- } +-} +- +-template +-std::enable_if_t +-modify_launch_configuration_if_desired_occupancy_is_specified( +- Policy const&, cudaDeviceProp const&, cudaFuncAttributes const&, +- dim3 const& /*block*/, int& /*shmem*/, bool& /*prefer_shmem*/) {} +- + // end Some helper functions for launch code readability }}}1 + //============================================================================== + +@@ -348,7 +386,7 @@ struct CudaParallelLaunchKernelInvoker< + #ifdef KOKKOS_CUDA_ENABLE_GRAPHS + inline static void create_parallel_launch_graph_node( + DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem, +- CudaInternal const* cuda_instance, bool prefer_shmem) { ++ CudaInternal const* cuda_instance) { + //---------------------------------------- + auto const& graph = Impl::get_cuda_graph_from_kernel(driver); + KOKKOS_EXPECTS(bool(graph)); +@@ -358,8 +396,15 @@ struct CudaParallelLaunchKernelInvoker< + + if (!Impl::is_empty_launch(grid, block)) { + Impl::check_shmem_request(cuda_instance, shmem); +- Impl::configure_shmem_preference( +- base_t::get_kernel_func(), prefer_shmem); ++ if (DriverType::Policy:: ++ experimental_contains_desired_occupancy) { ++ int desired_occupancy = ++ driver.get_policy().impl_get_desired_occupancy().value(); ++ size_t block_size = block.x * block.y * block.z; ++ Impl::configure_shmem_preference( ++ base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size, ++ shmem, desired_occupancy); ++ } + + void const* args[] = {&driver}; + +@@ -442,7 +487,7 @@ struct CudaParallelLaunchKernelInvoker< + #ifdef KOKKOS_CUDA_ENABLE_GRAPHS + inline static void create_parallel_launch_graph_node( + DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem, +- CudaInternal const* cuda_instance, bool prefer_shmem) { ++ CudaInternal const* cuda_instance) { + //---------------------------------------- + auto const& graph = Impl::get_cuda_graph_from_kernel(driver); + KOKKOS_EXPECTS(bool(graph)); +@@ -452,8 +497,15 @@ struct CudaParallelLaunchKernelInvoker< + + if (!Impl::is_empty_launch(grid, block)) { + Impl::check_shmem_request(cuda_instance, shmem); +- Impl::configure_shmem_preference( +- base_t::get_kernel_func(), prefer_shmem); ++ if constexpr (DriverType::Policy:: ++ experimental_contains_desired_occupancy) { ++ int desired_occupancy = ++ driver.get_policy().impl_get_desired_occupancy().value(); ++ size_t block_size = block.x * block.y * block.z; ++ Impl::configure_shmem_preference( ++ base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size, ++ shmem, desired_occupancy); ++ } + + auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver); + +@@ -566,7 +618,7 @@ struct CudaParallelLaunchKernelInvoker< + #ifdef KOKKOS_CUDA_ENABLE_GRAPHS + inline static void create_parallel_launch_graph_node( + DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem, +- CudaInternal const* cuda_instance, bool prefer_shmem) { ++ CudaInternal const* cuda_instance) { + // Just use global memory; coordinating through events to share constant + // memory with the non-graph interface is not really reasonable since + // events don't work with Graphs directly, and this would anyway require +@@ -580,7 +632,7 @@ struct CudaParallelLaunchKernelInvoker< + DriverType, LaunchBounds, + Experimental::CudaLaunchMechanism::GlobalMemory>; + global_launch_impl_t::create_parallel_launch_graph_node( +- driver, grid, block, shmem, cuda_instance, prefer_shmem); ++ driver, grid, block, shmem, cuda_instance); + } + #endif + }; +@@ -613,8 +665,7 @@ struct CudaParallelLaunchImpl< + + inline static void launch_kernel(const DriverType& driver, const dim3& grid, + const dim3& block, int shmem, +- const CudaInternal* cuda_instance, +- bool prefer_shmem) { ++ const CudaInternal* cuda_instance) { + if (!Impl::is_empty_launch(grid, block)) { + // Prevent multiple threads to simultaneously set the cache configuration + // preference and launch the same kernel +@@ -623,18 +674,17 @@ struct CudaParallelLaunchImpl< + + Impl::check_shmem_request(cuda_instance, shmem); + +- // If a desired occupancy is specified, we compute how much shared memory +- // to ask for to achieve that occupancy, assuming that the cache +- // configuration is `cudaFuncCachePreferL1`. If the amount of dynamic +- // shared memory computed is actually smaller than `shmem` we overwrite +- // `shmem` and set `prefer_shmem` to `false`. +- modify_launch_configuration_if_desired_occupancy_is_specified( +- driver.get_policy(), cuda_instance->m_deviceProp, +- get_cuda_func_attributes(), block, shmem, prefer_shmem); +- +- Impl::configure_shmem_preference< +- DriverType, Kokkos::LaunchBounds>( +- base_t::get_kernel_func(), prefer_shmem); ++ if (DriverType::Policy:: ++ experimental_contains_desired_occupancy) { ++ int desired_occupancy = ++ driver.get_policy().impl_get_desired_occupancy().value(); ++ size_t block_size = block.x * block.y * block.z; ++ Impl::configure_shmem_preference< ++ DriverType, ++ Kokkos::LaunchBounds>( ++ base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size, ++ shmem, desired_occupancy); ++ } + + KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE(); + +@@ -650,18 +700,9 @@ struct CudaParallelLaunchImpl< + } + + static cudaFuncAttributes get_cuda_func_attributes() { +- // Race condition inside of cudaFuncGetAttributes if the same address is +- // given requires using a local variable as input instead of a static Rely +- // on static variable initialization to make sure only one thread executes +- // the code and the result is visible. +- auto wrap_get_attributes = []() -> cudaFuncAttributes { +- cudaFuncAttributes attr_tmp; +- KOKKOS_IMPL_CUDA_SAFE_CALL( +- cudaFuncGetAttributes(&attr_tmp, base_t::get_kernel_func())); +- return attr_tmp; +- }; +- static cudaFuncAttributes attr = wrap_get_attributes(); +- return attr; ++ return get_cuda_kernel_func_attributes< ++ DriverType, Kokkos::LaunchBounds>( ++ base_t::get_kernel_func()); + } + }; + +diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp +index e586bb4cc6..0e348c092a 100644 +--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp ++++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp +@@ -121,8 +121,7 @@ class ParallelFor, Kokkos::Cuda> { + maxblocks[1]), + 1); + CudaParallelLaunch( +- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(), +- false); ++ *this, grid, block, 0, m_rp.space().impl_internal_space_instance()); + } else if (RP::rank == 3) { + const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], m_rp.m_tile[2]); + KOKKOS_ASSERT(block.x > 0); +@@ -139,8 +138,7 @@ class ParallelFor, Kokkos::Cuda> { + (m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1) / block.z, + maxblocks[2])); + CudaParallelLaunch( +- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(), +- false); ++ *this, grid, block, 0, m_rp.space().impl_internal_space_instance()); + } else if (RP::rank == 4) { + // id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to + // threadIdx.z +@@ -158,8 +156,7 @@ class ParallelFor, Kokkos::Cuda> { + (m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1) / block.z, + maxblocks[2])); + CudaParallelLaunch( +- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(), +- false); ++ *this, grid, block, 0, m_rp.space().impl_internal_space_instance()); + } else if (RP::rank == 5) { + // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to + // threadIdx.z +@@ -175,8 +172,7 @@ class ParallelFor, Kokkos::Cuda> { + (m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1) / block.z, + maxblocks[2])); + CudaParallelLaunch( +- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(), +- false); ++ *this, grid, block, 0, m_rp.space().impl_internal_space_instance()); + } else if (RP::rank == 6) { + // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4,id5 to + // threadIdx.z +@@ -191,8 +187,7 @@ class ParallelFor, Kokkos::Cuda> { + std::min(m_rp.m_tile_end[4] * m_rp.m_tile_end[5], + maxblocks[2])); + CudaParallelLaunch( +- *this, grid, block, 0, m_rp.space().impl_internal_space_instance(), +- false); ++ *this, grid, block, 0, m_rp.space().impl_internal_space_instance()); + } else { + Kokkos::abort("Kokkos::MDRange Error: Exceeded rank bounds with Cuda\n"); + } +@@ -405,8 +400,8 @@ class ParallelReduce, ReducerType, + + CudaParallelLaunch( + *this, grid, block, shmem, +- m_policy.space().impl_internal_space_instance(), +- false); // copy to device and execute ++ m_policy.space() ++ .impl_internal_space_instance()); // copy to device and execute + + if (!m_result_ptr_device_accessible) { + if (m_result_ptr) { +diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp +index ac160f8fe2..d1031751c2 100644 +--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp ++++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp +@@ -135,8 +135,7 @@ class ParallelFor, Kokkos::Cuda> { + #endif + + CudaParallelLaunch( +- *this, grid, block, 0, m_policy.space().impl_internal_space_instance(), +- false); ++ *this, grid, block, 0, m_policy.space().impl_internal_space_instance()); + } + + ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) +@@ -375,8 +374,8 @@ class ParallelReduce, ReducerType, + + CudaParallelLaunch( + *this, grid, block, shmem, +- m_policy.space().impl_internal_space_instance(), +- false); // copy to device and execute ++ m_policy.space() ++ .impl_internal_space_instance()); // copy to device and execute + + if (!m_result_ptr_device_accessible) { + if (m_result_ptr) { +@@ -726,16 +725,16 @@ class ParallelScan, Kokkos::Cuda> { + m_final = false; + CudaParallelLaunch( + *this, grid, block, shmem, +- m_policy.space().impl_internal_space_instance(), +- false); // copy to device and execute ++ m_policy.space() ++ .impl_internal_space_instance()); // copy to device and execute + #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION + } + #endif + m_final = true; + CudaParallelLaunch( + *this, grid, block, shmem, +- m_policy.space().impl_internal_space_instance(), +- false); // copy to device and execute ++ m_policy.space() ++ .impl_internal_space_instance()); // copy to device and execute + } + } + +@@ -1038,16 +1037,16 @@ class ParallelScanWithTotal, + m_final = false; + CudaParallelLaunch( + *this, grid, block, shmem, +- m_policy.space().impl_internal_space_instance(), +- false); // copy to device and execute ++ m_policy.space() ++ .impl_internal_space_instance()); // copy to device and execute + #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION + } + #endif + m_final = true; + CudaParallelLaunch( + *this, grid, block, shmem, +- m_policy.space().impl_internal_space_instance(), +- false); // copy to device and execute ++ m_policy.space() ++ .impl_internal_space_instance()); // copy to device and execute + + const int size = Analysis::value_size(m_functor); + #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION +diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp +index cdd16085b3..ea9430b812 100644 +--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp ++++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp +@@ -552,8 +552,8 @@ class ParallelFor, + + CudaParallelLaunch( + *this, grid, block, shmem_size_total, +- m_policy.space().impl_internal_space_instance(), +- true); // copy to device and execute ++ m_policy.space() ++ .impl_internal_space_instance()); // copy to device and execute + } + + ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) +@@ -878,8 +878,8 @@ class ParallelReduce, + + CudaParallelLaunch( + *this, grid, block, shmem_size_total, +- m_policy.space().impl_internal_space_instance(), +- true); // copy to device and execute ++ m_policy.space() ++ .impl_internal_space_instance()); // copy to device and execute + + if (!m_result_ptr_device_accessible) { + m_policy.space().fence( +diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp +index 34d4bef9fd..178012431c 100644 +--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp ++++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp +@@ -428,11 +428,6 @@ struct CudaReductionsFunctor { + // __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor) + // function qualifier which could be used to improve performance. + //---------------------------------------------------------------------------- +-// Maximize shared memory and minimize L1 cache: +-// cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferShared ); +-// For 2.0 capability: 48 KB shared and 16 KB L1 +-//---------------------------------------------------------------------------- +-//---------------------------------------------------------------------------- + /* + * Algorithmic constraints: + * (a) blockDim.y <= 1024 +diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp +index fb3a6b138f..a12378a891 100644 +--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp ++++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp +@@ -100,8 +100,7 @@ class ParallelFor, + const int shared = 0; + + Kokkos::Impl::CudaParallelLaunch( +- *this, grid, block, shared, Cuda().impl_internal_space_instance(), +- false); ++ *this, grid, block, shared, Cuda().impl_internal_space_instance()); + } + + inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) diff --git a/lib/kokkos/kokkos_5731.diff b/lib/kokkos/kokkos_5731.diff new file mode 100644 index 0000000000..e95f4a1546 --- /dev/null +++ b/lib/kokkos/kokkos_5731.diff @@ -0,0 +1,46 @@ +diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp +index 30b6958a67..b94f053272 100644 +--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp ++++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp +@@ -207,41 +207,6 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance, + LaunchBounds{}); + } + +-// NOTE these number can be obtained several ways: +-// * One option is to download the CUDA Occupancy Calculator spreadsheet, select +-// "Compute Capability" first and check what is the smallest "Shared Memory +-// Size Config" that is available. The "Shared Memory Per Multiprocessor" in +-// bytes is then to be found below in the summary. +-// * Another option would be to look for the information in the "Tuning +-// Guide(s)" of the CUDA Toolkit Documentation for each GPU architecture, in +-// the "Shared Memory" section (more tedious) +-inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) { +- int const compute_capability = properties.major * 10 + properties.minor; +- return [compute_capability]() { +- switch (compute_capability) { +- case 30: +- case 32: +- case 35: return 16; +- case 37: return 80; +- case 50: +- case 53: +- case 60: +- case 62: return 64; +- case 52: +- case 61: return 96; +- case 70: +- case 80: +- case 86: +- case 90: return 8; +- case 75: return 32; +- default: +- Kokkos::Impl::throw_runtime_exception( +- "Unknown device in cuda block size deduction"); +- } +- return 0; +- }() * 1024; +-} +- + } // namespace Impl + } // namespace Kokkos + diff --git a/lib/kokkos/kokkos_5739.diff b/lib/kokkos/kokkos_5739.diff new file mode 100644 index 0000000000..fe7a1ff551 --- /dev/null +++ b/lib/kokkos/kokkos_5739.diff @@ -0,0 +1,204 @@ +diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp +index b94f053272..252c13c524 100644 +--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp ++++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp +@@ -53,17 +53,69 @@ + namespace Kokkos { + namespace Impl { + ++inline int cuda_warp_per_sm_allocation_granularity( ++ cudaDeviceProp const& properties) { ++ // Allocation granularity of warps in each sm ++ switch (properties.major) { ++ case 3: ++ case 5: ++ case 7: ++ case 8: ++ case 9: return 4; ++ case 6: return (properties.minor == 0 ? 2 : 4); ++ default: ++ throw_runtime_exception( ++ "Unknown device in cuda warp per sm allocation granularity"); ++ return 0; ++ } ++} ++ ++inline int cuda_max_warps_per_sm_registers( ++ cudaDeviceProp const& properties, cudaFuncAttributes const& attributes) { ++ // Maximum number of warps per sm as a function of register counts, ++ // subject to the constraint that warps are allocated with a fixed granularity ++ int const max_regs_per_block = properties.regsPerBlock; ++ int const regs_per_warp = attributes.numRegs * properties.warpSize; ++ int const warp_granularity = ++ cuda_warp_per_sm_allocation_granularity(properties); ++ // The granularity of register allocation is chunks of 256 registers per warp, ++ // which implies a need to over-allocate, so we round up ++ int const allocated_regs_per_warp = (regs_per_warp + 256 - 1) / 256; ++ ++ // The maximum number of warps per SM is constrained from above by register ++ // allocation. To satisfy the constraint that warps per SM is allocated at a ++ // finite granularity, we need to round down. ++ int const max_warps_per_sm = ++ warp_granularity * ++ (max_regs_per_block / (allocated_regs_per_warp * warp_granularity)); ++ ++ return max_warps_per_sm; ++} ++ + inline int cuda_max_active_blocks_per_sm(cudaDeviceProp const& properties, + cudaFuncAttributes const& attributes, + int block_size, size_t dynamic_shmem) { +- // Limits due do registers/SM ++ // Limits due to registers/SM + int const regs_per_sm = properties.regsPerMultiprocessor; + int const regs_per_thread = attributes.numRegs; + // The granularity of register allocation is chunks of 256 registers per warp + // -> 8 registers per thread + int const allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8); +- int const max_blocks_regs = +- regs_per_sm / (allocated_regs_per_thread * block_size); ++ int max_blocks_regs = regs_per_sm / (allocated_regs_per_thread * block_size); ++ ++ // Compute the maximum number of warps as a function of the number of ++ // registers ++ int const max_warps_per_sm_registers = ++ cuda_max_warps_per_sm_registers(properties, attributes); ++ ++ // Constrain the number of blocks to respect the maximum number of warps per ++ // SM On face value this should be an equality, but due to the warp ++ // granularity constraints noted in `cuda_max_warps_per_sm_registers` the ++ // left-hand-side of this comparison can overshoot what the hardware allows ++ // based on register counts alone ++ while ((max_blocks_regs * block_size / properties.warpSize) > ++ max_warps_per_sm_registers) ++ max_blocks_regs--; + + // Limits due to shared memory/SM + size_t const shmem_per_sm = properties.sharedMemPerMultiprocessor; +@@ -207,6 +259,19 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance, + LaunchBounds{}); + } + ++template ++int cuda_get_opt_block_size_no_shmem(const cudaFuncAttributes& attr, ++ LaunchBounds) { ++ auto const& prop = Kokkos::Cuda().cuda_device_prop(); ++ ++ // Thin version of cuda_get_opt_block_size for cases where there is no shared ++ // memory ++ auto const block_size_to_no_shmem = [&](int /*block_size*/) { return 0; }; ++ ++ return cuda_deduce_block_size(false, prop, attr, block_size_to_no_shmem, ++ LaunchBounds{}); ++} ++ + } // namespace Impl + } // namespace Kokkos + +diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp +index 5c4c3a7d39..170183ca0a 100644 +--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp ++++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp +@@ -188,9 +188,23 @@ inline void configure_shmem_preference(const KernelFuncPtr& func, + // The granularity of register allocation is chunks of 256 registers per warp + // -> 8 registers per thread + const size_t allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8); +- const size_t max_blocks_regs = ++ size_t max_blocks_regs = + regs_per_sm / (allocated_regs_per_thread * block_size); + ++ // Compute the maximum number of warps as a function of the number of ++ // registers ++ const size_t max_warps_per_sm_registers = ++ cuda_max_warps_per_sm_registers(device_props, func_attr); ++ ++ // Constrain the number of blocks to respect the maximum number of warps per ++ // SM On face value this should be an equality, but due to the warp ++ // granularity constraints noted in `cuda_max_warps_per_sm_registers` the ++ // left-hand-side of this comparison can overshoot what the hardware allows ++ // based on register counts alone ++ while ((max_blocks_regs * block_size / device_props.warpSize) > ++ max_warps_per_sm_registers) ++ max_blocks_regs--; ++ + // Compute how many threads per sm we actually want + const size_t max_threads_per_sm = device_props.maxThreadsPerMultiProcessor; + // only allocate multiples of warp size +diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp +index 0e348c092a..7e4f62f12e 100644 +--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp ++++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp +@@ -67,6 +67,34 @@ + namespace Kokkos { + namespace Impl { + ++template ++int max_tile_size_product_helper(const Policy& pol, const LaunchBounds&) { ++ cudaFuncAttributes attr = ++ CudaParallelLaunch::get_cuda_func_attributes(); ++ auto const& prop = pol.space().cuda_device_prop(); ++ ++ // Limits due to registers/SM, MDRange doesn't have ++ // shared memory constraints ++ int const optimal_block_size = ++ Kokkos::Impl::cuda_get_opt_block_size_no_shmem(attr, LaunchBounds{}); ++ ++ // Compute how many blocks of this size we can launch, based on warp ++ // constraints ++ int const max_warps_per_sm_registers = ++ Kokkos::Impl::cuda_max_warps_per_sm_registers(prop, attr); ++ int const max_num_threads_from_warps = ++ max_warps_per_sm_registers * prop.warpSize; ++ int const max_num_blocks = max_num_threads_from_warps / optimal_block_size; ++ ++ // Compute the total number of threads ++ int const max_threads_per_sm = optimal_block_size * max_num_blocks; ++ ++ return std::min( ++ max_threads_per_sm, ++ static_cast(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism)); ++} ++ + template + class ParallelFor, Kokkos::Cuda> { + public: +@@ -85,18 +113,7 @@ class ParallelFor, Kokkos::Cuda> { + public: + template + static int max_tile_size_product(const Policy& pol, const Functor&) { +- cudaFuncAttributes attr = +- CudaParallelLaunch::get_cuda_func_attributes(); +- auto const& prop = pol.space().cuda_device_prop(); +- // Limits due to registers/SM, MDRange doesn't have +- // shared memory constraints +- int const regs_per_sm = prop.regsPerMultiprocessor; +- int const regs_per_thread = attr.numRegs; +- int const max_threads_per_sm = regs_per_sm / regs_per_thread; +- return std::min( +- max_threads_per_sm, +- static_cast(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism)); ++ return max_tile_size_product_helper(pol, LaunchBounds{}); + } + Policy const& get_policy() const { return m_rp; } + inline __device__ void operator()() const { +@@ -258,17 +275,7 @@ class ParallelReduce, ReducerType, + public: + template + static int max_tile_size_product(const Policy& pol, const Functor&) { +- cudaFuncAttributes attr = +- CudaParallelLaunch::get_cuda_func_attributes(); +- auto const& prop = pol.space().cuda_device_prop(); +- // Limits due do registers/SM +- int const regs_per_sm = prop.regsPerMultiprocessor; +- int const regs_per_thread = attr.numRegs; +- int const max_threads_per_sm = regs_per_sm / regs_per_thread; +- return std::min( +- max_threads_per_sm, +- static_cast(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism)); ++ return max_tile_size_product_helper(pol, LaunchBounds{}); + } + Policy const& get_policy() const { return m_policy; } + inline __device__ void exec_range(reference_type update) const { diff --git a/lib/kokkos/kokkos_fix_5706_apply_last.diff b/lib/kokkos/kokkos_fix_5706_apply_last.diff new file mode 100644 index 0000000000..5d298323fd --- /dev/null +++ b/lib/kokkos/kokkos_fix_5706_apply_last.diff @@ -0,0 +1,63 @@ +diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp +index 170183ca0a..ba43e362bb 100644 +--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp ++++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp +@@ -412,12 +412,16 @@ struct CudaParallelLaunchKernelInvoker< + Impl::check_shmem_request(cuda_instance, shmem); + if (DriverType::Policy:: + experimental_contains_desired_occupancy) { ++ /* + int desired_occupancy = + driver.get_policy().impl_get_desired_occupancy().value(); + size_t block_size = block.x * block.y * block.z; + Impl::configure_shmem_preference( + base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size, +- shmem, desired_occupancy); ++ shmem, desired_occupancy);*/ ++ Kokkos::Impl::throw_runtime_exception( ++ std::string("Cuda graph node creation FAILED:" ++ " occupancy requests are currently broken.")); + } + + void const* args[] = {&driver}; +@@ -511,14 +515,17 @@ struct CudaParallelLaunchKernelInvoker< + + if (!Impl::is_empty_launch(grid, block)) { + Impl::check_shmem_request(cuda_instance, shmem); +- if constexpr (DriverType::Policy:: ++ if (DriverType::Policy:: + experimental_contains_desired_occupancy) { +- int desired_occupancy = ++ /*int desired_occupancy = + driver.get_policy().impl_get_desired_occupancy().value(); + size_t block_size = block.x * block.y * block.z; + Impl::configure_shmem_preference( + base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size, +- shmem, desired_occupancy); ++ shmem, desired_occupancy);*/ ++ Kokkos::Impl::throw_runtime_exception( ++ std::string("Cuda graph node creation FAILED:" ++ " occupancy requests are currently broken.")); + } + + auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver); +@@ -690,14 +697,17 @@ struct CudaParallelLaunchImpl< + + if (DriverType::Policy:: + experimental_contains_desired_occupancy) { +- int desired_occupancy = ++ /*int desired_occupancy = + driver.get_policy().impl_get_desired_occupancy().value(); + size_t block_size = block.x * block.y * block.z; + Impl::configure_shmem_preference< + DriverType, + Kokkos::LaunchBounds>( + base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size, +- shmem, desired_occupancy); ++ shmem, desired_occupancy);*/ ++ Kokkos::Impl::throw_runtime_exception( ++ std::string("Cuda graph node creation FAILED:" ++ " occupancy requests are currently broken.")); + } + + KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE(); diff --git a/lib/kokkos/master_history.txt b/lib/kokkos/master_history.txt index a1a87ce319..bd639c847e 100644 --- a/lib/kokkos/master_history.txt +++ b/lib/kokkos/master_history.txt @@ -29,3 +29,4 @@ tag: 3.5.00 date: 11:19:2021 master: c28a8b03 release: 21b879e4 tag: 3.6.00 date: 04:14:2022 master: 2834f94a release: 6ea708ff tag: 3.6.01 date: 06:16:2022 master: b52f8c83 release: afe9b404 tag: 3.7.00 date: 08:25:2022 master: d19aab99 release: 0018e5fb +tag: 3.7.01 date: 12:01:2022 master: 61d7db55 release: d3bb8cfe diff --git a/lib/kokkos/simd/cmake/Dependencies.cmake b/lib/kokkos/simd/cmake/Dependencies.cmake index 5e29157369..1d71d8af34 100644 --- a/lib/kokkos/simd/cmake/Dependencies.cmake +++ b/lib/kokkos/simd/cmake/Dependencies.cmake @@ -1,5 +1,5 @@ TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( LIB_REQUIRED_PACKAGES KokkosCore - LIB_OPTIONAL_TPLS Pthread CUDA HWLOC HPX + LIB_OPTIONAL_TPLS Pthread CUDA HWLOC TEST_OPTIONAL_TPLS CUSPARSE ) diff --git a/lib/kokkos/tpls/.clang-format b/lib/kokkos/tpls/.clang-format new file mode 100644 index 0000000000..743216e523 --- /dev/null +++ b/lib/kokkos/tpls/.clang-format @@ -0,0 +1,3 @@ +#Official Tool: clang-format version 8.0.0 +DisableFormat: true +SortIncludes: false diff --git a/lib/kokkos/tpls/desul/include/desul/atomics/Lock_Array_Cuda.hpp b/lib/kokkos/tpls/desul/include/desul/atomics/Lock_Array_Cuda.hpp index 2166fa3cb7..1815adb4a7 100644 --- a/lib/kokkos/tpls/desul/include/desul/atomics/Lock_Array_Cuda.hpp +++ b/lib/kokkos/tpls/desul/include/desul/atomics/Lock_Array_Cuda.hpp @@ -76,7 +76,7 @@ namespace Impl { /// instances in other translation units, we must update this CUDA global /// variable based on the Host global variable prior to running any kernels /// that will use it. -/// That is the purpose of the ensure_cuda_lock_arrays_on_device function. +/// That is the purpose of the KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE macro. __device__ #ifdef __CUDACC_RDC__ __constant__ extern @@ -138,42 +138,33 @@ namespace { static int lock_array_copied = 0; inline int eliminate_warning_for_lock_array() { return lock_array_copied; } } // namespace - -#ifdef __CUDACC_RDC__ -inline -#else -static -#endif - void - copy_cuda_lock_arrays_to_device() { - if (lock_array_copied == 0) { - cudaMemcpyToSymbol(CUDA_SPACE_ATOMIC_LOCKS_DEVICE, - &CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h, - sizeof(int32_t*)); - cudaMemcpyToSymbol(CUDA_SPACE_ATOMIC_LOCKS_NODE, - &CUDA_SPACE_ATOMIC_LOCKS_NODE_h, - sizeof(int32_t*)); - } - lock_array_copied = 1; -} - } // namespace Impl } // namespace desul +/* It is critical that this code be a macro, so that it will + capture the right address for desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE + putting this in an inline function will NOT do the right thing! */ +#define DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE() \ + { \ + if (::desul::Impl::lock_array_copied == 0) { \ + cudaMemcpyToSymbol(::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE, \ + &::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h, \ + sizeof(int32_t*)); \ + cudaMemcpyToSymbol(::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_NODE, \ + &::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_NODE_h, \ + sizeof(int32_t*)); \ + } \ + ::desul::Impl::lock_array_copied = 1; \ + } #endif /* defined( __CUDACC__ ) */ #endif /* defined( DESUL_HAVE_CUDA_ATOMICS ) */ -namespace desul { - #if defined(__CUDACC_RDC__) || (!defined(__CUDACC__)) -inline void ensure_cuda_lock_arrays_on_device() {} +#define DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() #else -static inline void ensure_cuda_lock_arrays_on_device() { - Impl::copy_cuda_lock_arrays_to_device(); -} +#define DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() \ + DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE() #endif -} // namespace desul - -#endif /* #ifndef DESUL_ATOMICS_LOCK_ARRAY_CUDA_HPP_ */ +#endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP_ */ diff --git a/lib/kokkos/tpls/desul/src/Lock_Array_CUDA.cpp b/lib/kokkos/tpls/desul/src/Lock_Array_CUDA.cpp index 19944b378e..cb8482c5da 100644 --- a/lib/kokkos/tpls/desul/src/Lock_Array_CUDA.cpp +++ b/lib/kokkos/tpls/desul/src/Lock_Array_CUDA.cpp @@ -70,7 +70,7 @@ void init_lock_arrays_cuda() { "init_lock_arrays_cuda: cudaMalloc host locks"); auto error_sync1 = cudaDeviceSynchronize(); - copy_cuda_lock_arrays_to_device(); + DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE(); check_error_and_throw_cuda(error_sync1, "init_lock_arrays_cuda: post mallocs"); init_lock_arrays_cuda_kernel<<<(CUDA_SPACE_ATOMIC_MASK + 1 + 255) / 256, 256>>>(); auto error_sync2 = cudaDeviceSynchronize(); @@ -85,7 +85,7 @@ void finalize_lock_arrays_cuda() { CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr; CUDA_SPACE_ATOMIC_LOCKS_NODE_h = nullptr; #ifdef __CUDACC_RDC__ - copy_cuda_lock_arrays_to_device(); + DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE(); #endif } diff --git a/src/KOKKOS/fix_langevin_kokkos.h b/src/KOKKOS/fix_langevin_kokkos.h index 43f809d548..97dbd28a33 100644 --- a/src/KOKKOS/fix_langevin_kokkos.h +++ b/src/KOKKOS/fix_langevin_kokkos.h @@ -190,6 +190,20 @@ namespace LAMMPS_NS { Tp_BIAS,Tp_RMASS,Tp_ZERO>(i); } + KOKKOS_INLINE_FUNCTION + static void init(value_type &update) { + update.fx = 0.0; + update.fy = 0.0; + update.fz = 0.0; + } + KOKKOS_INLINE_FUNCTION + static void join(value_type &update, + const value_type &source) { + update.fx += source.fx; + update.fy += source.fy; + update.fz += source.fz; + } + KOKKOS_INLINE_FUNCTION static void init(volatile value_type &update) { update.fx = 0.0; @@ -233,6 +247,15 @@ namespace LAMMPS_NS { energy += c.compute_energy_item(i); } KOKKOS_INLINE_FUNCTION + static void init(value_type &update) { + update = 0.0; + } + KOKKOS_INLINE_FUNCTION + static void join(value_type &update, + const value_type &source) { + update += source; + } + KOKKOS_INLINE_FUNCTION static void init(volatile value_type &update) { update = 0.0; } diff --git a/src/KOKKOS/kokkos.cpp b/src/KOKKOS/kokkos.cpp index d735419ab3..9bbfb4157f 100644 --- a/src/KOKKOS/kokkos.cpp +++ b/src/KOKKOS/kokkos.cpp @@ -77,7 +77,6 @@ GPU_AWARE_UNKNOWN using namespace LAMMPS_NS; -Kokkos::InitArguments KokkosLMP::args{-1, -1, -1, false}; int KokkosLMP::is_finalized = 0; int KokkosLMP::init_ngpus = 0; @@ -110,7 +109,6 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp) ngpus = 0; int device = 0; nthreads = 1; - numa = 1; int iarg = 0; while (iarg < narg) { @@ -189,30 +187,24 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp) iarg += 2; - } else if (strcmp(arg[iarg],"n") == 0 || - strcmp(arg[iarg],"numa") == 0) { - numa = utils::inumeric(FLERR, arg[iarg+1], false, lmp); - iarg += 2; - } else error->all(FLERR,"Invalid Kokkos command-line arg: {}", arg[iarg]); } // Initialize Kokkos. However, we cannot change any // Kokkos library parameters after the first initalization - if (args.num_threads != -1) { - if ((args.num_threads != nthreads) || (args.num_numa != numa) || (args.device_id != device)) + Kokkos::InitializationSettings args; + + if (args.has_num_threads()) { + if ((args.get_num_threads() != nthreads) || (args.get_device_id() != device)) if (me == 0) - error->warning(FLERR,"Kokkos package already initalized, " - "cannot reinitialize with different parameters"); - nthreads = args.num_threads; - numa = args.num_numa; - device = args.device_id; + error->warning(FLERR,"Kokkos package already initalized. Cannot change parameters"); + nthreads = args.get_num_threads(); + device = args.get_device_id(); ngpus = init_ngpus; } else { - args.num_threads = nthreads; - args.num_numa = numa; - args.device_id = device; + args.set_num_threads(nthreads); + args.set_device_id(device); init_ngpus = ngpus; } @@ -350,7 +342,7 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp) /* ---------------------------------------------------------------------- */ -void KokkosLMP::initialize(Kokkos::InitArguments args, Error *error) +void KokkosLMP::initialize(Kokkos::InitializationSettings args, Error *error) { if (!Kokkos::is_initialized()) { if (is_finalized) diff --git a/src/KOKKOS/kokkos.h b/src/KOKKOS/kokkos.h index b6a9d57345..02189e6c4a 100644 --- a/src/KOKKOS/kokkos.h +++ b/src/KOKKOS/kokkos.h @@ -43,7 +43,6 @@ class KokkosLMP : protected Pointers { int forward_fix_comm_changed; int reverse_comm_changed; int nthreads,ngpus; - int numa; int auto_sync; int gpu_aware_flag; int neigh_thread; @@ -53,12 +52,11 @@ class KokkosLMP : protected Pointers { double binsize; static int is_finalized; - static Kokkos::InitArguments args; static int init_ngpus; KokkosLMP(class LAMMPS *, int, char **); - static void initialize(Kokkos::InitArguments, Error *); + static void initialize(Kokkos::InitializationSettings, Error *); static void finalize(); void accelerator(int, char **); int neigh_count(int); diff --git a/src/KOKKOS/pair_reaxff_kokkos.h b/src/KOKKOS/pair_reaxff_kokkos.h index 5edf439641..56f89d4071 100644 --- a/src/KOKKOS/pair_reaxff_kokkos.h +++ b/src/KOKKOS/pair_reaxff_kokkos.h @@ -524,6 +524,12 @@ struct PairReaxKokkosFindBondFunctor { PairReaxFFKokkos c; PairReaxKokkosFindBondFunctor(PairReaxFFKokkos* c_ptr):c(*c_ptr) {}; + KOKKOS_INLINE_FUNCTION + void join(int &dst, + const int &src) const { + dst = MAX(dst,src); + } + KOKKOS_INLINE_FUNCTION void join(volatile int &dst, const volatile int &src) const { diff --git a/src/accelerator_kokkos.h b/src/accelerator_kokkos.h index c064d73728..36a376bff8 100644 --- a/src/accelerator_kokkos.h +++ b/src/accelerator_kokkos.h @@ -52,7 +52,6 @@ class KokkosLMP { int kokkos_exists; int nthreads; int ngpus; - int numa; KokkosLMP(class LAMMPS *, int, char **) { kokkos_exists = 0; } ~KokkosLMP() {} diff --git a/src/comm.cpp b/src/comm.cpp index 3debdd36cb..a6ac1c4bc8 100644 --- a/src/comm.cpp +++ b/src/comm.cpp @@ -91,7 +91,7 @@ Comm::Comm(LAMMPS *lmp) : Pointers(lmp) nthreads = 1; #ifdef _OPENMP if (lmp->kokkos) { - nthreads = lmp->kokkos->nthreads * lmp->kokkos->numa; + nthreads = lmp->kokkos->nthreads; } else if (getenv("OMP_NUM_THREADS") == nullptr) { nthreads = 1; if (me == 0)