Merge pull request #3532 from stanmoore1/kk_occupancy

Update Kokkos version in LAMMPS to 3.7.1
2023-01-20 17:52:05 -05:00
parent dfe3436e9c 6477b19702
commit ef692258b4
67 changed files with 2295 additions and 561 deletions
--- a/cmake/Modules/Packages/KOKKOS.cmake
+++ b/cmake/Modules/Packages/KOKKOS.cmake
@ -49,8 +49,8 @@ if(DOWNLOAD_KOKKOS)
  list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_CXX_EXTENSIONS=${CMAKE_CXX_EXTENSIONS}")
  list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}")
  include(ExternalProject)
-  set(KOKKOS_URL "https://github.com/kokkos/kokkos/archive/3.7.00.tar.gz" CACHE STRING "URL for KOKKOS tarball")
+  set(KOKKOS_URL "https://github.com/kokkos/kokkos/archive/3.7.01.tar.gz" CACHE STRING "URL for KOKKOS tarball")
-  set(KOKKOS_MD5 "84991eca9f066383abe119a5bc7a11c4" CACHE STRING "MD5 checksum of KOKKOS tarball")
+  set(KOKKOS_MD5 "f140e02b826223b1045207d9bc10d404" CACHE STRING "MD5 checksum of KOKKOS tarball")
  mark_as_advanced(KOKKOS_URL)
  mark_as_advanced(KOKKOS_MD5)
  ExternalProject_Add(kokkos_build
@ -74,7 +74,7 @@ if(DOWNLOAD_KOKKOS)
  add_dependencies(LAMMPS::KOKKOSCORE kokkos_build)
  add_dependencies(LAMMPS::KOKKOSCONTAINERS kokkos_build)
 elseif(EXTERNAL_KOKKOS)
-  find_package(Kokkos 3.7.00 REQUIRED CONFIG)
+  find_package(Kokkos 3.7.01 REQUIRED CONFIG)
  target_link_libraries(lammps PRIVATE Kokkos::kokkos)
  target_link_libraries(lmp PRIVATE Kokkos::kokkos)
 else()
--- a/doc/src/Run_options.rst
+++ b/doc/src/Run_options.rst
@ -105,13 +105,12 @@ Either the full word or an abbreviation can be used for the keywords.
 Note that the keywords do not use a leading minus sign.  I.e. the
 keyword is "t", not "-t".  Also note that each of the keywords has a
 default setting.  Examples of when to use these options and what
-settings to use on different platforms is given on the :doc:`KOKKOS package <Speed_kokkos>`
+settings to use on different platforms is given on the :doc:`KOKKOS
-doc page.
+package <Speed_kokkos>` doc page.
 * d or device
 * g or gpus
 * t or threads
 * n or numa
 .. parsed-literal::
@ -164,19 +163,10 @@ the number of physical cores per node, to use your available hardware
 optimally.  This also sets the number of threads used by the host when
 LAMMPS is compiled with CUDA=yes.
-.. parsed-literal::
+.. deprecated:: 22Dec2022
-   numa Nm
+Support for the "numa" or "n" option was removed as its functionality
-
+was ignored in Kokkos for some time already.
 This option is only relevant when using pthreads with hwloc support.
 In this case Nm defines the number of NUMA regions (typically sockets)
 on a node which will be utilized by a single MPI rank.  By default Nm
 = 1.  If this option is used the total number of worker-threads per
 MPI rank is threads\*numa.  Currently it is always almost better to
 assign at least one MPI rank per NUMA region, and leave numa set to
 its default value of 1. This is because letting a single process span
 multiple NUMA regions induces a significant amount of cross NUMA data
 traffic which is slow.
 ----------
--- a/lib/kokkos/CHANGELOG.md
+++ b/lib/kokkos/CHANGELOG.md
@ -1,5 +1,27 @@
 # Change Log
 ## [3.7.01](https://github.com/kokkos/kokkos/tree/3.7.01) (2022-12-01)
 [Full Changelog](https://github.com/kokkos/kokkos/compare/3.7.00...3.7.01)
 ### Bug Fixes:
 - Add fences to all sorting routines not taking an execution space instance argument [\#5547](https://github.com/kokkos/kokkos/pull/5547)
 - Fix repeated `team_reduce` without barrier [\#5552](https://github.com/kokkos/kokkos/pull/5552)
 - Fix memory spaces in `create_mirror_view` overloads using `view_alloc` [\#5521](https://github.com/kokkos/kokkos/pull/5521)
 - Allow `as_view_of_rank_n()` to be overloaded for "special" scalar types [\#5553](https://github.com/kokkos/kokkos/pull/5553)
 - Fix warning calling a `__host__` function from a `__host__ __device__` from `View:: as_view_of_rank_n` [\#5591](https://github.com/kokkos/kokkos/pull/5591)
 - OpenMPTarget: adding implementation to set device id. [\#5557](https://github.com/kokkos/kokkos/pull/5557)
 - Use `Kokkos::atomic_load` to Correct Race Condition Giving Rise to Seg Faulting Error in OpenMP tests [\#5559](https://github.com/kokkos/kokkos/pull/5559)
 - cmake: define `KOKKOS_ARCH_A64FX` [\#5561](https://github.com/kokkos/kokkos/pull/5561)
 - Only link against libatomic in gnu-make OpenMPTarget build [\#5565](https://github.com/kokkos/kokkos/pull/5565)
 - Fix static extents assignment for LayoutLeft/LayoutRight assignment [\#5566](https://github.com/kokkos/kokkos/pull/5566)
 - Do not add -cuda to the link line with NVHPC compiler when the CUDA backend is not actually enabled [\#5569](https://github.com/kokkos/kokkos/pull/5569)
 - Export the flags in `KOKKOS_AMDGPU_OPTIONS` when using Trilinos [\#5571](https://github.com/kokkos/kokkos/pull/5571)
 - Add support for detecting MPI local rank with MPICH and PMI [\#5570](https://github.com/kokkos/kokkos/pull/5570) [\#5582](https://github.com/kokkos/kokkos/pull/5582)
 - Remove listing of undefined TPL dependencies [\#5573](https://github.com/kokkos/kokkos/pull/5573)
 - ClockTic changed to 64 bit to fix overflow on Power [\#5592](https://github.com/kokkos/kokkos/pull/5592)
 - Fix incorrect offset in CUDA and HIP parallel scan for < 4 byte types [\#5607](https://github.com/kokkos/kokkos/pull/5607)
 - Fix initialization of Cuda lock arrays [\#5622](https://github.com/kokkos/kokkos/pull/5622)
 ## [3.7.00](https://github.com/kokkos/kokkos/tree/3.7.00) (2022-08-22)
 [Full Changelog](https://github.com/kokkos/kokkos/compare/3.6.01...3.7.00)
@ -102,7 +124,6 @@
 - Deprecate command line arguments (other than `--help`) that are not prefixed with `kokkos-*` [\#5120](https://github.com/kokkos/kokkos/pull/5120)
 - Deprecate `--[kokkos-]numa` cmdline arg and `KOKKOS_NUMA` env var [\#5117](https://github.com/kokkos/kokkos/pull/5117)
 - Deprecate `--[kokkos-]threads` command line argument in favor of `--[kokkos-]num-threads` [\#5111](https://github.com/kokkos/kokkos/pull/5111)
 - Deprecate `Kokkos::common_view_alloc_prop` [\#5059](https://github.com/kokkos/kokkos/pull/5059)
 - Deprecate `Kokkos::is_reducer_type` [\#4957](https://github.com/kokkos/kokkos/pull/4957)
 - Deprecate `OffsetView` constructors taking `index_list_type` [\#4810](https://github.com/kokkos/kokkos/pull/4810)
 - Deprecate overloads of `Kokkos::sort` taking a parameter `bool always_use_kokkos_sort` [\#5382](https://github.com/kokkos/kokkos/issues/5382)
--- a/lib/kokkos/CMakeLists.txt
+++ b/lib/kokkos/CMakeLists.txt
@ -129,7 +129,7 @@ ENDIF()
 set(Kokkos_VERSION_MAJOR 3)
 set(Kokkos_VERSION_MINOR 7)
-set(Kokkos_VERSION_PATCH 00)
+set(Kokkos_VERSION_PATCH 01)
 set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}")
 math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}")
@ -152,6 +152,7 @@ ENDIF()
 # but scoping issues can make it difficult
 GLOBAL_SET(KOKKOS_COMPILE_OPTIONS)
 GLOBAL_SET(KOKKOS_LINK_OPTIONS)
 GLOBAL_SET(KOKKOS_AMDGPU_OPTIONS)
 GLOBAL_SET(KOKKOS_CUDA_OPTIONS)
 GLOBAL_SET(KOKKOS_CUDAFE_OPTIONS)
 GLOBAL_SET(KOKKOS_XCOMPILER_OPTIONS)
@ -228,6 +229,9 @@ IF (KOKKOS_HAS_TRILINOS)
  # we have to match the annoying behavior, also we have to preserve quotes
  # which needs another workaround.
  SET(KOKKOS_COMPILE_OPTIONS_TMP)
  IF (KOKKOS_ENABLE_HIP)
    LIST(APPEND KOKKOS_COMPILE_OPTIONS ${KOKKOS_AMDGPU_OPTIONS})
  ENDIF()
  FOREACH(OPTION ${KOKKOS_COMPILE_OPTIONS})
    STRING(FIND "${OPTION}" " " OPTION_HAS_WHITESPACE)
    IF(OPTION_HAS_WHITESPACE EQUAL -1)
--- a/lib/kokkos/Makefile.kokkos
+++ b/lib/kokkos/Makefile.kokkos
@ -12,7 +12,7 @@ endif
 KOKKOS_VERSION_MAJOR = 3
 KOKKOS_VERSION_MINOR = 7
-KOKKOS_VERSION_PATCH = 00
+KOKKOS_VERSION_PATCH = 01
 KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc)
 # Options: Cuda,HIP,SYCL,OpenMPTarget,OpenMP,Threads,Serial
@ -20,7 +20,7 @@ KOKKOS_DEVICES ?= "OpenMP"
 #KOKKOS_DEVICES ?= "Threads"
 # Options:
 # Intel:    KNC,KNL,SNB,HSW,BDW,SKL,SKX,ICL,ICX,SPR
-# NVIDIA:   Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86
+# NVIDIA:   Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86,Hopper90
 # ARM:      ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX
 # IBM:      BGQ,Power7,Power8,Power9
 # AMD-GPUS: Vega900,Vega906,Vega908,Vega90A
@ -401,6 +401,7 @@ KOKKOS_INTERNAL_USE_ARCH_VOLTA72 := $(call kokkos_has_string,$(KOKKOS_ARCH),Volt
 KOKKOS_INTERNAL_USE_ARCH_TURING75 := $(call kokkos_has_string,$(KOKKOS_ARCH),Turing75)
 KOKKOS_INTERNAL_USE_ARCH_AMPERE80 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere80)
 KOKKOS_INTERNAL_USE_ARCH_AMPERE86 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere86)
 KOKKOS_INTERNAL_USE_ARCH_HOPPER90 := $(call kokkos_has_string,$(KOKKOS_ARCH),Hopper90)
 KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30)  \
                                              + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32)  \
                                              + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35)  \
@ -414,7 +415,8 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLE
                                              + $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72)   \
                                              + $(KOKKOS_INTERNAL_USE_ARCH_TURING75)  \
                                              + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE80)  \
-                                              + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86))
+                                              + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86)  \
                                              + $(KOKKOS_INTERNAL_USE_ARCH_HOPPER90))
 #SEK: This seems like a bug to me
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
@ -505,10 +507,6 @@ KOKKOS_LINK_FLAGS =
 KOKKOS_SRC =
 KOKKOS_HEADERS =
 #ifeq ($(KOKKOS_INTERNAL_COMPILER_GCC), 1)
  KOKKOS_LIBS += -latomic
 #endif
 # Generating the KokkosCore_config.h file.
 KOKKOS_INTERNAL_CONFIG_TMP=KokkosCore_config.tmp
@ -550,6 +548,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1)
 endif
 ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
  KOKKOS_LIBS += -latomic
  tmp := $(call kokkos_append_header,'$H''define KOKKOS_ENABLE_OPENMPTARGET')
  ifeq ($(KOKKOS_INTERNAL_COMPILER_GCC), 1)
  tmp := $(call kokkos_append_header,"$H""define KOKKOS_WORKAROUND_OPENMPTARGET_GCC")
@ -1197,6 +1196,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA_ARCH), 1)
    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE86")
    KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_86
  endif
  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_HOPPER90), 1)
    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER")
    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER90")
    KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_90
  endif
  ifneq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
    KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)
--- a/lib/kokkos/algorithms/cmake/Dependencies.cmake
+++ b/lib/kokkos/algorithms/cmake/Dependencies.cmake
@ -1,5 +1,5 @@
 TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
  LIB_REQUIRED_PACKAGES KokkosCore KokkosContainers
-  LIB_OPTIONAL_TPLS Pthread CUDA HWLOC HPX
+  LIB_OPTIONAL_TPLS Pthread CUDA HWLOC
  TEST_OPTIONAL_TPLS CUSPARSE
  )
--- a/lib/kokkos/algorithms/src/Kokkos_Sort.hpp
+++ b/lib/kokkos/algorithms/src/Kokkos_Sort.hpp
@ -265,8 +265,8 @@ class BinSort {
  //----------------------------------------
  // Create the permutation vector, the bin_offset array and the bin_count
  // array. Can be called again if keys changed
-  template <class ExecutionSpace = exec_space>
+  template <class ExecutionSpace>
-  void create_permute_vector(const ExecutionSpace& exec = exec_space{}) {
+  void create_permute_vector(const ExecutionSpace& exec) {
    static_assert(
        Kokkos::SpaceAccessibility<ExecutionSpace,
                                   typename Space::memory_space>::accessible,
@ -297,6 +297,15 @@ class BinSort {
          *this);
  }
  // Create the permutation vector, the bin_offset array and the bin_count
  // array. Can be called again if keys changed
  void create_permute_vector() {
    Kokkos::fence("Kokkos::Binsort::create_permute_vector: before");
    exec_space e{};
    create_permute_vector(e);
    e.fence("Kokkos::Binsort::create_permute_vector: after");
  }
  // Sort a subset of a view with respect to the first dimension using the
  // permutation array
  template <class ExecutionSpace, class ValuesViewType>
@ -372,9 +381,10 @@ class BinSort {
  template <class ValuesViewType>
  void sort(ValuesViewType const& values, int values_range_begin,
            int values_range_end) const {
    Kokkos::fence("Kokkos::Binsort::sort: before");
    exec_space exec;
    sort(exec, values, values_range_begin, values_range_end);
-    exec.fence("Kokkos::Sort: fence after sorting");
+    exec.fence("Kokkos::BinSort:sort: after");
  }
  template <class ExecutionSpace, class ValuesViewType>
@ -641,9 +651,10 @@ std::enable_if_t<Kokkos::is_execution_space<ExecutionSpace>::value> sort(
 template <class ViewType>
 void sort(ViewType const& view) {
  Kokkos::fence("Kokkos::sort: before");
  typename ViewType::execution_space exec;
  sort(exec, view);
-  exec.fence("Kokkos::Sort: fence after sorting");
+  exec.fence("Kokkos::sort: fence after sorting");
 }
 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
@ -682,6 +693,7 @@ std::enable_if_t<Kokkos::is_execution_space<ExecutionSpace>::value> sort(
 template <class ViewType>
 void sort(ViewType view, size_t const begin, size_t const end) {
  Kokkos::fence("Kokkos::sort: before");
  typename ViewType::execution_space exec;
  sort(exec, view, begin, end);
  exec.fence("Kokkos::Sort: fence after sorting");
--- a/lib/kokkos/bin/nvcc_wrapper
+++ b/lib/kokkos/bin/nvcc_wrapper
@ -10,10 +10,12 @@
 # Default settings: change those according to your machine.  For
 # example, you may have have two different wrappers with either icpc
 # or g++ as their back-end compiler.  The defaults can be overwritten
-# by using the usual arguments (e.g., -arch=sm_30 -ccbin icpc).
+# by using the usual arguments (e.g., -arch=sm_80 -ccbin icpc).
 # sm_70 is supported by every CUDA version from 9-12 and is thus
 # chosen as default
-default_arch="sm_35"
+default_arch="sm_70"
-#default_arch="sm_50"
+#default_arch="sm_80"
 #
 # The default C++ compiler.
--- a/lib/kokkos/cmake/KokkosCore_config.h.in
+++ b/lib/kokkos/cmake/KokkosCore_config.h.in
@ -66,6 +66,7 @@
 #cmakedefine KOKKOS_ARCH_ARMV8_THUNDERX
 #cmakedefine KOKKOS_ARCH_ARMV81
 #cmakedefine KOKKOS_ARCH_ARMV8_THUNDERX2
 #cmakedefine KOKKOS_ARCH_A64FX
 #cmakedefine KOKKOS_ARCH_AMD_AVX2
 #cmakedefine KOKKOS_ARCH_AVX
 #cmakedefine KOKKOS_ARCH_AVX2
@ -101,6 +102,7 @@
 #cmakedefine KOKKOS_ARCH_AMPERE
 #cmakedefine KOKKOS_ARCH_AMPERE80
 #cmakedefine KOKKOS_ARCH_AMPERE86
 #cmakedefine KOKKOS_ARCH_HOPPER90
 #cmakedefine KOKKOS_ARCH_AMD_ZEN
 #cmakedefine KOKKOS_ARCH_AMD_ZEN2
 #cmakedefine KOKKOS_ARCH_AMD_ZEN3
--- a/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc
+++ b/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc
@ -74,6 +74,7 @@ int main() {
    case 75: std::cout << "Set -DKokkos_ARCH_TURING75=ON ." << std::endl; break;
    case 80: std::cout << "Set -DKokkos_ARCH_AMPERE80=ON ." << std::endl; break;
    case 86: std::cout << "Set -DKokkos_ARCH_AMPERE86=ON ." << std::endl; break;
    case 90: std::cout << "Set -DKokkos_ARCH_HOPPER90=ON ." << std::endl; break;
    default:
      std::cout << "Compute capability " << compute_capability
                << " is not supported" << std::endl;
--- a/lib/kokkos/cmake/kokkos_arch.cmake
+++ b/lib/kokkos/cmake/kokkos_arch.cmake
@ -86,6 +86,7 @@ KOKKOS_ARCH_OPTION(VOLTA72         GPU  "NVIDIA Volta generation CC 7.2"   "KOKK
 KOKKOS_ARCH_OPTION(TURING75        GPU  "NVIDIA Turing generation CC 7.5"  "KOKKOS_SHOW_CUDA_ARCHS")
 KOKKOS_ARCH_OPTION(AMPERE80        GPU  "NVIDIA Ampere generation CC 8.0"  "KOKKOS_SHOW_CUDA_ARCHS")
 KOKKOS_ARCH_OPTION(AMPERE86        GPU  "NVIDIA Ampere generation CC 8.6"  "KOKKOS_SHOW_CUDA_ARCHS")
 KOKKOS_ARCH_OPTION(HOPPER90        GPU  "NVIDIA Hopper generation CC 9.0"  "KOKKOS_SHOW_CUDA_ARCHS")
 IF(Kokkos_ENABLE_HIP OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_UNSUPPORTED_ARCHS)
  SET(KOKKOS_SHOW_HIP_ARCHS ON)
@ -187,7 +188,9 @@ IF (KOKKOS_CXX_COMPILER_ID STREQUAL Clang)
 ELSEIF (KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
  SET(CUDA_ARCH_FLAG "-gpu")
  GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS -cuda)
  IF (KOKKOS_ENABLE_CUDA) # FIXME ideally unreachable when CUDA not enabled
    GLOBAL_APPEND(KOKKOS_LINK_OPTIONS -cuda)
  ENDIF()
 ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA)
  SET(CUDA_ARCH_FLAG "-arch")
 ENDIF()
@ -542,6 +545,7 @@ CHECK_CUDA_ARCH(VOLTA72   sm_72)
 CHECK_CUDA_ARCH(TURING75  sm_75)
 CHECK_CUDA_ARCH(AMPERE80  sm_80)
 CHECK_CUDA_ARCH(AMPERE86  sm_86)
 CHECK_CUDA_ARCH(HOPPER90  sm_90)
 SET(AMDGPU_ARCH_ALREADY_SPECIFIED "")
 FUNCTION(CHECK_AMDGPU_ARCH ARCH FLAG)
@ -804,6 +808,10 @@ IF (KOKKOS_ARCH_AMPERE80 OR KOKKOS_ARCH_AMPERE86)
  SET(KOKKOS_ARCH_AMPERE ON)
 ENDIF()
 IF (KOKKOS_ARCH_HOPPER90)
  SET(KOKKOS_ARCH_HOPPER ON)
 ENDIF()
 #Regardless of version, make sure we define the general architecture name
 IF (KOKKOS_ARCH_VEGA900 OR KOKKOS_ARCH_VEGA906 OR KOKKOS_ARCH_VEGA908 OR KOKKOS_ARCH_VEGA90A)
  SET(KOKKOS_ARCH_VEGA ON)
--- a/lib/kokkos/containers/cmake/Dependencies.cmake
+++ b/lib/kokkos/containers/cmake/Dependencies.cmake
@ -1,5 +1,5 @@
 TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
  LIB_REQUIRED_PACKAGES KokkosCore
-  LIB_OPTIONAL_TPLS Pthread CUDA HWLOC HPX
+  LIB_OPTIONAL_TPLS Pthread CUDA HWLOC
  TEST_OPTIONAL_TPLS CUSPARSE
  )
--- a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp
@ -1701,7 +1701,11 @@ namespace Impl {
   underlying memory, to facilitate implementation of deep_copy() and
   other routines that are defined on View */
 template <unsigned N, typename T, typename... Args>
-KOKKOS_FUNCTION auto as_view_of_rank_n(DynRankView<T, Args...> v) {
+KOKKOS_FUNCTION auto as_view_of_rank_n(
    DynRankView<T, Args...> v,
    typename std::enable_if<std::is_same<
        typename ViewTraits<T, Args...>::specialize, void>::value>::type* =
        nullptr) {
  if (v.rank() != N) {
    KOKKOS_IF_ON_HOST(
        const std::string message =
@ -2114,6 +2118,7 @@ inline auto create_mirror(
 namespace Impl {
 template <class T, class... P, class... ViewCtorArgs>
 inline std::enable_if_t<
    !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
        std::is_same<
            typename DynRankView<T, P...>::memory_space,
            typename DynRankView<T, P...>::HostMirror::memory_space>::value &&
@ -2128,6 +2133,7 @@ create_mirror_view(const DynRankView<T, P...>& src,
 template <class T, class... P, class... ViewCtorArgs>
 inline std::enable_if_t<
    !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
        !(std::is_same<
              typename DynRankView<T, P...>::memory_space,
              typename DynRankView<T, P...>::HostMirror::memory_space>::value &&
@ -2141,29 +2147,39 @@ create_mirror_view(
  return Kokkos::Impl::create_mirror(src, arg_prop);
 }
-template <class Space, class T, class... P, class... ViewCtorArgs>
+template <class T, class... P, class... ViewCtorArgs,
          class = std::enable_if_t<
              Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
 inline std::enable_if_t<
-    Kokkos::is_space<Space>::value &&
+    Kokkos::is_space<
-        Impl::MirrorDRViewType<Space, T, P...>::is_same_memspace,
+        typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space>::value &&
-    typename Impl::MirrorDRViewType<Space, T, P...>::view_type>
+        Impl::MirrorDRViewType<
-create_mirror_view(const Space&, const Kokkos::DynRankView<T, P...>& src,
+            typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T,
            P...>::is_same_memspace,
    typename Impl::MirrorDRViewType<
        typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T,
        P...>::view_type>
 create_mirror_view(const Kokkos::DynRankView<T, P...>& src,
                   const typename Impl::ViewCtorProp<ViewCtorArgs...>&) {
  return src;
 }
-template <class Space, class T, class... P, class... ViewCtorArgs>
+template <class T, class... P, class... ViewCtorArgs,
          class = std::enable_if_t<
              Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
 inline std::enable_if_t<
-    Kokkos::is_space<Space>::value &&
+    Kokkos::is_space<
-        !Impl::MirrorDRViewType<Space, T, P...>::is_same_memspace,
+        typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space>::value &&
-    typename Impl::MirrorDRViewType<Space, T, P...>::view_type>
+        !Impl::MirrorDRViewType<
            typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T,
            P...>::is_same_memspace,
    typename Impl::MirrorDRViewType<
        typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T,
        P...>::view_type>
 create_mirror_view(
-    const Space&, const Kokkos::DynRankView<T, P...>& src,
+    const Kokkos::DynRankView<T, P...>& src,
    const typename Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
-  using MemorySpace = typename Space::memory_space;
+  return Kokkos::Impl::create_mirror(src, arg_prop);
  using alloc_prop  = Impl::ViewCtorProp<ViewCtorArgs..., MemorySpace>;
  alloc_prop prop_copy(arg_prop);
  return Kokkos::Impl::create_mirror(src, prop_copy);
 }
 }  // namespace Impl
@ -2224,9 +2240,10 @@ create_mirror_view(
 template <class Space, class T, class... P>
 inline auto create_mirror_view(Kokkos::Impl::WithoutInitializing_t wi,
-                               const Space& space,
+                               const Space&,
                               const Kokkos::DynRankView<T, P...>& src) {
-  return Impl::create_mirror_view(space, src, Kokkos::view_alloc(wi));
+  return Impl::create_mirror_view(
      src, Kokkos::view_alloc(typename Space::memory_space{}, wi));
 }
 template <class T, class... P, class... ViewCtorArgs>
--- a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp
@ -710,7 +710,7 @@ template <class Space, class T, class... P>
 inline auto create_mirror(
    const Space&, const Kokkos::Experimental::DynamicView<T, P...>& src) {
  return Impl::create_mirror(
-      src, Impl::ViewCtorProp<>{typename Space::memory_space{}});
+      src, Kokkos::view_alloc(typename Space::memory_space{}));
 }
 template <class Space, class T, class... P>
@ -729,8 +729,10 @@ inline auto create_mirror(
 }
 namespace Impl {
 template <class T, class... P, class... ViewCtorArgs>
 inline std::enable_if_t<
    !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
        (std::is_same<
             typename Kokkos::Experimental::DynamicView<T, P...>::memory_space,
             typename Kokkos::Experimental::DynamicView<
@ -740,14 +742,14 @@ inline std::enable_if_t<
             typename Kokkos::Experimental::DynamicView<
                 T, P...>::HostMirror::data_type>::value),
    typename Kokkos::Experimental::DynamicView<T, P...>::HostMirror>
-create_mirror_view(
+create_mirror_view(const Kokkos::Experimental::DynamicView<T, P...>& src,
    const typename Kokkos::Experimental::DynamicView<T, P...>& src,
                   const Impl::ViewCtorProp<ViewCtorArgs...>&) {
  return src;
 }
 template <class T, class... P, class... ViewCtorArgs>
 inline std::enable_if_t<
    !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
        !(std::is_same<
              typename Kokkos::Experimental::DynamicView<T, P...>::memory_space,
              typename Kokkos::Experimental::DynamicView<
@ -762,15 +764,33 @@ create_mirror_view(const Kokkos::Experimental::DynamicView<T, P...>& src,
  return Kokkos::create_mirror(arg_prop, src);
 }
-template <class Space, class T, class... P, class... ViewCtorArgs>
+template <class T, class... P, class... ViewCtorArgs,
-inline std::enable_if_t<
+          class = std::enable_if_t<
-    Impl::MirrorDynamicViewType<Space, T, P...>::is_same_memspace,
+              Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
-    typename Kokkos::Impl::MirrorDynamicViewType<Space, T, P...>::view_type>
+std::enable_if_t<Impl::MirrorDynamicViewType<
-create_mirror_view(const Space&,
+                     typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
-                   const Kokkos::Experimental::DynamicView<T, P...>& src,
+                     T, P...>::is_same_memspace,
                 typename Impl::MirrorDynamicViewType<
                     typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
                     T, P...>::view_type>
 create_mirror_view(const Kokkos::Experimental::DynamicView<T, P...>& src,
                   const Impl::ViewCtorProp<ViewCtorArgs...>&) {
  return src;
 }
 template <class T, class... P, class... ViewCtorArgs,
          class = std::enable_if_t<
              Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
 std::enable_if_t<!Impl::MirrorDynamicViewType<
                     typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
                     T, P...>::is_same_memspace,
                 typename Impl::MirrorDynamicViewType<
                     typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
                     T, P...>::view_type>
 create_mirror_view(const Kokkos::Experimental::DynamicView<T, P...>& src,
                   const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
  return Kokkos::Impl::create_mirror(src, arg_prop);
 }
 }  // namespace Impl
 // Create a mirror view in host space
@ -790,8 +810,9 @@ inline auto create_mirror_view(
 // Create a mirror in a new space
 template <class Space, class T, class... P>
 inline auto create_mirror_view(
-    const Space& space, const Kokkos::Experimental::DynamicView<T, P...>& src) {
+    const Space&, const Kokkos::Experimental::DynamicView<T, P...>& src) {
-  return Impl::create_mirror_view(space, src, Impl::ViewCtorProp<>{});
+  return Impl::create_mirror_view(src,
                                  view_alloc(typename Space::memory_space{}));
 }
 template <class Space, class T, class... P>
--- a/lib/kokkos/containers/src/Kokkos_OffsetView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_OffsetView.hpp
@ -1901,19 +1901,22 @@ struct MirrorOffsetType {
 namespace Impl {
 template <class T, class... P, class... ViewCtorArgs>
-inline typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror
+inline std::enable_if_t<
    !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space,
    typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror>
 create_mirror(const Kokkos::Experimental::OffsetView<T, P...>& src,
              const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
  return typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror(
      Kokkos::create_mirror(arg_prop, src.view()), src.begins());
 }
-template <class Space, class T, class... P, class... ViewCtorArgs>
+template <class T, class... P, class... ViewCtorArgs,
-inline typename Kokkos::Impl::MirrorOffsetType<Space, T, P...>::view_type
+          class = std::enable_if_t<
-create_mirror(const Space&,
+              Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
-              const Kokkos::Experimental::OffsetView<T, P...>& src,
+inline auto create_mirror(const Kokkos::Experimental::OffsetView<T, P...>& src,
                          const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
  using alloc_prop_input = Impl::ViewCtorProp<ViewCtorArgs...>;
  using Space = typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space;
  static_assert(
      !alloc_prop_input::has_label,
@ -1923,10 +1926,6 @@ create_mirror(const Space&,
      !alloc_prop_input::has_pointer,
      "The view constructor arguments passed to Kokkos::create_mirror must "
      "not include a pointer!");
  static_assert(
      !alloc_prop_input::has_memory_space,
      "The view constructor arguments passed to Kokkos::create_mirror must "
      "not include a memory space instance!");
  static_assert(
      !alloc_prop_input::allow_padding,
      "The view constructor arguments passed to Kokkos::create_mirror must "
@ -1962,15 +1961,17 @@ inline auto create_mirror(
 template <class Space, class T, class... P,
          typename Enable = std::enable_if_t<Kokkos::is_space<Space>::value>>
 inline auto create_mirror(
-    const Space& space, const Kokkos::Experimental::OffsetView<T, P...>& src) {
+    const Space&, const Kokkos::Experimental::OffsetView<T, P...>& src) {
-  return Impl::create_mirror(space, src, Impl::ViewCtorProp<>{});
+  return Impl::create_mirror(
      src, Kokkos::view_alloc(typename Space::memory_space{}));
 }
 template <class Space, class T, class... P>
 typename Kokkos::Impl::MirrorOffsetType<Space, T, P...>::view_type
-create_mirror(Kokkos::Impl::WithoutInitializing_t wi, const Space& space,
+create_mirror(Kokkos::Impl::WithoutInitializing_t wi, const Space&,
              const Kokkos::Experimental::OffsetView<T, P...>& src) {
-  return Impl::create_mirror(space, src, Kokkos::view_alloc(wi));
+  return Impl::create_mirror(
      src, Kokkos::view_alloc(typename Space::memory_space{}, wi));
 }
 template <class T, class... P, class... ViewCtorArgs>
@ -1983,22 +1984,24 @@ inline auto create_mirror(
 namespace Impl {
 template <class T, class... P, class... ViewCtorArgs>
 inline std::enable_if_t<
    !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
        (std::is_same<
             typename Kokkos::Experimental::OffsetView<T, P...>::memory_space,
             typename Kokkos::Experimental::OffsetView<
                 T, P...>::HostMirror::memory_space>::value &&
-     std::is_same<typename Kokkos::Experimental::OffsetView<T, P...>::data_type,
+         std::is_same<
             typename Kokkos::Experimental::OffsetView<T, P...>::data_type,
             typename Kokkos::Experimental::OffsetView<
                 T, P...>::HostMirror::data_type>::value),
    typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror>
-create_mirror_view(
+create_mirror_view(const Kokkos::Experimental::OffsetView<T, P...>& src,
    const typename Kokkos::Experimental::OffsetView<T, P...>& src,
                   const Impl::ViewCtorProp<ViewCtorArgs...>&) {
  return src;
 }
 template <class T, class... P, class... ViewCtorArgs>
 inline std::enable_if_t<
    !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
        !(std::is_same<
              typename Kokkos::Experimental::OffsetView<T, P...>::memory_space,
              typename Kokkos::Experimental::OffsetView<
@ -2013,24 +2016,32 @@ create_mirror_view(const Kokkos::Experimental::OffsetView<T, P...>& src,
  return Kokkos::create_mirror(arg_prop, src);
 }
-template <class Space, class T, class... P, class... ViewCtorArgs>
+template <class T, class... P, class... ViewCtorArgs,
-inline std::enable_if_t<
+          class = std::enable_if_t<
-    Impl::MirrorOffsetViewType<Space, T, P...>::is_same_memspace,
+              Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
-    Kokkos::Experimental::OffsetView<T, P...>>
+std::enable_if_t<Impl::MirrorOffsetViewType<
-create_mirror_view(const Space&,
+                     typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
-                   const Kokkos::Experimental::OffsetView<T, P...>& src,
+                     T, P...>::is_same_memspace,
                 typename Impl::MirrorOffsetViewType<
                     typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
                     T, P...>::view_type>
 create_mirror_view(const Kokkos::Experimental::OffsetView<T, P...>& src,
                   const Impl::ViewCtorProp<ViewCtorArgs...>&) {
  return src;
 }
-template <class Space, class T, class... P, class... ViewCtorArgs>
+template <class T, class... P, class... ViewCtorArgs,
-std::enable_if_t<
+          class = std::enable_if_t<
-    !Impl::MirrorOffsetViewType<Space, T, P...>::is_same_memspace,
+              Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
-    typename Kokkos::Impl::MirrorOffsetViewType<Space, T, P...>::view_type>
+std::enable_if_t<!Impl::MirrorOffsetViewType<
-create_mirror_view(const Space& space,
+                     typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
-                   const Kokkos::Experimental::OffsetView<T, P...>& src,
+                     T, P...>::is_same_memspace,
                 typename Impl::MirrorOffsetViewType<
                     typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
                     T, P...>::view_type>
 create_mirror_view(const Kokkos::Experimental::OffsetView<T, P...>& src,
                   const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
-  return create_mirror(space, src, arg_prop);
+  return Kokkos::Impl::create_mirror(src, arg_prop);
 }
 }  // namespace Impl
@ -2052,15 +2063,17 @@ inline auto create_mirror_view(
 template <class Space, class T, class... P,
          typename Enable = std::enable_if_t<Kokkos::is_space<Space>::value>>
 inline auto create_mirror_view(
-    const Space& space, const Kokkos::Experimental::OffsetView<T, P...>& src) {
+    const Space&, const Kokkos::Experimental::OffsetView<T, P...>& src) {
-  return Impl::create_mirror_view(space, src, Impl::ViewCtorProp<>{});
+  return Impl::create_mirror_view(
      src, Kokkos::view_alloc(typename Space::memory_space{}));
 }
 template <class Space, class T, class... P>
 inline auto create_mirror_view(
-    Kokkos::Impl::WithoutInitializing_t wi, const Space& space,
+    Kokkos::Impl::WithoutInitializing_t wi, const Space&,
    const Kokkos::Experimental::OffsetView<T, P...>& src) {
-  return Impl::create_mirror_view(space, src, Kokkos::view_alloc(wi));
+  return Impl::create_mirror_view(
      src, Kokkos::view_alloc(typename Space::memory_space{}, wi));
 }
 template <class T, class... P, class... ViewCtorArgs>
--- a/lib/kokkos/containers/unit_tests/CMakeLists.txt
+++ b/lib/kokkos/containers/unit_tests/CMakeLists.txt
@ -46,3 +46,13 @@ foreach(Tag Threads;Serial;OpenMP;HPX;Cuda;HIP;SYCL)
    KOKKOS_ADD_EXECUTABLE_AND_TEST(UnitTest_${Tag} SOURCES ${UnitTestSources})
  endif()
 endforeach()
 SET(COMPILE_ONLY_SOURCES
  TestCreateMirror.cpp
 )
 KOKKOS_ADD_EXECUTABLE(
  TestCompileOnly
  SOURCES
  TestCompileMain.cpp
  ${COMPILE_ONLY_SOURCES}
 )
--- a/lib/kokkos/containers/unit_tests/TestCompileMain.cpp
+++ b/lib/kokkos/containers/unit_tests/TestCompileMain.cpp
@ -0,0 +1 @@
 int main() {}
--- a/lib/kokkos/containers/unit_tests/TestCreateMirror.cpp
+++ b/lib/kokkos/containers/unit_tests/TestCreateMirror.cpp
@ -0,0 +1,179 @@
 /*
 //@HEADER
 // ************************************************************************
 //
 //                        Kokkos v. 3.0
 //       Copyright (2020) National Technology & Engineering
 //               Solutions of Sandia, LLC (NTESS).
 //
 // Under the terms of Contract DE-NA0003525 with NTESS,
 // the U.S. Government retains certain rights in this software.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
 //
 // 1. Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
 //
 // 2. Redistributions in binary form must reproduce the above copyright
 // notice, this list of conditions and the following disclaimer in the
 // documentation and/or other materials provided with the distribution.
 //
 // 3. Neither the name of the Corporation nor the names of the
 // contributors may be used to endorse or promote products derived from
 // this software without specific prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact Christian R. Trott (crtrott@sandia.gov)
 //
 // ************************************************************************
 //@HEADER
 */
 #include <Kokkos_Core.hpp>
 #include <Kokkos_DynamicView.hpp>
 #include <Kokkos_DynRankView.hpp>
 #include <Kokkos_OffsetView.hpp>
 template <typename TestView, typename MemorySpace>
 void check_memory_space(TestView, MemorySpace) {
  static_assert(
      std::is_same<typename TestView::memory_space, MemorySpace>::value, "");
 }
 template <class View>
 auto host_mirror_test_space(View) {
  return std::conditional_t<
      Kokkos::SpaceAccessibility<Kokkos::HostSpace,
                                 typename View::memory_space>::accessible,
      typename View::memory_space, Kokkos::HostSpace>{};
 }
 template <typename View>
 void test_create_mirror_properties(const View& view) {
  using namespace Kokkos;
  using DeviceMemorySpace = typename DefaultExecutionSpace::memory_space;
  // clang-format off
  // create_mirror
 #ifndef KOKKOS_ENABLE_CXX14
  // FIXME DynamicView: HostMirror is the same type
  if constexpr (!is_dynamic_view<View>::value) {
    check_memory_space(create_mirror(WithoutInitializing,                        view), host_mirror_test_space(view));
    check_memory_space(create_mirror(                                            view), host_mirror_test_space(view));
  }
 #endif
  check_memory_space(create_mirror(WithoutInitializing, DefaultExecutionSpace{}, view), DeviceMemorySpace{});
  check_memory_space(create_mirror(                     DefaultExecutionSpace{}, view), DeviceMemorySpace{});
  // create_mirror_view
 #ifndef KOKKOS_ENABLE_CXX14
  // FIXME DynamicView: HostMirror is the same type
  if constexpr (!is_dynamic_view<View>::value) {
    check_memory_space(create_mirror_view(WithoutInitializing,                        view), host_mirror_test_space(view));
    check_memory_space(create_mirror_view(                                            view), host_mirror_test_space(view));
  }
 #endif
  check_memory_space(create_mirror_view(WithoutInitializing, DefaultExecutionSpace{}, view), DeviceMemorySpace{});
  check_memory_space(create_mirror_view(                     DefaultExecutionSpace{}, view), DeviceMemorySpace{});
  // create_mirror view_alloc
 #ifndef KOKKOS_ENABLE_CXX14
  // FIXME DynamicView: HostMirror is the same type
  if constexpr (!is_dynamic_view<View>::value) {
    check_memory_space(create_mirror(view_alloc(WithoutInitializing),                    view), host_mirror_test_space(view));
    check_memory_space(create_mirror(view_alloc(),                                       view), host_mirror_test_space(view));
  }
 #endif
  check_memory_space(create_mirror(view_alloc(WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
  check_memory_space(create_mirror(view_alloc(                     DeviceMemorySpace{}), view), DeviceMemorySpace{});
  // create_mirror_view view_alloc
 #ifndef KOKKOS_ENABLE_CXX14
  // FIXME DynamicView: HostMirror is the same type
  if constexpr (!is_dynamic_view<View>::value) {
    check_memory_space(create_mirror_view(view_alloc(WithoutInitializing),                    view), host_mirror_test_space(view));
    check_memory_space(create_mirror_view(view_alloc(),                                       view), host_mirror_test_space(view));
  }
 #endif
  check_memory_space(create_mirror_view(view_alloc(WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
  check_memory_space(create_mirror_view(view_alloc(                     DeviceMemorySpace{}), view), DeviceMemorySpace{});
  // create_mirror view_alloc + execution space
 #ifndef KOKKOS_ENABLE_CXX14
  // FIXME DynamicView: HostMirror is the same type
  if constexpr (!is_dynamic_view<View>::value) {
    check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{}, WithoutInitializing),                      view), host_mirror_test_space(view));
    check_memory_space(create_mirror(view_alloc(DefaultHostExecutionSpace{}),                                       view), host_mirror_test_space(view));
  }
 #endif
  check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{},   WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
  check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{},                        DeviceMemorySpace{}), view), DeviceMemorySpace{});
  // create_mirror_view view_alloc + execution space
 #ifndef KOKKOS_ENABLE_CXX14
  // FIXME DynamicView: HostMirror is the same type
  if constexpr (!is_dynamic_view<View>::value) {
    check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{}, WithoutInitializing),                      view), host_mirror_test_space(view));
    check_memory_space(create_mirror_view(view_alloc(DefaultHostExecutionSpace{}),                                       view), host_mirror_test_space(view));
  }
 #endif
  check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{},   WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
  check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{},                        DeviceMemorySpace{}), view), DeviceMemorySpace{});
  // create_mirror_view_and_copy
  check_memory_space(create_mirror_view_and_copy(HostSpace{},         view), HostSpace{});
  check_memory_space(create_mirror_view_and_copy(DeviceMemorySpace{}, view), DeviceMemorySpace{});
  // create_mirror_view_and_copy view_alloc
  check_memory_space(create_mirror_view_and_copy(view_alloc(HostSpace{}),         view), HostSpace{});
  check_memory_space(create_mirror_view_and_copy(view_alloc(DeviceMemorySpace{}), view), DeviceMemorySpace{});
  // create_mirror_view_and_copy view_alloc + execution space
  check_memory_space(create_mirror_view_and_copy(view_alloc(HostSpace{},         DefaultHostExecutionSpace{}),   view), HostSpace{});
  check_memory_space(create_mirror_view_and_copy(view_alloc(DeviceMemorySpace{}, DefaultExecutionSpace{}),       view), DeviceMemorySpace{});
  // clang-format on
 }
 void test_create_mirror_dynrankview() {
  Kokkos::DynRankView<int, Kokkos::DefaultExecutionSpace> device_view(
      "device view", 10);
  Kokkos::DynRankView<int, Kokkos::HostSpace> host_view("host view", 10);
  test_create_mirror_properties(device_view);
  test_create_mirror_properties(host_view);
 }
 void test_reate_mirror_offsetview() {
  Kokkos::Experimental::OffsetView<int*, Kokkos::DefaultExecutionSpace>
      device_view("device view", {0, 10});
  Kokkos::Experimental::OffsetView<int*, Kokkos::HostSpace> host_view(
      "host view", {0, 10});
  test_create_mirror_properties(device_view);
  test_create_mirror_properties(host_view);
 }
 void test_create_mirror_dynamicview() {
  Kokkos::Experimental::DynamicView<int*, Kokkos::DefaultExecutionSpace>
      device_view("device view", 2, 10);
  Kokkos::Experimental::DynamicView<int*, Kokkos::HostSpace> host_view(
      "host view", 2, 10);
  test_create_mirror_properties(device_view);
  test_create_mirror_properties(host_view);
 }
--- a/lib/kokkos/core/cmake/Dependencies.cmake
+++ b/lib/kokkos/core/cmake/Dependencies.cmake
@ -1,5 +1,5 @@
 TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
-  LIB_OPTIONAL_TPLS Pthread CUDA HWLOC DLlib HPX
+  LIB_OPTIONAL_TPLS Pthread CUDA HWLOC DLlib
  TEST_OPTIONAL_TPLS CUSPARSE
  )
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
@ -53,13 +53,69 @@
 namespace Kokkos {
 namespace Impl {
 inline int cuda_warp_per_sm_allocation_granularity(
    cudaDeviceProp const& properties) {
  // Allocation granularity of warps in each sm
  switch (properties.major) {
    case 3:
    case 5:
    case 7:
    case 8:
    case 9: return 4;
    case 6: return (properties.minor == 0 ? 2 : 4);
    default:
      throw_runtime_exception(
          "Unknown device in cuda warp per sm allocation granularity");
      return 0;
  }
 }
 inline int cuda_max_warps_per_sm_registers(
    cudaDeviceProp const& properties, cudaFuncAttributes const& attributes) {
  // Maximum number of warps per sm as a function of register counts,
  // subject to the constraint that warps are allocated with a fixed granularity
  int const max_regs_per_block = properties.regsPerBlock;
  int const regs_per_warp      = attributes.numRegs * properties.warpSize;
  int const warp_granularity =
      cuda_warp_per_sm_allocation_granularity(properties);
  // The granularity of register allocation is chunks of 256 registers per warp,
  // which implies a need to over-allocate, so we round up
  int const allocated_regs_per_warp = 256 * ((regs_per_warp + 256 - 1) / 256);
  // The maximum number of warps per SM is constrained from above by register
  // allocation. To satisfy the constraint that warps per SM is allocated at a
  // finite granularity, we need to round down.
  int const max_warps_per_sm =
      warp_granularity *
      (max_regs_per_block / (allocated_regs_per_warp * warp_granularity));
  return max_warps_per_sm;
 }
 inline int cuda_max_active_blocks_per_sm(cudaDeviceProp const& properties,
                                         cudaFuncAttributes const& attributes,
                                         int block_size, size_t dynamic_shmem) {
-  // Limits due do registers/SM
+  // Limits due to registers/SM
  int const regs_per_sm     = properties.regsPerMultiprocessor;
  int const regs_per_thread = attributes.numRegs;
-  int const max_blocks_regs = regs_per_sm / (regs_per_thread * block_size);
+  // The granularity of register allocation is chunks of 256 registers per warp
  // -> 8 registers per thread
  int const allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
  int max_blocks_regs = regs_per_sm / (allocated_regs_per_thread * block_size);
  // Compute the maximum number of warps as a function of the number of
  // registers
  int const max_warps_per_sm_registers =
      cuda_max_warps_per_sm_registers(properties, attributes);
  // Constrain the number of blocks to respect the maximum number of warps per
  // SM On face value this should be an equality, but due to the warp
  // granularity constraints noted in `cuda_max_warps_per_sm_registers` the
  // left-hand-side of this comparison can overshoot what the hardware allows
  // based on register counts alone
  while ((max_blocks_regs * block_size / properties.warpSize) >
         max_warps_per_sm_registers)
    max_blocks_regs--;
  // Limits due to shared memory/SM
  size_t const shmem_per_sm            = properties.sharedMemPerMultiprocessor;
@ -203,40 +259,19 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
                                LaunchBounds{});
 }
-// Assuming cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1)
+template <class LaunchBounds>
-// NOTE these number can be obtained several ways:
+int cuda_get_opt_block_size_no_shmem(const cudaFuncAttributes& attr,
-// * One option is to download the CUDA Occupancy Calculator spreadsheet, select
+                                     LaunchBounds) {
-// "Compute Capability" first and check what is the smallest "Shared Memory
+  auto const& prop = Kokkos::Cuda().cuda_device_prop();
-// Size Config" that is available.  The "Shared Memory Per Multiprocessor" in
+
-// bytes is then to be found below in the summary.
+  // Thin version of cuda_get_opt_block_size for cases where there is no shared
-// * Another option would be to look for the information in the "Tuning
+  // memory
-// Guide(s)" of the CUDA Toolkit Documentation for each GPU architecture, in
+  auto const block_size_to_no_shmem = [&](int /*block_size*/) { return 0; };
-// the "Shared Memory" section (more tedious)
+
-inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) {
+  return cuda_deduce_block_size(false, prop, attr, block_size_to_no_shmem,
-  int const compute_capability = properties.major * 10 + properties.minor;
+                                LaunchBounds{});
  return [compute_capability]() {
    switch (compute_capability) {
      case 30:
      case 32:
      case 35: return 16;
      case 37: return 80;
      case 50:
      case 53:
      case 60:
      case 62: return 64;
      case 52:
      case 61: return 96;
      case 70:
      case 80:
      case 86: return 8;
      case 75: return 32;
      default:
        Kokkos::Impl::throw_runtime_exception(
            "Unknown device in cuda block size deduction");
    }
    return 0;
  }() * 1024;
 }
 }  // namespace Impl
 }  // namespace Kokkos
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp
@ -418,7 +418,7 @@ KOKKOS_INLINE_FUNCTION
 #endif  // CUDA_VERSION >= 11000 && CUDA_VERSION < 11010
 #if CUDA_VERSION >= 11010 && \
-    ((defined(KOKKOS_ARCH_AMPERE80) || defined(KOKKOS_ARCH_AMPERE86)))
+    ((defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER)))
 KOKKOS_INLINE_FUNCTION
 bhalf_t cast_to_bhalf(bhalf_t val) { return val; }
 KOKKOS_INLINE_FUNCTION
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
@ -569,12 +569,6 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default
  }
 #endif
 #ifdef KOKKOS_ENABLE_PRE_CUDA_10_DEPRECATION_API
  cudaThreadSetCacheConfig(cudaFuncCachePreferShared);
 #else
  cudaDeviceSetCacheConfig(cudaFuncCachePreferShared);
 #endif
  // Init the array for used for arbitrarily sized atomics
  if (stream == nullptr) Impl::initialize_host_cuda_lock_arrays();
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
@ -93,10 +93,6 @@ namespace Impl {
 //   __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
 // function qualifier which could be used to improve performance.
 //----------------------------------------------------------------------------
 // Maximize L1 cache and minimize shared memory:
 //   cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1 );
 // For 2.0 capability: 48 KB L1 and 16 KB shared
 //----------------------------------------------------------------------------
 template <class DriverType>
 __global__ static void cuda_parallel_launch_constant_memory() {
@ -158,63 +154,119 @@ inline void check_shmem_request(CudaInternal const* cuda_instance, int shmem) {
  }
 }
-// This function needs to be template on DriverType and LaunchBounds
+// These functions needs to be template on DriverType and LaunchBounds
 // so that the static bool is unique for each type combo
 // KernelFuncPtr does not necessarily contain that type information.
 template <class DriverType, class LaunchBounds, class KernelFuncPtr>
-inline void configure_shmem_preference(KernelFuncPtr const& func,
+const cudaFuncAttributes& get_cuda_kernel_func_attributes(
-                                       bool prefer_shmem) {
+    const KernelFuncPtr& func) {
-#ifndef KOKKOS_ARCH_KEPLER
+  // Only call cudaFuncGetAttributes once for each unique kernel
-  // On Kepler the L1 has no benefit since it doesn't cache reads
+  // by leveraging static variable initialization rules
-  auto set_cache_config = [&] {
+  auto wrap_get_attributes = [&]() -> cudaFuncAttributes {
-    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
+    cudaFuncAttributes attr;
-        func,
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncGetAttributes(&attr, func));
-        (prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1)));
+    return attr;
    return prefer_shmem;
  };
-  static bool cache_config_preference_cached = set_cache_config();
+  static cudaFuncAttributes func_attr = wrap_get_attributes();
-  if (cache_config_preference_cached != prefer_shmem) {
+  return func_attr;
 }
 template <class DriverType, class LaunchBounds, class KernelFuncPtr>
 inline void configure_shmem_preference(const KernelFuncPtr& func,
                                       const cudaDeviceProp& device_props,
                                       const size_t block_size, int& shmem,
                                       const size_t occupancy) {
 #ifndef KOKKOS_ARCH_KEPLER
  const auto& func_attr =
      get_cuda_kernel_func_attributes<DriverType, LaunchBounds>(func);
  // Compute limits for number of blocks due to registers/SM
  const size_t regs_per_sm     = device_props.regsPerMultiprocessor;
  const size_t regs_per_thread = func_attr.numRegs;
  // The granularity of register allocation is chunks of 256 registers per warp
  // -> 8 registers per thread
  const size_t allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
  size_t max_blocks_regs =
      regs_per_sm / (allocated_regs_per_thread * block_size);
  // Compute the maximum number of warps as a function of the number of
  // registers
  const size_t max_warps_per_sm_registers =
      cuda_max_warps_per_sm_registers(device_props, func_attr);
  // Constrain the number of blocks to respect the maximum number of warps per
  // SM On face value this should be an equality, but due to the warp
  // granularity constraints noted in `cuda_max_warps_per_sm_registers` the
  // left-hand-side of this comparison can overshoot what the hardware allows
  // based on register counts alone
  while ((max_blocks_regs * block_size / device_props.warpSize) >
         max_warps_per_sm_registers)
    max_blocks_regs--;
  // Compute how many threads per sm we actually want
  const size_t max_threads_per_sm = device_props.maxThreadsPerMultiProcessor;
  // only allocate multiples of warp size
  const size_t num_threads_desired =
      ((max_threads_per_sm * occupancy / 100 + 31) / 32) * 32;
  // Get close to the desired occupancy,
  // don't undershoot by much but also don't allocate a whole new block just
  // because one is a few threads over otherwise.
  size_t num_blocks_desired =
      (num_threads_desired + block_size * 0.8) / block_size;
  num_blocks_desired = ::std::min(max_blocks_regs, num_blocks_desired);
  if (num_blocks_desired == 0) num_blocks_desired = 1;
  // Calculate how much shared memory we need per block
  size_t shmem_per_block = shmem + func_attr.sharedSizeBytes;
  // The minimum shared memory allocation we can have in total per SM is 8kB.
  // If we want to lower occupancy we have to make sure we request at least that
  // much in aggregate over all blocks, so that shared memory actually becomes a
  // limiting factor for occupancy
  constexpr size_t min_shmem_size_per_sm = 8192;
  if ((occupancy < 100) &&
      (shmem_per_block * num_blocks_desired < min_shmem_size_per_sm)) {
    shmem_per_block = min_shmem_size_per_sm / num_blocks_desired;
    // Need to set the caller's shmem variable so that the
    // kernel launch uses the correct dynamic shared memory request
    shmem = shmem_per_block - func_attr.sharedSizeBytes;
  }
  // Compute the carveout fraction we need based on occupancy
  // Use multiples of 8kB
  const size_t max_shmem_per_sm = device_props.sharedMemPerMultiprocessor;
  size_t carveout               = shmem_per_block == 0
                        ? 0
                        : 100 *
                              (((num_blocks_desired * shmem_per_block +
                                 min_shmem_size_per_sm - 1) /
                                min_shmem_size_per_sm) *
                               min_shmem_size_per_sm) /
                              max_shmem_per_sm;
  if (carveout > 100) carveout = 100;
  // Set the carveout, but only call it once per kernel or when it changes
  auto set_cache_config = [&] {
    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetAttribute(
        func, cudaFuncAttributePreferredSharedMemoryCarveout, carveout));
    return carveout;
  };
  // Store the value in a static variable so we only reset if needed
  static size_t cache_config_preference_cached = set_cache_config();
  if (cache_config_preference_cached != carveout) {
    cache_config_preference_cached = set_cache_config();
  }
 #else
  // Use the parameters so we don't get a warning
  (void)func;
-  (void)prefer_shmem;
+  (void)device_props;
  (void)block_size;
  (void)occupancy;
 #endif
 }
 template <class Policy>
 std::enable_if_t<Policy::experimental_contains_desired_occupancy>
 modify_launch_configuration_if_desired_occupancy_is_specified(
    Policy const& policy, cudaDeviceProp const& properties,
    cudaFuncAttributes const& attributes, dim3 const& block, int& shmem,
    bool& prefer_shmem) {
  int const block_size        = block.x * block.y * block.z;
  int const desired_occupancy = policy.impl_get_desired_occupancy().value();
  size_t const shmem_per_sm_prefer_l1 = get_shmem_per_sm_prefer_l1(properties);
  size_t const static_shmem           = attributes.sharedSizeBytes;
  // round to nearest integer and avoid division by zero
  int active_blocks = std::max(
      1, static_cast<int>(std::round(
             static_cast<double>(properties.maxThreadsPerMultiProcessor) /
             block_size * desired_occupancy / 100)));
  int const dynamic_shmem =
      shmem_per_sm_prefer_l1 / active_blocks - static_shmem;
  if (dynamic_shmem > shmem) {
    shmem        = dynamic_shmem;
    prefer_shmem = false;
  }
 }
 template <class Policy>
 std::enable_if_t<!Policy::experimental_contains_desired_occupancy>
 modify_launch_configuration_if_desired_occupancy_is_specified(
    Policy const&, cudaDeviceProp const&, cudaFuncAttributes const&,
    dim3 const& /*block*/, int& /*shmem*/, bool& /*prefer_shmem*/) {}
 // </editor-fold> end Some helper functions for launch code readability }}}1
 //==============================================================================
@ -348,7 +400,7 @@ struct CudaParallelLaunchKernelInvoker<
 #ifdef KOKKOS_CUDA_ENABLE_GRAPHS
  inline static void create_parallel_launch_graph_node(
      DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
-      CudaInternal const* cuda_instance, bool prefer_shmem) {
+      CudaInternal const* cuda_instance) {
    //----------------------------------------
    auto const& graph = Impl::get_cuda_graph_from_kernel(driver);
    KOKKOS_EXPECTS(bool(graph));
@ -358,8 +410,19 @@ struct CudaParallelLaunchKernelInvoker<
    if (!Impl::is_empty_launch(grid, block)) {
      Impl::check_shmem_request(cuda_instance, shmem);
      if (DriverType::Policy::
                        experimental_contains_desired_occupancy) {
      /*
        int desired_occupancy =
            driver.get_policy().impl_get_desired_occupancy().value();
        size_t block_size = block.x * block.y * block.z;
        Impl::configure_shmem_preference<DriverType, LaunchBounds>(
-          base_t::get_kernel_func(), prefer_shmem);
+            base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
            shmem, desired_occupancy);*/
        Kokkos::Impl::throw_runtime_exception(
        std::string("Cuda graph node creation FAILED:"
                    " occupancy requests are currently broken."));
      }
      void const* args[] = {&driver};
@ -442,7 +505,7 @@ struct CudaParallelLaunchKernelInvoker<
 #ifdef KOKKOS_CUDA_ENABLE_GRAPHS
  inline static void create_parallel_launch_graph_node(
      DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
-      CudaInternal const* cuda_instance, bool prefer_shmem) {
+      CudaInternal const* cuda_instance) {
    //----------------------------------------
    auto const& graph = Impl::get_cuda_graph_from_kernel(driver);
    KOKKOS_EXPECTS(bool(graph));
@ -452,8 +515,18 @@ struct CudaParallelLaunchKernelInvoker<
    if (!Impl::is_empty_launch(grid, block)) {
      Impl::check_shmem_request(cuda_instance, shmem);
      if (DriverType::Policy::
                        experimental_contains_desired_occupancy) {
        /*int desired_occupancy =
            driver.get_policy().impl_get_desired_occupancy().value();
        size_t block_size = block.x * block.y * block.z;
        Impl::configure_shmem_preference<DriverType, LaunchBounds>(
-          base_t::get_kernel_func(), prefer_shmem);
+            base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
            shmem, desired_occupancy);*/
        Kokkos::Impl::throw_runtime_exception(
        std::string("Cuda graph node creation FAILED:"
                    " occupancy requests are currently broken."));
      }
      auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver);
@ -566,7 +639,7 @@ struct CudaParallelLaunchKernelInvoker<
 #ifdef KOKKOS_CUDA_ENABLE_GRAPHS
  inline static void create_parallel_launch_graph_node(
      DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
-      CudaInternal const* cuda_instance, bool prefer_shmem) {
+      CudaInternal const* cuda_instance) {
    // Just use global memory; coordinating through events to share constant
    // memory with the non-graph interface is not really reasonable since
    // events don't work with Graphs directly, and this would anyway require
@ -580,7 +653,7 @@ struct CudaParallelLaunchKernelInvoker<
        DriverType, LaunchBounds,
        Experimental::CudaLaunchMechanism::GlobalMemory>;
    global_launch_impl_t::create_parallel_launch_graph_node(
-        driver, grid, block, shmem, cuda_instance, prefer_shmem);
+        driver, grid, block, shmem, cuda_instance);
  }
 #endif
 };
@ -613,8 +686,7 @@ struct CudaParallelLaunchImpl<
  inline static void launch_kernel(const DriverType& driver, const dim3& grid,
                                   const dim3& block, int shmem,
-                                   const CudaInternal* cuda_instance,
+                                   const CudaInternal* cuda_instance) {
                                   bool prefer_shmem) {
    if (!Impl::is_empty_launch(grid, block)) {
      // Prevent multiple threads to simultaneously set the cache configuration
      // preference and launch the same kernel
@ -623,20 +695,22 @@ struct CudaParallelLaunchImpl<
      Impl::check_shmem_request(cuda_instance, shmem);
-      // If a desired occupancy is specified, we compute how much shared memory
+      if (DriverType::Policy::
-      // to ask for to achieve that occupancy, assuming that the cache
+                        experimental_contains_desired_occupancy) {
-      // configuration is `cudaFuncCachePreferL1`.  If the amount of dynamic
+        /*int desired_occupancy =
-      // shared memory computed is actually smaller than `shmem` we overwrite
+            driver.get_policy().impl_get_desired_occupancy().value();
-      // `shmem` and set `prefer_shmem` to `false`.
+        size_t block_size = block.x * block.y * block.z;
      modify_launch_configuration_if_desired_occupancy_is_specified(
          driver.get_policy(), cuda_instance->m_deviceProp,
          get_cuda_func_attributes(), block, shmem, prefer_shmem);
        Impl::configure_shmem_preference<
-          DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
+            DriverType,
-          base_t::get_kernel_func(), prefer_shmem);
+            Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
            base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
            shmem, desired_occupancy);*/
        Kokkos::Impl::throw_runtime_exception(
        std::string("Cuda graph node creation FAILED:"
                    " occupancy requests are currently broken."));
      }
-      ensure_cuda_lock_arrays_on_device();
+      KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
      // Invoke the driver function on the device
      base_t::invoke_kernel(driver, grid, block, shmem, cuda_instance);
@ -650,18 +724,9 @@ struct CudaParallelLaunchImpl<
  }
  static cudaFuncAttributes get_cuda_func_attributes() {
-    // Race condition inside of cudaFuncGetAttributes if the same address is
+    return get_cuda_kernel_func_attributes<
-    // given requires using a local variable as input instead of a static Rely
+        DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
-    // on static variable initialization to make sure only one thread executes
+        base_t::get_kernel_func());
    // the code and the result is visible.
    auto wrap_get_attributes = []() -> cudaFuncAttributes {
      cudaFuncAttributes attr_tmp;
      KOKKOS_IMPL_CUDA_SAFE_CALL(
          cudaFuncGetAttributes(&attr_tmp, base_t::get_kernel_func()));
      return attr_tmp;
    };
    static cudaFuncAttributes attr = wrap_get_attributes();
    return attr;
  }
 };
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp
@ -79,7 +79,8 @@ CudaLockArrays g_host_cuda_lock_arrays = {nullptr, 0};
 void initialize_host_cuda_lock_arrays() {
 #ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
  desul::Impl::init_lock_arrays();
-  desul::ensure_cuda_lock_arrays_on_device();
+
  DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
 #endif
  if (g_host_cuda_lock_arrays.atomic != nullptr) return;
  KOKKOS_IMPL_CUDA_SAFE_CALL(
@ -88,7 +89,7 @@ void initialize_host_cuda_lock_arrays() {
  Impl::cuda_device_synchronize(
      "Kokkos::Impl::initialize_host_cuda_lock_arrays: Pre Init Lock Arrays");
  g_host_cuda_lock_arrays.n = Cuda::concurrency();
-  copy_cuda_lock_arrays_to_device();
+  KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
  init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK + 1 + 255) / 256,
                                  256>>>();
  Impl::cuda_device_synchronize(
@ -105,7 +106,7 @@ void finalize_host_cuda_lock_arrays() {
  g_host_cuda_lock_arrays.atomic = nullptr;
  g_host_cuda_lock_arrays.n      = 0;
 #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
-  copy_cuda_lock_arrays_to_device();
+  KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
 #endif
 }
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp
@ -67,7 +67,7 @@ struct CudaLockArrays {
 /// \brief This global variable in Host space is the central definition
 ///        of these arrays.
-extern CudaLockArrays g_host_cuda_lock_arrays;
+extern Kokkos::Impl::CudaLockArrays g_host_cuda_lock_arrays;
 /// \brief After this call, the g_host_cuda_lock_arrays variable has
 ///        valid, initialized arrays.
@ -105,12 +105,12 @@ namespace Impl {
 /// instances in other translation units, we must update this CUDA global
 /// variable based on the Host global variable prior to running any kernels
 /// that will use it.
-/// That is the purpose of the ensure_cuda_lock_arrays_on_device function.
+/// That is the purpose of the KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE macro.
 __device__
 #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
    __constant__ extern
 #endif
-    CudaLockArrays g_device_cuda_lock_arrays;
+    Kokkos::Impl::CudaLockArrays g_device_cuda_lock_arrays;
 #define CUDA_SPACE_ATOMIC_MASK 0x1FFFF
@ -123,7 +123,9 @@ __device__ inline bool lock_address_cuda_space(void* ptr) {
  size_t offset = size_t(ptr);
  offset        = offset >> 2;
  offset        = offset & CUDA_SPACE_ATOMIC_MASK;
-  return (0 == atomicCAS(&g_device_cuda_lock_arrays.atomic[offset], 0, 1));
+  return (
      0 ==
      atomicCAS(&Kokkos::Impl::g_device_cuda_lock_arrays.atomic[offset], 0, 1));
 }
 /// \brief Release lock for the address
@ -136,7 +138,7 @@ __device__ inline void unlock_address_cuda_space(void* ptr) {
  size_t offset = size_t(ptr);
  offset        = offset >> 2;
  offset        = offset & CUDA_SPACE_ATOMIC_MASK;
-  atomicExch(&g_device_cuda_lock_arrays.atomic[offset], 0);
+  atomicExch(&Kokkos::Impl::g_device_cuda_lock_arrays.atomic[offset], 0);
 }
 }  // namespace Impl
@ -149,49 +151,45 @@ namespace {
 static int lock_array_copied = 0;
 inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
 }  // namespace
 }  // namespace Impl
 }  // namespace Kokkos
-#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
+/* Dan Ibanez: it is critical that this code be a macro, so that it will
-inline
+   capture the right address for Kokkos::Impl::g_device_cuda_lock_arrays!
-#else
+   putting this in an inline function will NOT do the right thing! */
-static
+#define KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()                      \
-#endif
+  {                                                                   \
-    void
+    if (::Kokkos::Impl::lock_array_copied == 0) {                     \
-    copy_cuda_lock_arrays_to_device() {
+      KOKKOS_IMPL_CUDA_SAFE_CALL(                                     \
-  if (lock_array_copied == 0) {
+          cudaMemcpyToSymbol(Kokkos::Impl::g_device_cuda_lock_arrays, \
-    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemcpyToSymbol(g_device_cuda_lock_arrays,
+                             &Kokkos::Impl::g_host_cuda_lock_arrays,  \
-                                                  &g_host_cuda_lock_arrays,
+                             sizeof(Kokkos::Impl::CudaLockArrays)));  \
-                                                  sizeof(CudaLockArrays)));
+    }                                                                 \
    lock_array_copied = 1;                                            \
  }
  lock_array_copied = 1;
 }
 #ifndef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
 #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
-inline void ensure_cuda_lock_arrays_on_device() {}
+#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
 #else
-inline static void ensure_cuda_lock_arrays_on_device() {
+#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() \
-  copy_cuda_lock_arrays_to_device();
+  KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()
 }
 #endif
 #else
 #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
-inline void ensure_cuda_lock_arrays_on_device() {}
+#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
 #else
 // Still Need COPY_CUDA_LOCK_ARRAYS for team scratch etc.
-inline static void ensure_cuda_lock_arrays_on_device() {
+#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() \
-  copy_cuda_lock_arrays_to_device();
+  KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()         \
-  desul::ensure_cuda_lock_arrays_on_device();
+  DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
 }
 #endif
 #endif /* defined( KOKKOS_ENABLE_IMPL_DESUL_ATOMICS ) */
 }  // namespace Impl
 }  // namespace Kokkos
 #endif /* defined( KOKKOS_ENABLE_CUDA ) */
 #endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP */
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
@ -67,6 +67,34 @@
 namespace Kokkos {
 namespace Impl {
 template <typename ParallelType, typename Policy, typename LaunchBounds>
 int max_tile_size_product_helper(const Policy& pol, const LaunchBounds&) {
  cudaFuncAttributes attr =
      CudaParallelLaunch<ParallelType,
                         LaunchBounds>::get_cuda_func_attributes();
  auto const& prop = pol.space().cuda_device_prop();
  // Limits due to registers/SM, MDRange doesn't have
  // shared memory constraints
  int const optimal_block_size =
      Kokkos::Impl::cuda_get_opt_block_size_no_shmem(attr, LaunchBounds{});
  // Compute how many blocks of this size we can launch, based on warp
  // constraints
  int const max_warps_per_sm_registers =
      Kokkos::Impl::cuda_max_warps_per_sm_registers(prop, attr);
  int const max_num_threads_from_warps =
      max_warps_per_sm_registers * prop.warpSize;
  int const max_num_blocks = max_num_threads_from_warps / optimal_block_size;
  // Compute the total number of threads
  int const max_threads_per_sm = optimal_block_size * max_num_blocks;
  return std::min(
      max_threads_per_sm,
      static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
 }
 template <class FunctorType, class... Traits>
 class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
 public:
@ -85,18 +113,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
 public:
  template <typename Policy, typename Functor>
  static int max_tile_size_product(const Policy& pol, const Functor&) {
-    cudaFuncAttributes attr =
+    return max_tile_size_product_helper<ParallelFor>(pol, LaunchBounds{});
        CudaParallelLaunch<ParallelFor,
                           LaunchBounds>::get_cuda_func_attributes();
    auto const& prop = pol.space().cuda_device_prop();
    // Limits due to registers/SM, MDRange doesn't have
    // shared memory constraints
    int const regs_per_sm        = prop.regsPerMultiprocessor;
    int const regs_per_thread    = attr.numRegs;
    int const max_threads_per_sm = regs_per_sm / regs_per_thread;
    return std::min(
        max_threads_per_sm,
        static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
  }
  Policy const& get_policy() const { return m_rp; }
  inline __device__ void operator()() const {
@ -121,8 +138,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
              maxblocks[1]),
          1);
      CudaParallelLaunch<ParallelFor, LaunchBounds>(
-          *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
+          *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
          false);
    } else if (RP::rank == 3) {
      const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], m_rp.m_tile[2]);
      KOKKOS_ASSERT(block.x > 0);
@ -139,8 +155,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
              (m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1) / block.z,
              maxblocks[2]));
      CudaParallelLaunch<ParallelFor, LaunchBounds>(
-          *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
+          *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
          false);
    } else if (RP::rank == 4) {
      // id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to
      // threadIdx.z
@ -158,8 +173,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
              (m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1) / block.z,
              maxblocks[2]));
      CudaParallelLaunch<ParallelFor, LaunchBounds>(
-          *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
+          *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
          false);
    } else if (RP::rank == 5) {
      // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to
      // threadIdx.z
@ -175,8 +189,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
              (m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1) / block.z,
              maxblocks[2]));
      CudaParallelLaunch<ParallelFor, LaunchBounds>(
-          *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
+          *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
          false);
    } else if (RP::rank == 6) {
      // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4,id5 to
      // threadIdx.z
@ -191,8 +204,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
          std::min<array_index_type>(m_rp.m_tile_end[4] * m_rp.m_tile_end[5],
                                     maxblocks[2]));
      CudaParallelLaunch<ParallelFor, LaunchBounds>(
-          *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
+          *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
          false);
    } else {
      Kokkos::abort("Kokkos::MDRange Error: Exceeded rank bounds with Cuda\n");
    }
@ -263,17 +275,7 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
 public:
  template <typename Policy, typename Functor>
  static int max_tile_size_product(const Policy& pol, const Functor&) {
-    cudaFuncAttributes attr =
+    return max_tile_size_product_helper<ParallelReduce>(pol, LaunchBounds{});
        CudaParallelLaunch<ParallelReduce,
                           LaunchBounds>::get_cuda_func_attributes();
    auto const& prop = pol.space().cuda_device_prop();
    // Limits due do registers/SM
    int const regs_per_sm        = prop.regsPerMultiprocessor;
    int const regs_per_thread    = attr.numRegs;
    int const max_threads_per_sm = regs_per_sm / regs_per_thread;
    return std::min(
        max_threads_per_sm,
        static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
  }
  Policy const& get_policy() const { return m_policy; }
  inline __device__ void exec_range(reference_type update) const {
@ -405,8 +407,8 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
      CudaParallelLaunch<ParallelReduce, LaunchBounds>(
          *this, grid, block, shmem,
-          m_policy.space().impl_internal_space_instance(),
+          m_policy.space()
-          false);  // copy to device and execute
+	      .impl_internal_space_instance()); // copy to device and execute
      if (!m_result_ptr_device_accessible) {
        if (m_result_ptr) {
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
@ -135,8 +135,7 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
 #endif
    CudaParallelLaunch<ParallelFor, LaunchBounds>(
-        *this, grid, block, 0, m_policy.space().impl_internal_space_instance(),
+        *this, grid, block, 0, m_policy.space().impl_internal_space_instance());
        false);
  }
  ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
@ -375,8 +374,8 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
      CudaParallelLaunch<ParallelReduce, LaunchBounds>(
          *this, grid, block, shmem,
-          m_policy.space().impl_internal_space_instance(),
+          m_policy.space()
-          false);  // copy to device and execute
+	      .impl_internal_space_instance());  // copy to device and execute
      if (!m_result_ptr_device_accessible) {
        if (m_result_ptr) {
@ -465,8 +464,24 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
 public:
  using pointer_type   = typename Analysis::pointer_type;
  using reference_type = typename Analysis::reference_type;
  using value_type     = typename Analysis::value_type;
  using functor_type   = FunctorType;
  using size_type      = Cuda::size_type;
  // Conditionally set word_size_type to int16_t or int8_t if value_type is
  // smaller than int32_t (Kokkos::Cuda::size_type)
  // word_size_type is used to determine the word count, shared memory buffer
  // size, and global memory buffer size before the scan is performed.
  // Within the scan, the word count is recomputed based on word_size_type
  // and when calculating indexes into the shared/global memory buffers for
  // performing the scan, word_size_type is used again.
  // For scalars > 4 bytes in size, indexing into shared/global memory relies
  // on the block and grid dimensions to ensure that we index at the correct
  // offset rather than at every 4 byte word; such that, when the join is
  // performed, we have the correct data that was copied over in chunks of 4
  // bytes.
  using word_size_type = std::conditional_t<
      sizeof(value_type) < sizeof(size_type),
      std::conditional_t<sizeof(value_type) == 2, int16_t, int8_t>, size_type>;
 private:
  // Algorithmic constraints:
@ -477,7 +492,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
  const FunctorType m_functor;
  const Policy m_policy;
-  size_type* m_scratch_space;
+  word_size_type* m_scratch_space;
  size_type* m_scratch_flags;
  size_type m_final;
 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
@ -501,12 +516,12 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
  __device__ inline void initial() const {
    typename Analysis::Reducer final_reducer(&m_functor);
-    const integral_nonzero_constant<size_type, Analysis::StaticValueSize /
+    const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
-                                                   sizeof(size_type)>
+                                                        sizeof(word_size_type)>
-        word_count(Analysis::value_size(m_functor) / sizeof(size_type));
+        word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));
-    size_type* const shared_value =
+    word_size_type* const shared_value =
-        kokkos_impl_cuda_shared_memory<size_type>() +
+        kokkos_impl_cuda_shared_memory<word_size_type>() +
        word_count.value * threadIdx.y;
    final_reducer.init(reinterpret_cast<pointer_type>(shared_value));
@ -532,7 +547,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
    // gridDim.x
    cuda_single_inter_block_reduce_scan<true>(
        final_reducer, blockIdx.x, gridDim.x,
-        kokkos_impl_cuda_shared_memory<size_type>(), m_scratch_space,
+        kokkos_impl_cuda_shared_memory<word_size_type>(), m_scratch_space,
        m_scratch_flags);
  }
@ -541,21 +556,22 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
  __device__ inline void final() const {
    typename Analysis::Reducer final_reducer(&m_functor);
-    const integral_nonzero_constant<size_type, Analysis::StaticValueSize /
+    const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
-                                                   sizeof(size_type)>
+                                                        sizeof(word_size_type)>
-        word_count(Analysis::value_size(m_functor) / sizeof(size_type));
+        word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));
    // Use shared memory as an exclusive scan: { 0 , value[0] , value[1] ,
    // value[2] , ... }
-    size_type* const shared_data = kokkos_impl_cuda_shared_memory<size_type>();
+    word_size_type* const shared_data =
-    size_type* const shared_prefix =
+        kokkos_impl_cuda_shared_memory<word_size_type>();
    word_size_type* const shared_prefix =
        shared_data + word_count.value * threadIdx.y;
-    size_type* const shared_accum =
+    word_size_type* const shared_accum =
        shared_data + word_count.value * (blockDim.y + 1);
    // Starting value for this thread block is the previous block's total.
    if (blockIdx.x) {
-      size_type* const block_total =
+      word_size_type* const block_total =
          m_scratch_space + word_count.value * (blockIdx.x - 1);
      for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
        shared_accum[i] = block_total[i];
@ -602,7 +618,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
          typename Analysis::pointer_type(shared_data + word_count.value));
      {
-        size_type* const block_total =
+        word_size_type* const block_total =
            shared_data + word_count.value * blockDim.y;
        for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
          shared_accum[i] = block_total[i];
@ -690,8 +706,9 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
      // How many block are really needed for this much work:
      const int grid_x = (nwork + work_per_block - 1) / work_per_block;
-      m_scratch_space = cuda_internal_scratch_space(
+      m_scratch_space =
-          m_policy.space(), Analysis::value_size(m_functor) * grid_x);
+          reinterpret_cast<word_size_type*>(cuda_internal_scratch_space(
              m_policy.space(), Analysis::value_size(m_functor) * grid_x));
      m_scratch_flags =
          cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type) * 1);
@ -708,16 +725,16 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
        m_final = false;
        CudaParallelLaunch<ParallelScan, LaunchBounds>(
            *this, grid, block, shmem,
-            m_policy.space().impl_internal_space_instance(),
+            m_policy.space()
-            false);  // copy to device and execute
+	        .impl_internal_space_instance());  // copy to device and execute
 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
      }
 #endif
      m_final = true;
      CudaParallelLaunch<ParallelScan, LaunchBounds>(
          *this, grid, block, shmem,
-          m_policy.space().impl_internal_space_instance(),
+          m_policy.space()
-          false);  // copy to device and execute
+	      .impl_internal_space_instance());  // copy to device and execute
    }
  }
@ -752,10 +769,26 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
                                                 Policy, FunctorType>;
 public:
  using value_type     = typename Analysis::value_type;
  using pointer_type   = typename Analysis::pointer_type;
  using reference_type = typename Analysis::reference_type;
  using functor_type   = FunctorType;
  using size_type      = Cuda::size_type;
  // Conditionally set word_size_type to int16_t or int8_t if value_type is
  // smaller than int32_t (Kokkos::Cuda::size_type)
  // word_size_type is used to determine the word count, shared memory buffer
  // size, and global memory buffer size before the scan is performed.
  // Within the scan, the word count is recomputed based on word_size_type
  // and when calculating indexes into the shared/global memory buffers for
  // performing the scan, word_size_type is used again.
  // For scalars > 4 bytes in size, indexing into shared/global memory relies
  // on the block and grid dimensions to ensure that we index at the correct
  // offset rather than at every 4 byte word; such that, when the join is
  // performed, we have the correct data that was copied over in chunks of 4
  // bytes.
  using word_size_type = std::conditional_t<
      sizeof(value_type) < sizeof(size_type),
      std::conditional_t<sizeof(value_type) == 2, int16_t, int8_t>, size_type>;
 private:
  // Algorithmic constraints:
@ -766,7 +799,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
  const FunctorType m_functor;
  const Policy m_policy;
-  size_type* m_scratch_space;
+  word_size_type* m_scratch_space;
  size_type* m_scratch_flags;
  size_type m_final;
  ReturnType& m_returnvalue;
@ -791,12 +824,12 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
  __device__ inline void initial() const {
    typename Analysis::Reducer final_reducer(&m_functor);
-    const integral_nonzero_constant<size_type, Analysis::StaticValueSize /
+    const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
-                                                   sizeof(size_type)>
+                                                        sizeof(word_size_type)>
-        word_count(Analysis::value_size(m_functor) / sizeof(size_type));
+        word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));
-    size_type* const shared_value =
+    word_size_type* const shared_value =
-        kokkos_impl_cuda_shared_memory<size_type>() +
+        kokkos_impl_cuda_shared_memory<word_size_type>() +
        word_count.value * threadIdx.y;
    final_reducer.init(reinterpret_cast<pointer_type>(shared_value));
@ -822,7 +855,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
    // gridDim.x
    cuda_single_inter_block_reduce_scan<true>(
        final_reducer, blockIdx.x, gridDim.x,
-        kokkos_impl_cuda_shared_memory<size_type>(), m_scratch_space,
+        kokkos_impl_cuda_shared_memory<word_size_type>(), m_scratch_space,
        m_scratch_flags);
  }
@ -831,21 +864,22 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
  __device__ inline void final() const {
    typename Analysis::Reducer final_reducer(&m_functor);
-    const integral_nonzero_constant<size_type, Analysis::StaticValueSize /
+    const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
-                                                   sizeof(size_type)>
+                                                        sizeof(word_size_type)>
-        word_count(Analysis::value_size(m_functor) / sizeof(size_type));
+        word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));
    // Use shared memory as an exclusive scan: { 0 , value[0] , value[1] ,
    // value[2] , ... }
-    size_type* const shared_data = kokkos_impl_cuda_shared_memory<size_type>();
+    word_size_type* const shared_data =
-    size_type* const shared_prefix =
+        kokkos_impl_cuda_shared_memory<word_size_type>();
    word_size_type* const shared_prefix =
        shared_data + word_count.value * threadIdx.y;
-    size_type* const shared_accum =
+    word_size_type* const shared_accum =
        shared_data + word_count.value * (blockDim.y + 1);
    // Starting value for this thread block is the previous block's total.
    if (blockIdx.x) {
-      size_type* const block_total =
+      word_size_type* const block_total =
          m_scratch_space + word_count.value * (blockIdx.x - 1);
      for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
        shared_accum[i] = block_total[i];
@ -894,7 +928,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
          typename Analysis::pointer_type(shared_data + word_count.value));
      {
-        size_type* const block_total =
+        word_size_type* const block_total =
            shared_data + word_count.value * blockDim.y;
        for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
          shared_accum[i] = block_total[i];
@ -983,8 +1017,9 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
      // How many block are really needed for this much work:
      const int grid_x = (nwork + work_per_block - 1) / work_per_block;
-      m_scratch_space = cuda_internal_scratch_space(
+      m_scratch_space =
-          m_policy.space(), Analysis::value_size(m_functor) * grid_x);
+          reinterpret_cast<word_size_type*>(cuda_internal_scratch_space(
              m_policy.space(), Analysis::value_size(m_functor) * grid_x));
      m_scratch_flags =
          cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type) * 1);
@ -1002,16 +1037,16 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
        m_final = false;
        CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>(
            *this, grid, block, shmem,
-            m_policy.space().impl_internal_space_instance(),
+            m_policy.space()
-            false);  // copy to device and execute
+	        .impl_internal_space_instance());  // copy to device and execute
 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
      }
 #endif
      m_final = true;
      CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>(
          *this, grid, block, shmem,
-          m_policy.space().impl_internal_space_instance(),
+          m_policy.space()
-          false);  // copy to device and execute
+              .impl_internal_space_instance());  // copy to device and execute
      const int size = Analysis::value_size(m_functor);
 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
@ -1022,7 +1057,8 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
 #endif
        DeepCopy<HostSpace, CudaSpace, Cuda>(
            m_policy.space(), &m_returnvalue,
-            m_scratch_space + (grid_x - 1) * size / sizeof(int), size);
+            m_scratch_space + (grid_x - 1) * size / sizeof(word_size_type),
            size);
    }
  }
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
@ -552,8 +552,8 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
    CudaParallelLaunch<ParallelFor, LaunchBounds>(
        *this, grid, block, shmem_size_total,
-        m_policy.space().impl_internal_space_instance(),
+        m_policy.space()
-        true);  // copy to device and execute
+	    .impl_internal_space_instance());  // copy to device and execute
  }
  ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
@ -878,8 +878,8 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
      CudaParallelLaunch<ParallelReduce, LaunchBounds>(
          *this, grid, block, shmem_size_total,
-          m_policy.space().impl_internal_space_instance(),
+          m_policy.space()
-          true);  // copy to device and execute
+	      .impl_internal_space_instance());  // copy to device and execute
      if (!m_result_ptr_device_accessible) {
        m_policy.space().fence(
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
@ -116,6 +116,7 @@ __device__ inline void cuda_inter_warp_reduction(
  value = result[0];
  for (int i = 1; (i * step < max_active_thread) && i < STEP_WIDTH; i++)
    reducer.join(&value, &result[i]);
  __syncthreads();
 }
 template <class ValueType, class ReducerType>
@ -427,11 +428,6 @@ struct CudaReductionsFunctor<FunctorType, false, false> {
 //   __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
 // function qualifier which could be used to improve performance.
 //----------------------------------------------------------------------------
 // Maximize shared memory and minimize L1 cache:
 //   cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferShared );
 // For 2.0 capability: 48 KB shared and 16 KB L1
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 /*
 *  Algorithmic constraints:
 *   (a) blockDim.y <= 1024
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
@ -100,8 +100,7 @@ class ParallelFor<FunctorType, Kokkos::WorkGraphPolicy<Traits...>,
    const int shared = 0;
    Kokkos::Impl::CudaParallelLaunch<Self>(
-        *this, grid, block, shared, Cuda().impl_internal_space_instance(),
+        *this, grid, block, shared, Cuda().impl_internal_space_instance());
        false);
  }
  inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
--- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp
+++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp
@ -448,11 +448,27 @@ class ParallelScanHIPBase {
                                                 Policy, FunctorType>;
 public:
  using value_type     = typename Analysis::value_type;
  using pointer_type   = typename Analysis::pointer_type;
  using reference_type = typename Analysis::reference_type;
  using functor_type   = FunctorType;
  using size_type      = Kokkos::Experimental::HIP::size_type;
  using index_type     = typename Policy::index_type;
  // Conditionally set word_size_type to int16_t or int8_t if value_type is
  // smaller than int32_t (Kokkos::HIP::size_type)
  // word_size_type is used to determine the word count, shared memory buffer
  // size, and global memory buffer size before the scan is performed.
  // Within the scan, the word count is recomputed based on word_size_type
  // and when calculating indexes into the shared/global memory buffers for
  // performing the scan, word_size_type is used again.
  // For scalars > 4 bytes in size, indexing into shared/global memory relies
  // on the block and grid dimensions to ensure that we index at the correct
  // offset rather than at every 4 byte word; such that, when the join is
  // performed, we have the correct data that was copied over in chunks of 4
  // bytes.
  using word_size_type = std::conditional_t<
      sizeof(value_type) < sizeof(size_type),
      std::conditional_t<sizeof(value_type) == 2, int16_t, int8_t>, size_type>;
 protected:
  // Algorithmic constraints:
@ -463,7 +479,7 @@ class ParallelScanHIPBase {
  const FunctorType m_functor;
  const Policy m_policy;
-  size_type* m_scratch_space = nullptr;
+  word_size_type* m_scratch_space = nullptr;
  size_type* m_scratch_flags      = nullptr;
  size_type m_final               = false;
  int m_grid_x                    = 0;
@ -489,12 +505,12 @@ class ParallelScanHIPBase {
  __device__ inline void initial() const {
    typename Analysis::Reducer final_reducer(&m_functor);
-    const integral_nonzero_constant<size_type, Analysis::StaticValueSize /
+    const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
-                                                   sizeof(size_type)>
+                                                        sizeof(word_size_type)>
-        word_count(Analysis::value_size(m_functor) / sizeof(size_type));
+        word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));
    pointer_type const shared_value = reinterpret_cast<pointer_type>(
-        Kokkos::Experimental::kokkos_impl_hip_shared_memory<size_type>() +
+        Kokkos::Experimental::kokkos_impl_hip_shared_memory<word_size_type>() +
        word_count.value * threadIdx.y);
    final_reducer.init(shared_value);
@ -518,7 +534,7 @@ class ParallelScanHIPBase {
    // gridDim.x
    hip_single_inter_block_reduce_scan<true>(
        final_reducer, blockIdx.x, gridDim.x,
-        Kokkos::Experimental::kokkos_impl_hip_shared_memory<size_type>(),
+        Kokkos::Experimental::kokkos_impl_hip_shared_memory<word_size_type>(),
        m_scratch_space, m_scratch_flags);
  }
@ -527,22 +543,22 @@ class ParallelScanHIPBase {
  __device__ inline void final() const {
    typename Analysis::Reducer final_reducer(&m_functor);
-    const integral_nonzero_constant<size_type, Analysis::StaticValueSize /
+    const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
-                                                   sizeof(size_type)>
+                                                        sizeof(word_size_type)>
-        word_count(Analysis::value_size(m_functor) / sizeof(size_type));
+        word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));
    // Use shared memory as an exclusive scan: { 0 , value[0] , value[1] ,
    // value[2] , ... }
-    size_type* const shared_data =
+    word_size_type* const shared_data =
-        Kokkos::Experimental::kokkos_impl_hip_shared_memory<size_type>();
+        Kokkos::Experimental::kokkos_impl_hip_shared_memory<word_size_type>();
-    size_type* const shared_prefix =
+    word_size_type* const shared_prefix =
        shared_data + word_count.value * threadIdx.y;
-    size_type* const shared_accum =
+    word_size_type* const shared_accum =
        shared_data + word_count.value * (blockDim.y + 1);
    // Starting value for this thread block is the previous block's total.
    if (blockIdx.x) {
-      size_type* const block_total =
+      word_size_type* const block_total =
          m_scratch_space + word_count.value * (blockIdx.x - 1);
      for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
        shared_accum[i] = block_total[i];
@ -588,7 +604,7 @@ class ParallelScanHIPBase {
          typename Analysis::pointer_type(shared_data + word_count.value));
      {
-        size_type* const block_total =
+        word_size_type* const block_total =
            shared_data + word_count.value * blockDim.y;
        for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
          shared_accum[i] = block_total[i];
@ -647,8 +663,9 @@ class ParallelScanHIPBase {
      // How many block are really needed for this much work:
      m_grid_x = (nwork + work_per_block - 1) / work_per_block;
-      m_scratch_space = Kokkos::Experimental::Impl::hip_internal_scratch_space(
+      m_scratch_space = reinterpret_cast<word_size_type*>(
-          m_policy.space(), Analysis::value_size(m_functor) * m_grid_x);
+          Kokkos::Experimental::Impl::hip_internal_scratch_space(
              m_policy.space(), Analysis::value_size(m_functor) * m_grid_x));
      m_scratch_flags = Kokkos::Experimental::Impl::hip_internal_scratch_flags(
          m_policy.space(), sizeof(size_type) * 1);
@ -734,7 +751,8 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
      DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace,
               Kokkos::Experimental::HIP>(
          Base::m_policy.space(), &m_returnvalue,
-          Base::m_scratch_space + (Base::m_grid_x - 1) * size / sizeof(int),
+          Base::m_scratch_space + (Base::m_grid_x - 1) * size /
                                      sizeof(typename Base::word_size_type),
          size);
    }
  }
--- a/lib/kokkos/core/src/HIP/Kokkos_HIP_ReduceScan.hpp
+++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_ReduceScan.hpp
@ -225,11 +225,11 @@ struct HIPReductionsFunctor<FunctorType, false> {
    }
  }
  template <typename SizeType>
  __device__ static inline bool scalar_inter_block_reduction(
      FunctorType const& functor,
      ::Kokkos::Experimental::HIP::size_type const block_count,
-      ::Kokkos::Experimental::HIP::size_type* const shared_data,
+      SizeType* const shared_data, SizeType* const global_data,
      ::Kokkos::Experimental::HIP::size_type* const global_data,
      ::Kokkos::Experimental::HIP::size_type* const global_flags) {
    Scalar* const global_team_buffer_element =
        reinterpret_cast<Scalar*>(global_data);
@ -411,16 +411,14 @@ __device__ void hip_intra_block_reduce_scan(
 *  Global reduce result is in the last threads' 'shared_data' location.
 */
-template <bool DoScan, class FunctorType>
+template <bool DoScan, typename FunctorType, typename SizeType>
 __device__ bool hip_single_inter_block_reduce_scan_impl(
    FunctorType const& functor,
    ::Kokkos::Experimental::HIP::size_type const block_id,
    ::Kokkos::Experimental::HIP::size_type const block_count,
-    ::Kokkos::Experimental::HIP::size_type* const shared_data,
+    SizeType* const shared_data, SizeType* const global_data,
    ::Kokkos::Experimental::HIP::size_type* const global_data,
    ::Kokkos::Experimental::HIP::size_type* const global_flags) {
-  using size_type = ::Kokkos::Experimental::HIP::size_type;
+  using size_type    = SizeType;
  using value_type   = typename FunctorType::value_type;
  using pointer_type = typename FunctorType::pointer_type;
@ -518,13 +516,12 @@ __device__ bool hip_single_inter_block_reduce_scan_impl(
  return is_last_block;
 }
-template <bool DoScan, typename FunctorType>
+template <bool DoScan, typename FunctorType, typename SizeType>
 __device__ bool hip_single_inter_block_reduce_scan(
    FunctorType const& functor,
    ::Kokkos::Experimental::HIP::size_type const block_id,
    ::Kokkos::Experimental::HIP::size_type const block_count,
-    ::Kokkos::Experimental::HIP::size_type* const shared_data,
+    SizeType* const shared_data, SizeType* const global_data,
    ::Kokkos::Experimental::HIP::size_type* const global_data,
    ::Kokkos::Experimental::HIP::size_type* const global_flags) {
  // If we are doing a reduction and we don't do an array reduction, we use the
  // reduction-only path. Otherwise, we use the common path between reduction
--- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp
+++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp
@ -116,6 +116,7 @@ __device__ inline void hip_inter_warp_shuffle_reduction(
  value = result[0];
  for (int i = 1; (i * step < max_active_thread) && (i < step_width); ++i)
    reducer.join(&value, &result[i]);
  __syncthreads();
 }
 template <typename ValueType, typename ReducerType>
--- a/lib/kokkos/core/src/Kokkos_CopyViews.hpp
+++ b/lib/kokkos/core/src/Kokkos_CopyViews.hpp
@ -3711,6 +3711,7 @@ namespace Impl {
 template <class T, class... P, class... ViewCtorArgs>
 inline std::enable_if_t<
    !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
        (std::is_same<
             typename Kokkos::View<T, P...>::memory_space,
             typename Kokkos::View<T, P...>::HostMirror::memory_space>::value &&
@ -3725,9 +3726,10 @@ create_mirror_view(const Kokkos::View<T, P...>& src,
 template <class T, class... P, class... ViewCtorArgs>
 inline std::enable_if_t<
-    !(std::is_same<
+    !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
-          typename Kokkos::View<T, P...>::memory_space,
+        !(std::is_same<typename Kokkos::View<T, P...>::memory_space,
-          typename Kokkos::View<T, P...>::HostMirror::memory_space>::value &&
+                       typename Kokkos::View<
                           T, P...>::HostMirror::memory_space>::value &&
          std::is_same<
              typename Kokkos::View<T, P...>::data_type,
              typename Kokkos::View<T, P...>::HostMirror::data_type>::value),
@ -3738,25 +3740,33 @@ create_mirror_view(const Kokkos::View<T, P...>& src,
 }
 // Create a mirror view in a new space (specialization for same space)
-template <class Space, class T, class... P, class... ViewCtorArgs>
+template <class T, class... P, class... ViewCtorArgs,
-std::enable_if_t<Impl::MirrorViewType<Space, T, P...>::is_same_memspace,
+          class = std::enable_if_t<
-                 typename Impl::MirrorViewType<Space, T, P...>::view_type>
+              Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
-create_mirror_view(const Space&, const Kokkos::View<T, P...>& src,
+std::enable_if_t<Impl::MirrorViewType<
                     typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
                     T, P...>::is_same_memspace,
                 typename Impl::MirrorViewType<
                     typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
                     T, P...>::view_type>
 create_mirror_view(const Kokkos::View<T, P...>& src,
                   const Impl::ViewCtorProp<ViewCtorArgs...>&) {
  return src;
 }
 // Create a mirror view in a new space (specialization for different space)
-template <class Space, class T, class... P, class... ViewCtorArgs>
+template <class T, class... P, class... ViewCtorArgs,
-std::enable_if_t<!Impl::MirrorViewType<Space, T, P...>::is_same_memspace,
+          class = std::enable_if_t<
-                 typename Impl::MirrorViewType<Space, T, P...>::view_type>
+              Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
-create_mirror_view(const Space&, const Kokkos::View<T, P...>& src,
+std::enable_if_t<!Impl::MirrorViewType<
                     typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
                     T, P...>::is_same_memspace,
                 typename Impl::MirrorViewType<
                     typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
                     T, P...>::view_type>
 create_mirror_view(const Kokkos::View<T, P...>& src,
                   const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
-  using MemorySpace = typename Space::memory_space;
+  return Kokkos::Impl::create_mirror(src, arg_prop);
  using alloc_prop  = Impl::ViewCtorProp<ViewCtorArgs..., MemorySpace>;
  alloc_prop prop_copy(arg_prop);
  return Kokkos::Impl::create_mirror(src, prop_copy);
 }
 }  // namespace Impl
@ -3815,9 +3825,10 @@ typename Impl::MirrorViewType<Space, T, P...>::view_type create_mirror_view(
 template <class Space, class T, class... P,
          typename Enable = std::enable_if_t<Kokkos::is_space<Space>::value>>
 typename Impl::MirrorViewType<Space, T, P...>::view_type create_mirror_view(
-    Kokkos::Impl::WithoutInitializing_t wi, Space const& space,
+    Kokkos::Impl::WithoutInitializing_t wi, Space const&,
    Kokkos::View<T, P...> const& v) {
-  return Impl::create_mirror_view(space, v, view_alloc(wi));
+  return Impl::create_mirror_view(
      v, view_alloc(typename Space::memory_space{}, wi));
 }
 template <class T, class... P, class... ViewCtorArgs>
--- a/lib/kokkos/core/src/Kokkos_View.hpp
+++ b/lib/kokkos/core/src/Kokkos_View.hpp
@ -1754,7 +1754,10 @@ struct RankDataType<ValueType, 0> {
 };
 template <unsigned N, typename... Args>
-KOKKOS_FUNCTION std::enable_if_t<N == View<Args...>::Rank, View<Args...>>
+KOKKOS_FUNCTION std::enable_if_t<
    N == View<Args...>::Rank &&
        std::is_same<typename ViewTraits<Args...>::specialize, void>::value,
    View<Args...>>
 as_view_of_rank_n(View<Args...> v) {
  return v;
 }
@ -1762,13 +1765,13 @@ as_view_of_rank_n(View<Args...> v) {
 // Placeholder implementation to compile generic code for DynRankView; should
 // never be called
 template <unsigned N, typename T, typename... Args>
-std::enable_if_t<
+KOKKOS_FUNCTION std::enable_if_t<
-    N != View<T, Args...>::Rank,
+    N != View<T, Args...>::Rank &&
        std::is_same<typename ViewTraits<T, Args...>::specialize, void>::value,
    View<typename RankDataType<typename View<T, Args...>::value_type, N>::type,
         Args...>>
 as_view_of_rank_n(View<T, Args...>) {
-  Kokkos::Impl::throw_runtime_exception(
+  Kokkos::abort("Trying to get at a View of the wrong rank");
      "Trying to get at a View of the wrong rank");
  return {};
 }
--- a/lib/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp
+++ b/lib/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp
@ -101,8 +101,8 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
  void push_work(const std::int32_t w) const noexcept {
    const std::int32_t N = m_graph.numRows();
-    std::int32_t volatile* const ready_queue = &m_queue[0];
+    std::int32_t* const ready_queue = &m_queue[0];
-    std::int32_t volatile* const end_hint    = &m_queue[2 * N + 1];
+    std::int32_t* const end_hint    = &m_queue[2 * N + 1];
    // Push work to end of queue
    const std::int32_t j = atomic_fetch_add(end_hint, 1);
@ -134,14 +134,14 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
  std::int32_t pop_work() const noexcept {
    const std::int32_t N = m_graph.numRows();
-    std::int32_t volatile* const ready_queue = &m_queue[0];
+    std::int32_t* const ready_queue = &m_queue[0];
-    std::int32_t volatile* const begin_hint  = &m_queue[2 * N];
+    std::int32_t* const begin_hint  = &m_queue[2 * N];
    // begin hint is guaranteed to be less than or equal to
    // actual begin location in the queue.
-    for (std::int32_t i = *begin_hint; i < N; ++i) {
+    for (std::int32_t i = Kokkos::atomic_load(begin_hint); i < N; ++i) {
-      const std::int32_t w = ready_queue[i];
+      const std::int32_t w = Kokkos::atomic_load(&ready_queue[i]);
      if (w == END_TOKEN) {
        return END_TOKEN;
@ -169,7 +169,7 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
    const std::int32_t N = m_graph.numRows();
-    std::int32_t volatile* const count_queue = &m_queue[N];
+    std::int32_t* const count_queue = &m_queue[N];
    const std::int32_t B = m_graph.row_map(w);
    const std::int32_t E = m_graph.row_map(w + 1);
@ -199,7 +199,7 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
  KOKKOS_INLINE_FUNCTION
  void operator()(const TagCount, int i) const noexcept {
-    std::int32_t volatile* const count_queue = &m_queue[m_graph.numRows()];
+    std::int32_t* const count_queue = &m_queue[m_graph.numRows()];
    atomic_increment(count_queue + m_graph.entries[i]);
  }
--- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp
+++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp
@ -51,7 +51,7 @@ namespace Kokkos::Experimental::Impl {
 struct OpenACC_Traits {
 #if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
-    defined(KOKKOS_ARCH_AMPERE)
+    defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER)
  static constexpr acc_device_t dev_type     = acc_device_nvidia;
  static constexpr bool may_fallback_to_host = false;
 #else
--- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
+++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
@ -47,6 +47,7 @@
 #endif
 #include <Kokkos_Macros.hpp>
 #include <impl/Kokkos_DeviceManagement.hpp>
 #if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(_OPENMP)
@ -115,7 +116,8 @@ void OpenMPTargetInternal::impl_initialize() {
  // FIXME_OPENMPTARGET:  Only fix the number of teams for NVIDIA architectures
  // from Pascal and upwards.
 #if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) ||    \
-    defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
+    defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
    defined(KOKKOS_ARCH_HOPPER)
 #if defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG >= 1300)
  omp_set_num_teams(512);
 #endif
@ -164,7 +166,11 @@ void OpenMPTarget::impl_static_fence(const std::string& name) {
      name, Kokkos::Experimental::Impl::openmp_fence_is_static::yes);
 }
-void OpenMPTarget::impl_initialize(InitializationSettings const&) {
+void OpenMPTarget::impl_initialize(InitializationSettings const& settings) {
  using Kokkos::Impl::get_gpu;
  const int device_num = get_gpu(settings);
  omp_set_default_device(device_num);
  Impl::OpenMPTargetInternal::impl_singleton()->impl_initialize();
 }
 void OpenMPTarget::impl_finalize() {
--- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
@ -155,7 +155,7 @@ void SYCL::impl_initialize(InitializationSettings const& settings) {
 #if !defined(KOKKOS_ARCH_INTEL_GPU) && !defined(KOKKOS_ARCH_KEPLER) && \
    !defined(KOKKOS_ARCH_MAXWELL) && !defined(KOKKOS_ARCH_PASCAL) &&   \
    !defined(KOKKOS_ARCH_VOLTA) && !defined(KOKKOS_ARCH_TURING75) &&   \
-    !defined(KOKKOS_ARCH_AMPERE)
+    !defined(KOKKOS_ARCH_AMPERE) && !defined(KOKKOS_ARCH_HOPPER)
  if (!settings.has_device_id() && gpu_devices.empty()) {
    Impl::SYCLInternal::singleton().initialize(sycl::device());
    return;
--- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
@ -337,7 +337,8 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
      // FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs.
 #if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) ||  \
    defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) ||    \
-    defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
+    defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
    defined(KOKKOS_ARCH_HOPPER)
                 256,
 #endif
                 max_threads_for_memory
@ -369,7 +370,8 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
      // FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs.
 #if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) ||  \
    defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) ||    \
-    defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
+    defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
    defined(KOKKOS_ARCH_HOPPER)
                 256,
 #endif
                 max_threads_for_memory
--- a/lib/kokkos/core/src/impl/Kokkos_ClockTic.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_ClockTic.hpp
@ -110,10 +110,9 @@ KOKKOS_IMPL_HOST_FUNCTION inline uint64_t clock_tic_host() noexcept {
  return ((uint64_t)a) | (((uint64_t)d) << 32);
-#elif defined(__powerpc) || defined(__powerpc__) || defined(__powerpc64__) || \
+#elif defined(__powerpc64__) || defined(__ppc64__)
    defined(__POWERPC__) || defined(__ppc__) || defined(__ppc64__)
-  unsigned int cycles = 0;
+  unsigned long cycles = 0;
  asm volatile("mftb %0" : "=r"(cycles));
--- a/lib/kokkos/core/src/impl/Kokkos_Core.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Core.cpp
@ -166,6 +166,8 @@ int get_device_count() {
 #elif defined(KOKKOS_ENABLE_OPENACC)
  return acc_get_num_devices(
      Kokkos::Experimental::Impl::OpenACC_Traits::dev_type);
 #elif defined(KOKKOS_ENABLE_OPENMPTARGET)
  return omp_get_num_devices();
 #else
  Kokkos::abort("implementation bug");
  return -1;
@ -426,11 +428,17 @@ int Kokkos::Impl::get_gpu(const InitializationSettings& settings) {
    Kokkos::abort("implementation bug");
  }
-  auto const* local_rank_str =
+  char const* local_rank_str = nullptr;
-      std::getenv("OMPI_COMM_WORLD_LOCAL_RANK");  // OpenMPI
+  for (char const* env_var : {
-  if (!local_rank_str)
+           "OMPI_COMM_WORLD_LOCAL_RANK",  // OpenMPI
-    local_rank_str = std::getenv("MV2_COMM_WORLD_LOCAL_RANK");  // MVAPICH2
+           "MV2_COMM_WORLD_LOCAL_RANK",   // MVAPICH2
-  if (!local_rank_str) local_rank_str = std::getenv("SLURM_LOCALID");  // SLURM
+           "MPI_LOCALRANKID",             // MPICH
           "SLURM_LOCALID",               // SLURM
           "PMI_LOCAL_RANK"               // PMI
       }) {
    local_rank_str = std::getenv(env_var);
    if (local_rank_str) break;
  }
  // use first GPU available for execution if unable to detect local MPI rank
  if (!local_rank_str) {
--- a/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp
@ -1128,9 +1128,8 @@ struct ViewOffset<
  KOKKOS_INLINE_FUNCTION constexpr ViewOffset(
      const ViewOffset<DimRHS, Kokkos::LayoutRight, void>& rhs)
      : m_dim(rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0) {
-    static_assert((DimRHS::rank == 0 && dimension_type::rank == 0) ||
+    static_assert(((DimRHS::rank == 0 && dimension_type::rank == 0) ||
-                      (DimRHS::rank == 1 && dimension_type::rank == 1 &&
+                   (DimRHS::rank == 1 && dimension_type::rank == 1)),
                       dimension_type::rank_dynamic == 1),
                  "ViewOffset LayoutLeft and LayoutRight are only compatible "
                  "when rank <= 1");
  }
@ -1778,8 +1777,7 @@ struct ViewOffset<
      const ViewOffset<DimRHS, Kokkos::LayoutLeft, void>& rhs)
      : m_dim(rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0) {
    static_assert((DimRHS::rank == 0 && dimension_type::rank == 0) ||
-                      (DimRHS::rank == 1 && dimension_type::rank == 1 &&
+                      (DimRHS::rank == 1 && dimension_type::rank == 1),
                       dimension_type::rank_dynamic == 1),
                  "ViewOffset LayoutRight and LayoutLeft are only compatible "
                  "when rank <= 1");
  }
@ -3059,10 +3057,10 @@ struct ViewValueFunctor<DeviceType, ValueType, true /* is_scalar */> {
                   std::is_trivially_copy_assignable<Dummy>::value>
  construct_shared_allocation() {
    // Shortcut for zero initialization
    ValueType value{};
 // On A64FX memset seems to do the wrong thing with regards to first touch
 // leading to the significant performance issues
 #ifndef KOKKOS_ARCH_A64FX
    ValueType value{};
    if (Impl::is_zero_byte(value)) {
      uint64_t kpID = 0;
      if (Kokkos::Profiling::profileLibraryLoaded()) {
@ -3539,9 +3537,7 @@ class ViewMapping<
                     typename SrcTraits::array_layout>::value ||
        std::is_same<typename DstTraits::array_layout,
                     Kokkos::LayoutStride>::value ||
-        (DstTraits::dimension::rank == 0) ||
+        (DstTraits::dimension::rank == 0) || (DstTraits::dimension::rank == 1)
        (DstTraits::dimension::rank == 1 &&
         DstTraits::dimension::rank_dynamic == 1)
  };
 public:
--- a/lib/kokkos/core/unit_test/CMakeLists.txt
+++ b/lib/kokkos/core/unit_test/CMakeLists.txt
@ -73,6 +73,7 @@ KOKKOS_INCLUDE_DIRECTORIES(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files)
 SET(COMPILE_ONLY_SOURCES
  TestArray.cpp
  TestCreateMirror.cpp
  TestDetectionIdiom.cpp
  TestInterOp.cpp
  TestLegionInteroperability.cpp
@ -86,6 +87,7 @@ ENDIF()
 KOKKOS_ADD_EXECUTABLE(
  TestCompileOnly
  SOURCES
  TestCompileMain.cpp
  ${COMPILE_ONLY_SOURCES}
 )
--- a/lib/kokkos/core/unit_test/TestCompileMain.cpp
+++ b/lib/kokkos/core/unit_test/TestCompileMain.cpp
@ -0,0 +1 @@
 int main() {}
--- a/lib/kokkos/core/unit_test/TestCreateMirror.cpp
+++ b/lib/kokkos/core/unit_test/TestCreateMirror.cpp
@ -0,0 +1,126 @@
 /*
 //@HEADER
 // ************************************************************************
 //
 //                        Kokkos v. 3.0
 //       Copyright (2020) National Technology & Engineering
 //               Solutions of Sandia, LLC (NTESS).
 //
 // Under the terms of Contract DE-NA0003525 with NTESS,
 // the U.S. Government retains certain rights in this software.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
 //
 // 1. Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
 //
 // 2. Redistributions in binary form must reproduce the above copyright
 // notice, this list of conditions and the following disclaimer in the
 // documentation and/or other materials provided with the distribution.
 //
 // 3. Neither the name of the Corporation nor the names of the
 // contributors may be used to endorse or promote products derived from
 // this software without specific prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact Christian R. Trott (crtrott@sandia.gov)
 //
 // ************************************************************************
 //@HEADER
 */
 #include <Kokkos_Core.hpp>
 template <typename TestView, typename MemorySpace>
 void check_memory_space(TestView, MemorySpace) {
  static_assert(
      std::is_same<typename TestView::memory_space, MemorySpace>::value, "");
 }
 template <class View>
 auto host_mirror_test_space(View) {
  return std::conditional_t<
      Kokkos::SpaceAccessibility<Kokkos::HostSpace,
                                 typename View::memory_space>::accessible,
      typename View::memory_space, Kokkos::HostSpace>{};
 }
 template <typename View>
 void test_create_mirror_properties(const View& view) {
  using namespace Kokkos;
  using DeviceMemorySpace = typename DefaultExecutionSpace::memory_space;
  // clang-format off
  // create_mirror
  check_memory_space(create_mirror(WithoutInitializing,                          view), host_mirror_test_space(view));
  check_memory_space(create_mirror(                                              view), host_mirror_test_space(view));
  check_memory_space(create_mirror(WithoutInitializing, DefaultExecutionSpace{}, view), DeviceMemorySpace{});
  check_memory_space(create_mirror(                     DefaultExecutionSpace{}, view), DeviceMemorySpace{});
  // create_mirror_view
  check_memory_space(create_mirror_view(WithoutInitializing,                          view), host_mirror_test_space(view));
  check_memory_space(create_mirror_view(                                              view), host_mirror_test_space(view));
  check_memory_space(create_mirror_view(WithoutInitializing, DefaultExecutionSpace{}, view), DeviceMemorySpace{});
  check_memory_space(create_mirror_view(                     DefaultExecutionSpace{}, view), DeviceMemorySpace{});
  // create_mirror view_alloc
  check_memory_space(create_mirror(view_alloc(WithoutInitializing),                      view), host_mirror_test_space(view));
  check_memory_space(create_mirror(view_alloc(),                                         view), host_mirror_test_space(view));
  check_memory_space(create_mirror(view_alloc(WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
  check_memory_space(create_mirror(view_alloc(                     DeviceMemorySpace{}), view), DeviceMemorySpace{});
  // create_mirror_view view_alloc
  check_memory_space(create_mirror_view(view_alloc(WithoutInitializing),                      view), host_mirror_test_space(view));
  check_memory_space(create_mirror_view(view_alloc(),                                         view), host_mirror_test_space(view));
  check_memory_space(create_mirror_view(view_alloc(WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
  check_memory_space(create_mirror_view(view_alloc(                     DeviceMemorySpace{}), view), DeviceMemorySpace{});
  // create_mirror view_alloc + execution space
  check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{}, WithoutInitializing),                        view), host_mirror_test_space(view));
  check_memory_space(create_mirror(view_alloc(DefaultHostExecutionSpace{}),                                         view), host_mirror_test_space(view));
  check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{},   WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
  check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{},                        DeviceMemorySpace{}), view), DeviceMemorySpace{});
  // create_mirror_view view_alloc + execution space
  check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{}, WithoutInitializing),                        view), host_mirror_test_space(view));
  check_memory_space(create_mirror_view(view_alloc(DefaultHostExecutionSpace{}),                                         view), host_mirror_test_space(view));
  check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{},   WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
  check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{},                        DeviceMemorySpace{}), view), DeviceMemorySpace{});
  // create_mirror_view_and_copy
  check_memory_space(create_mirror_view_and_copy(HostSpace{},         view), HostSpace{});
  check_memory_space(create_mirror_view_and_copy(DeviceMemorySpace{}, view), DeviceMemorySpace{});
  // create_mirror_view_and_copy view_alloc
  check_memory_space(create_mirror_view_and_copy(view_alloc(HostSpace{}),         view), HostSpace{});
  check_memory_space(create_mirror_view_and_copy(view_alloc(DeviceMemorySpace{}), view), DeviceMemorySpace{});
  // create_mirror_view_and_copy view_alloc + execution space
  check_memory_space(create_mirror_view_and_copy(view_alloc(HostSpace{},         DefaultHostExecutionSpace{}),   view), HostSpace{});
  check_memory_space(create_mirror_view_and_copy(view_alloc(DeviceMemorySpace{}, DefaultExecutionSpace{}),       view), DeviceMemorySpace{});
  // clang-format on
 }
 void test() {
  Kokkos::View<int*, Kokkos::DefaultExecutionSpace> device_view("device view",
                                                                10);
  Kokkos::View<int*, Kokkos::HostSpace> host_view("host view", 10);
  test_create_mirror_properties(device_view);
  test_create_mirror_properties(host_view);
 }
--- a/lib/kokkos/core/unit_test/TestDetectionIdiom.cpp
+++ b/lib/kokkos/core/unit_test/TestDetectionIdiom.cpp
@ -92,5 +92,3 @@ static_assert(std::is_same<difference_type<Woof>, int>::value,
 static_assert(std::is_same<difference_type<Bark>, std::ptrdiff_t>::value,
              "Bark's difference_type should be ptrdiff_t!");
 }  // namespace Example
 int main() {}
--- a/lib/kokkos/core/unit_test/TestScan.hpp
+++ b/lib/kokkos/core/unit_test/TestScan.hpp
@ -45,12 +45,12 @@
 #include <Kokkos_Core.hpp>
 #include <cstdio>
-namespace Test {
+namespace {
-template <class Device>
+template <class Device, class T, T ImbalanceSz>
 struct TestScan {
  using execution_space = Device;
-  using value_type      = int64_t;
+  using value_type      = T;
  Kokkos::View<int, Device, Kokkos::MemoryTraits<Kokkos::Atomic> > errors;
@ -58,7 +58,10 @@ struct TestScan {
  void operator()(const int iwork, value_type& update,
                  const bool final_pass) const {
    const value_type n = iwork + 1;
-    const value_type imbalance = ((1000 <= n) && (0 == n % 1000)) ? 1000 : 0;
+    const value_type imbalance =
        ((ImbalanceSz <= n) && (value_type(0) == n % ImbalanceSz))
            ? ImbalanceSz
            : value_type(0);
    // Insert an artificial load imbalance
@ -133,12 +136,29 @@ struct TestScan {
    }
  }
 };
 }  // namespace
 TEST(TEST_CATEGORY, scan) {
-  TestScan<TEST_EXECSPACE>::test_range(1, 1000);
+  constexpr auto imbalance_size = 1000;
-  TestScan<TEST_EXECSPACE>(0);
+  TestScan<TEST_EXECSPACE, int64_t, imbalance_size>::test_range(1, 1000);
-  TestScan<TEST_EXECSPACE>(100000);
+  TestScan<TEST_EXECSPACE, int64_t, imbalance_size>(0);
-  TestScan<TEST_EXECSPACE>(10000000);
+  TestScan<TEST_EXECSPACE, int64_t, imbalance_size>(100000);
-  TEST_EXECSPACE().fence();
+  TestScan<TEST_EXECSPACE, int64_t, imbalance_size>(10000000);
 }
 TEST(TEST_CATEGORY, small_size_scan) {
  constexpr auto imbalance_size = 10;  // Pick to not overflow...
  TestScan<TEST_EXECSPACE, std::int8_t, imbalance_size>(0);
  TestScan<TEST_EXECSPACE, std::int8_t, imbalance_size>(5);
  TestScan<TEST_EXECSPACE, std::int8_t, imbalance_size>(10);
  TestScan<TEST_EXECSPACE, std::int8_t, imbalance_size>(
      static_cast<std::size_t>(
          std::sqrt(std::numeric_limits<std::int8_t>::max())));
  constexpr auto short_imbalance_size = 100;  // Pick to not overflow...
  TestScan<TEST_EXECSPACE, std::int16_t, short_imbalance_size>(0);
  TestScan<TEST_EXECSPACE, std::int16_t, short_imbalance_size>(5);
  TestScan<TEST_EXECSPACE, std::int16_t, short_imbalance_size>(100);
  TestScan<TEST_EXECSPACE, std::int16_t, short_imbalance_size>(
      static_cast<std::size_t>(
          std::sqrt(std::numeric_limits<std::int16_t>::max())));
 }
 }  // namespace Test
--- a/lib/kokkos/core/unit_test/TestTeam.hpp
+++ b/lib/kokkos/core/unit_test/TestTeam.hpp
@ -1616,6 +1616,73 @@ struct TestTeamPolicyHandleByValue {
 }  // namespace
 namespace {
 template <typename ExecutionSpace>
 struct TestRepeatedTeamReduce {
  static constexpr int ncol = 1500;  // nothing special, just some work
  KOKKOS_FUNCTION void operator()(
      const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type &team)
      const {
    // non-divisible by power of two to make triggering problems easier
    constexpr int nlev = 129;
    constexpr auto pi  = Kokkos::Experimental::pi_v<double>;
    double b           = 0.;
    for (int ri = 0; ri < 10; ++ri) {
      // The contributions here must be sufficiently complex, simply adding ones
      // wasn't enough to trigger the bug.
      const auto g1 = [&](const int k, double &acc) {
        acc += Kokkos::cos(pi * double(k) / nlev);
      };
      const auto g2 = [&](const int k, double &acc) {
        acc += Kokkos::sin(pi * double(k) / nlev);
      };
      double a1, a2;
      Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team, nlev), g1, a1);
      Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team, nlev), g2, a2);
      b += a1;
      b += a2;
    }
    const auto h = [&]() {
      const auto col = team.league_rank();
      v(col)         = b + col;
    };
    Kokkos::single(Kokkos::PerTeam(team), h);
  }
  KOKKOS_FUNCTION void operator()(const int i, int &bad) const {
    if (v(i) != v(0) + i) {
      ++bad;
      KOKKOS_IMPL_DO_NOT_USE_PRINTF("Failing at %d!\n", i);
    }
  }
  TestRepeatedTeamReduce() : v("v", ncol) { test(); }
  void test() {
    int team_size_recommended =
        Kokkos::TeamPolicy<ExecutionSpace>(1, 1).team_size_recommended(
            *this, Kokkos::ParallelForTag());
    // Choose a non-recommened (non-power of two for GPUs) team size
    int team_size = team_size_recommended > 1 ? team_size_recommended - 1 : 1;
    // The failure was non-deterministic so run the test a bunch of times
    for (int it = 0; it < 100; ++it) {
      Kokkos::parallel_for(
          Kokkos::TeamPolicy<ExecutionSpace>(ncol, team_size, 1), *this);
      int bad = 0;
      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecutionSpace>(0, ncol),
                              *this, bad);
      ASSERT_EQ(bad, 0) << " Failing in iteration " << it;
    }
  }
  Kokkos::View<double *, ExecutionSpace> v;
 };
 }  // namespace
 }  // namespace Test
 /*--------------------------------------------------------------------------*/
--- a/lib/kokkos/core/unit_test/TestTeamReductionScan.hpp
+++ b/lib/kokkos/core/unit_test/TestTeamReductionScan.hpp
@ -134,5 +134,15 @@ TEST(TEST_CATEGORY, team_parallel_dummy_with_reducer_and_scratch_space) {
  }
 }
 TEST(TEST_CATEGORY, repeated_team_reduce) {
 #ifdef KOKKOS_ENABLE_OPENMPTARGET
  if (std::is_same<TEST_EXECSPACE, Kokkos::Experimental::OpenMPTarget>::value)
    GTEST_SKIP() << "skipping since team_reduce for OpenMPTarget is not "
                    "properly implemented";
 #endif
  TestRepeatedTeamReduce<TEST_EXECSPACE>();
 }
 }  // namespace Test
 #endif
--- a/lib/kokkos/core/unit_test/TestViewIsAssignable.hpp
+++ b/lib/kokkos/core/unit_test/TestViewIsAssignable.hpp
@ -92,8 +92,18 @@ TEST(TEST_CATEGORY, view_is_assignable) {
                          View<double*, left, d_exec>>::test(false, false, 10);
  // Layout assignment
  Impl::TestAssignability<View<int, left, d_exec>,
                          View<int, right, d_exec>>::test(true, true);
  Impl::TestAssignability<View<int*, left, d_exec>,
                          View<int*, right, d_exec>>::test(true, true, 10);
  Impl::TestAssignability<View<int[5], left, d_exec>,
                          View<int*, right, d_exec>>::test(false, false, 10);
  Impl::TestAssignability<View<int[10], left, d_exec>,
                          View<int*, right, d_exec>>::test(false, true, 10);
  Impl::TestAssignability<View<int*, left, d_exec>,
                          View<int[5], right, d_exec>>::test(true, true);
  Impl::TestAssignability<View<int[5], left, d_exec>,
                          View<int[10], right, d_exec>>::test(false, false);
  // This could be made possible (due to the degenerate nature of the views) but
  // we do not allow this yet
--- a/lib/kokkos/kokkos_5538.diff
+++ b/lib/kokkos/kokkos_5538.diff
@ -0,0 +1,199 @@
 diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos
 index 22af411f32..530510a0d1 100644
 --- a/lib/kokkos/Makefile.kokkos
 +++ b/lib/kokkos/Makefile.kokkos
@@ -20,7 +20,7 @@ KOKKOS_DEVICES ?= "OpenMP"
 #KOKKOS_DEVICES ?= "Threads"
 # Options:
 # Intel:    KNC,KNL,SNB,HSW,BDW,SKL,SKX,ICL,ICX,SPR
 -# NVIDIA:   Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86
 +# NVIDIA:   Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86,Hopper90
 # ARM:      ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX
 # IBM:      BGQ,Power7,Power8,Power9
 # AMD-GPUS: Vega900,Vega906,Vega908,Vega90A
@@ -401,6 +401,7 @@ KOKKOS_INTERNAL_USE_ARCH_VOLTA72 := $(call kokkos_has_string,$(KOKKOS_ARCH),Volt
 KOKKOS_INTERNAL_USE_ARCH_TURING75 := $(call kokkos_has_string,$(KOKKOS_ARCH),Turing75)
 KOKKOS_INTERNAL_USE_ARCH_AMPERE80 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere80)
 KOKKOS_INTERNAL_USE_ARCH_AMPERE86 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere86)
 +KOKKOS_INTERNAL_USE_ARCH_HOPPER90 := $(call kokkos_has_string,$(KOKKOS_ARCH),Hopper90)
 KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30)  \
                                               + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32)  \
                                               + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35)  \
@@ -414,7 +415,8 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLE
                                               + $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72)   \
                                               + $(KOKKOS_INTERNAL_USE_ARCH_TURING75)  \
                                               + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE80)  \
 -                                              + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86))
 +                                              + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86)  \
 +                                              + $(KOKKOS_INTERNAL_USE_ARCH_HOPPER90))
 #SEK: This seems like a bug to me
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
@@ -1194,6 +1196,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA_ARCH), 1)
     tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE86")
     KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_86
   endif
 +  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_HOPPER90), 1)
 +    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER")
 +    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER90")
 +    KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_90
 +  endif
   ifneq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
     KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)
 diff --git a/lib/kokkos/cmake/KokkosCore_config.h.in b/lib/kokkos/cmake/KokkosCore_config.h.in
 index 88ddc48378..b83ced9243 100644
 --- a/lib/kokkos/cmake/KokkosCore_config.h.in
 +++ b/lib/kokkos/cmake/KokkosCore_config.h.in
@@ -102,6 +102,7 @@
 #cmakedefine KOKKOS_ARCH_AMPERE
 #cmakedefine KOKKOS_ARCH_AMPERE80
 #cmakedefine KOKKOS_ARCH_AMPERE86
 +#cmakedefine KOKKOS_ARCH_HOPPER90
 #cmakedefine KOKKOS_ARCH_AMD_ZEN
 #cmakedefine KOKKOS_ARCH_AMD_ZEN2
 #cmakedefine KOKKOS_ARCH_AMD_ZEN3
 diff --git a/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc b/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc
 index f56cef1651..2585a6a64c 100644
 --- a/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc
 +++ b/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc
@@ -74,6 +74,7 @@ int main() {
     case 75: std::cout << "Set -DKokkos_ARCH_TURING75=ON ." << std::endl; break;
     case 80: std::cout << "Set -DKokkos_ARCH_AMPERE80=ON ." << std::endl; break;
     case 86: std::cout << "Set -DKokkos_ARCH_AMPERE86=ON ." << std::endl; break;
 +    case 90: std::cout << "Set -DKokkos_ARCH_HOPPER90=ON ." << std::endl; break;
     default:
       std::cout << "Compute capability " << compute_capability
                 << " is not supported" << std::endl;
 diff --git a/lib/kokkos/cmake/kokkos_arch.cmake b/lib/kokkos/cmake/kokkos_arch.cmake
 index ef16aad047..c1d76cceeb 100644
 --- a/lib/kokkos/cmake/kokkos_arch.cmake
 +++ b/lib/kokkos/cmake/kokkos_arch.cmake
@@ -86,6 +86,7 @@ KOKKOS_ARCH_OPTION(VOLTA72         GPU  "NVIDIA Volta generation CC 7.2"   "KOKK
 KOKKOS_ARCH_OPTION(TURING75        GPU  "NVIDIA Turing generation CC 7.5"  "KOKKOS_SHOW_CUDA_ARCHS")
 KOKKOS_ARCH_OPTION(AMPERE80        GPU  "NVIDIA Ampere generation CC 8.0"  "KOKKOS_SHOW_CUDA_ARCHS")
 KOKKOS_ARCH_OPTION(AMPERE86        GPU  "NVIDIA Ampere generation CC 8.6"  "KOKKOS_SHOW_CUDA_ARCHS")
 +KOKKOS_ARCH_OPTION(HOPPER90        GPU  "NVIDIA Hopper generation CC 9.0"  "KOKKOS_SHOW_CUDA_ARCHS")
 IF(Kokkos_ENABLE_HIP OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_UNSUPPORTED_ARCHS)
   SET(KOKKOS_SHOW_HIP_ARCHS ON)
@@ -544,6 +545,7 @@ CHECK_CUDA_ARCH(VOLTA72   sm_72)
 CHECK_CUDA_ARCH(TURING75  sm_75)
 CHECK_CUDA_ARCH(AMPERE80  sm_80)
 CHECK_CUDA_ARCH(AMPERE86  sm_86)
 +CHECK_CUDA_ARCH(HOPPER90  sm_90)
 SET(AMDGPU_ARCH_ALREADY_SPECIFIED "")
 FUNCTION(CHECK_AMDGPU_ARCH ARCH FLAG)
@@ -806,6 +808,10 @@ IF (KOKKOS_ARCH_AMPERE80 OR KOKKOS_ARCH_AMPERE86)
   SET(KOKKOS_ARCH_AMPERE ON)
 ENDIF()
 +IF (KOKKOS_ARCH_HOPPER90)
 +  SET(KOKKOS_ARCH_HOPPER ON)
 +ENDIF()
 +
 #Regardless of version, make sure we define the general architecture name
 IF (KOKKOS_ARCH_VEGA900 OR KOKKOS_ARCH_VEGA906 OR KOKKOS_ARCH_VEGA908 OR KOKKOS_ARCH_VEGA90A)
   SET(KOKKOS_ARCH_VEGA ON)
 diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
 index 56f9117844..fcd4773dbc 100644
 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
 +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
@@ -232,7 +232,8 @@ inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) {
       case 61: return 96;
       case 70:
       case 80:
 -      case 86: return 8;
 +      case 86:
 +      case 90: return 8;
       case 75: return 32;
       default:
         Kokkos::Impl::throw_runtime_exception(
 diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp
 index 40a263561f..8c40ebd60d 100644
 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp
 +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp
@@ -418,7 +418,7 @@ KOKKOS_INLINE_FUNCTION
 #endif  // CUDA_VERSION >= 11000 && CUDA_VERSION < 11010
 #if CUDA_VERSION >= 11010 && \
 -    ((defined(KOKKOS_ARCH_AMPERE80) || defined(KOKKOS_ARCH_AMPERE86)))
 +    ((defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER)))
 KOKKOS_INLINE_FUNCTION
 bhalf_t cast_to_bhalf(bhalf_t val) { return val; }
 KOKKOS_INLINE_FUNCTION
 diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp
 index f9451ecfe6..2ce1efb98c 100644
 --- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp
 +++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp
@@ -51,7 +51,7 @@ namespace Kokkos::Experimental::Impl {
 struct OpenACC_Traits {
 #if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
 -    defined(KOKKOS_ARCH_AMPERE)
 +    defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER)
   static constexpr acc_device_t dev_type     = acc_device_nvidia;
   static constexpr bool may_fallback_to_host = false;
 #else
 diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
 index a9bc085912..27ee1d4232 100644
 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
 +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
@@ -115,8 +115,9 @@ void OpenMPTargetInternal::impl_initialize() {
   // FIXME_OPENMPTARGET:  Only fix the number of teams for NVIDIA architectures
   // from Pascal and upwards.
 -#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
 -    defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
 +#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) ||    \
 +    defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
 +    defined(KOKKOS_ARCH_HOPPER)
 #if defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG >= 1300)
   omp_set_num_teams(512);
 #endif
 diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
 index 840db4327c..7e5addbc5b 100644
 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
 +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
@@ -155,7 +155,7 @@ void SYCL::impl_initialize(InitializationSettings const& settings) {
 #if !defined(KOKKOS_ARCH_INTEL_GPU) && !defined(KOKKOS_ARCH_KEPLER) && \
     !defined(KOKKOS_ARCH_MAXWELL) && !defined(KOKKOS_ARCH_PASCAL) &&   \
     !defined(KOKKOS_ARCH_VOLTA) && !defined(KOKKOS_ARCH_TURING75) &&   \
 -    !defined(KOKKOS_ARCH_AMPERE)
 +    !defined(KOKKOS_ARCH_AMPERE) && !defined(KOKKOS_ARCH_HOPPER)
   if (!settings.has_device_id() && gpu_devices.empty()) {
     Impl::SYCLInternal::singleton().initialize(sycl::device());
     return;
 diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
 index 5ac7d8af30..ba101f699e 100644
 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
 +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
@@ -335,9 +335,10 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
     return std::min({
              int(m_space.impl_internal_space_instance()->m_maxWorkgroupSize),
       // FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs.
 -#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
 -    defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) ||   \
 -    defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
 +#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) ||  \
 +    defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) ||    \
 +    defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
 +    defined(KOKKOS_ARCH_HOPPER)
                  256,
 #endif
                  max_threads_for_memory
@@ -367,9 +368,10 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
     return std::min<int>({
              int(m_space.impl_internal_space_instance()->m_maxWorkgroupSize),
       // FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs.
 -#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
 -    defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) ||   \
 -    defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
 +#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) ||  \
 +    defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) ||    \
 +    defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
 +    defined(KOKKOS_ARCH_HOPPER)
                  256,
 #endif
                  max_threads_for_memory
--- a/lib/kokkos/kokkos_5706.diff
+++ b/lib/kokkos/kokkos_5706.diff
@ -0,0 +1,523 @@
 diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
 index fcd4773dbc..30b6958a67 100644
 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
 +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
@@ -207,7 +207,6 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
                                 LaunchBounds{});
 }
 -// Assuming cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1)
 // NOTE these number can be obtained several ways:
 // * One option is to download the CUDA Occupancy Calculator spreadsheet, select
 // "Compute Capability" first and check what is the smallest "Shared Memory
@@ -242,6 +241,7 @@ inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) {
     return 0;
   }() * 1024;
 }
 +
 }  // namespace Impl
 }  // namespace Kokkos
 diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
 index 5811498e01..e22eb3b842 100644
 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
 +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
@@ -569,12 +569,6 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default
   }
 #endif
 -#ifdef KOKKOS_ENABLE_PRE_CUDA_10_DEPRECATION_API
 -  cudaThreadSetCacheConfig(cudaFuncCachePreferShared);
 -#else
 -  cudaDeviceSetCacheConfig(cudaFuncCachePreferShared);
 -#endif
 -
   // Init the array for used for arbitrarily sized atomics
   if (stream == nullptr) Impl::initialize_host_cuda_lock_arrays();
 diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
 index b7a80ad84f..5c4c3a7d39 100644
 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
 +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
@@ -93,10 +93,6 @@ namespace Impl {
 //   __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
 // function qualifier which could be used to improve performance.
 //----------------------------------------------------------------------------
 -// Maximize L1 cache and minimize shared memory:
 -//   cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1 );
 -// For 2.0 capability: 48 KB L1 and 16 KB shared
 -//----------------------------------------------------------------------------
 template <class DriverType>
 __global__ static void cuda_parallel_launch_constant_memory() {
@@ -158,63 +154,105 @@ inline void check_shmem_request(CudaInternal const* cuda_instance, int shmem) {
   }
 }
 -// This function needs to be template on DriverType and LaunchBounds
 +// These functions needs to be template on DriverType and LaunchBounds
 // so that the static bool is unique for each type combo
 // KernelFuncPtr does not necessarily contain that type information.
 +
 template <class DriverType, class LaunchBounds, class KernelFuncPtr>
 -inline void configure_shmem_preference(KernelFuncPtr const& func,
 -                                       bool prefer_shmem) {
 +const cudaFuncAttributes& get_cuda_kernel_func_attributes(
 +    const KernelFuncPtr& func) {
 +  // Only call cudaFuncGetAttributes once for each unique kernel
 +  // by leveraging static variable initialization rules
 +  auto wrap_get_attributes = [&]() -> cudaFuncAttributes {
 +    cudaFuncAttributes attr;
 +    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncGetAttributes(&attr, func));
 +    return attr;
 +  };
 +  static cudaFuncAttributes func_attr = wrap_get_attributes();
 +  return func_attr;
 +}
 +
 +template <class DriverType, class LaunchBounds, class KernelFuncPtr>
 +inline void configure_shmem_preference(const KernelFuncPtr& func,
 +                                       const cudaDeviceProp& device_props,
 +                                       const size_t block_size, int& shmem,
 +                                       const size_t occupancy) {
 #ifndef KOKKOS_ARCH_KEPLER
 -  // On Kepler the L1 has no benefit since it doesn't cache reads
 +
 +  const auto& func_attr =
 +      get_cuda_kernel_func_attributes<DriverType, LaunchBounds>(func);
 +
 +  // Compute limits for number of blocks due to registers/SM
 +  const size_t regs_per_sm     = device_props.regsPerMultiprocessor;
 +  const size_t regs_per_thread = func_attr.numRegs;
 +  // The granularity of register allocation is chunks of 256 registers per warp
 +  // -> 8 registers per thread
 +  const size_t allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
 +  const size_t max_blocks_regs =
 +      regs_per_sm / (allocated_regs_per_thread * block_size);
 +
 +  // Compute how many threads per sm we actually want
 +  const size_t max_threads_per_sm = device_props.maxThreadsPerMultiProcessor;
 +  // only allocate multiples of warp size
 +  const size_t num_threads_desired =
 +      ((max_threads_per_sm * occupancy / 100 + 31) / 32) * 32;
 +  // Get close to the desired occupancy,
 +  // don't undershoot by much but also don't allocate a whole new block just
 +  // because one is a few threads over otherwise.
 +  size_t num_blocks_desired =
 +      (num_threads_desired + block_size * 0.8) / block_size;
 +  num_blocks_desired = ::std::min(max_blocks_regs, num_blocks_desired);
 +  if (num_blocks_desired == 0) num_blocks_desired = 1;
 +
 +  // Calculate how much shared memory we need per block
 +  size_t shmem_per_block = shmem + func_attr.sharedSizeBytes;
 +
 +  // The minimum shared memory allocation we can have in total per SM is 8kB.
 +  // If we want to lower occupancy we have to make sure we request at least that
 +  // much in aggregate over all blocks, so that shared memory actually becomes a
 +  // limiting factor for occupancy
 +  constexpr size_t min_shmem_size_per_sm = 8192;
 +  if ((occupancy < 100) &&
 +      (shmem_per_block * num_blocks_desired < min_shmem_size_per_sm)) {
 +    shmem_per_block = min_shmem_size_per_sm / num_blocks_desired;
 +    // Need to set the caller's shmem variable so that the
 +    // kernel launch uses the correct dynamic shared memory request
 +    shmem = shmem_per_block - func_attr.sharedSizeBytes;
 +  }
 +
 +  // Compute the carveout fraction we need based on occupancy
 +  // Use multiples of 8kB
 +  const size_t max_shmem_per_sm = device_props.sharedMemPerMultiprocessor;
 +  size_t carveout               = shmem_per_block == 0
 +                        ? 0
 +                        : 100 *
 +                              (((num_blocks_desired * shmem_per_block +
 +                                 min_shmem_size_per_sm - 1) /
 +                                min_shmem_size_per_sm) *
 +                               min_shmem_size_per_sm) /
 +                              max_shmem_per_sm;
 +  if (carveout > 100) carveout = 100;
 +
 +  // Set the carveout, but only call it once per kernel or when it changes
   auto set_cache_config = [&] {
 -    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
 -        func,
 -        (prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1)));
 -    return prefer_shmem;
 +    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetAttribute(
 +        func, cudaFuncAttributePreferredSharedMemoryCarveout, carveout));
 +    return carveout;
   };
 -  static bool cache_config_preference_cached = set_cache_config();
 -  if (cache_config_preference_cached != prefer_shmem) {
 +  // Store the value in a static variable so we only reset if needed
 +  static size_t cache_config_preference_cached = set_cache_config();
 +  if (cache_config_preference_cached != carveout) {
     cache_config_preference_cached = set_cache_config();
   }
 #else
   // Use the parameters so we don't get a warning
   (void)func;
 -  (void)prefer_shmem;
 +  (void)device_props;
 +  (void)block_size;
 +  (void)occupancy;
 #endif
 }
 -template <class Policy>
 -std::enable_if_t<Policy::experimental_contains_desired_occupancy>
 -modify_launch_configuration_if_desired_occupancy_is_specified(
 -    Policy const& policy, cudaDeviceProp const& properties,
 -    cudaFuncAttributes const& attributes, dim3 const& block, int& shmem,
 -    bool& prefer_shmem) {
 -  int const block_size        = block.x * block.y * block.z;
 -  int const desired_occupancy = policy.impl_get_desired_occupancy().value();
 -
 -  size_t const shmem_per_sm_prefer_l1 = get_shmem_per_sm_prefer_l1(properties);
 -  size_t const static_shmem           = attributes.sharedSizeBytes;
 -
 -  // round to nearest integer and avoid division by zero
 -  int active_blocks = std::max(
 -      1, static_cast<int>(std::round(
 -             static_cast<double>(properties.maxThreadsPerMultiProcessor) /
 -             block_size * desired_occupancy / 100)));
 -  int const dynamic_shmem =
 -      shmem_per_sm_prefer_l1 / active_blocks - static_shmem;
 -
 -  if (dynamic_shmem > shmem) {
 -    shmem        = dynamic_shmem;
 -    prefer_shmem = false;
 -  }
 -}
 -
 -template <class Policy>
 -std::enable_if_t<!Policy::experimental_contains_desired_occupancy>
 -modify_launch_configuration_if_desired_occupancy_is_specified(
 -    Policy const&, cudaDeviceProp const&, cudaFuncAttributes const&,
 -    dim3 const& /*block*/, int& /*shmem*/, bool& /*prefer_shmem*/) {}
 -
 // </editor-fold> end Some helper functions for launch code readability }}}1
 //==============================================================================
@@ -348,7 +386,7 @@ struct CudaParallelLaunchKernelInvoker<
 #ifdef KOKKOS_CUDA_ENABLE_GRAPHS
   inline static void create_parallel_launch_graph_node(
       DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
 -      CudaInternal const* cuda_instance, bool prefer_shmem) {
 +      CudaInternal const* cuda_instance) {
     //----------------------------------------
     auto const& graph = Impl::get_cuda_graph_from_kernel(driver);
     KOKKOS_EXPECTS(bool(graph));
@@ -358,8 +396,15 @@ struct CudaParallelLaunchKernelInvoker<
     if (!Impl::is_empty_launch(grid, block)) {
       Impl::check_shmem_request(cuda_instance, shmem);
 -      Impl::configure_shmem_preference<DriverType, LaunchBounds>(
 -          base_t::get_kernel_func(), prefer_shmem);
 +      if (DriverType::Policy::
 +                        experimental_contains_desired_occupancy) {
 +        int desired_occupancy =
 +            driver.get_policy().impl_get_desired_occupancy().value();
 +        size_t block_size = block.x * block.y * block.z;
 +        Impl::configure_shmem_preference<DriverType, LaunchBounds>(
 +            base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
 +            shmem, desired_occupancy);
 +      }
       void const* args[] = {&driver};
@@ -442,7 +487,7 @@ struct CudaParallelLaunchKernelInvoker<
 #ifdef KOKKOS_CUDA_ENABLE_GRAPHS
   inline static void create_parallel_launch_graph_node(
       DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
 -      CudaInternal const* cuda_instance, bool prefer_shmem) {
 +      CudaInternal const* cuda_instance) {
     //----------------------------------------
     auto const& graph = Impl::get_cuda_graph_from_kernel(driver);
     KOKKOS_EXPECTS(bool(graph));
@@ -452,8 +497,15 @@ struct CudaParallelLaunchKernelInvoker<
     if (!Impl::is_empty_launch(grid, block)) {
       Impl::check_shmem_request(cuda_instance, shmem);
 -      Impl::configure_shmem_preference<DriverType, LaunchBounds>(
 -          base_t::get_kernel_func(), prefer_shmem);
 +      if constexpr (DriverType::Policy::
 +                        experimental_contains_desired_occupancy) {
 +        int desired_occupancy =
 +            driver.get_policy().impl_get_desired_occupancy().value();
 +        size_t block_size = block.x * block.y * block.z;
 +        Impl::configure_shmem_preference<DriverType, LaunchBounds>(
 +            base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
 +            shmem, desired_occupancy);
 +      }
       auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver);
@@ -566,7 +618,7 @@ struct CudaParallelLaunchKernelInvoker<
 #ifdef KOKKOS_CUDA_ENABLE_GRAPHS
   inline static void create_parallel_launch_graph_node(
       DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
 -      CudaInternal const* cuda_instance, bool prefer_shmem) {
 +      CudaInternal const* cuda_instance) {
     // Just use global memory; coordinating through events to share constant
     // memory with the non-graph interface is not really reasonable since
     // events don't work with Graphs directly, and this would anyway require
@@ -580,7 +632,7 @@ struct CudaParallelLaunchKernelInvoker<
         DriverType, LaunchBounds,
         Experimental::CudaLaunchMechanism::GlobalMemory>;
     global_launch_impl_t::create_parallel_launch_graph_node(
 -        driver, grid, block, shmem, cuda_instance, prefer_shmem);
 +        driver, grid, block, shmem, cuda_instance);
   }
 #endif
 };
@@ -613,8 +665,7 @@ struct CudaParallelLaunchImpl<
   inline static void launch_kernel(const DriverType& driver, const dim3& grid,
                                    const dim3& block, int shmem,
 -                                   const CudaInternal* cuda_instance,
 -                                   bool prefer_shmem) {
 +                                   const CudaInternal* cuda_instance) {
     if (!Impl::is_empty_launch(grid, block)) {
       // Prevent multiple threads to simultaneously set the cache configuration
       // preference and launch the same kernel
@@ -623,18 +674,17 @@ struct CudaParallelLaunchImpl<
       Impl::check_shmem_request(cuda_instance, shmem);
 -      // If a desired occupancy is specified, we compute how much shared memory
 -      // to ask for to achieve that occupancy, assuming that the cache
 -      // configuration is `cudaFuncCachePreferL1`.  If the amount of dynamic
 -      // shared memory computed is actually smaller than `shmem` we overwrite
 -      // `shmem` and set `prefer_shmem` to `false`.
 -      modify_launch_configuration_if_desired_occupancy_is_specified(
 -          driver.get_policy(), cuda_instance->m_deviceProp,
 -          get_cuda_func_attributes(), block, shmem, prefer_shmem);
 -
 -      Impl::configure_shmem_preference<
 -          DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
 -          base_t::get_kernel_func(), prefer_shmem);
 +      if (DriverType::Policy::
 +                        experimental_contains_desired_occupancy) {
 +        int desired_occupancy =
 +            driver.get_policy().impl_get_desired_occupancy().value();
 +        size_t block_size = block.x * block.y * block.z;
 +        Impl::configure_shmem_preference<
 +            DriverType,
 +            Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
 +            base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
 +            shmem, desired_occupancy);
 +      }
       KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
@@ -650,18 +700,9 @@ struct CudaParallelLaunchImpl<
   }
   static cudaFuncAttributes get_cuda_func_attributes() {
 -    // Race condition inside of cudaFuncGetAttributes if the same address is
 -    // given requires using a local variable as input instead of a static Rely
 -    // on static variable initialization to make sure only one thread executes
 -    // the code and the result is visible.
 -    auto wrap_get_attributes = []() -> cudaFuncAttributes {
 -      cudaFuncAttributes attr_tmp;
 -      KOKKOS_IMPL_CUDA_SAFE_CALL(
 -          cudaFuncGetAttributes(&attr_tmp, base_t::get_kernel_func()));
 -      return attr_tmp;
 -    };
 -    static cudaFuncAttributes attr = wrap_get_attributes();
 -    return attr;
 +    return get_cuda_kernel_func_attributes<
 +        DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
 +        base_t::get_kernel_func());
   }
 };
 diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
 index e586bb4cc6..0e348c092a 100644
 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
 +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
@@ -121,8 +121,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
               maxblocks[1]),
           1);
       CudaParallelLaunch<ParallelFor, LaunchBounds>(
 -          *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
 -          false);
 +          *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
     } else if (RP::rank == 3) {
       const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], m_rp.m_tile[2]);
       KOKKOS_ASSERT(block.x > 0);
@@ -139,8 +138,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
               (m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1) / block.z,
               maxblocks[2]));
       CudaParallelLaunch<ParallelFor, LaunchBounds>(
 -          *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
 -          false);
 +          *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
     } else if (RP::rank == 4) {
       // id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to
       // threadIdx.z
@@ -158,8 +156,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
               (m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1) / block.z,
               maxblocks[2]));
       CudaParallelLaunch<ParallelFor, LaunchBounds>(
 -          *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
 -          false);
 +          *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
     } else if (RP::rank == 5) {
       // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to
       // threadIdx.z
@@ -175,8 +172,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
               (m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1) / block.z,
               maxblocks[2]));
       CudaParallelLaunch<ParallelFor, LaunchBounds>(
 -          *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
 -          false);
 +          *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
     } else if (RP::rank == 6) {
       // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4,id5 to
       // threadIdx.z
@@ -191,8 +187,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
           std::min<array_index_type>(m_rp.m_tile_end[4] * m_rp.m_tile_end[5],
                                      maxblocks[2]));
       CudaParallelLaunch<ParallelFor, LaunchBounds>(
 -          *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
 -          false);
 +          *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
     } else {
       Kokkos::abort("Kokkos::MDRange Error: Exceeded rank bounds with Cuda\n");
     }
@@ -405,8 +400,8 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
       CudaParallelLaunch<ParallelReduce, LaunchBounds>(
           *this, grid, block, shmem,
 -          m_policy.space().impl_internal_space_instance(),
 -          false);  // copy to device and execute
 +          m_policy.space()
 +	      .impl_internal_space_instance()); // copy to device and execute
       if (!m_result_ptr_device_accessible) {
         if (m_result_ptr) {
 diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
 index ac160f8fe2..d1031751c2 100644
 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
 +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
@@ -135,8 +135,7 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
 #endif
     CudaParallelLaunch<ParallelFor, LaunchBounds>(
 -        *this, grid, block, 0, m_policy.space().impl_internal_space_instance(),
 -        false);
 +        *this, grid, block, 0, m_policy.space().impl_internal_space_instance());
   }
   ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
@@ -375,8 +374,8 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
       CudaParallelLaunch<ParallelReduce, LaunchBounds>(
           *this, grid, block, shmem,
 -          m_policy.space().impl_internal_space_instance(),
 -          false);  // copy to device and execute
 +          m_policy.space()
 +	      .impl_internal_space_instance());  // copy to device and execute
       if (!m_result_ptr_device_accessible) {
         if (m_result_ptr) {
@@ -726,16 +725,16 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
         m_final = false;
         CudaParallelLaunch<ParallelScan, LaunchBounds>(
             *this, grid, block, shmem,
 -            m_policy.space().impl_internal_space_instance(),
 -            false);  // copy to device and execute
 +            m_policy.space()
 +	        .impl_internal_space_instance());  // copy to device and execute
 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
       }
 #endif
       m_final = true;
       CudaParallelLaunch<ParallelScan, LaunchBounds>(
           *this, grid, block, shmem,
 -          m_policy.space().impl_internal_space_instance(),
 -          false);  // copy to device and execute
 +          m_policy.space()
 +	      .impl_internal_space_instance());  // copy to device and execute
     }
   }
@@ -1038,16 +1037,16 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
         m_final = false;
         CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>(
             *this, grid, block, shmem,
 -            m_policy.space().impl_internal_space_instance(),
 -            false);  // copy to device and execute
 +            m_policy.space()
 +	        .impl_internal_space_instance());  // copy to device and execute
 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
       }
 #endif
       m_final = true;
       CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>(
           *this, grid, block, shmem,
 -          m_policy.space().impl_internal_space_instance(),
 -          false);  // copy to device and execute
 +          m_policy.space()
 +              .impl_internal_space_instance());  // copy to device and execute
       const int size = Analysis::value_size(m_functor);
 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
 diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
 index cdd16085b3..ea9430b812 100644
 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
 +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
@@ -552,8 +552,8 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
     CudaParallelLaunch<ParallelFor, LaunchBounds>(
         *this, grid, block, shmem_size_total,
 -        m_policy.space().impl_internal_space_instance(),
 -        true);  // copy to device and execute
 +        m_policy.space()
 +	    .impl_internal_space_instance());  // copy to device and execute
   }
   ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
@@ -878,8 +878,8 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
       CudaParallelLaunch<ParallelReduce, LaunchBounds>(
           *this, grid, block, shmem_size_total,
 -          m_policy.space().impl_internal_space_instance(),
 -          true);  // copy to device and execute
 +          m_policy.space()
 +	      .impl_internal_space_instance());  // copy to device and execute
       if (!m_result_ptr_device_accessible) {
         m_policy.space().fence(
 diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
 index 34d4bef9fd..178012431c 100644
 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
 +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
@@ -428,11 +428,6 @@ struct CudaReductionsFunctor<FunctorType, false, false> {
 //   __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
 // function qualifier which could be used to improve performance.
 //----------------------------------------------------------------------------
 -// Maximize shared memory and minimize L1 cache:
 -//   cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferShared );
 -// For 2.0 capability: 48 KB shared and 16 KB L1
 -//----------------------------------------------------------------------------
 -//----------------------------------------------------------------------------
 /*
  *  Algorithmic constraints:
  *   (a) blockDim.y <= 1024
 diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
 index fb3a6b138f..a12378a891 100644
 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
 +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
@@ -100,8 +100,7 @@ class ParallelFor<FunctorType, Kokkos::WorkGraphPolicy<Traits...>,
     const int shared = 0;
     Kokkos::Impl::CudaParallelLaunch<Self>(
 -        *this, grid, block, shared, Cuda().impl_internal_space_instance(),
 -        false);
 +        *this, grid, block, shared, Cuda().impl_internal_space_instance());
   }
   inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
--- a/lib/kokkos/kokkos_5731.diff
+++ b/lib/kokkos/kokkos_5731.diff
@ -0,0 +1,46 @@
 diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
 index 30b6958a67..b94f053272 100644
 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
 +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
@@ -207,41 +207,6 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
                                 LaunchBounds{});
 }
 -// NOTE these number can be obtained several ways:
 -// * One option is to download the CUDA Occupancy Calculator spreadsheet, select
 -// "Compute Capability" first and check what is the smallest "Shared Memory
 -// Size Config" that is available.  The "Shared Memory Per Multiprocessor" in
 -// bytes is then to be found below in the summary.
 -// * Another option would be to look for the information in the "Tuning
 -// Guide(s)" of the CUDA Toolkit Documentation for each GPU architecture, in
 -// the "Shared Memory" section (more tedious)
 -inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) {
 -  int const compute_capability = properties.major * 10 + properties.minor;
 -  return [compute_capability]() {
 -    switch (compute_capability) {
 -      case 30:
 -      case 32:
 -      case 35: return 16;
 -      case 37: return 80;
 -      case 50:
 -      case 53:
 -      case 60:
 -      case 62: return 64;
 -      case 52:
 -      case 61: return 96;
 -      case 70:
 -      case 80:
 -      case 86:
 -      case 90: return 8;
 -      case 75: return 32;
 -      default:
 -        Kokkos::Impl::throw_runtime_exception(
 -            "Unknown device in cuda block size deduction");
 -    }
 -    return 0;
 -  }() * 1024;
 -}
 -
 }  // namespace Impl
 }  // namespace Kokkos
--- a/lib/kokkos/kokkos_5739.diff
+++ b/lib/kokkos/kokkos_5739.diff
@ -0,0 +1,204 @@
 diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
 index b94f053272..252c13c524 100644
 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
 +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
@@ -53,17 +53,69 @@
 namespace Kokkos {
 namespace Impl {
 +inline int cuda_warp_per_sm_allocation_granularity(
 +    cudaDeviceProp const& properties) {
 +  // Allocation granularity of warps in each sm
 +  switch (properties.major) {
 +    case 3:
 +    case 5:
 +    case 7:
 +    case 8:
 +    case 9: return 4;
 +    case 6: return (properties.minor == 0 ? 2 : 4);
 +    default:
 +      throw_runtime_exception(
 +          "Unknown device in cuda warp per sm allocation granularity");
 +      return 0;
 +  }
 +}
 +
 +inline int cuda_max_warps_per_sm_registers(
 +    cudaDeviceProp const& properties, cudaFuncAttributes const& attributes) {
 +  // Maximum number of warps per sm as a function of register counts,
 +  // subject to the constraint that warps are allocated with a fixed granularity
 +  int const max_regs_per_block = properties.regsPerBlock;
 +  int const regs_per_warp      = attributes.numRegs * properties.warpSize;
 +  int const warp_granularity =
 +      cuda_warp_per_sm_allocation_granularity(properties);
 +  // The granularity of register allocation is chunks of 256 registers per warp,
 +  // which implies a need to over-allocate, so we round up
 +  int const allocated_regs_per_warp = (regs_per_warp + 256 - 1) / 256;
 +
 +  // The maximum number of warps per SM is constrained from above by register
 +  // allocation. To satisfy the constraint that warps per SM is allocated at a
 +  // finite granularity, we need to round down.
 +  int const max_warps_per_sm =
 +      warp_granularity *
 +      (max_regs_per_block / (allocated_regs_per_warp * warp_granularity));
 +
 +  return max_warps_per_sm;
 +}
 +
 inline int cuda_max_active_blocks_per_sm(cudaDeviceProp const& properties,
                                          cudaFuncAttributes const& attributes,
                                          int block_size, size_t dynamic_shmem) {
 -  // Limits due do registers/SM
 +  // Limits due to registers/SM
   int const regs_per_sm     = properties.regsPerMultiprocessor;
   int const regs_per_thread = attributes.numRegs;
   // The granularity of register allocation is chunks of 256 registers per warp
   // -> 8 registers per thread
   int const allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
 -  int const max_blocks_regs =
 -      regs_per_sm / (allocated_regs_per_thread * block_size);
 +  int max_blocks_regs = regs_per_sm / (allocated_regs_per_thread * block_size);
 +
 +  // Compute the maximum number of warps as a function of the number of
 +  // registers
 +  int const max_warps_per_sm_registers =
 +      cuda_max_warps_per_sm_registers(properties, attributes);
 +
 +  // Constrain the number of blocks to respect the maximum number of warps per
 +  // SM On face value this should be an equality, but due to the warp
 +  // granularity constraints noted in `cuda_max_warps_per_sm_registers` the
 +  // left-hand-side of this comparison can overshoot what the hardware allows
 +  // based on register counts alone
 +  while ((max_blocks_regs * block_size / properties.warpSize) >
 +         max_warps_per_sm_registers)
 +    max_blocks_regs--;
   // Limits due to shared memory/SM
   size_t const shmem_per_sm            = properties.sharedMemPerMultiprocessor;
@@ -207,6 +259,19 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
                                 LaunchBounds{});
 }
 +template <class LaunchBounds>
 +int cuda_get_opt_block_size_no_shmem(const cudaFuncAttributes& attr,
 +                                     LaunchBounds) {
 +  auto const& prop = Kokkos::Cuda().cuda_device_prop();
 +
 +  // Thin version of cuda_get_opt_block_size for cases where there is no shared
 +  // memory
 +  auto const block_size_to_no_shmem = [&](int /*block_size*/) { return 0; };
 +
 +  return cuda_deduce_block_size(false, prop, attr, block_size_to_no_shmem,
 +                                LaunchBounds{});
 +}
 +
 }  // namespace Impl
 }  // namespace Kokkos
 diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
 index 5c4c3a7d39..170183ca0a 100644
 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
 +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
@@ -188,9 +188,23 @@ inline void configure_shmem_preference(const KernelFuncPtr& func,
   // The granularity of register allocation is chunks of 256 registers per warp
   // -> 8 registers per thread
   const size_t allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
 -  const size_t max_blocks_regs =
 +  size_t max_blocks_regs =
       regs_per_sm / (allocated_regs_per_thread * block_size);
 +  // Compute the maximum number of warps as a function of the number of
 +  // registers
 +  const size_t max_warps_per_sm_registers =
 +      cuda_max_warps_per_sm_registers(device_props, func_attr);
 +
 +  // Constrain the number of blocks to respect the maximum number of warps per
 +  // SM On face value this should be an equality, but due to the warp
 +  // granularity constraints noted in `cuda_max_warps_per_sm_registers` the
 +  // left-hand-side of this comparison can overshoot what the hardware allows
 +  // based on register counts alone
 +  while ((max_blocks_regs * block_size / device_props.warpSize) >
 +         max_warps_per_sm_registers)
 +    max_blocks_regs--;
 +
   // Compute how many threads per sm we actually want
   const size_t max_threads_per_sm = device_props.maxThreadsPerMultiProcessor;
   // only allocate multiples of warp size
 diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
 index 0e348c092a..7e4f62f12e 100644
 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
 +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
@@ -67,6 +67,34 @@
 namespace Kokkos {
 namespace Impl {
 +template <typename ParallelType, typename Policy, typename LaunchBounds>
 +int max_tile_size_product_helper(const Policy& pol, const LaunchBounds&) {
 +  cudaFuncAttributes attr =
 +      CudaParallelLaunch<ParallelType,
 +                         LaunchBounds>::get_cuda_func_attributes();
 +  auto const& prop = pol.space().cuda_device_prop();
 +
 +  // Limits due to registers/SM, MDRange doesn't have
 +  // shared memory constraints
 +  int const optimal_block_size =
 +      Kokkos::Impl::cuda_get_opt_block_size_no_shmem(attr, LaunchBounds{});
 +
 +  // Compute how many blocks of this size we can launch, based on warp
 +  // constraints
 +  int const max_warps_per_sm_registers =
 +      Kokkos::Impl::cuda_max_warps_per_sm_registers(prop, attr);
 +  int const max_num_threads_from_warps =
 +      max_warps_per_sm_registers * prop.warpSize;
 +  int const max_num_blocks = max_num_threads_from_warps / optimal_block_size;
 +
 +  // Compute the total number of threads
 +  int const max_threads_per_sm = optimal_block_size * max_num_blocks;
 +
 +  return std::min(
 +      max_threads_per_sm,
 +      static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
 +}
 +
 template <class FunctorType, class... Traits>
 class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
  public:
@@ -85,18 +113,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
  public:
   template <typename Policy, typename Functor>
   static int max_tile_size_product(const Policy& pol, const Functor&) {
 -    cudaFuncAttributes attr =
 -        CudaParallelLaunch<ParallelFor,
 -                           LaunchBounds>::get_cuda_func_attributes();
 -    auto const& prop = pol.space().cuda_device_prop();
 -    // Limits due to registers/SM, MDRange doesn't have
 -    // shared memory constraints
 -    int const regs_per_sm        = prop.regsPerMultiprocessor;
 -    int const regs_per_thread    = attr.numRegs;
 -    int const max_threads_per_sm = regs_per_sm / regs_per_thread;
 -    return std::min(
 -        max_threads_per_sm,
 -        static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
 +    return max_tile_size_product_helper<ParallelFor>(pol, LaunchBounds{});
   }
   Policy const& get_policy() const { return m_rp; }
   inline __device__ void operator()() const {
@@ -258,17 +275,7 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
  public:
   template <typename Policy, typename Functor>
   static int max_tile_size_product(const Policy& pol, const Functor&) {
 -    cudaFuncAttributes attr =
 -        CudaParallelLaunch<ParallelReduce,
 -                           LaunchBounds>::get_cuda_func_attributes();
 -    auto const& prop = pol.space().cuda_device_prop();
 -    // Limits due do registers/SM
 -    int const regs_per_sm        = prop.regsPerMultiprocessor;
 -    int const regs_per_thread    = attr.numRegs;
 -    int const max_threads_per_sm = regs_per_sm / regs_per_thread;
 -    return std::min(
 -        max_threads_per_sm,
 -        static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
 +    return max_tile_size_product_helper<ParallelReduce>(pol, LaunchBounds{});
   }
   Policy const& get_policy() const { return m_policy; }
   inline __device__ void exec_range(reference_type update) const {
--- a/lib/kokkos/kokkos_fix_5706_apply_last.diff
+++ b/lib/kokkos/kokkos_fix_5706_apply_last.diff
@ -0,0 +1,63 @@
 diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
 index 170183ca0a..ba43e362bb 100644
 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
 +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
@@ -412,12 +412,16 @@ struct CudaParallelLaunchKernelInvoker<
       Impl::check_shmem_request(cuda_instance, shmem);
       if (DriverType::Policy::
                         experimental_contains_desired_occupancy) {
 +      /*
         int desired_occupancy =
             driver.get_policy().impl_get_desired_occupancy().value();
         size_t block_size = block.x * block.y * block.z;
         Impl::configure_shmem_preference<DriverType, LaunchBounds>(
             base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
 -            shmem, desired_occupancy);
 +            shmem, desired_occupancy);*/
 +        Kokkos::Impl::throw_runtime_exception(
 +        std::string("Cuda graph node creation FAILED:"
 +                    " occupancy requests are currently broken."));
       }
       void const* args[] = {&driver};
@@ -511,14 +515,17 @@ struct CudaParallelLaunchKernelInvoker<
     if (!Impl::is_empty_launch(grid, block)) {
       Impl::check_shmem_request(cuda_instance, shmem);
 -      if constexpr (DriverType::Policy::
 +      if (DriverType::Policy::
                         experimental_contains_desired_occupancy) {
 -        int desired_occupancy =
 +        /*int desired_occupancy =
             driver.get_policy().impl_get_desired_occupancy().value();
         size_t block_size = block.x * block.y * block.z;
         Impl::configure_shmem_preference<DriverType, LaunchBounds>(
             base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
 -            shmem, desired_occupancy);
 +            shmem, desired_occupancy);*/
 +        Kokkos::Impl::throw_runtime_exception(
 +        std::string("Cuda graph node creation FAILED:"
 +                    " occupancy requests are currently broken."));
       }
       auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver);
@@ -690,14 +697,17 @@ struct CudaParallelLaunchImpl<
       if (DriverType::Policy::
                         experimental_contains_desired_occupancy) {
 -        int desired_occupancy =
 +        /*int desired_occupancy =
             driver.get_policy().impl_get_desired_occupancy().value();
         size_t block_size = block.x * block.y * block.z;
         Impl::configure_shmem_preference<
             DriverType,
             Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
             base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
 -            shmem, desired_occupancy);
 +            shmem, desired_occupancy);*/
 +        Kokkos::Impl::throw_runtime_exception(
 +        std::string("Cuda graph node creation FAILED:"
 +                    " occupancy requests are currently broken."));
       }
       KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
--- a/lib/kokkos/master_history.txt
+++ b/lib/kokkos/master_history.txt
@ -29,3 +29,4 @@ tag:  3.5.00     date: 11:19:2021    master: c28a8b03    release: 21b879e4
 tag:  3.6.00     date: 04:14:2022    master: 2834f94a    release: 6ea708ff
 tag:  3.6.01     date: 06:16:2022    master: b52f8c83    release: afe9b404
 tag:  3.7.00     date: 08:25:2022    master: d19aab99    release: 0018e5fb
 tag:  3.7.01     date: 12:01:2022    master: 61d7db55    release: d3bb8cfe
--- a/lib/kokkos/simd/cmake/Dependencies.cmake
+++ b/lib/kokkos/simd/cmake/Dependencies.cmake
@ -1,5 +1,5 @@
 TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
  LIB_REQUIRED_PACKAGES KokkosCore
-  LIB_OPTIONAL_TPLS Pthread CUDA HWLOC HPX
+  LIB_OPTIONAL_TPLS Pthread CUDA HWLOC
  TEST_OPTIONAL_TPLS CUSPARSE
  )
--- a/lib/kokkos/tpls/.clang-format
+++ b/lib/kokkos/tpls/.clang-format
@ -0,0 +1,3 @@
 #Official Tool: clang-format version 8.0.0
 DisableFormat: true
 SortIncludes: false
--- a/lib/kokkos/tpls/desul/include/desul/atomics/Lock_Array_Cuda.hpp
+++ b/lib/kokkos/tpls/desul/include/desul/atomics/Lock_Array_Cuda.hpp
@ -76,7 +76,7 @@ namespace Impl {
 /// instances in other translation units, we must update this CUDA global
 /// variable based on the Host global variable prior to running any kernels
 /// that will use it.
-/// That is the purpose of the ensure_cuda_lock_arrays_on_device function.
+/// That is the purpose of the KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE macro.
 __device__
 #ifdef __CUDACC_RDC__
    __constant__ extern
@ -138,42 +138,33 @@ namespace {
 static int lock_array_copied = 0;
 inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
 }  // namespace
 #ifdef __CUDACC_RDC__
 inline
 #else
 static
 #endif
    void
    copy_cuda_lock_arrays_to_device() {
  if (lock_array_copied == 0) {
    cudaMemcpyToSymbol(CUDA_SPACE_ATOMIC_LOCKS_DEVICE,
                       &CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h,
                       sizeof(int32_t*));
    cudaMemcpyToSymbol(CUDA_SPACE_ATOMIC_LOCKS_NODE,
                       &CUDA_SPACE_ATOMIC_LOCKS_NODE_h,
                       sizeof(int32_t*));
  }
  lock_array_copied = 1;
 }
 }  // namespace Impl
 }  // namespace desul
 /* It is critical that this code be a macro, so that it will
   capture the right address for desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE
   putting this in an inline function will NOT do the right thing! */
 #define DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()                       \
  {                                                                        \
    if (::desul::Impl::lock_array_copied == 0) {                           \
      cudaMemcpyToSymbol(::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE,    \
                         &::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h, \
                         sizeof(int32_t*));                                \
      cudaMemcpyToSymbol(::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_NODE,      \
                         &::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_NODE_h,   \
                         sizeof(int32_t*));                                \
    }                                                                      \
    ::desul::Impl::lock_array_copied = 1;                                  \
  }
 #endif /* defined( __CUDACC__ ) */
 #endif /* defined( DESUL_HAVE_CUDA_ATOMICS ) */
 namespace desul {
 #if defined(__CUDACC_RDC__) || (!defined(__CUDACC__))
-inline void ensure_cuda_lock_arrays_on_device() {}
+#define DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
 #else
-static inline void ensure_cuda_lock_arrays_on_device() {
+#define DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() \
-  Impl::copy_cuda_lock_arrays_to_device();
+  DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()
 }
 #endif
-}  // namespace desul
+#endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP_ */
 #endif /* #ifndef DESUL_ATOMICS_LOCK_ARRAY_CUDA_HPP_ */
--- a/lib/kokkos/tpls/desul/src/Lock_Array_CUDA.cpp
+++ b/lib/kokkos/tpls/desul/src/Lock_Array_CUDA.cpp
@ -70,7 +70,7 @@ void init_lock_arrays_cuda() {
                             "init_lock_arrays_cuda: cudaMalloc host locks");
  auto error_sync1 = cudaDeviceSynchronize();
-  copy_cuda_lock_arrays_to_device();
+  DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
  check_error_and_throw_cuda(error_sync1, "init_lock_arrays_cuda: post mallocs");
  init_lock_arrays_cuda_kernel<<<(CUDA_SPACE_ATOMIC_MASK + 1 + 255) / 256, 256>>>();
  auto error_sync2 = cudaDeviceSynchronize();
@ -85,7 +85,7 @@ void finalize_lock_arrays_cuda() {
  CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr;
  CUDA_SPACE_ATOMIC_LOCKS_NODE_h = nullptr;
 #ifdef __CUDACC_RDC__
-  copy_cuda_lock_arrays_to_device();
+  DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
 #endif
 }
--- a/src/KOKKOS/fix_langevin_kokkos.h
+++ b/src/KOKKOS/fix_langevin_kokkos.h
@ -190,6 +190,20 @@ namespace LAMMPS_NS {
          Tp_BIAS,Tp_RMASS,Tp_ZERO>(i);
      }
      KOKKOS_INLINE_FUNCTION
      static void init(value_type &update) {
        update.fx = 0.0;
        update.fy = 0.0;
        update.fz = 0.0;
      }
      KOKKOS_INLINE_FUNCTION
      static void join(value_type &update,
                       const value_type &source) {
        update.fx += source.fx;
        update.fy += source.fy;
        update.fz += source.fz;
      }
      KOKKOS_INLINE_FUNCTION
      static void init(volatile value_type &update) {
        update.fx = 0.0;
@ -233,6 +247,15 @@ namespace LAMMPS_NS {
        energy += c.compute_energy_item(i);
      }
      KOKKOS_INLINE_FUNCTION
      static void init(value_type &update) {
        update = 0.0;
      }
      KOKKOS_INLINE_FUNCTION
      static void join(value_type &update,
                       const value_type &source) {
        update += source;
      }
      KOKKOS_INLINE_FUNCTION
      static void init(volatile value_type &update) {
        update = 0.0;
      }
--- a/src/KOKKOS/kokkos.cpp
+++ b/src/KOKKOS/kokkos.cpp
@ -77,7 +77,6 @@ GPU_AWARE_UNKNOWN
 using namespace LAMMPS_NS;
 Kokkos::InitArguments KokkosLMP::args{-1, -1, -1, false};
 int KokkosLMP::is_finalized = 0;
 int KokkosLMP::init_ngpus = 0;
@ -110,7 +109,6 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
  ngpus = 0;
  int device = 0;
  nthreads = 1;
  numa = 1;
  int iarg = 0;
  while (iarg < narg) {
@ -189,30 +187,24 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
      iarg += 2;
    } else if (strcmp(arg[iarg],"n") == 0 ||
               strcmp(arg[iarg],"numa") == 0) {
      numa = utils::inumeric(FLERR, arg[iarg+1], false, lmp);
      iarg += 2;
    } else error->all(FLERR,"Invalid Kokkos command-line arg: {}", arg[iarg]);
  }
  // Initialize Kokkos. However, we cannot change any
  // Kokkos library parameters after the first initalization
-  if (args.num_threads != -1) {
+  Kokkos::InitializationSettings args;
-    if ((args.num_threads != nthreads) || (args.num_numa != numa) || (args.device_id != device))
+
  if (args.has_num_threads()) {
    if ((args.get_num_threads() != nthreads) || (args.get_device_id() != device))
      if (me == 0)
-        error->warning(FLERR,"Kokkos package already initalized, "
+        error->warning(FLERR,"Kokkos package already initalized. Cannot change parameters");
-                       "cannot reinitialize with different parameters");
+    nthreads = args.get_num_threads();
-    nthreads = args.num_threads;
+    device = args.get_device_id();
    numa = args.num_numa;
    device = args.device_id;
    ngpus = init_ngpus;
  } else {
-    args.num_threads = nthreads;
+    args.set_num_threads(nthreads);
-    args.num_numa = numa;
+    args.set_device_id(device);
    args.device_id = device;
    init_ngpus = ngpus;
  }
@ -350,7 +342,7 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
 /* ---------------------------------------------------------------------- */
-void KokkosLMP::initialize(Kokkos::InitArguments args, Error *error)
+void KokkosLMP::initialize(Kokkos::InitializationSettings args, Error *error)
 {
  if (!Kokkos::is_initialized()) {
    if (is_finalized)
--- a/src/KOKKOS/kokkos.h
+++ b/src/KOKKOS/kokkos.h
@ -43,7 +43,6 @@ class KokkosLMP : protected Pointers {
  int forward_fix_comm_changed;
  int reverse_comm_changed;
  int nthreads,ngpus;
  int numa;
  int auto_sync;
  int gpu_aware_flag;
  int neigh_thread;
@ -53,12 +52,11 @@ class KokkosLMP : protected Pointers {
  double binsize;
  static int is_finalized;
  static Kokkos::InitArguments args;
  static int init_ngpus;
  KokkosLMP(class LAMMPS *, int, char **);
-  static void initialize(Kokkos::InitArguments, Error *);
+  static void initialize(Kokkos::InitializationSettings, Error *);
  static void finalize();
  void accelerator(int, char **);
  int neigh_count(int);
--- a/src/KOKKOS/pair_reaxff_kokkos.h
+++ b/src/KOKKOS/pair_reaxff_kokkos.h
@ -524,6 +524,12 @@ struct PairReaxKokkosFindBondFunctor  {
  PairReaxFFKokkos<DeviceType> c;
  PairReaxKokkosFindBondFunctor(PairReaxFFKokkos<DeviceType>* c_ptr):c(*c_ptr) {};
  KOKKOS_INLINE_FUNCTION
  void join(int &dst,
             const int &src) const {
    dst = MAX(dst,src);
  }
  KOKKOS_INLINE_FUNCTION
  void join(volatile int &dst,
             const volatile int &src) const {
--- a/src/accelerator_kokkos.h
+++ b/src/accelerator_kokkos.h
@ -52,7 +52,6 @@ class KokkosLMP {
  int kokkos_exists;
  int nthreads;
  int ngpus;
  int numa;
  KokkosLMP(class LAMMPS *, int, char **) { kokkos_exists = 0; }
  ~KokkosLMP() {}
--- a/src/comm.cpp
+++ b/src/comm.cpp
@ -91,7 +91,7 @@ Comm::Comm(LAMMPS *lmp) : Pointers(lmp)
  nthreads = 1;
 #ifdef _OPENMP
  if (lmp->kokkos) {
-    nthreads = lmp->kokkos->nthreads * lmp->kokkos->numa;
+    nthreads = lmp->kokkos->nthreads;
  } else if (getenv("OMP_NUM_THREADS") == nullptr) {
    nthreads = 1;
    if (me == 0)