Merge pull request #3532 from stanmoore1/kk_occupancy

Update Kokkos version in LAMMPS to 3.7.1
2023-01-20 17:52:05 -05:00
parent dfe3436e9c 6477b19702
commit ef692258b4
67 changed files with 2295 additions and 561 deletions
--- a/cmake/Modules/Packages/KOKKOS.cmake
+++ b/cmake/Modules/Packages/KOKKOS.cmake
@ -49,8 +49,8 @@ if(DOWNLOAD_KOKKOS)
  list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_CXX_EXTENSIONS=${CMAKE_CXX_EXTENSIONS}")
  list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}")
  include(ExternalProject)
-  set(KOKKOS_URL "https://github.com/kokkos/kokkos/archive/3.7.00.tar.gz" CACHE STRING "URL for KOKKOS tarball")
-  set(KOKKOS_MD5 "84991eca9f066383abe119a5bc7a11c4" CACHE STRING "MD5 checksum of KOKKOS tarball")
+  set(KOKKOS_URL "https://github.com/kokkos/kokkos/archive/3.7.01.tar.gz" CACHE STRING "URL for KOKKOS tarball")
+  set(KOKKOS_MD5 "f140e02b826223b1045207d9bc10d404" CACHE STRING "MD5 checksum of KOKKOS tarball")
  mark_as_advanced(KOKKOS_URL)
  mark_as_advanced(KOKKOS_MD5)
  ExternalProject_Add(kokkos_build
@ -74,7 +74,7 @@ if(DOWNLOAD_KOKKOS)
  add_dependencies(LAMMPS::KOKKOSCORE kokkos_build)
  add_dependencies(LAMMPS::KOKKOSCONTAINERS kokkos_build)
 elseif(EXTERNAL_KOKKOS)
-  find_package(Kokkos 3.7.00 REQUIRED CONFIG)
+  find_package(Kokkos 3.7.01 REQUIRED CONFIG)
  target_link_libraries(lammps PRIVATE Kokkos::kokkos)
  target_link_libraries(lmp PRIVATE Kokkos::kokkos)
 else()
--- a/doc/src/Run_options.rst
+++ b/doc/src/Run_options.rst
@ -105,13 +105,12 @@ Either the full word or an abbreviation can be used for the keywords.
 Note that the keywords do not use a leading minus sign.  I.e. the
 keyword is "t", not "-t".  Also note that each of the keywords has a
 default setting.  Examples of when to use these options and what
-settings to use on different platforms is given on the :doc:`KOKKOS package <Speed_kokkos>`
-doc page.
+settings to use on different platforms is given on the :doc:`KOKKOS
+package <Speed_kokkos>` doc page.

 * d or device
 * g or gpus
 * t or threads
-* n or numa

 .. parsed-literal::

@ -164,19 +163,10 @@ the number of physical cores per node, to use your available hardware
 optimally.  This also sets the number of threads used by the host when
 LAMMPS is compiled with CUDA=yes.

-.. parsed-literal::
+.. deprecated:: 22Dec2022

-   numa Nm
-
-This option is only relevant when using pthreads with hwloc support.
-In this case Nm defines the number of NUMA regions (typically sockets)
-on a node which will be utilized by a single MPI rank.  By default Nm
-= 1.  If this option is used the total number of worker-threads per
-MPI rank is threads\*numa.  Currently it is always almost better to
-assign at least one MPI rank per NUMA region, and leave numa set to
-its default value of 1. This is because letting a single process span
-multiple NUMA regions induces a significant amount of cross NUMA data
-traffic which is slow.
+Support for the "numa" or "n" option was removed as its functionality
+was ignored in Kokkos for some time already.

 ----------

--- a/lib/kokkos/CHANGELOG.md
+++ b/lib/kokkos/CHANGELOG.md
@ -1,5 +1,27 @@
 # Change Log

+## [3.7.01](https://github.com/kokkos/kokkos/tree/3.7.01) (2022-12-01)
+[Full Changelog](https://github.com/kokkos/kokkos/compare/3.7.00...3.7.01)
+
+### Bug Fixes:
+- Add fences to all sorting routines not taking an execution space instance argument [\#5547](https://github.com/kokkos/kokkos/pull/5547)
+- Fix repeated `team_reduce` without barrier [\#5552](https://github.com/kokkos/kokkos/pull/5552)
+- Fix memory spaces in `create_mirror_view` overloads using `view_alloc` [\#5521](https://github.com/kokkos/kokkos/pull/5521)
+- Allow `as_view_of_rank_n()` to be overloaded for "special" scalar types [\#5553](https://github.com/kokkos/kokkos/pull/5553)
+- Fix warning calling a `__host__` function from a `__host__ __device__` from `View:: as_view_of_rank_n` [\#5591](https://github.com/kokkos/kokkos/pull/5591)
+- OpenMPTarget: adding implementation to set device id. [\#5557](https://github.com/kokkos/kokkos/pull/5557)
+- Use `Kokkos::atomic_load` to Correct Race Condition Giving Rise to Seg Faulting Error in OpenMP tests [\#5559](https://github.com/kokkos/kokkos/pull/5559)
+- cmake: define `KOKKOS_ARCH_A64FX` [\#5561](https://github.com/kokkos/kokkos/pull/5561)
+- Only link against libatomic in gnu-make OpenMPTarget build [\#5565](https://github.com/kokkos/kokkos/pull/5565)
+- Fix static extents assignment for LayoutLeft/LayoutRight assignment [\#5566](https://github.com/kokkos/kokkos/pull/5566)
+- Do not add -cuda to the link line with NVHPC compiler when the CUDA backend is not actually enabled [\#5569](https://github.com/kokkos/kokkos/pull/5569)
+- Export the flags in `KOKKOS_AMDGPU_OPTIONS` when using Trilinos [\#5571](https://github.com/kokkos/kokkos/pull/5571)
+- Add support for detecting MPI local rank with MPICH and PMI [\#5570](https://github.com/kokkos/kokkos/pull/5570) [\#5582](https://github.com/kokkos/kokkos/pull/5582)
+- Remove listing of undefined TPL dependencies [\#5573](https://github.com/kokkos/kokkos/pull/5573)
+- ClockTic changed to 64 bit to fix overflow on Power [\#5592](https://github.com/kokkos/kokkos/pull/5592)
+- Fix incorrect offset in CUDA and HIP parallel scan for < 4 byte types [\#5607](https://github.com/kokkos/kokkos/pull/5607)
+- Fix initialization of Cuda lock arrays [\#5622](https://github.com/kokkos/kokkos/pull/5622)
+
 ## [3.7.00](https://github.com/kokkos/kokkos/tree/3.7.00) (2022-08-22)
 [Full Changelog](https://github.com/kokkos/kokkos/compare/3.6.01...3.7.00)

@ -102,7 +124,6 @@
 - Deprecate command line arguments (other than `--help`) that are not prefixed with `kokkos-*` [\#5120](https://github.com/kokkos/kokkos/pull/5120)
 - Deprecate `--[kokkos-]numa` cmdline arg and `KOKKOS_NUMA` env var [\#5117](https://github.com/kokkos/kokkos/pull/5117)
 - Deprecate `--[kokkos-]threads` command line argument in favor of `--[kokkos-]num-threads` [\#5111](https://github.com/kokkos/kokkos/pull/5111)
- Deprecate `Kokkos::common_view_alloc_prop` [\#5059](https://github.com/kokkos/kokkos/pull/5059)
 - Deprecate `Kokkos::is_reducer_type` [\#4957](https://github.com/kokkos/kokkos/pull/4957)
 - Deprecate `OffsetView` constructors taking `index_list_type` [\#4810](https://github.com/kokkos/kokkos/pull/4810)
 - Deprecate overloads of `Kokkos::sort` taking a parameter `bool always_use_kokkos_sort` [\#5382](https://github.com/kokkos/kokkos/issues/5382)
--- a/lib/kokkos/CMakeLists.txt
+++ b/lib/kokkos/CMakeLists.txt
@ -129,7 +129,7 @@ ENDIF()

 set(Kokkos_VERSION_MAJOR 3)
 set(Kokkos_VERSION_MINOR 7)
-set(Kokkos_VERSION_PATCH 00)
+set(Kokkos_VERSION_PATCH 01)
 set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}")
 math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}")

@ -152,6 +152,7 @@ ENDIF()
 # but scoping issues can make it difficult
 GLOBAL_SET(KOKKOS_COMPILE_OPTIONS)
 GLOBAL_SET(KOKKOS_LINK_OPTIONS)
+GLOBAL_SET(KOKKOS_AMDGPU_OPTIONS)
 GLOBAL_SET(KOKKOS_CUDA_OPTIONS)
 GLOBAL_SET(KOKKOS_CUDAFE_OPTIONS)
 GLOBAL_SET(KOKKOS_XCOMPILER_OPTIONS)
@ -228,6 +229,9 @@ IF (KOKKOS_HAS_TRILINOS)
  # we have to match the annoying behavior, also we have to preserve quotes
  # which needs another workaround.
  SET(KOKKOS_COMPILE_OPTIONS_TMP)
+  IF (KOKKOS_ENABLE_HIP)
+    LIST(APPEND KOKKOS_COMPILE_OPTIONS ${KOKKOS_AMDGPU_OPTIONS})
+  ENDIF()
  FOREACH(OPTION ${KOKKOS_COMPILE_OPTIONS})
    STRING(FIND "${OPTION}" " " OPTION_HAS_WHITESPACE)
    IF(OPTION_HAS_WHITESPACE EQUAL -1)
--- a/lib/kokkos/Makefile.kokkos
+++ b/lib/kokkos/Makefile.kokkos
@ -12,7 +12,7 @@ endif

 KOKKOS_VERSION_MAJOR = 3
 KOKKOS_VERSION_MINOR = 7
-KOKKOS_VERSION_PATCH = 00
+KOKKOS_VERSION_PATCH = 01
 KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc)

 # Options: Cuda,HIP,SYCL,OpenMPTarget,OpenMP,Threads,Serial
@ -20,7 +20,7 @@ KOKKOS_DEVICES ?= "OpenMP"
 #KOKKOS_DEVICES ?= "Threads"
 # Options:
 # Intel:    KNC,KNL,SNB,HSW,BDW,SKL,SKX,ICL,ICX,SPR
-# NVIDIA:   Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86
+# NVIDIA:   Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86,Hopper90
 # ARM:      ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX
 # IBM:      BGQ,Power7,Power8,Power9
 # AMD-GPUS: Vega900,Vega906,Vega908,Vega90A
@ -401,6 +401,7 @@ KOKKOS_INTERNAL_USE_ARCH_VOLTA72 := $(call kokkos_has_string,$(KOKKOS_ARCH),Volt
 KOKKOS_INTERNAL_USE_ARCH_TURING75 := $(call kokkos_has_string,$(KOKKOS_ARCH),Turing75)
 KOKKOS_INTERNAL_USE_ARCH_AMPERE80 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere80)
 KOKKOS_INTERNAL_USE_ARCH_AMPERE86 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere86)
+KOKKOS_INTERNAL_USE_ARCH_HOPPER90 := $(call kokkos_has_string,$(KOKKOS_ARCH),Hopper90)
 KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30)  \
                                              + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32)  \
                                              + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35)  \
@ -414,7 +415,8 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLE
                                              + $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72)   \
                                              + $(KOKKOS_INTERNAL_USE_ARCH_TURING75)  \
                                              + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE80)  \
-                                              + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86))
+                                              + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86)  \
+                                              + $(KOKKOS_INTERNAL_USE_ARCH_HOPPER90))

 #SEK: This seems like a bug to me
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
@ -505,10 +507,6 @@ KOKKOS_LINK_FLAGS =
 KOKKOS_SRC =
 KOKKOS_HEADERS =

-#ifeq ($(KOKKOS_INTERNAL_COMPILER_GCC), 1)
-  KOKKOS_LIBS += -latomic
-#endif
-
 # Generating the KokkosCore_config.h file.

 KOKKOS_INTERNAL_CONFIG_TMP=KokkosCore_config.tmp
@ -550,6 +548,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1)
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
+  KOKKOS_LIBS += -latomic
  tmp := $(call kokkos_append_header,'$H''define KOKKOS_ENABLE_OPENMPTARGET')
  ifeq ($(KOKKOS_INTERNAL_COMPILER_GCC), 1)
  tmp := $(call kokkos_append_header,"$H""define KOKKOS_WORKAROUND_OPENMPTARGET_GCC")
@ -1197,6 +1196,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA_ARCH), 1)
    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE86")
    KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_86
  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_HOPPER90), 1)
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER90")
+    KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_90
+  endif

  ifneq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
    KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)
--- a/lib/kokkos/algorithms/cmake/Dependencies.cmake
+++ b/lib/kokkos/algorithms/cmake/Dependencies.cmake
@ -1,5 +1,5 @@
 TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
  LIB_REQUIRED_PACKAGES KokkosCore KokkosContainers
-  LIB_OPTIONAL_TPLS Pthread CUDA HWLOC HPX
+  LIB_OPTIONAL_TPLS Pthread CUDA HWLOC
  TEST_OPTIONAL_TPLS CUSPARSE
  )
--- a/lib/kokkos/algorithms/src/Kokkos_Sort.hpp
+++ b/lib/kokkos/algorithms/src/Kokkos_Sort.hpp
@ -265,8 +265,8 @@ class BinSort {
  //----------------------------------------
  // Create the permutation vector, the bin_offset array and the bin_count
  // array. Can be called again if keys changed
-  template <class ExecutionSpace = exec_space>
-  void create_permute_vector(const ExecutionSpace& exec = exec_space{}) {
+  template <class ExecutionSpace>
+  void create_permute_vector(const ExecutionSpace& exec) {
    static_assert(
        Kokkos::SpaceAccessibility<ExecutionSpace,
                                   typename Space::memory_space>::accessible,
@ -297,6 +297,15 @@ class BinSort {
          *this);
  }

+  // Create the permutation vector, the bin_offset array and the bin_count
+  // array. Can be called again if keys changed
+  void create_permute_vector() {
+    Kokkos::fence("Kokkos::Binsort::create_permute_vector: before");
+    exec_space e{};
+    create_permute_vector(e);
+    e.fence("Kokkos::Binsort::create_permute_vector: after");
+  }
+
  // Sort a subset of a view with respect to the first dimension using the
  // permutation array
  template <class ExecutionSpace, class ValuesViewType>
@ -372,9 +381,10 @@ class BinSort {
  template <class ValuesViewType>
  void sort(ValuesViewType const& values, int values_range_begin,
            int values_range_end) const {
+    Kokkos::fence("Kokkos::Binsort::sort: before");
    exec_space exec;
    sort(exec, values, values_range_begin, values_range_end);
-    exec.fence("Kokkos::Sort: fence after sorting");
+    exec.fence("Kokkos::BinSort:sort: after");
  }

  template <class ExecutionSpace, class ValuesViewType>
@ -641,9 +651,10 @@ std::enable_if_t<Kokkos::is_execution_space<ExecutionSpace>::value> sort(

 template <class ViewType>
 void sort(ViewType const& view) {
+  Kokkos::fence("Kokkos::sort: before");
  typename ViewType::execution_space exec;
  sort(exec, view);
-  exec.fence("Kokkos::Sort: fence after sorting");
+  exec.fence("Kokkos::sort: fence after sorting");
 }

 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
@ -682,6 +693,7 @@ std::enable_if_t<Kokkos::is_execution_space<ExecutionSpace>::value> sort(

 template <class ViewType>
 void sort(ViewType view, size_t const begin, size_t const end) {
+  Kokkos::fence("Kokkos::sort: before");
  typename ViewType::execution_space exec;
  sort(exec, view, begin, end);
  exec.fence("Kokkos::Sort: fence after sorting");
--- a/lib/kokkos/bin/nvcc_wrapper
+++ b/lib/kokkos/bin/nvcc_wrapper
@ -10,10 +10,12 @@
 # Default settings: change those according to your machine.  For
 # example, you may have have two different wrappers with either icpc
 # or g++ as their back-end compiler.  The defaults can be overwritten
-# by using the usual arguments (e.g., -arch=sm_30 -ccbin icpc).
+# by using the usual arguments (e.g., -arch=sm_80 -ccbin icpc).
+# sm_70 is supported by every CUDA version from 9-12 and is thus
+# chosen as default

-default_arch="sm_35"
-#default_arch="sm_50"
+default_arch="sm_70"
+#default_arch="sm_80"

 #
 # The default C++ compiler.
--- a/lib/kokkos/cmake/KokkosCore_config.h.in
+++ b/lib/kokkos/cmake/KokkosCore_config.h.in
@ -66,6 +66,7 @@
 #cmakedefine KOKKOS_ARCH_ARMV8_THUNDERX
 #cmakedefine KOKKOS_ARCH_ARMV81
 #cmakedefine KOKKOS_ARCH_ARMV8_THUNDERX2
+#cmakedefine KOKKOS_ARCH_A64FX
 #cmakedefine KOKKOS_ARCH_AMD_AVX2
 #cmakedefine KOKKOS_ARCH_AVX
 #cmakedefine KOKKOS_ARCH_AVX2
@ -101,6 +102,7 @@
 #cmakedefine KOKKOS_ARCH_AMPERE
 #cmakedefine KOKKOS_ARCH_AMPERE80
 #cmakedefine KOKKOS_ARCH_AMPERE86
+#cmakedefine KOKKOS_ARCH_HOPPER90
 #cmakedefine KOKKOS_ARCH_AMD_ZEN
 #cmakedefine KOKKOS_ARCH_AMD_ZEN2
 #cmakedefine KOKKOS_ARCH_AMD_ZEN3
--- a/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc
+++ b/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc
@ -74,6 +74,7 @@ int main() {
    case 75: std::cout << "Set -DKokkos_ARCH_TURING75=ON ." << std::endl; break;
    case 80: std::cout << "Set -DKokkos_ARCH_AMPERE80=ON ." << std::endl; break;
    case 86: std::cout << "Set -DKokkos_ARCH_AMPERE86=ON ." << std::endl; break;
+    case 90: std::cout << "Set -DKokkos_ARCH_HOPPER90=ON ." << std::endl; break;
    default:
      std::cout << "Compute capability " << compute_capability
                << " is not supported" << std::endl;
--- a/lib/kokkos/cmake/kokkos_arch.cmake
+++ b/lib/kokkos/cmake/kokkos_arch.cmake
@ -86,6 +86,7 @@ KOKKOS_ARCH_OPTION(VOLTA72         GPU  "NVIDIA Volta generation CC 7.2"   "KOKK
 KOKKOS_ARCH_OPTION(TURING75        GPU  "NVIDIA Turing generation CC 7.5"  "KOKKOS_SHOW_CUDA_ARCHS")
 KOKKOS_ARCH_OPTION(AMPERE80        GPU  "NVIDIA Ampere generation CC 8.0"  "KOKKOS_SHOW_CUDA_ARCHS")
 KOKKOS_ARCH_OPTION(AMPERE86        GPU  "NVIDIA Ampere generation CC 8.6"  "KOKKOS_SHOW_CUDA_ARCHS")
+KOKKOS_ARCH_OPTION(HOPPER90        GPU  "NVIDIA Hopper generation CC 9.0"  "KOKKOS_SHOW_CUDA_ARCHS")

 IF(Kokkos_ENABLE_HIP OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_UNSUPPORTED_ARCHS)
  SET(KOKKOS_SHOW_HIP_ARCHS ON)
@ -187,7 +188,9 @@ IF (KOKKOS_CXX_COMPILER_ID STREQUAL Clang)
 ELSEIF (KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
  SET(CUDA_ARCH_FLAG "-gpu")
  GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS -cuda)
-  GLOBAL_APPEND(KOKKOS_LINK_OPTIONS -cuda)
+  IF (KOKKOS_ENABLE_CUDA) # FIXME ideally unreachable when CUDA not enabled
+    GLOBAL_APPEND(KOKKOS_LINK_OPTIONS -cuda)
+  ENDIF()
 ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA)
  SET(CUDA_ARCH_FLAG "-arch")
 ENDIF()
@ -542,6 +545,7 @@ CHECK_CUDA_ARCH(VOLTA72   sm_72)
 CHECK_CUDA_ARCH(TURING75  sm_75)
 CHECK_CUDA_ARCH(AMPERE80  sm_80)
 CHECK_CUDA_ARCH(AMPERE86  sm_86)
+CHECK_CUDA_ARCH(HOPPER90  sm_90)

 SET(AMDGPU_ARCH_ALREADY_SPECIFIED "")
 FUNCTION(CHECK_AMDGPU_ARCH ARCH FLAG)
@ -804,6 +808,10 @@ IF (KOKKOS_ARCH_AMPERE80 OR KOKKOS_ARCH_AMPERE86)
  SET(KOKKOS_ARCH_AMPERE ON)
 ENDIF()

+IF (KOKKOS_ARCH_HOPPER90)
+  SET(KOKKOS_ARCH_HOPPER ON)
+ENDIF()
+
 #Regardless of version, make sure we define the general architecture name
 IF (KOKKOS_ARCH_VEGA900 OR KOKKOS_ARCH_VEGA906 OR KOKKOS_ARCH_VEGA908 OR KOKKOS_ARCH_VEGA90A)
  SET(KOKKOS_ARCH_VEGA ON)
--- a/lib/kokkos/containers/cmake/Dependencies.cmake
+++ b/lib/kokkos/containers/cmake/Dependencies.cmake
@ -1,5 +1,5 @@
 TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
  LIB_REQUIRED_PACKAGES KokkosCore
-  LIB_OPTIONAL_TPLS Pthread CUDA HWLOC HPX
+  LIB_OPTIONAL_TPLS Pthread CUDA HWLOC
  TEST_OPTIONAL_TPLS CUSPARSE
  )
--- a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp
@ -1701,7 +1701,11 @@ namespace Impl {
   underlying memory, to facilitate implementation of deep_copy() and
   other routines that are defined on View */
 template <unsigned N, typename T, typename... Args>
-KOKKOS_FUNCTION auto as_view_of_rank_n(DynRankView<T, Args...> v) {
+KOKKOS_FUNCTION auto as_view_of_rank_n(
+    DynRankView<T, Args...> v,
+    typename std::enable_if<std::is_same<
+        typename ViewTraits<T, Args...>::specialize, void>::value>::type* =
+        nullptr) {
  if (v.rank() != N) {
    KOKKOS_IF_ON_HOST(
        const std::string message =
@ -2114,9 +2118,10 @@ inline auto create_mirror(
 namespace Impl {
 template <class T, class... P, class... ViewCtorArgs>
 inline std::enable_if_t<
-    std::is_same<
-        typename DynRankView<T, P...>::memory_space,
-        typename DynRankView<T, P...>::HostMirror::memory_space>::value &&
+    !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
+        std::is_same<
+            typename DynRankView<T, P...>::memory_space,
+            typename DynRankView<T, P...>::HostMirror::memory_space>::value &&
        std::is_same<
            typename DynRankView<T, P...>::data_type,
            typename DynRankView<T, P...>::HostMirror::data_type>::value,
@ -2128,12 +2133,13 @@ create_mirror_view(const DynRankView<T, P...>& src,

 template <class T, class... P, class... ViewCtorArgs>
 inline std::enable_if_t<
-    !(std::is_same<
-          typename DynRankView<T, P...>::memory_space,
-          typename DynRankView<T, P...>::HostMirror::memory_space>::value &&
-      std::is_same<
-          typename DynRankView<T, P...>::data_type,
-          typename DynRankView<T, P...>::HostMirror::data_type>::value),
+    !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
+        !(std::is_same<
+              typename DynRankView<T, P...>::memory_space,
+              typename DynRankView<T, P...>::HostMirror::memory_space>::value &&
+          std::is_same<
+              typename DynRankView<T, P...>::data_type,
+              typename DynRankView<T, P...>::HostMirror::data_type>::value),
    typename DynRankView<T, P...>::HostMirror>
 create_mirror_view(
    const DynRankView<T, P...>& src,
@ -2141,29 +2147,39 @@ create_mirror_view(
  return Kokkos::Impl::create_mirror(src, arg_prop);
 }

-template <class Space, class T, class... P, class... ViewCtorArgs>
+template <class T, class... P, class... ViewCtorArgs,
+          class = std::enable_if_t<
+              Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
 inline std::enable_if_t<
-    Kokkos::is_space<Space>::value &&
-        Impl::MirrorDRViewType<Space, T, P...>::is_same_memspace,
-    typename Impl::MirrorDRViewType<Space, T, P...>::view_type>
-create_mirror_view(const Space&, const Kokkos::DynRankView<T, P...>& src,
+    Kokkos::is_space<
+        typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space>::value &&
+        Impl::MirrorDRViewType<
+            typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T,
+            P...>::is_same_memspace,
+    typename Impl::MirrorDRViewType<
+        typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T,
+        P...>::view_type>
+create_mirror_view(const Kokkos::DynRankView<T, P...>& src,
                   const typename Impl::ViewCtorProp<ViewCtorArgs...>&) {
  return src;
 }

-template <class Space, class T, class... P, class... ViewCtorArgs>
+template <class T, class... P, class... ViewCtorArgs,
+          class = std::enable_if_t<
+              Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
 inline std::enable_if_t<
-    Kokkos::is_space<Space>::value &&
-        !Impl::MirrorDRViewType<Space, T, P...>::is_same_memspace,
-    typename Impl::MirrorDRViewType<Space, T, P...>::view_type>
+    Kokkos::is_space<
+        typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space>::value &&
+        !Impl::MirrorDRViewType<
+            typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T,
+            P...>::is_same_memspace,
+    typename Impl::MirrorDRViewType<
+        typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T,
+        P...>::view_type>
 create_mirror_view(
-    const Space&, const Kokkos::DynRankView<T, P...>& src,
+    const Kokkos::DynRankView<T, P...>& src,
    const typename Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
-  using MemorySpace = typename Space::memory_space;
-  using alloc_prop  = Impl::ViewCtorProp<ViewCtorArgs..., MemorySpace>;
-  alloc_prop prop_copy(arg_prop);
-
-  return Kokkos::Impl::create_mirror(src, prop_copy);
+  return Kokkos::Impl::create_mirror(src, arg_prop);
 }
 }  // namespace Impl

@ -2224,9 +2240,10 @@ create_mirror_view(

 template <class Space, class T, class... P>
 inline auto create_mirror_view(Kokkos::Impl::WithoutInitializing_t wi,
-                               const Space& space,
+                               const Space&,
                               const Kokkos::DynRankView<T, P...>& src) {
-  return Impl::create_mirror_view(space, src, Kokkos::view_alloc(wi));
+  return Impl::create_mirror_view(
+      src, Kokkos::view_alloc(typename Space::memory_space{}, wi));
 }

 template <class T, class... P, class... ViewCtorArgs>
--- a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp
@ -710,7 +710,7 @@ template <class Space, class T, class... P>
 inline auto create_mirror(
    const Space&, const Kokkos::Experimental::DynamicView<T, P...>& src) {
  return Impl::create_mirror(
-      src, Impl::ViewCtorProp<>{typename Space::memory_space{}});
+      src, Kokkos::view_alloc(typename Space::memory_space{}));
 }

 template <class Space, class T, class... P>
@ -729,48 +729,68 @@ inline auto create_mirror(
 }

 namespace Impl {
+
 template <class T, class... P, class... ViewCtorArgs>
 inline std::enable_if_t<
-    (std::is_same<
-         typename Kokkos::Experimental::DynamicView<T, P...>::memory_space,
-         typename Kokkos::Experimental::DynamicView<
-             T, P...>::HostMirror::memory_space>::value &&
-     std::is_same<
-         typename Kokkos::Experimental::DynamicView<T, P...>::data_type,
-         typename Kokkos::Experimental::DynamicView<
-             T, P...>::HostMirror::data_type>::value),
+    !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
+        (std::is_same<
+             typename Kokkos::Experimental::DynamicView<T, P...>::memory_space,
+             typename Kokkos::Experimental::DynamicView<
+                 T, P...>::HostMirror::memory_space>::value &&
+         std::is_same<
+             typename Kokkos::Experimental::DynamicView<T, P...>::data_type,
+             typename Kokkos::Experimental::DynamicView<
+                 T, P...>::HostMirror::data_type>::value),
    typename Kokkos::Experimental::DynamicView<T, P...>::HostMirror>
-create_mirror_view(
-    const typename Kokkos::Experimental::DynamicView<T, P...>& src,
-    const Impl::ViewCtorProp<ViewCtorArgs...>&) {
+create_mirror_view(const Kokkos::Experimental::DynamicView<T, P...>& src,
+                   const Impl::ViewCtorProp<ViewCtorArgs...>&) {
  return src;
 }

 template <class T, class... P, class... ViewCtorArgs>
 inline std::enable_if_t<
-    !(std::is_same<
-          typename Kokkos::Experimental::DynamicView<T, P...>::memory_space,
-          typename Kokkos::Experimental::DynamicView<
-              T, P...>::HostMirror::memory_space>::value &&
-      std::is_same<
-          typename Kokkos::Experimental::DynamicView<T, P...>::data_type,
-          typename Kokkos::Experimental::DynamicView<
-              T, P...>::HostMirror::data_type>::value),
+    !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
+        !(std::is_same<
+              typename Kokkos::Experimental::DynamicView<T, P...>::memory_space,
+              typename Kokkos::Experimental::DynamicView<
+                  T, P...>::HostMirror::memory_space>::value &&
+          std::is_same<
+              typename Kokkos::Experimental::DynamicView<T, P...>::data_type,
+              typename Kokkos::Experimental::DynamicView<
+                  T, P...>::HostMirror::data_type>::value),
    typename Kokkos::Experimental::DynamicView<T, P...>::HostMirror>
 create_mirror_view(const Kokkos::Experimental::DynamicView<T, P...>& src,
                   const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
  return Kokkos::create_mirror(arg_prop, src);
 }

-template <class Space, class T, class... P, class... ViewCtorArgs>
-inline std::enable_if_t<
-    Impl::MirrorDynamicViewType<Space, T, P...>::is_same_memspace,
-    typename Kokkos::Impl::MirrorDynamicViewType<Space, T, P...>::view_type>
-create_mirror_view(const Space&,
-                   const Kokkos::Experimental::DynamicView<T, P...>& src,
+template <class T, class... P, class... ViewCtorArgs,
+          class = std::enable_if_t<
+              Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
+std::enable_if_t<Impl::MirrorDynamicViewType<
+                     typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
+                     T, P...>::is_same_memspace,
+                 typename Impl::MirrorDynamicViewType<
+                     typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
+                     T, P...>::view_type>
+create_mirror_view(const Kokkos::Experimental::DynamicView<T, P...>& src,
                   const Impl::ViewCtorProp<ViewCtorArgs...>&) {
  return src;
 }
+
+template <class T, class... P, class... ViewCtorArgs,
+          class = std::enable_if_t<
+              Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
+std::enable_if_t<!Impl::MirrorDynamicViewType<
+                     typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
+                     T, P...>::is_same_memspace,
+                 typename Impl::MirrorDynamicViewType<
+                     typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
+                     T, P...>::view_type>
+create_mirror_view(const Kokkos::Experimental::DynamicView<T, P...>& src,
+                   const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
+  return Kokkos::Impl::create_mirror(src, arg_prop);
+}
 }  // namespace Impl

 // Create a mirror view in host space
@ -790,8 +810,9 @@ inline auto create_mirror_view(
 // Create a mirror in a new space
 template <class Space, class T, class... P>
 inline auto create_mirror_view(
-    const Space& space, const Kokkos::Experimental::DynamicView<T, P...>& src) {
-  return Impl::create_mirror_view(space, src, Impl::ViewCtorProp<>{});
+    const Space&, const Kokkos::Experimental::DynamicView<T, P...>& src) {
+  return Impl::create_mirror_view(src,
+                                  view_alloc(typename Space::memory_space{}));
 }

 template <class Space, class T, class... P>
--- a/lib/kokkos/containers/src/Kokkos_OffsetView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_OffsetView.hpp
@ -1901,19 +1901,22 @@ struct MirrorOffsetType {

 namespace Impl {
 template <class T, class... P, class... ViewCtorArgs>
-inline typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror
+inline std::enable_if_t<
+    !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space,
+    typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror>
 create_mirror(const Kokkos::Experimental::OffsetView<T, P...>& src,
              const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
  return typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror(
      Kokkos::create_mirror(arg_prop, src.view()), src.begins());
 }

-template <class Space, class T, class... P, class... ViewCtorArgs>
-inline typename Kokkos::Impl::MirrorOffsetType<Space, T, P...>::view_type
-create_mirror(const Space&,
-              const Kokkos::Experimental::OffsetView<T, P...>& src,
-              const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
+template <class T, class... P, class... ViewCtorArgs,
+          class = std::enable_if_t<
+              Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
+inline auto create_mirror(const Kokkos::Experimental::OffsetView<T, P...>& src,
+                          const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
  using alloc_prop_input = Impl::ViewCtorProp<ViewCtorArgs...>;
+  using Space = typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space;

  static_assert(
      !alloc_prop_input::has_label,
@ -1923,10 +1926,6 @@ create_mirror(const Space&,
      !alloc_prop_input::has_pointer,
      "The view constructor arguments passed to Kokkos::create_mirror must "
      "not include a pointer!");
-  static_assert(
-      !alloc_prop_input::has_memory_space,
-      "The view constructor arguments passed to Kokkos::create_mirror must "
-      "not include a memory space instance!");
  static_assert(
      !alloc_prop_input::allow_padding,
      "The view constructor arguments passed to Kokkos::create_mirror must "
@ -1962,15 +1961,17 @@ inline auto create_mirror(
 template <class Space, class T, class... P,
          typename Enable = std::enable_if_t<Kokkos::is_space<Space>::value>>
 inline auto create_mirror(
-    const Space& space, const Kokkos::Experimental::OffsetView<T, P...>& src) {
-  return Impl::create_mirror(space, src, Impl::ViewCtorProp<>{});
+    const Space&, const Kokkos::Experimental::OffsetView<T, P...>& src) {
+  return Impl::create_mirror(
+      src, Kokkos::view_alloc(typename Space::memory_space{}));
 }

 template <class Space, class T, class... P>
 typename Kokkos::Impl::MirrorOffsetType<Space, T, P...>::view_type
-create_mirror(Kokkos::Impl::WithoutInitializing_t wi, const Space& space,
+create_mirror(Kokkos::Impl::WithoutInitializing_t wi, const Space&,
              const Kokkos::Experimental::OffsetView<T, P...>& src) {
-  return Impl::create_mirror(space, src, Kokkos::view_alloc(wi));
+  return Impl::create_mirror(
+      src, Kokkos::view_alloc(typename Space::memory_space{}, wi));
 }

 template <class T, class... P, class... ViewCtorArgs>
@ -1983,54 +1984,64 @@ inline auto create_mirror(
 namespace Impl {
 template <class T, class... P, class... ViewCtorArgs>
 inline std::enable_if_t<
-    (std::is_same<
-         typename Kokkos::Experimental::OffsetView<T, P...>::memory_space,
-         typename Kokkos::Experimental::OffsetView<
-             T, P...>::HostMirror::memory_space>::value &&
-     std::is_same<typename Kokkos::Experimental::OffsetView<T, P...>::data_type,
-                  typename Kokkos::Experimental::OffsetView<
-                      T, P...>::HostMirror::data_type>::value),
+    !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
+        (std::is_same<
+             typename Kokkos::Experimental::OffsetView<T, P...>::memory_space,
+             typename Kokkos::Experimental::OffsetView<
+                 T, P...>::HostMirror::memory_space>::value &&
+         std::is_same<
+             typename Kokkos::Experimental::OffsetView<T, P...>::data_type,
+             typename Kokkos::Experimental::OffsetView<
+                 T, P...>::HostMirror::data_type>::value),
    typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror>
-create_mirror_view(
-    const typename Kokkos::Experimental::OffsetView<T, P...>& src,
-    const Impl::ViewCtorProp<ViewCtorArgs...>&) {
+create_mirror_view(const Kokkos::Experimental::OffsetView<T, P...>& src,
+                   const Impl::ViewCtorProp<ViewCtorArgs...>&) {
  return src;
 }

 template <class T, class... P, class... ViewCtorArgs>
 inline std::enable_if_t<
-    !(std::is_same<
-          typename Kokkos::Experimental::OffsetView<T, P...>::memory_space,
-          typename Kokkos::Experimental::OffsetView<
-              T, P...>::HostMirror::memory_space>::value &&
-      std::is_same<
-          typename Kokkos::Experimental::OffsetView<T, P...>::data_type,
-          typename Kokkos::Experimental::OffsetView<
-              T, P...>::HostMirror::data_type>::value),
+    !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
+        !(std::is_same<
+              typename Kokkos::Experimental::OffsetView<T, P...>::memory_space,
+              typename Kokkos::Experimental::OffsetView<
+                  T, P...>::HostMirror::memory_space>::value &&
+          std::is_same<
+              typename Kokkos::Experimental::OffsetView<T, P...>::data_type,
+              typename Kokkos::Experimental::OffsetView<
+                  T, P...>::HostMirror::data_type>::value),
    typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror>
 create_mirror_view(const Kokkos::Experimental::OffsetView<T, P...>& src,
                   const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
  return Kokkos::create_mirror(arg_prop, src);
 }

-template <class Space, class T, class... P, class... ViewCtorArgs>
-inline std::enable_if_t<
-    Impl::MirrorOffsetViewType<Space, T, P...>::is_same_memspace,
-    Kokkos::Experimental::OffsetView<T, P...>>
-create_mirror_view(const Space&,
-                   const Kokkos::Experimental::OffsetView<T, P...>& src,
+template <class T, class... P, class... ViewCtorArgs,
+          class = std::enable_if_t<
+              Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
+std::enable_if_t<Impl::MirrorOffsetViewType<
+                     typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
+                     T, P...>::is_same_memspace,
+                 typename Impl::MirrorOffsetViewType<
+                     typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
+                     T, P...>::view_type>
+create_mirror_view(const Kokkos::Experimental::OffsetView<T, P...>& src,
                   const Impl::ViewCtorProp<ViewCtorArgs...>&) {
  return src;
 }

-template <class Space, class T, class... P, class... ViewCtorArgs>
-std::enable_if_t<
-    !Impl::MirrorOffsetViewType<Space, T, P...>::is_same_memspace,
-    typename Kokkos::Impl::MirrorOffsetViewType<Space, T, P...>::view_type>
-create_mirror_view(const Space& space,
-                   const Kokkos::Experimental::OffsetView<T, P...>& src,
+template <class T, class... P, class... ViewCtorArgs,
+          class = std::enable_if_t<
+              Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
+std::enable_if_t<!Impl::MirrorOffsetViewType<
+                     typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
+                     T, P...>::is_same_memspace,
+                 typename Impl::MirrorOffsetViewType<
+                     typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
+                     T, P...>::view_type>
+create_mirror_view(const Kokkos::Experimental::OffsetView<T, P...>& src,
                   const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
-  return create_mirror(space, src, arg_prop);
+  return Kokkos::Impl::create_mirror(src, arg_prop);
 }
 }  // namespace Impl

@ -2052,15 +2063,17 @@ inline auto create_mirror_view(
 template <class Space, class T, class... P,
          typename Enable = std::enable_if_t<Kokkos::is_space<Space>::value>>
 inline auto create_mirror_view(
-    const Space& space, const Kokkos::Experimental::OffsetView<T, P...>& src) {
-  return Impl::create_mirror_view(space, src, Impl::ViewCtorProp<>{});
+    const Space&, const Kokkos::Experimental::OffsetView<T, P...>& src) {
+  return Impl::create_mirror_view(
+      src, Kokkos::view_alloc(typename Space::memory_space{}));
 }

 template <class Space, class T, class... P>
 inline auto create_mirror_view(
-    Kokkos::Impl::WithoutInitializing_t wi, const Space& space,
+    Kokkos::Impl::WithoutInitializing_t wi, const Space&,
    const Kokkos::Experimental::OffsetView<T, P...>& src) {
-  return Impl::create_mirror_view(space, src, Kokkos::view_alloc(wi));
+  return Impl::create_mirror_view(
+      src, Kokkos::view_alloc(typename Space::memory_space{}, wi));
 }

 template <class T, class... P, class... ViewCtorArgs>
--- a/lib/kokkos/containers/unit_tests/CMakeLists.txt
+++ b/lib/kokkos/containers/unit_tests/CMakeLists.txt
@ -46,3 +46,13 @@ foreach(Tag Threads;Serial;OpenMP;HPX;Cuda;HIP;SYCL)
    KOKKOS_ADD_EXECUTABLE_AND_TEST(UnitTest_${Tag} SOURCES ${UnitTestSources})
  endif()
 endforeach()
+
+SET(COMPILE_ONLY_SOURCES
+  TestCreateMirror.cpp
+)
+KOKKOS_ADD_EXECUTABLE(
+  TestCompileOnly
+  SOURCES
+  TestCompileMain.cpp
+  ${COMPILE_ONLY_SOURCES}
+)
--- a/lib/kokkos/containers/unit_tests/TestCompileMain.cpp
+++ b/lib/kokkos/containers/unit_tests/TestCompileMain.cpp
@ -0,0 +1 @@
+int main() {}
--- a/lib/kokkos/containers/unit_tests/TestCreateMirror.cpp
+++ b/lib/kokkos/containers/unit_tests/TestCreateMirror.cpp
@ -0,0 +1,179 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_DynamicView.hpp>
+#include <Kokkos_DynRankView.hpp>
+#include <Kokkos_OffsetView.hpp>
+
+template <typename TestView, typename MemorySpace>
+void check_memory_space(TestView, MemorySpace) {
+  static_assert(
+      std::is_same<typename TestView::memory_space, MemorySpace>::value, "");
+}
+
+template <class View>
+auto host_mirror_test_space(View) {
+  return std::conditional_t<
+      Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                 typename View::memory_space>::accessible,
+      typename View::memory_space, Kokkos::HostSpace>{};
+}
+
+template <typename View>
+void test_create_mirror_properties(const View& view) {
+  using namespace Kokkos;
+  using DeviceMemorySpace = typename DefaultExecutionSpace::memory_space;
+
+  // clang-format off
+  
+  // create_mirror
+#ifndef KOKKOS_ENABLE_CXX14
+  // FIXME DynamicView: HostMirror is the same type
+  if constexpr (!is_dynamic_view<View>::value) {
+    check_memory_space(create_mirror(WithoutInitializing,                        view), host_mirror_test_space(view));
+    check_memory_space(create_mirror(                                            view), host_mirror_test_space(view));
+  }
+#endif
+  check_memory_space(create_mirror(WithoutInitializing, DefaultExecutionSpace{}, view), DeviceMemorySpace{});
+  check_memory_space(create_mirror(                     DefaultExecutionSpace{}, view), DeviceMemorySpace{});
+
+  // create_mirror_view
+#ifndef KOKKOS_ENABLE_CXX14
+  // FIXME DynamicView: HostMirror is the same type
+  if constexpr (!is_dynamic_view<View>::value) {
+    check_memory_space(create_mirror_view(WithoutInitializing,                        view), host_mirror_test_space(view));
+    check_memory_space(create_mirror_view(                                            view), host_mirror_test_space(view));
+  }
+#endif
+  check_memory_space(create_mirror_view(WithoutInitializing, DefaultExecutionSpace{}, view), DeviceMemorySpace{});
+  check_memory_space(create_mirror_view(                     DefaultExecutionSpace{}, view), DeviceMemorySpace{});
+
+  // create_mirror view_alloc
+#ifndef KOKKOS_ENABLE_CXX14
+  // FIXME DynamicView: HostMirror is the same type
+  if constexpr (!is_dynamic_view<View>::value) {
+    check_memory_space(create_mirror(view_alloc(WithoutInitializing),                    view), host_mirror_test_space(view));
+    check_memory_space(create_mirror(view_alloc(),                                       view), host_mirror_test_space(view));
+  }
+#endif
+  check_memory_space(create_mirror(view_alloc(WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
+  check_memory_space(create_mirror(view_alloc(                     DeviceMemorySpace{}), view), DeviceMemorySpace{});
+
+  // create_mirror_view view_alloc
+#ifndef KOKKOS_ENABLE_CXX14
+  // FIXME DynamicView: HostMirror is the same type
+  if constexpr (!is_dynamic_view<View>::value) {
+    check_memory_space(create_mirror_view(view_alloc(WithoutInitializing),                    view), host_mirror_test_space(view));
+    check_memory_space(create_mirror_view(view_alloc(),                                       view), host_mirror_test_space(view));
+  }
+#endif
+  check_memory_space(create_mirror_view(view_alloc(WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
+  check_memory_space(create_mirror_view(view_alloc(                     DeviceMemorySpace{}), view), DeviceMemorySpace{});
+
+  // create_mirror view_alloc + execution space
+#ifndef KOKKOS_ENABLE_CXX14
+  // FIXME DynamicView: HostMirror is the same type
+  if constexpr (!is_dynamic_view<View>::value) {
+    check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{}, WithoutInitializing),                      view), host_mirror_test_space(view));
+    check_memory_space(create_mirror(view_alloc(DefaultHostExecutionSpace{}),                                       view), host_mirror_test_space(view));
+  }
+#endif
+  check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{},   WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
+  check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{},                        DeviceMemorySpace{}), view), DeviceMemorySpace{});
+
+  // create_mirror_view view_alloc + execution space
+#ifndef KOKKOS_ENABLE_CXX14
+  // FIXME DynamicView: HostMirror is the same type
+  if constexpr (!is_dynamic_view<View>::value) {
+    check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{}, WithoutInitializing),                      view), host_mirror_test_space(view));
+    check_memory_space(create_mirror_view(view_alloc(DefaultHostExecutionSpace{}),                                       view), host_mirror_test_space(view));
+  }
+#endif
+  check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{},   WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
+  check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{},                        DeviceMemorySpace{}), view), DeviceMemorySpace{});
+
+  // create_mirror_view_and_copy
+  check_memory_space(create_mirror_view_and_copy(HostSpace{},         view), HostSpace{});
+  check_memory_space(create_mirror_view_and_copy(DeviceMemorySpace{}, view), DeviceMemorySpace{});
+
+  // create_mirror_view_and_copy view_alloc
+  check_memory_space(create_mirror_view_and_copy(view_alloc(HostSpace{}),         view), HostSpace{});
+  check_memory_space(create_mirror_view_and_copy(view_alloc(DeviceMemorySpace{}), view), DeviceMemorySpace{});
+
+  // create_mirror_view_and_copy view_alloc + execution space
+  check_memory_space(create_mirror_view_and_copy(view_alloc(HostSpace{},         DefaultHostExecutionSpace{}),   view), HostSpace{});
+  check_memory_space(create_mirror_view_and_copy(view_alloc(DeviceMemorySpace{}, DefaultExecutionSpace{}),       view), DeviceMemorySpace{});
+
+  // clang-format on
+}
+
+void test_create_mirror_dynrankview() {
+  Kokkos::DynRankView<int, Kokkos::DefaultExecutionSpace> device_view(
+      "device view", 10);
+  Kokkos::DynRankView<int, Kokkos::HostSpace> host_view("host view", 10);
+
+  test_create_mirror_properties(device_view);
+  test_create_mirror_properties(host_view);
+}
+
+void test_reate_mirror_offsetview() {
+  Kokkos::Experimental::OffsetView<int*, Kokkos::DefaultExecutionSpace>
+      device_view("device view", {0, 10});
+  Kokkos::Experimental::OffsetView<int*, Kokkos::HostSpace> host_view(
+      "host view", {0, 10});
+
+  test_create_mirror_properties(device_view);
+  test_create_mirror_properties(host_view);
+}
+
+void test_create_mirror_dynamicview() {
+  Kokkos::Experimental::DynamicView<int*, Kokkos::DefaultExecutionSpace>
+      device_view("device view", 2, 10);
+  Kokkos::Experimental::DynamicView<int*, Kokkos::HostSpace> host_view(
+      "host view", 2, 10);
+
+  test_create_mirror_properties(device_view);
+  test_create_mirror_properties(host_view);
+}
--- a/lib/kokkos/core/cmake/Dependencies.cmake
+++ b/lib/kokkos/core/cmake/Dependencies.cmake
@ -1,5 +1,5 @@
 TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
-  LIB_OPTIONAL_TPLS Pthread CUDA HWLOC DLlib HPX
+  LIB_OPTIONAL_TPLS Pthread CUDA HWLOC DLlib
  TEST_OPTIONAL_TPLS CUSPARSE
  )

--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
@ -53,13 +53,69 @@
 namespace Kokkos {
 namespace Impl {

+inline int cuda_warp_per_sm_allocation_granularity(
+    cudaDeviceProp const& properties) {
+  // Allocation granularity of warps in each sm
+  switch (properties.major) {
+    case 3:
+    case 5:
+    case 7:
+    case 8:
+    case 9: return 4;
+    case 6: return (properties.minor == 0 ? 2 : 4);
+    default:
+      throw_runtime_exception(
+          "Unknown device in cuda warp per sm allocation granularity");
+      return 0;
+  }
+}
+
+inline int cuda_max_warps_per_sm_registers(
+    cudaDeviceProp const& properties, cudaFuncAttributes const& attributes) {
+  // Maximum number of warps per sm as a function of register counts,
+  // subject to the constraint that warps are allocated with a fixed granularity
+  int const max_regs_per_block = properties.regsPerBlock;
+  int const regs_per_warp      = attributes.numRegs * properties.warpSize;
+  int const warp_granularity =
+      cuda_warp_per_sm_allocation_granularity(properties);
+  // The granularity of register allocation is chunks of 256 registers per warp,
+  // which implies a need to over-allocate, so we round up
+  int const allocated_regs_per_warp = 256 * ((regs_per_warp + 256 - 1) / 256);
+
+  // The maximum number of warps per SM is constrained from above by register
+  // allocation. To satisfy the constraint that warps per SM is allocated at a
+  // finite granularity, we need to round down.
+  int const max_warps_per_sm =
+      warp_granularity *
+      (max_regs_per_block / (allocated_regs_per_warp * warp_granularity));
+
+  return max_warps_per_sm;
+}
+
 inline int cuda_max_active_blocks_per_sm(cudaDeviceProp const& properties,
                                         cudaFuncAttributes const& attributes,
                                         int block_size, size_t dynamic_shmem) {
-  // Limits due do registers/SM
+  // Limits due to registers/SM
  int const regs_per_sm     = properties.regsPerMultiprocessor;
  int const regs_per_thread = attributes.numRegs;
-  int const max_blocks_regs = regs_per_sm / (regs_per_thread * block_size);
+  // The granularity of register allocation is chunks of 256 registers per warp
+  // -> 8 registers per thread
+  int const allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
+  int max_blocks_regs = regs_per_sm / (allocated_regs_per_thread * block_size);
+
+  // Compute the maximum number of warps as a function of the number of
+  // registers
+  int const max_warps_per_sm_registers =
+      cuda_max_warps_per_sm_registers(properties, attributes);
+
+  // Constrain the number of blocks to respect the maximum number of warps per
+  // SM On face value this should be an equality, but due to the warp
+  // granularity constraints noted in `cuda_max_warps_per_sm_registers` the
+  // left-hand-side of this comparison can overshoot what the hardware allows
+  // based on register counts alone
+  while ((max_blocks_regs * block_size / properties.warpSize) >
+         max_warps_per_sm_registers)
+    max_blocks_regs--;

  // Limits due to shared memory/SM
  size_t const shmem_per_sm            = properties.sharedMemPerMultiprocessor;
@ -203,40 +259,19 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
                                LaunchBounds{});
 }

-// Assuming cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1)
-// NOTE these number can be obtained several ways:
-// * One option is to download the CUDA Occupancy Calculator spreadsheet, select
-// "Compute Capability" first and check what is the smallest "Shared Memory
-// Size Config" that is available.  The "Shared Memory Per Multiprocessor" in
-// bytes is then to be found below in the summary.
-// * Another option would be to look for the information in the "Tuning
-// Guide(s)" of the CUDA Toolkit Documentation for each GPU architecture, in
-// the "Shared Memory" section (more tedious)
-inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) {
-  int const compute_capability = properties.major * 10 + properties.minor;
-  return [compute_capability]() {
-    switch (compute_capability) {
-      case 30:
-      case 32:
-      case 35: return 16;
-      case 37: return 80;
-      case 50:
-      case 53:
-      case 60:
-      case 62: return 64;
-      case 52:
-      case 61: return 96;
-      case 70:
-      case 80:
-      case 86: return 8;
-      case 75: return 32;
-      default:
-        Kokkos::Impl::throw_runtime_exception(
-            "Unknown device in cuda block size deduction");
-    }
-    return 0;
-  }() * 1024;
+template <class LaunchBounds>
+int cuda_get_opt_block_size_no_shmem(const cudaFuncAttributes& attr,
+                                     LaunchBounds) {
+  auto const& prop = Kokkos::Cuda().cuda_device_prop();
+
+  // Thin version of cuda_get_opt_block_size for cases where there is no shared
+  // memory
+  auto const block_size_to_no_shmem = [&](int /*block_size*/) { return 0; };
+
+  return cuda_deduce_block_size(false, prop, attr, block_size_to_no_shmem,
+                                LaunchBounds{});
 }
+
 }  // namespace Impl
 }  // namespace Kokkos

--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp
@ -418,7 +418,7 @@ KOKKOS_INLINE_FUNCTION
 #endif  // CUDA_VERSION >= 11000 && CUDA_VERSION < 11010

 #if CUDA_VERSION >= 11010 && \
-    ((defined(KOKKOS_ARCH_AMPERE80) || defined(KOKKOS_ARCH_AMPERE86)))
+    ((defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER)))
 KOKKOS_INLINE_FUNCTION
 bhalf_t cast_to_bhalf(bhalf_t val) { return val; }
 KOKKOS_INLINE_FUNCTION
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
@ -569,12 +569,6 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default
  }
 #endif

-#ifdef KOKKOS_ENABLE_PRE_CUDA_10_DEPRECATION_API
-  cudaThreadSetCacheConfig(cudaFuncCachePreferShared);
-#else
-  cudaDeviceSetCacheConfig(cudaFuncCachePreferShared);
-#endif
-
  // Init the array for used for arbitrarily sized atomics
  if (stream == nullptr) Impl::initialize_host_cuda_lock_arrays();

--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
@ -93,10 +93,6 @@ namespace Impl {
 //   __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
 // function qualifier which could be used to improve performance.
 //----------------------------------------------------------------------------
-// Maximize L1 cache and minimize shared memory:
-//   cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1 );
-// For 2.0 capability: 48 KB L1 and 16 KB shared
-//----------------------------------------------------------------------------

 template <class DriverType>
 __global__ static void cuda_parallel_launch_constant_memory() {
@ -158,63 +154,119 @@ inline void check_shmem_request(CudaInternal const* cuda_instance, int shmem) {
  }
 }

-// This function needs to be template on DriverType and LaunchBounds
+// These functions needs to be template on DriverType and LaunchBounds
 // so that the static bool is unique for each type combo
 // KernelFuncPtr does not necessarily contain that type information.
+
 template <class DriverType, class LaunchBounds, class KernelFuncPtr>
-inline void configure_shmem_preference(KernelFuncPtr const& func,
-                                       bool prefer_shmem) {
-#ifndef KOKKOS_ARCH_KEPLER
-  // On Kepler the L1 has no benefit since it doesn't cache reads
-  auto set_cache_config = [&] {
-    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
-        func,
-        (prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1)));
-    return prefer_shmem;
+const cudaFuncAttributes& get_cuda_kernel_func_attributes(
+    const KernelFuncPtr& func) {
+  // Only call cudaFuncGetAttributes once for each unique kernel
+  // by leveraging static variable initialization rules
+  auto wrap_get_attributes = [&]() -> cudaFuncAttributes {
+    cudaFuncAttributes attr;
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncGetAttributes(&attr, func));
+    return attr;
  };
-  static bool cache_config_preference_cached = set_cache_config();
-  if (cache_config_preference_cached != prefer_shmem) {
+  static cudaFuncAttributes func_attr = wrap_get_attributes();
+  return func_attr;
+}
+
+template <class DriverType, class LaunchBounds, class KernelFuncPtr>
+inline void configure_shmem_preference(const KernelFuncPtr& func,
+                                       const cudaDeviceProp& device_props,
+                                       const size_t block_size, int& shmem,
+                                       const size_t occupancy) {
+#ifndef KOKKOS_ARCH_KEPLER
+
+  const auto& func_attr =
+      get_cuda_kernel_func_attributes<DriverType, LaunchBounds>(func);
+
+  // Compute limits for number of blocks due to registers/SM
+  const size_t regs_per_sm     = device_props.regsPerMultiprocessor;
+  const size_t regs_per_thread = func_attr.numRegs;
+  // The granularity of register allocation is chunks of 256 registers per warp
+  // -> 8 registers per thread
+  const size_t allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
+  size_t max_blocks_regs =
+      regs_per_sm / (allocated_regs_per_thread * block_size);
+
+  // Compute the maximum number of warps as a function of the number of
+  // registers
+  const size_t max_warps_per_sm_registers =
+      cuda_max_warps_per_sm_registers(device_props, func_attr);
+
+  // Constrain the number of blocks to respect the maximum number of warps per
+  // SM On face value this should be an equality, but due to the warp
+  // granularity constraints noted in `cuda_max_warps_per_sm_registers` the
+  // left-hand-side of this comparison can overshoot what the hardware allows
+  // based on register counts alone
+  while ((max_blocks_regs * block_size / device_props.warpSize) >
+         max_warps_per_sm_registers)
+    max_blocks_regs--;
+
+  // Compute how many threads per sm we actually want
+  const size_t max_threads_per_sm = device_props.maxThreadsPerMultiProcessor;
+  // only allocate multiples of warp size
+  const size_t num_threads_desired =
+      ((max_threads_per_sm * occupancy / 100 + 31) / 32) * 32;
+  // Get close to the desired occupancy,
+  // don't undershoot by much but also don't allocate a whole new block just
+  // because one is a few threads over otherwise.
+  size_t num_blocks_desired =
+      (num_threads_desired + block_size * 0.8) / block_size;
+  num_blocks_desired = ::std::min(max_blocks_regs, num_blocks_desired);
+  if (num_blocks_desired == 0) num_blocks_desired = 1;
+
+  // Calculate how much shared memory we need per block
+  size_t shmem_per_block = shmem + func_attr.sharedSizeBytes;
+
+  // The minimum shared memory allocation we can have in total per SM is 8kB.
+  // If we want to lower occupancy we have to make sure we request at least that
+  // much in aggregate over all blocks, so that shared memory actually becomes a
+  // limiting factor for occupancy
+  constexpr size_t min_shmem_size_per_sm = 8192;
+  if ((occupancy < 100) &&
+      (shmem_per_block * num_blocks_desired < min_shmem_size_per_sm)) {
+    shmem_per_block = min_shmem_size_per_sm / num_blocks_desired;
+    // Need to set the caller's shmem variable so that the
+    // kernel launch uses the correct dynamic shared memory request
+    shmem = shmem_per_block - func_attr.sharedSizeBytes;
+  }
+
+  // Compute the carveout fraction we need based on occupancy
+  // Use multiples of 8kB
+  const size_t max_shmem_per_sm = device_props.sharedMemPerMultiprocessor;
+  size_t carveout               = shmem_per_block == 0
+                        ? 0
+                        : 100 *
+                              (((num_blocks_desired * shmem_per_block +
+                                 min_shmem_size_per_sm - 1) /
+                                min_shmem_size_per_sm) *
+                               min_shmem_size_per_sm) /
+                              max_shmem_per_sm;
+  if (carveout > 100) carveout = 100;
+
+  // Set the carveout, but only call it once per kernel or when it changes
+  auto set_cache_config = [&] {
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetAttribute(
+        func, cudaFuncAttributePreferredSharedMemoryCarveout, carveout));
+    return carveout;
+  };
+  // Store the value in a static variable so we only reset if needed
+  static size_t cache_config_preference_cached = set_cache_config();
+  if (cache_config_preference_cached != carveout) {
    cache_config_preference_cached = set_cache_config();
  }
 #else
  // Use the parameters so we don't get a warning
  (void)func;
-  (void)prefer_shmem;
+  (void)device_props;
+  (void)block_size;
+  (void)occupancy;
 #endif
 }

-template <class Policy>
-std::enable_if_t<Policy::experimental_contains_desired_occupancy>
-modify_launch_configuration_if_desired_occupancy_is_specified(
-    Policy const& policy, cudaDeviceProp const& properties,
-    cudaFuncAttributes const& attributes, dim3 const& block, int& shmem,
-    bool& prefer_shmem) {
-  int const block_size        = block.x * block.y * block.z;
-  int const desired_occupancy = policy.impl_get_desired_occupancy().value();
-
-  size_t const shmem_per_sm_prefer_l1 = get_shmem_per_sm_prefer_l1(properties);
-  size_t const static_shmem           = attributes.sharedSizeBytes;
-
-  // round to nearest integer and avoid division by zero
-  int active_blocks = std::max(
-      1, static_cast<int>(std::round(
-             static_cast<double>(properties.maxThreadsPerMultiProcessor) /
-             block_size * desired_occupancy / 100)));
-  int const dynamic_shmem =
-      shmem_per_sm_prefer_l1 / active_blocks - static_shmem;
-
-  if (dynamic_shmem > shmem) {
-    shmem        = dynamic_shmem;
-    prefer_shmem = false;
-  }
-}
-
-template <class Policy>
-std::enable_if_t<!Policy::experimental_contains_desired_occupancy>
-modify_launch_configuration_if_desired_occupancy_is_specified(
-    Policy const&, cudaDeviceProp const&, cudaFuncAttributes const&,
-    dim3 const& /*block*/, int& /*shmem*/, bool& /*prefer_shmem*/) {}
-
 // </editor-fold> end Some helper functions for launch code readability }}}1
 //==============================================================================

@ -348,7 +400,7 @@ struct CudaParallelLaunchKernelInvoker<
 #ifdef KOKKOS_CUDA_ENABLE_GRAPHS
  inline static void create_parallel_launch_graph_node(
      DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
-      CudaInternal const* cuda_instance, bool prefer_shmem) {
+      CudaInternal const* cuda_instance) {
    //----------------------------------------
    auto const& graph = Impl::get_cuda_graph_from_kernel(driver);
    KOKKOS_EXPECTS(bool(graph));
@ -358,8 +410,19 @@ struct CudaParallelLaunchKernelInvoker<

    if (!Impl::is_empty_launch(grid, block)) {
      Impl::check_shmem_request(cuda_instance, shmem);
-      Impl::configure_shmem_preference<DriverType, LaunchBounds>(
-          base_t::get_kernel_func(), prefer_shmem);
+      if (DriverType::Policy::
+                        experimental_contains_desired_occupancy) {
+      /*
+        int desired_occupancy =
+            driver.get_policy().impl_get_desired_occupancy().value();
+        size_t block_size = block.x * block.y * block.z;
+        Impl::configure_shmem_preference<DriverType, LaunchBounds>(
+            base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
+            shmem, desired_occupancy);*/
+        Kokkos::Impl::throw_runtime_exception(
+        std::string("Cuda graph node creation FAILED:"
+                    " occupancy requests are currently broken."));
+      }

      void const* args[] = {&driver};

@ -442,7 +505,7 @@ struct CudaParallelLaunchKernelInvoker<
 #ifdef KOKKOS_CUDA_ENABLE_GRAPHS
  inline static void create_parallel_launch_graph_node(
      DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
-      CudaInternal const* cuda_instance, bool prefer_shmem) {
+      CudaInternal const* cuda_instance) {
    //----------------------------------------
    auto const& graph = Impl::get_cuda_graph_from_kernel(driver);
    KOKKOS_EXPECTS(bool(graph));
@ -452,8 +515,18 @@ struct CudaParallelLaunchKernelInvoker<

    if (!Impl::is_empty_launch(grid, block)) {
      Impl::check_shmem_request(cuda_instance, shmem);
-      Impl::configure_shmem_preference<DriverType, LaunchBounds>(
-          base_t::get_kernel_func(), prefer_shmem);
+      if (DriverType::Policy::
+                        experimental_contains_desired_occupancy) {
+        /*int desired_occupancy =
+            driver.get_policy().impl_get_desired_occupancy().value();
+        size_t block_size = block.x * block.y * block.z;
+        Impl::configure_shmem_preference<DriverType, LaunchBounds>(
+            base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
+            shmem, desired_occupancy);*/
+        Kokkos::Impl::throw_runtime_exception(
+        std::string("Cuda graph node creation FAILED:"
+                    " occupancy requests are currently broken."));
+      }

      auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver);

@ -566,7 +639,7 @@ struct CudaParallelLaunchKernelInvoker<
 #ifdef KOKKOS_CUDA_ENABLE_GRAPHS
  inline static void create_parallel_launch_graph_node(
      DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
-      CudaInternal const* cuda_instance, bool prefer_shmem) {
+      CudaInternal const* cuda_instance) {
    // Just use global memory; coordinating through events to share constant
    // memory with the non-graph interface is not really reasonable since
    // events don't work with Graphs directly, and this would anyway require
@ -580,7 +653,7 @@ struct CudaParallelLaunchKernelInvoker<
        DriverType, LaunchBounds,
        Experimental::CudaLaunchMechanism::GlobalMemory>;
    global_launch_impl_t::create_parallel_launch_graph_node(
-        driver, grid, block, shmem, cuda_instance, prefer_shmem);
+        driver, grid, block, shmem, cuda_instance);
  }
 #endif
 };
@ -613,8 +686,7 @@ struct CudaParallelLaunchImpl<

  inline static void launch_kernel(const DriverType& driver, const dim3& grid,
                                   const dim3& block, int shmem,
-                                   const CudaInternal* cuda_instance,
-                                   bool prefer_shmem) {
+                                   const CudaInternal* cuda_instance) {
    if (!Impl::is_empty_launch(grid, block)) {
      // Prevent multiple threads to simultaneously set the cache configuration
      // preference and launch the same kernel
@ -623,20 +695,22 @@ struct CudaParallelLaunchImpl<

      Impl::check_shmem_request(cuda_instance, shmem);

-      // If a desired occupancy is specified, we compute how much shared memory
-      // to ask for to achieve that occupancy, assuming that the cache
-      // configuration is `cudaFuncCachePreferL1`.  If the amount of dynamic
-      // shared memory computed is actually smaller than `shmem` we overwrite
-      // `shmem` and set `prefer_shmem` to `false`.
-      modify_launch_configuration_if_desired_occupancy_is_specified(
-          driver.get_policy(), cuda_instance->m_deviceProp,
-          get_cuda_func_attributes(), block, shmem, prefer_shmem);
+      if (DriverType::Policy::
+                        experimental_contains_desired_occupancy) {
+        /*int desired_occupancy =
+            driver.get_policy().impl_get_desired_occupancy().value();
+        size_t block_size = block.x * block.y * block.z;
+        Impl::configure_shmem_preference<
+            DriverType,
+            Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
+            base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
+            shmem, desired_occupancy);*/
+        Kokkos::Impl::throw_runtime_exception(
+        std::string("Cuda graph node creation FAILED:"
+                    " occupancy requests are currently broken."));
+      }

-      Impl::configure_shmem_preference<
-          DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
-          base_t::get_kernel_func(), prefer_shmem);
-
-      ensure_cuda_lock_arrays_on_device();
+      KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();

      // Invoke the driver function on the device
      base_t::invoke_kernel(driver, grid, block, shmem, cuda_instance);
@ -650,18 +724,9 @@ struct CudaParallelLaunchImpl<
  }

  static cudaFuncAttributes get_cuda_func_attributes() {
-    // Race condition inside of cudaFuncGetAttributes if the same address is
-    // given requires using a local variable as input instead of a static Rely
-    // on static variable initialization to make sure only one thread executes
-    // the code and the result is visible.
-    auto wrap_get_attributes = []() -> cudaFuncAttributes {
-      cudaFuncAttributes attr_tmp;
-      KOKKOS_IMPL_CUDA_SAFE_CALL(
-          cudaFuncGetAttributes(&attr_tmp, base_t::get_kernel_func()));
-      return attr_tmp;
-    };
-    static cudaFuncAttributes attr = wrap_get_attributes();
-    return attr;
+    return get_cuda_kernel_func_attributes<
+        DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
+        base_t::get_kernel_func());
  }
 };

--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp
@ -79,7 +79,8 @@ CudaLockArrays g_host_cuda_lock_arrays = {nullptr, 0};
 void initialize_host_cuda_lock_arrays() {
 #ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
  desul::Impl::init_lock_arrays();
-  desul::ensure_cuda_lock_arrays_on_device();
+
+  DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
 #endif
  if (g_host_cuda_lock_arrays.atomic != nullptr) return;
  KOKKOS_IMPL_CUDA_SAFE_CALL(
@ -88,7 +89,7 @@ void initialize_host_cuda_lock_arrays() {
  Impl::cuda_device_synchronize(
      "Kokkos::Impl::initialize_host_cuda_lock_arrays: Pre Init Lock Arrays");
  g_host_cuda_lock_arrays.n = Cuda::concurrency();
-  copy_cuda_lock_arrays_to_device();
+  KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
  init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK + 1 + 255) / 256,
                                  256>>>();
  Impl::cuda_device_synchronize(
@ -105,7 +106,7 @@ void finalize_host_cuda_lock_arrays() {
  g_host_cuda_lock_arrays.atomic = nullptr;
  g_host_cuda_lock_arrays.n      = 0;
 #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
-  copy_cuda_lock_arrays_to_device();
+  KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
 #endif
 }

--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp
@ -67,7 +67,7 @@ struct CudaLockArrays {

 /// \brief This global variable in Host space is the central definition
 ///        of these arrays.
-extern CudaLockArrays g_host_cuda_lock_arrays;
+extern Kokkos::Impl::CudaLockArrays g_host_cuda_lock_arrays;

 /// \brief After this call, the g_host_cuda_lock_arrays variable has
 ///        valid, initialized arrays.
@ -105,12 +105,12 @@ namespace Impl {
 /// instances in other translation units, we must update this CUDA global
 /// variable based on the Host global variable prior to running any kernels
 /// that will use it.
-/// That is the purpose of the ensure_cuda_lock_arrays_on_device function.
+/// That is the purpose of the KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE macro.
 __device__
 #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
    __constant__ extern
 #endif
-    CudaLockArrays g_device_cuda_lock_arrays;
+    Kokkos::Impl::CudaLockArrays g_device_cuda_lock_arrays;

 #define CUDA_SPACE_ATOMIC_MASK 0x1FFFF

@ -123,7 +123,9 @@ __device__ inline bool lock_address_cuda_space(void* ptr) {
  size_t offset = size_t(ptr);
  offset        = offset >> 2;
  offset        = offset & CUDA_SPACE_ATOMIC_MASK;
-  return (0 == atomicCAS(&g_device_cuda_lock_arrays.atomic[offset], 0, 1));
+  return (
+      0 ==
+      atomicCAS(&Kokkos::Impl::g_device_cuda_lock_arrays.atomic[offset], 0, 1));
 }

 /// \brief Release lock for the address
@ -136,7 +138,7 @@ __device__ inline void unlock_address_cuda_space(void* ptr) {
  size_t offset = size_t(ptr);
  offset        = offset >> 2;
  offset        = offset & CUDA_SPACE_ATOMIC_MASK;
-  atomicExch(&g_device_cuda_lock_arrays.atomic[offset], 0);
+  atomicExch(&Kokkos::Impl::g_device_cuda_lock_arrays.atomic[offset], 0);
 }

 }  // namespace Impl
@ -149,49 +151,45 @@ namespace {
 static int lock_array_copied = 0;
 inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
 }  // namespace
+}  // namespace Impl
+}  // namespace Kokkos

-#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
-inline
-#else
-static
-#endif
-    void
-    copy_cuda_lock_arrays_to_device() {
-  if (lock_array_copied == 0) {
-    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemcpyToSymbol(g_device_cuda_lock_arrays,
-                                                  &g_host_cuda_lock_arrays,
-                                                  sizeof(CudaLockArrays)));
+/* Dan Ibanez: it is critical that this code be a macro, so that it will
+   capture the right address for Kokkos::Impl::g_device_cuda_lock_arrays!
+   putting this in an inline function will NOT do the right thing! */
+#define KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()                      \
+  {                                                                   \
+    if (::Kokkos::Impl::lock_array_copied == 0) {                     \
+      KOKKOS_IMPL_CUDA_SAFE_CALL(                                     \
+          cudaMemcpyToSymbol(Kokkos::Impl::g_device_cuda_lock_arrays, \
+                             &Kokkos::Impl::g_host_cuda_lock_arrays,  \
+                             sizeof(Kokkos::Impl::CudaLockArrays)));  \
+    }                                                                 \
+    lock_array_copied = 1;                                            \
  }
-  lock_array_copied = 1;
-}

 #ifndef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS

 #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
-inline void ensure_cuda_lock_arrays_on_device() {}
+#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
 #else
-inline static void ensure_cuda_lock_arrays_on_device() {
-  copy_cuda_lock_arrays_to_device();
-}
+#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() \
+  KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()
 #endif

 #else

 #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
-inline void ensure_cuda_lock_arrays_on_device() {}
+#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
 #else
 // Still Need COPY_CUDA_LOCK_ARRAYS for team scratch etc.
-inline static void ensure_cuda_lock_arrays_on_device() {
-  copy_cuda_lock_arrays_to_device();
-  desul::ensure_cuda_lock_arrays_on_device();
-}
+#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() \
+  KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()         \
+  DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
 #endif

 #endif /* defined( KOKKOS_ENABLE_IMPL_DESUL_ATOMICS ) */

-}  // namespace Impl
-}  // namespace Kokkos
-
 #endif /* defined( KOKKOS_ENABLE_CUDA ) */

 #endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP */
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
@ -67,6 +67,34 @@
 namespace Kokkos {
 namespace Impl {

+template <typename ParallelType, typename Policy, typename LaunchBounds>
+int max_tile_size_product_helper(const Policy& pol, const LaunchBounds&) {
+  cudaFuncAttributes attr =
+      CudaParallelLaunch<ParallelType,
+                         LaunchBounds>::get_cuda_func_attributes();
+  auto const& prop = pol.space().cuda_device_prop();
+
+  // Limits due to registers/SM, MDRange doesn't have
+  // shared memory constraints
+  int const optimal_block_size =
+      Kokkos::Impl::cuda_get_opt_block_size_no_shmem(attr, LaunchBounds{});
+
+  // Compute how many blocks of this size we can launch, based on warp
+  // constraints
+  int const max_warps_per_sm_registers =
+      Kokkos::Impl::cuda_max_warps_per_sm_registers(prop, attr);
+  int const max_num_threads_from_warps =
+      max_warps_per_sm_registers * prop.warpSize;
+  int const max_num_blocks = max_num_threads_from_warps / optimal_block_size;
+
+  // Compute the total number of threads
+  int const max_threads_per_sm = optimal_block_size * max_num_blocks;
+
+  return std::min(
+      max_threads_per_sm,
+      static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
+}
+
 template <class FunctorType, class... Traits>
 class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
 public:
@ -85,18 +113,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
 public:
  template <typename Policy, typename Functor>
  static int max_tile_size_product(const Policy& pol, const Functor&) {
-    cudaFuncAttributes attr =
-        CudaParallelLaunch<ParallelFor,
-                           LaunchBounds>::get_cuda_func_attributes();
-    auto const& prop = pol.space().cuda_device_prop();
-    // Limits due to registers/SM, MDRange doesn't have
-    // shared memory constraints
-    int const regs_per_sm        = prop.regsPerMultiprocessor;
-    int const regs_per_thread    = attr.numRegs;
-    int const max_threads_per_sm = regs_per_sm / regs_per_thread;
-    return std::min(
-        max_threads_per_sm,
-        static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
+    return max_tile_size_product_helper<ParallelFor>(pol, LaunchBounds{});
  }
  Policy const& get_policy() const { return m_rp; }
  inline __device__ void operator()() const {
@ -121,8 +138,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
              maxblocks[1]),
          1);
      CudaParallelLaunch<ParallelFor, LaunchBounds>(
-          *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
-          false);
+          *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
    } else if (RP::rank == 3) {
      const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], m_rp.m_tile[2]);
      KOKKOS_ASSERT(block.x > 0);
@ -139,8 +155,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
              (m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1) / block.z,
              maxblocks[2]));
      CudaParallelLaunch<ParallelFor, LaunchBounds>(
-          *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
-          false);
+          *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
    } else if (RP::rank == 4) {
      // id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to
      // threadIdx.z
@ -158,8 +173,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
              (m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1) / block.z,
              maxblocks[2]));
      CudaParallelLaunch<ParallelFor, LaunchBounds>(
-          *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
-          false);
+          *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
    } else if (RP::rank == 5) {
      // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to
      // threadIdx.z
@ -175,8 +189,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
              (m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1) / block.z,
              maxblocks[2]));
      CudaParallelLaunch<ParallelFor, LaunchBounds>(
-          *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
-          false);
+          *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
    } else if (RP::rank == 6) {
      // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4,id5 to
      // threadIdx.z
@ -191,8 +204,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
          std::min<array_index_type>(m_rp.m_tile_end[4] * m_rp.m_tile_end[5],
                                     maxblocks[2]));
      CudaParallelLaunch<ParallelFor, LaunchBounds>(
-          *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
-          false);
+          *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
    } else {
      Kokkos::abort("Kokkos::MDRange Error: Exceeded rank bounds with Cuda\n");
    }
@ -263,17 +275,7 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
 public:
  template <typename Policy, typename Functor>
  static int max_tile_size_product(const Policy& pol, const Functor&) {
-    cudaFuncAttributes attr =
-        CudaParallelLaunch<ParallelReduce,
-                           LaunchBounds>::get_cuda_func_attributes();
-    auto const& prop = pol.space().cuda_device_prop();
-    // Limits due do registers/SM
-    int const regs_per_sm        = prop.regsPerMultiprocessor;
-    int const regs_per_thread    = attr.numRegs;
-    int const max_threads_per_sm = regs_per_sm / regs_per_thread;
-    return std::min(
-        max_threads_per_sm,
-        static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
+    return max_tile_size_product_helper<ParallelReduce>(pol, LaunchBounds{});
  }
  Policy const& get_policy() const { return m_policy; }
  inline __device__ void exec_range(reference_type update) const {
@ -405,8 +407,8 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,

      CudaParallelLaunch<ParallelReduce, LaunchBounds>(
          *this, grid, block, shmem,
-          m_policy.space().impl_internal_space_instance(),
-          false);  // copy to device and execute
+          m_policy.space()
+	      .impl_internal_space_instance()); // copy to device and execute

      if (!m_result_ptr_device_accessible) {
        if (m_result_ptr) {
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
@ -135,8 +135,7 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
 #endif

    CudaParallelLaunch<ParallelFor, LaunchBounds>(
-        *this, grid, block, 0, m_policy.space().impl_internal_space_instance(),
-        false);
+        *this, grid, block, 0, m_policy.space().impl_internal_space_instance());
  }

  ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
@ -375,8 +374,8 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,

      CudaParallelLaunch<ParallelReduce, LaunchBounds>(
          *this, grid, block, shmem,
-          m_policy.space().impl_internal_space_instance(),
-          false);  // copy to device and execute
+          m_policy.space()
+	      .impl_internal_space_instance());  // copy to device and execute

      if (!m_result_ptr_device_accessible) {
        if (m_result_ptr) {
@ -465,8 +464,24 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
 public:
  using pointer_type   = typename Analysis::pointer_type;
  using reference_type = typename Analysis::reference_type;
+  using value_type     = typename Analysis::value_type;
  using functor_type   = FunctorType;
  using size_type      = Cuda::size_type;
+  // Conditionally set word_size_type to int16_t or int8_t if value_type is
+  // smaller than int32_t (Kokkos::Cuda::size_type)
+  // word_size_type is used to determine the word count, shared memory buffer
+  // size, and global memory buffer size before the scan is performed.
+  // Within the scan, the word count is recomputed based on word_size_type
+  // and when calculating indexes into the shared/global memory buffers for
+  // performing the scan, word_size_type is used again.
+  // For scalars > 4 bytes in size, indexing into shared/global memory relies
+  // on the block and grid dimensions to ensure that we index at the correct
+  // offset rather than at every 4 byte word; such that, when the join is
+  // performed, we have the correct data that was copied over in chunks of 4
+  // bytes.
+  using word_size_type = std::conditional_t<
+      sizeof(value_type) < sizeof(size_type),
+      std::conditional_t<sizeof(value_type) == 2, int16_t, int8_t>, size_type>;

 private:
  // Algorithmic constraints:
@ -477,7 +492,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {

  const FunctorType m_functor;
  const Policy m_policy;
-  size_type* m_scratch_space;
+  word_size_type* m_scratch_space;
  size_type* m_scratch_flags;
  size_type m_final;
 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
@ -501,12 +516,12 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
  __device__ inline void initial() const {
    typename Analysis::Reducer final_reducer(&m_functor);

-    const integral_nonzero_constant<size_type, Analysis::StaticValueSize /
-                                                   sizeof(size_type)>
-        word_count(Analysis::value_size(m_functor) / sizeof(size_type));
+    const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
+                                                        sizeof(word_size_type)>
+        word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));

-    size_type* const shared_value =
-        kokkos_impl_cuda_shared_memory<size_type>() +
+    word_size_type* const shared_value =
+        kokkos_impl_cuda_shared_memory<word_size_type>() +
        word_count.value * threadIdx.y;

    final_reducer.init(reinterpret_cast<pointer_type>(shared_value));
@ -532,7 +547,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
    // gridDim.x
    cuda_single_inter_block_reduce_scan<true>(
        final_reducer, blockIdx.x, gridDim.x,
-        kokkos_impl_cuda_shared_memory<size_type>(), m_scratch_space,
+        kokkos_impl_cuda_shared_memory<word_size_type>(), m_scratch_space,
        m_scratch_flags);
  }

@ -541,21 +556,22 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
  __device__ inline void final() const {
    typename Analysis::Reducer final_reducer(&m_functor);

-    const integral_nonzero_constant<size_type, Analysis::StaticValueSize /
-                                                   sizeof(size_type)>
-        word_count(Analysis::value_size(m_functor) / sizeof(size_type));
+    const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
+                                                        sizeof(word_size_type)>
+        word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));

    // Use shared memory as an exclusive scan: { 0 , value[0] , value[1] ,
    // value[2] , ... }
-    size_type* const shared_data = kokkos_impl_cuda_shared_memory<size_type>();
-    size_type* const shared_prefix =
+    word_size_type* const shared_data =
+        kokkos_impl_cuda_shared_memory<word_size_type>();
+    word_size_type* const shared_prefix =
        shared_data + word_count.value * threadIdx.y;
-    size_type* const shared_accum =
+    word_size_type* const shared_accum =
        shared_data + word_count.value * (blockDim.y + 1);

    // Starting value for this thread block is the previous block's total.
    if (blockIdx.x) {
-      size_type* const block_total =
+      word_size_type* const block_total =
          m_scratch_space + word_count.value * (blockIdx.x - 1);
      for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
        shared_accum[i] = block_total[i];
@ -602,7 +618,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
          typename Analysis::pointer_type(shared_data + word_count.value));

      {
-        size_type* const block_total =
+        word_size_type* const block_total =
            shared_data + word_count.value * blockDim.y;
        for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
          shared_accum[i] = block_total[i];
@ -690,8 +706,9 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
      // How many block are really needed for this much work:
      const int grid_x = (nwork + work_per_block - 1) / work_per_block;

-      m_scratch_space = cuda_internal_scratch_space(
-          m_policy.space(), Analysis::value_size(m_functor) * grid_x);
+      m_scratch_space =
+          reinterpret_cast<word_size_type*>(cuda_internal_scratch_space(
+              m_policy.space(), Analysis::value_size(m_functor) * grid_x));
      m_scratch_flags =
          cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type) * 1);

@ -708,16 +725,16 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
        m_final = false;
        CudaParallelLaunch<ParallelScan, LaunchBounds>(
            *this, grid, block, shmem,
-            m_policy.space().impl_internal_space_instance(),
-            false);  // copy to device and execute
+            m_policy.space()
+	        .impl_internal_space_instance());  // copy to device and execute
 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
      }
 #endif
      m_final = true;
      CudaParallelLaunch<ParallelScan, LaunchBounds>(
          *this, grid, block, shmem,
-          m_policy.space().impl_internal_space_instance(),
-          false);  // copy to device and execute
+          m_policy.space()
+	      .impl_internal_space_instance());  // copy to device and execute
    }
  }

@ -752,10 +769,26 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
                                                 Policy, FunctorType>;

 public:
+  using value_type     = typename Analysis::value_type;
  using pointer_type   = typename Analysis::pointer_type;
  using reference_type = typename Analysis::reference_type;
  using functor_type   = FunctorType;
  using size_type      = Cuda::size_type;
+  // Conditionally set word_size_type to int16_t or int8_t if value_type is
+  // smaller than int32_t (Kokkos::Cuda::size_type)
+  // word_size_type is used to determine the word count, shared memory buffer
+  // size, and global memory buffer size before the scan is performed.
+  // Within the scan, the word count is recomputed based on word_size_type
+  // and when calculating indexes into the shared/global memory buffers for
+  // performing the scan, word_size_type is used again.
+  // For scalars > 4 bytes in size, indexing into shared/global memory relies
+  // on the block and grid dimensions to ensure that we index at the correct
+  // offset rather than at every 4 byte word; such that, when the join is
+  // performed, we have the correct data that was copied over in chunks of 4
+  // bytes.
+  using word_size_type = std::conditional_t<
+      sizeof(value_type) < sizeof(size_type),
+      std::conditional_t<sizeof(value_type) == 2, int16_t, int8_t>, size_type>;

 private:
  // Algorithmic constraints:
@ -766,7 +799,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,

  const FunctorType m_functor;
  const Policy m_policy;
-  size_type* m_scratch_space;
+  word_size_type* m_scratch_space;
  size_type* m_scratch_flags;
  size_type m_final;
  ReturnType& m_returnvalue;
@ -791,12 +824,12 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
  __device__ inline void initial() const {
    typename Analysis::Reducer final_reducer(&m_functor);

-    const integral_nonzero_constant<size_type, Analysis::StaticValueSize /
-                                                   sizeof(size_type)>
-        word_count(Analysis::value_size(m_functor) / sizeof(size_type));
+    const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
+                                                        sizeof(word_size_type)>
+        word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));

-    size_type* const shared_value =
-        kokkos_impl_cuda_shared_memory<size_type>() +
+    word_size_type* const shared_value =
+        kokkos_impl_cuda_shared_memory<word_size_type>() +
        word_count.value * threadIdx.y;

    final_reducer.init(reinterpret_cast<pointer_type>(shared_value));
@ -822,7 +855,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
    // gridDim.x
    cuda_single_inter_block_reduce_scan<true>(
        final_reducer, blockIdx.x, gridDim.x,
-        kokkos_impl_cuda_shared_memory<size_type>(), m_scratch_space,
+        kokkos_impl_cuda_shared_memory<word_size_type>(), m_scratch_space,
        m_scratch_flags);
  }

@ -831,21 +864,22 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
  __device__ inline void final() const {
    typename Analysis::Reducer final_reducer(&m_functor);

-    const integral_nonzero_constant<size_type, Analysis::StaticValueSize /
-                                                   sizeof(size_type)>
-        word_count(Analysis::value_size(m_functor) / sizeof(size_type));
+    const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
+                                                        sizeof(word_size_type)>
+        word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));

    // Use shared memory as an exclusive scan: { 0 , value[0] , value[1] ,
    // value[2] , ... }
-    size_type* const shared_data = kokkos_impl_cuda_shared_memory<size_type>();
-    size_type* const shared_prefix =
+    word_size_type* const shared_data =
+        kokkos_impl_cuda_shared_memory<word_size_type>();
+    word_size_type* const shared_prefix =
        shared_data + word_count.value * threadIdx.y;
-    size_type* const shared_accum =
+    word_size_type* const shared_accum =
        shared_data + word_count.value * (blockDim.y + 1);

    // Starting value for this thread block is the previous block's total.
    if (blockIdx.x) {
-      size_type* const block_total =
+      word_size_type* const block_total =
          m_scratch_space + word_count.value * (blockIdx.x - 1);
      for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
        shared_accum[i] = block_total[i];
@ -894,7 +928,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
          typename Analysis::pointer_type(shared_data + word_count.value));

      {
-        size_type* const block_total =
+        word_size_type* const block_total =
            shared_data + word_count.value * blockDim.y;
        for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
          shared_accum[i] = block_total[i];
@ -983,8 +1017,9 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
      // How many block are really needed for this much work:
      const int grid_x = (nwork + work_per_block - 1) / work_per_block;

-      m_scratch_space = cuda_internal_scratch_space(
-          m_policy.space(), Analysis::value_size(m_functor) * grid_x);
+      m_scratch_space =
+          reinterpret_cast<word_size_type*>(cuda_internal_scratch_space(
+              m_policy.space(), Analysis::value_size(m_functor) * grid_x));
      m_scratch_flags =
          cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type) * 1);

@ -1002,16 +1037,16 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
        m_final = false;
        CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>(
            *this, grid, block, shmem,
-            m_policy.space().impl_internal_space_instance(),
-            false);  // copy to device and execute
+            m_policy.space()
+	        .impl_internal_space_instance());  // copy to device and execute
 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
      }
 #endif
      m_final = true;
      CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>(
          *this, grid, block, shmem,
-          m_policy.space().impl_internal_space_instance(),
-          false);  // copy to device and execute
+          m_policy.space()
+              .impl_internal_space_instance());  // copy to device and execute

      const int size = Analysis::value_size(m_functor);
 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
@ -1022,7 +1057,8 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
 #endif
        DeepCopy<HostSpace, CudaSpace, Cuda>(
            m_policy.space(), &m_returnvalue,
-            m_scratch_space + (grid_x - 1) * size / sizeof(int), size);
+            m_scratch_space + (grid_x - 1) * size / sizeof(word_size_type),
+            size);
    }
  }

--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
@ -552,8 +552,8 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,

    CudaParallelLaunch<ParallelFor, LaunchBounds>(
        *this, grid, block, shmem_size_total,
-        m_policy.space().impl_internal_space_instance(),
-        true);  // copy to device and execute
+        m_policy.space()
+	    .impl_internal_space_instance());  // copy to device and execute
  }

  ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
@ -878,8 +878,8 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,

      CudaParallelLaunch<ParallelReduce, LaunchBounds>(
          *this, grid, block, shmem_size_total,
-          m_policy.space().impl_internal_space_instance(),
-          true);  // copy to device and execute
+          m_policy.space()
+	      .impl_internal_space_instance());  // copy to device and execute

      if (!m_result_ptr_device_accessible) {
        m_policy.space().fence(
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
@ -116,6 +116,7 @@ __device__ inline void cuda_inter_warp_reduction(
  value = result[0];
  for (int i = 1; (i * step < max_active_thread) && i < STEP_WIDTH; i++)
    reducer.join(&value, &result[i]);
+  __syncthreads();
 }

 template <class ValueType, class ReducerType>
@ -427,11 +428,6 @@ struct CudaReductionsFunctor<FunctorType, false, false> {
 //   __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
 // function qualifier which could be used to improve performance.
 //----------------------------------------------------------------------------
-// Maximize shared memory and minimize L1 cache:
-//   cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferShared );
-// For 2.0 capability: 48 KB shared and 16 KB L1
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
 /*
 *  Algorithmic constraints:
 *   (a) blockDim.y <= 1024
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
@ -100,8 +100,7 @@ class ParallelFor<FunctorType, Kokkos::WorkGraphPolicy<Traits...>,
    const int shared = 0;

    Kokkos::Impl::CudaParallelLaunch<Self>(
-        *this, grid, block, shared, Cuda().impl_internal_space_instance(),
-        false);
+        *this, grid, block, shared, Cuda().impl_internal_space_instance());
  }

  inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
--- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp
+++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp
@ -448,11 +448,27 @@ class ParallelScanHIPBase {
                                                 Policy, FunctorType>;

 public:
+  using value_type     = typename Analysis::value_type;
  using pointer_type   = typename Analysis::pointer_type;
  using reference_type = typename Analysis::reference_type;
  using functor_type   = FunctorType;
  using size_type      = Kokkos::Experimental::HIP::size_type;
  using index_type     = typename Policy::index_type;
+  // Conditionally set word_size_type to int16_t or int8_t if value_type is
+  // smaller than int32_t (Kokkos::HIP::size_type)
+  // word_size_type is used to determine the word count, shared memory buffer
+  // size, and global memory buffer size before the scan is performed.
+  // Within the scan, the word count is recomputed based on word_size_type
+  // and when calculating indexes into the shared/global memory buffers for
+  // performing the scan, word_size_type is used again.
+  // For scalars > 4 bytes in size, indexing into shared/global memory relies
+  // on the block and grid dimensions to ensure that we index at the correct
+  // offset rather than at every 4 byte word; such that, when the join is
+  // performed, we have the correct data that was copied over in chunks of 4
+  // bytes.
+  using word_size_type = std::conditional_t<
+      sizeof(value_type) < sizeof(size_type),
+      std::conditional_t<sizeof(value_type) == 2, int16_t, int8_t>, size_type>;

 protected:
  // Algorithmic constraints:
@ -463,10 +479,10 @@ class ParallelScanHIPBase {

  const FunctorType m_functor;
  const Policy m_policy;
-  size_type* m_scratch_space = nullptr;
-  size_type* m_scratch_flags = nullptr;
-  size_type m_final          = false;
-  int m_grid_x               = 0;
+  word_size_type* m_scratch_space = nullptr;
+  size_type* m_scratch_flags      = nullptr;
+  size_type m_final               = false;
+  int m_grid_x                    = 0;
  // Only let one ParallelReduce/Scan modify the shared memory. The
  // constructor acquires the mutex which is released in the destructor.
  std::lock_guard<std::mutex> m_shared_memory_lock;
@ -489,12 +505,12 @@ class ParallelScanHIPBase {
  __device__ inline void initial() const {
    typename Analysis::Reducer final_reducer(&m_functor);

-    const integral_nonzero_constant<size_type, Analysis::StaticValueSize /
-                                                   sizeof(size_type)>
-        word_count(Analysis::value_size(m_functor) / sizeof(size_type));
+    const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
+                                                        sizeof(word_size_type)>
+        word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));

    pointer_type const shared_value = reinterpret_cast<pointer_type>(
-        Kokkos::Experimental::kokkos_impl_hip_shared_memory<size_type>() +
+        Kokkos::Experimental::kokkos_impl_hip_shared_memory<word_size_type>() +
        word_count.value * threadIdx.y);

    final_reducer.init(shared_value);
@ -518,7 +534,7 @@ class ParallelScanHIPBase {
    // gridDim.x
    hip_single_inter_block_reduce_scan<true>(
        final_reducer, blockIdx.x, gridDim.x,
-        Kokkos::Experimental::kokkos_impl_hip_shared_memory<size_type>(),
+        Kokkos::Experimental::kokkos_impl_hip_shared_memory<word_size_type>(),
        m_scratch_space, m_scratch_flags);
  }

@ -527,22 +543,22 @@ class ParallelScanHIPBase {
  __device__ inline void final() const {
    typename Analysis::Reducer final_reducer(&m_functor);

-    const integral_nonzero_constant<size_type, Analysis::StaticValueSize /
-                                                   sizeof(size_type)>
-        word_count(Analysis::value_size(m_functor) / sizeof(size_type));
+    const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize /
+                                                        sizeof(word_size_type)>
+        word_count(Analysis::value_size(m_functor) / sizeof(word_size_type));

    // Use shared memory as an exclusive scan: { 0 , value[0] , value[1] ,
    // value[2] , ... }
-    size_type* const shared_data =
-        Kokkos::Experimental::kokkos_impl_hip_shared_memory<size_type>();
-    size_type* const shared_prefix =
+    word_size_type* const shared_data =
+        Kokkos::Experimental::kokkos_impl_hip_shared_memory<word_size_type>();
+    word_size_type* const shared_prefix =
        shared_data + word_count.value * threadIdx.y;
-    size_type* const shared_accum =
+    word_size_type* const shared_accum =
        shared_data + word_count.value * (blockDim.y + 1);

    // Starting value for this thread block is the previous block's total.
    if (blockIdx.x) {
-      size_type* const block_total =
+      word_size_type* const block_total =
          m_scratch_space + word_count.value * (blockIdx.x - 1);
      for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
        shared_accum[i] = block_total[i];
@ -588,7 +604,7 @@ class ParallelScanHIPBase {
          typename Analysis::pointer_type(shared_data + word_count.value));

      {
-        size_type* const block_total =
+        word_size_type* const block_total =
            shared_data + word_count.value * blockDim.y;
        for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
          shared_accum[i] = block_total[i];
@ -647,8 +663,9 @@ class ParallelScanHIPBase {
      // How many block are really needed for this much work:
      m_grid_x = (nwork + work_per_block - 1) / work_per_block;

-      m_scratch_space = Kokkos::Experimental::Impl::hip_internal_scratch_space(
-          m_policy.space(), Analysis::value_size(m_functor) * m_grid_x);
+      m_scratch_space = reinterpret_cast<word_size_type*>(
+          Kokkos::Experimental::Impl::hip_internal_scratch_space(
+              m_policy.space(), Analysis::value_size(m_functor) * m_grid_x));
      m_scratch_flags = Kokkos::Experimental::Impl::hip_internal_scratch_flags(
          m_policy.space(), sizeof(size_type) * 1);

@ -734,7 +751,8 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
      DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace,
               Kokkos::Experimental::HIP>(
          Base::m_policy.space(), &m_returnvalue,
-          Base::m_scratch_space + (Base::m_grid_x - 1) * size / sizeof(int),
+          Base::m_scratch_space + (Base::m_grid_x - 1) * size /
+                                      sizeof(typename Base::word_size_type),
          size);
    }
  }
--- a/lib/kokkos/core/src/HIP/Kokkos_HIP_ReduceScan.hpp
+++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_ReduceScan.hpp
@ -225,11 +225,11 @@ struct HIPReductionsFunctor<FunctorType, false> {
    }
  }

+  template <typename SizeType>
  __device__ static inline bool scalar_inter_block_reduction(
      FunctorType const& functor,
      ::Kokkos::Experimental::HIP::size_type const block_count,
-      ::Kokkos::Experimental::HIP::size_type* const shared_data,
-      ::Kokkos::Experimental::HIP::size_type* const global_data,
+      SizeType* const shared_data, SizeType* const global_data,
      ::Kokkos::Experimental::HIP::size_type* const global_flags) {
    Scalar* const global_team_buffer_element =
        reinterpret_cast<Scalar*>(global_data);
@ -411,16 +411,14 @@ __device__ void hip_intra_block_reduce_scan(
 *  Global reduce result is in the last threads' 'shared_data' location.
 */

-template <bool DoScan, class FunctorType>
+template <bool DoScan, typename FunctorType, typename SizeType>
 __device__ bool hip_single_inter_block_reduce_scan_impl(
    FunctorType const& functor,
    ::Kokkos::Experimental::HIP::size_type const block_id,
    ::Kokkos::Experimental::HIP::size_type const block_count,
-    ::Kokkos::Experimental::HIP::size_type* const shared_data,
-    ::Kokkos::Experimental::HIP::size_type* const global_data,
+    SizeType* const shared_data, SizeType* const global_data,
    ::Kokkos::Experimental::HIP::size_type* const global_flags) {
-  using size_type = ::Kokkos::Experimental::HIP::size_type;
-
+  using size_type    = SizeType;
  using value_type   = typename FunctorType::value_type;
  using pointer_type = typename FunctorType::pointer_type;

@ -518,13 +516,12 @@ __device__ bool hip_single_inter_block_reduce_scan_impl(
  return is_last_block;
 }

-template <bool DoScan, typename FunctorType>
+template <bool DoScan, typename FunctorType, typename SizeType>
 __device__ bool hip_single_inter_block_reduce_scan(
    FunctorType const& functor,
    ::Kokkos::Experimental::HIP::size_type const block_id,
    ::Kokkos::Experimental::HIP::size_type const block_count,
-    ::Kokkos::Experimental::HIP::size_type* const shared_data,
-    ::Kokkos::Experimental::HIP::size_type* const global_data,
+    SizeType* const shared_data, SizeType* const global_data,
    ::Kokkos::Experimental::HIP::size_type* const global_flags) {
  // If we are doing a reduction and we don't do an array reduction, we use the
  // reduction-only path. Otherwise, we use the common path between reduction
--- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp
+++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp
@ -116,6 +116,7 @@ __device__ inline void hip_inter_warp_shuffle_reduction(
  value = result[0];
  for (int i = 1; (i * step < max_active_thread) && (i < step_width); ++i)
    reducer.join(&value, &result[i]);
+  __syncthreads();
 }

 template <typename ValueType, typename ReducerType>
--- a/lib/kokkos/core/src/Kokkos_CopyViews.hpp
+++ b/lib/kokkos/core/src/Kokkos_CopyViews.hpp
@ -3711,12 +3711,13 @@ namespace Impl {

 template <class T, class... P, class... ViewCtorArgs>
 inline std::enable_if_t<
-    (std::is_same<
-         typename Kokkos::View<T, P...>::memory_space,
-         typename Kokkos::View<T, P...>::HostMirror::memory_space>::value &&
-     std::is_same<
-         typename Kokkos::View<T, P...>::data_type,
-         typename Kokkos::View<T, P...>::HostMirror::data_type>::value),
+    !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
+        (std::is_same<
+             typename Kokkos::View<T, P...>::memory_space,
+             typename Kokkos::View<T, P...>::HostMirror::memory_space>::value &&
+         std::is_same<
+             typename Kokkos::View<T, P...>::data_type,
+             typename Kokkos::View<T, P...>::HostMirror::data_type>::value),
    typename Kokkos::View<T, P...>::HostMirror>
 create_mirror_view(const Kokkos::View<T, P...>& src,
                   const Impl::ViewCtorProp<ViewCtorArgs...>&) {
@ -3725,12 +3726,13 @@ create_mirror_view(const Kokkos::View<T, P...>& src,

 template <class T, class... P, class... ViewCtorArgs>
 inline std::enable_if_t<
-    !(std::is_same<
-          typename Kokkos::View<T, P...>::memory_space,
-          typename Kokkos::View<T, P...>::HostMirror::memory_space>::value &&
-      std::is_same<
-          typename Kokkos::View<T, P...>::data_type,
-          typename Kokkos::View<T, P...>::HostMirror::data_type>::value),
+    !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space &&
+        !(std::is_same<typename Kokkos::View<T, P...>::memory_space,
+                       typename Kokkos::View<
+                           T, P...>::HostMirror::memory_space>::value &&
+          std::is_same<
+              typename Kokkos::View<T, P...>::data_type,
+              typename Kokkos::View<T, P...>::HostMirror::data_type>::value),
    typename Kokkos::View<T, P...>::HostMirror>
 create_mirror_view(const Kokkos::View<T, P...>& src,
                   const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
@ -3738,25 +3740,33 @@ create_mirror_view(const Kokkos::View<T, P...>& src,
 }

 // Create a mirror view in a new space (specialization for same space)
-template <class Space, class T, class... P, class... ViewCtorArgs>
-std::enable_if_t<Impl::MirrorViewType<Space, T, P...>::is_same_memspace,
-                 typename Impl::MirrorViewType<Space, T, P...>::view_type>
-create_mirror_view(const Space&, const Kokkos::View<T, P...>& src,
+template <class T, class... P, class... ViewCtorArgs,
+          class = std::enable_if_t<
+              Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
+std::enable_if_t<Impl::MirrorViewType<
+                     typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
+                     T, P...>::is_same_memspace,
+                 typename Impl::MirrorViewType<
+                     typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
+                     T, P...>::view_type>
+create_mirror_view(const Kokkos::View<T, P...>& src,
                   const Impl::ViewCtorProp<ViewCtorArgs...>&) {
  return src;
 }

 // Create a mirror view in a new space (specialization for different space)
-template <class Space, class T, class... P, class... ViewCtorArgs>
-std::enable_if_t<!Impl::MirrorViewType<Space, T, P...>::is_same_memspace,
-                 typename Impl::MirrorViewType<Space, T, P...>::view_type>
-create_mirror_view(const Space&, const Kokkos::View<T, P...>& src,
+template <class T, class... P, class... ViewCtorArgs,
+          class = std::enable_if_t<
+              Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>>
+std::enable_if_t<!Impl::MirrorViewType<
+                     typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
+                     T, P...>::is_same_memspace,
+                 typename Impl::MirrorViewType<
+                     typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space,
+                     T, P...>::view_type>
+create_mirror_view(const Kokkos::View<T, P...>& src,
                   const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) {
-  using MemorySpace = typename Space::memory_space;
-  using alloc_prop  = Impl::ViewCtorProp<ViewCtorArgs..., MemorySpace>;
-  alloc_prop prop_copy(arg_prop);
-
-  return Kokkos::Impl::create_mirror(src, prop_copy);
+  return Kokkos::Impl::create_mirror(src, arg_prop);
 }
 }  // namespace Impl

@ -3815,9 +3825,10 @@ typename Impl::MirrorViewType<Space, T, P...>::view_type create_mirror_view(
 template <class Space, class T, class... P,
          typename Enable = std::enable_if_t<Kokkos::is_space<Space>::value>>
 typename Impl::MirrorViewType<Space, T, P...>::view_type create_mirror_view(
-    Kokkos::Impl::WithoutInitializing_t wi, Space const& space,
+    Kokkos::Impl::WithoutInitializing_t wi, Space const&,
    Kokkos::View<T, P...> const& v) {
-  return Impl::create_mirror_view(space, v, view_alloc(wi));
+  return Impl::create_mirror_view(
+      v, view_alloc(typename Space::memory_space{}, wi));
 }

 template <class T, class... P, class... ViewCtorArgs>
--- a/lib/kokkos/core/src/Kokkos_View.hpp
+++ b/lib/kokkos/core/src/Kokkos_View.hpp
@ -1754,7 +1754,10 @@ struct RankDataType<ValueType, 0> {
 };

 template <unsigned N, typename... Args>
-KOKKOS_FUNCTION std::enable_if_t<N == View<Args...>::Rank, View<Args...>>
+KOKKOS_FUNCTION std::enable_if_t<
+    N == View<Args...>::Rank &&
+        std::is_same<typename ViewTraits<Args...>::specialize, void>::value,
+    View<Args...>>
 as_view_of_rank_n(View<Args...> v) {
  return v;
 }
@ -1762,13 +1765,13 @@ as_view_of_rank_n(View<Args...> v) {
 // Placeholder implementation to compile generic code for DynRankView; should
 // never be called
 template <unsigned N, typename T, typename... Args>
-std::enable_if_t<
-    N != View<T, Args...>::Rank,
+KOKKOS_FUNCTION std::enable_if_t<
+    N != View<T, Args...>::Rank &&
+        std::is_same<typename ViewTraits<T, Args...>::specialize, void>::value,
    View<typename RankDataType<typename View<T, Args...>::value_type, N>::type,
         Args...>>
 as_view_of_rank_n(View<T, Args...>) {
-  Kokkos::Impl::throw_runtime_exception(
-      "Trying to get at a View of the wrong rank");
+  Kokkos::abort("Trying to get at a View of the wrong rank");
  return {};
 }

--- a/lib/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp
+++ b/lib/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp
@ -101,8 +101,8 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
  void push_work(const std::int32_t w) const noexcept {
    const std::int32_t N = m_graph.numRows();

-    std::int32_t volatile* const ready_queue = &m_queue[0];
-    std::int32_t volatile* const end_hint    = &m_queue[2 * N + 1];
+    std::int32_t* const ready_queue = &m_queue[0];
+    std::int32_t* const end_hint    = &m_queue[2 * N + 1];

    // Push work to end of queue
    const std::int32_t j = atomic_fetch_add(end_hint, 1);
@ -134,14 +134,14 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
  std::int32_t pop_work() const noexcept {
    const std::int32_t N = m_graph.numRows();

-    std::int32_t volatile* const ready_queue = &m_queue[0];
-    std::int32_t volatile* const begin_hint  = &m_queue[2 * N];
+    std::int32_t* const ready_queue = &m_queue[0];
+    std::int32_t* const begin_hint  = &m_queue[2 * N];

    // begin hint is guaranteed to be less than or equal to
    // actual begin location in the queue.

-    for (std::int32_t i = *begin_hint; i < N; ++i) {
-      const std::int32_t w = ready_queue[i];
+    for (std::int32_t i = Kokkos::atomic_load(begin_hint); i < N; ++i) {
+      const std::int32_t w = Kokkos::atomic_load(&ready_queue[i]);

      if (w == END_TOKEN) {
        return END_TOKEN;
@ -169,7 +169,7 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits<Properties...> {

    const std::int32_t N = m_graph.numRows();

-    std::int32_t volatile* const count_queue = &m_queue[N];
+    std::int32_t* const count_queue = &m_queue[N];

    const std::int32_t B = m_graph.row_map(w);
    const std::int32_t E = m_graph.row_map(w + 1);
@ -199,7 +199,7 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits<Properties...> {

  KOKKOS_INLINE_FUNCTION
  void operator()(const TagCount, int i) const noexcept {
-    std::int32_t volatile* const count_queue = &m_queue[m_graph.numRows()];
+    std::int32_t* const count_queue = &m_queue[m_graph.numRows()];

    atomic_increment(count_queue + m_graph.entries[i]);
  }
--- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp
+++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp
@ -51,7 +51,7 @@ namespace Kokkos::Experimental::Impl {

 struct OpenACC_Traits {
 #if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
-    defined(KOKKOS_ARCH_AMPERE)
+    defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER)
  static constexpr acc_device_t dev_type     = acc_device_nvidia;
  static constexpr bool may_fallback_to_host = false;
 #else
--- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
+++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
@ -47,6 +47,7 @@
 #endif

 #include <Kokkos_Macros.hpp>
+#include <impl/Kokkos_DeviceManagement.hpp>

 #if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(_OPENMP)

@ -114,8 +115,9 @@ void OpenMPTargetInternal::impl_initialize() {

  // FIXME_OPENMPTARGET:  Only fix the number of teams for NVIDIA architectures
  // from Pascal and upwards.
-#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
-    defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
+#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) ||    \
+    defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
+    defined(KOKKOS_ARCH_HOPPER)
 #if defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG >= 1300)
  omp_set_num_teams(512);
 #endif
@ -164,7 +166,11 @@ void OpenMPTarget::impl_static_fence(const std::string& name) {
      name, Kokkos::Experimental::Impl::openmp_fence_is_static::yes);
 }

-void OpenMPTarget::impl_initialize(InitializationSettings const&) {
+void OpenMPTarget::impl_initialize(InitializationSettings const& settings) {
+  using Kokkos::Impl::get_gpu;
+  const int device_num = get_gpu(settings);
+  omp_set_default_device(device_num);
+
  Impl::OpenMPTargetInternal::impl_singleton()->impl_initialize();
 }
 void OpenMPTarget::impl_finalize() {
--- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
@ -155,7 +155,7 @@ void SYCL::impl_initialize(InitializationSettings const& settings) {
 #if !defined(KOKKOS_ARCH_INTEL_GPU) && !defined(KOKKOS_ARCH_KEPLER) && \
    !defined(KOKKOS_ARCH_MAXWELL) && !defined(KOKKOS_ARCH_PASCAL) &&   \
    !defined(KOKKOS_ARCH_VOLTA) && !defined(KOKKOS_ARCH_TURING75) &&   \
-    !defined(KOKKOS_ARCH_AMPERE)
+    !defined(KOKKOS_ARCH_AMPERE) && !defined(KOKKOS_ARCH_HOPPER)
  if (!settings.has_device_id() && gpu_devices.empty()) {
    Impl::SYCLInternal::singleton().initialize(sycl::device());
    return;
--- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
@ -335,9 +335,10 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
    return std::min({
             int(m_space.impl_internal_space_instance()->m_maxWorkgroupSize),
      // FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs.
-#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
-    defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) ||   \
-    defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
+#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) ||  \
+    defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) ||    \
+    defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
+    defined(KOKKOS_ARCH_HOPPER)
                 256,
 #endif
                 max_threads_for_memory
@ -367,9 +368,10 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
    return std::min<int>({
             int(m_space.impl_internal_space_instance()->m_maxWorkgroupSize),
      // FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs.
-#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
-    defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) ||   \
-    defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
+#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) ||  \
+    defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) ||    \
+    defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
+    defined(KOKKOS_ARCH_HOPPER)
                 256,
 #endif
                 max_threads_for_memory
--- a/lib/kokkos/core/src/impl/Kokkos_ClockTic.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_ClockTic.hpp
@ -110,10 +110,9 @@ KOKKOS_IMPL_HOST_FUNCTION inline uint64_t clock_tic_host() noexcept {

  return ((uint64_t)a) | (((uint64_t)d) << 32);

-#elif defined(__powerpc) || defined(__powerpc__) || defined(__powerpc64__) || \
-    defined(__POWERPC__) || defined(__ppc__) || defined(__ppc64__)
+#elif defined(__powerpc64__) || defined(__ppc64__)

-  unsigned int cycles = 0;
+  unsigned long cycles = 0;

  asm volatile("mftb %0" : "=r"(cycles));

--- a/lib/kokkos/core/src/impl/Kokkos_Core.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Core.cpp
@ -166,6 +166,8 @@ int get_device_count() {
 #elif defined(KOKKOS_ENABLE_OPENACC)
  return acc_get_num_devices(
      Kokkos::Experimental::Impl::OpenACC_Traits::dev_type);
+#elif defined(KOKKOS_ENABLE_OPENMPTARGET)
+  return omp_get_num_devices();
 #else
  Kokkos::abort("implementation bug");
  return -1;
@ -426,11 +428,17 @@ int Kokkos::Impl::get_gpu(const InitializationSettings& settings) {
    Kokkos::abort("implementation bug");
  }

-  auto const* local_rank_str =
-      std::getenv("OMPI_COMM_WORLD_LOCAL_RANK");  // OpenMPI
-  if (!local_rank_str)
-    local_rank_str = std::getenv("MV2_COMM_WORLD_LOCAL_RANK");  // MVAPICH2
-  if (!local_rank_str) local_rank_str = std::getenv("SLURM_LOCALID");  // SLURM
+  char const* local_rank_str = nullptr;
+  for (char const* env_var : {
+           "OMPI_COMM_WORLD_LOCAL_RANK",  // OpenMPI
+           "MV2_COMM_WORLD_LOCAL_RANK",   // MVAPICH2
+           "MPI_LOCALRANKID",             // MPICH
+           "SLURM_LOCALID",               // SLURM
+           "PMI_LOCAL_RANK"               // PMI
+       }) {
+    local_rank_str = std::getenv(env_var);
+    if (local_rank_str) break;
+  }

  // use first GPU available for execution if unable to detect local MPI rank
  if (!local_rank_str) {
--- a/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp
@ -1128,9 +1128,8 @@ struct ViewOffset<
  KOKKOS_INLINE_FUNCTION constexpr ViewOffset(
      const ViewOffset<DimRHS, Kokkos::LayoutRight, void>& rhs)
      : m_dim(rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0) {
-    static_assert((DimRHS::rank == 0 && dimension_type::rank == 0) ||
-                      (DimRHS::rank == 1 && dimension_type::rank == 1 &&
-                       dimension_type::rank_dynamic == 1),
+    static_assert(((DimRHS::rank == 0 && dimension_type::rank == 0) ||
+                   (DimRHS::rank == 1 && dimension_type::rank == 1)),
                  "ViewOffset LayoutLeft and LayoutRight are only compatible "
                  "when rank <= 1");
  }
@ -1778,8 +1777,7 @@ struct ViewOffset<
      const ViewOffset<DimRHS, Kokkos::LayoutLeft, void>& rhs)
      : m_dim(rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0) {
    static_assert((DimRHS::rank == 0 && dimension_type::rank == 0) ||
-                      (DimRHS::rank == 1 && dimension_type::rank == 1 &&
-                       dimension_type::rank_dynamic == 1),
+                      (DimRHS::rank == 1 && dimension_type::rank == 1),
                  "ViewOffset LayoutRight and LayoutLeft are only compatible "
                  "when rank <= 1");
  }
@ -3059,10 +3057,10 @@ struct ViewValueFunctor<DeviceType, ValueType, true /* is_scalar */> {
                   std::is_trivially_copy_assignable<Dummy>::value>
  construct_shared_allocation() {
    // Shortcut for zero initialization
-    ValueType value{};
 // On A64FX memset seems to do the wrong thing with regards to first touch
 // leading to the significant performance issues
 #ifndef KOKKOS_ARCH_A64FX
+    ValueType value{};
    if (Impl::is_zero_byte(value)) {
      uint64_t kpID = 0;
      if (Kokkos::Profiling::profileLibraryLoaded()) {
@ -3539,9 +3537,7 @@ class ViewMapping<
                     typename SrcTraits::array_layout>::value ||
        std::is_same<typename DstTraits::array_layout,
                     Kokkos::LayoutStride>::value ||
-        (DstTraits::dimension::rank == 0) ||
-        (DstTraits::dimension::rank == 1 &&
-         DstTraits::dimension::rank_dynamic == 1)
+        (DstTraits::dimension::rank == 0) || (DstTraits::dimension::rank == 1)
  };

 public:
--- a/lib/kokkos/core/unit_test/CMakeLists.txt
+++ b/lib/kokkos/core/unit_test/CMakeLists.txt
@ -73,6 +73,7 @@ KOKKOS_INCLUDE_DIRECTORIES(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files)

 SET(COMPILE_ONLY_SOURCES
  TestArray.cpp
+  TestCreateMirror.cpp
  TestDetectionIdiom.cpp
  TestInterOp.cpp
  TestLegionInteroperability.cpp
@ -86,6 +87,7 @@ ENDIF()
 KOKKOS_ADD_EXECUTABLE(
  TestCompileOnly
  SOURCES
+  TestCompileMain.cpp
  ${COMPILE_ONLY_SOURCES}
 )

--- a/lib/kokkos/core/unit_test/TestCompileMain.cpp
+++ b/lib/kokkos/core/unit_test/TestCompileMain.cpp
@ -0,0 +1 @@
+int main() {}
--- a/lib/kokkos/core/unit_test/TestCreateMirror.cpp
+++ b/lib/kokkos/core/unit_test/TestCreateMirror.cpp
@ -0,0 +1,126 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+template <typename TestView, typename MemorySpace>
+void check_memory_space(TestView, MemorySpace) {
+  static_assert(
+      std::is_same<typename TestView::memory_space, MemorySpace>::value, "");
+}
+
+template <class View>
+auto host_mirror_test_space(View) {
+  return std::conditional_t<
+      Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                 typename View::memory_space>::accessible,
+      typename View::memory_space, Kokkos::HostSpace>{};
+}
+
+template <typename View>
+void test_create_mirror_properties(const View& view) {
+  using namespace Kokkos;
+  using DeviceMemorySpace = typename DefaultExecutionSpace::memory_space;
+
+  // clang-format off
+  
+  // create_mirror
+  check_memory_space(create_mirror(WithoutInitializing,                          view), host_mirror_test_space(view));
+  check_memory_space(create_mirror(                                              view), host_mirror_test_space(view));
+  check_memory_space(create_mirror(WithoutInitializing, DefaultExecutionSpace{}, view), DeviceMemorySpace{});
+  check_memory_space(create_mirror(                     DefaultExecutionSpace{}, view), DeviceMemorySpace{});
+
+  // create_mirror_view
+  check_memory_space(create_mirror_view(WithoutInitializing,                          view), host_mirror_test_space(view));
+  check_memory_space(create_mirror_view(                                              view), host_mirror_test_space(view));
+  check_memory_space(create_mirror_view(WithoutInitializing, DefaultExecutionSpace{}, view), DeviceMemorySpace{});
+  check_memory_space(create_mirror_view(                     DefaultExecutionSpace{}, view), DeviceMemorySpace{});
+
+  // create_mirror view_alloc
+  check_memory_space(create_mirror(view_alloc(WithoutInitializing),                      view), host_mirror_test_space(view));
+  check_memory_space(create_mirror(view_alloc(),                                         view), host_mirror_test_space(view));
+  check_memory_space(create_mirror(view_alloc(WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
+  check_memory_space(create_mirror(view_alloc(                     DeviceMemorySpace{}), view), DeviceMemorySpace{});
+
+  // create_mirror_view view_alloc
+  check_memory_space(create_mirror_view(view_alloc(WithoutInitializing),                      view), host_mirror_test_space(view));
+  check_memory_space(create_mirror_view(view_alloc(),                                         view), host_mirror_test_space(view));
+  check_memory_space(create_mirror_view(view_alloc(WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
+  check_memory_space(create_mirror_view(view_alloc(                     DeviceMemorySpace{}), view), DeviceMemorySpace{});
+
+  // create_mirror view_alloc + execution space
+  check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{}, WithoutInitializing),                        view), host_mirror_test_space(view));
+  check_memory_space(create_mirror(view_alloc(DefaultHostExecutionSpace{}),                                         view), host_mirror_test_space(view));
+  check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{},   WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
+  check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{},                        DeviceMemorySpace{}), view), DeviceMemorySpace{});
+
+  // create_mirror_view view_alloc + execution space
+  check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{}, WithoutInitializing),                        view), host_mirror_test_space(view));
+  check_memory_space(create_mirror_view(view_alloc(DefaultHostExecutionSpace{}),                                         view), host_mirror_test_space(view));
+  check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{},   WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{});
+  check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{},                        DeviceMemorySpace{}), view), DeviceMemorySpace{});
+
+  // create_mirror_view_and_copy
+  check_memory_space(create_mirror_view_and_copy(HostSpace{},         view), HostSpace{});
+  check_memory_space(create_mirror_view_and_copy(DeviceMemorySpace{}, view), DeviceMemorySpace{});
+
+  // create_mirror_view_and_copy view_alloc
+  check_memory_space(create_mirror_view_and_copy(view_alloc(HostSpace{}),         view), HostSpace{});
+  check_memory_space(create_mirror_view_and_copy(view_alloc(DeviceMemorySpace{}), view), DeviceMemorySpace{});
+
+  // create_mirror_view_and_copy view_alloc + execution space
+  check_memory_space(create_mirror_view_and_copy(view_alloc(HostSpace{},         DefaultHostExecutionSpace{}),   view), HostSpace{});
+  check_memory_space(create_mirror_view_and_copy(view_alloc(DeviceMemorySpace{}, DefaultExecutionSpace{}),       view), DeviceMemorySpace{});
+
+  // clang-format on
+}
+
+void test() {
+  Kokkos::View<int*, Kokkos::DefaultExecutionSpace> device_view("device view",
+                                                                10);
+  Kokkos::View<int*, Kokkos::HostSpace> host_view("host view", 10);
+
+  test_create_mirror_properties(device_view);
+  test_create_mirror_properties(host_view);
+}
--- a/lib/kokkos/core/unit_test/TestDetectionIdiom.cpp
+++ b/lib/kokkos/core/unit_test/TestDetectionIdiom.cpp
@ -92,5 +92,3 @@ static_assert(std::is_same<difference_type<Woof>, int>::value,
 static_assert(std::is_same<difference_type<Bark>, std::ptrdiff_t>::value,
              "Bark's difference_type should be ptrdiff_t!");
 }  // namespace Example
-
-int main() {}
--- a/lib/kokkos/core/unit_test/TestScan.hpp
+++ b/lib/kokkos/core/unit_test/TestScan.hpp
@ -45,20 +45,23 @@
 #include <Kokkos_Core.hpp>
 #include <cstdio>

-namespace Test {
+namespace {

-template <class Device>
+template <class Device, class T, T ImbalanceSz>
 struct TestScan {
  using execution_space = Device;
-  using value_type      = int64_t;
+  using value_type      = T;

  Kokkos::View<int, Device, Kokkos::MemoryTraits<Kokkos::Atomic> > errors;

  KOKKOS_INLINE_FUNCTION
  void operator()(const int iwork, value_type& update,
                  const bool final_pass) const {
-    const value_type n         = iwork + 1;
-    const value_type imbalance = ((1000 <= n) && (0 == n % 1000)) ? 1000 : 0;
+    const value_type n = iwork + 1;
+    const value_type imbalance =
+        ((ImbalanceSz <= n) && (value_type(0) == n % ImbalanceSz))
+            ? ImbalanceSz
+            : value_type(0);

    // Insert an artificial load imbalance

@ -133,12 +136,29 @@ struct TestScan {
    }
  }
 };
+}  // namespace

 TEST(TEST_CATEGORY, scan) {
-  TestScan<TEST_EXECSPACE>::test_range(1, 1000);
-  TestScan<TEST_EXECSPACE>(0);
-  TestScan<TEST_EXECSPACE>(100000);
-  TestScan<TEST_EXECSPACE>(10000000);
-  TEST_EXECSPACE().fence();
+  constexpr auto imbalance_size = 1000;
+  TestScan<TEST_EXECSPACE, int64_t, imbalance_size>::test_range(1, 1000);
+  TestScan<TEST_EXECSPACE, int64_t, imbalance_size>(0);
+  TestScan<TEST_EXECSPACE, int64_t, imbalance_size>(100000);
+  TestScan<TEST_EXECSPACE, int64_t, imbalance_size>(10000000);
+}
+
+TEST(TEST_CATEGORY, small_size_scan) {
+  constexpr auto imbalance_size = 10;  // Pick to not overflow...
+  TestScan<TEST_EXECSPACE, std::int8_t, imbalance_size>(0);
+  TestScan<TEST_EXECSPACE, std::int8_t, imbalance_size>(5);
+  TestScan<TEST_EXECSPACE, std::int8_t, imbalance_size>(10);
+  TestScan<TEST_EXECSPACE, std::int8_t, imbalance_size>(
+      static_cast<std::size_t>(
+          std::sqrt(std::numeric_limits<std::int8_t>::max())));
+  constexpr auto short_imbalance_size = 100;  // Pick to not overflow...
+  TestScan<TEST_EXECSPACE, std::int16_t, short_imbalance_size>(0);
+  TestScan<TEST_EXECSPACE, std::int16_t, short_imbalance_size>(5);
+  TestScan<TEST_EXECSPACE, std::int16_t, short_imbalance_size>(100);
+  TestScan<TEST_EXECSPACE, std::int16_t, short_imbalance_size>(
+      static_cast<std::size_t>(
+          std::sqrt(std::numeric_limits<std::int16_t>::max())));
 }
-}  // namespace Test
--- a/lib/kokkos/core/unit_test/TestTeam.hpp
+++ b/lib/kokkos/core/unit_test/TestTeam.hpp
@ -1616,6 +1616,73 @@ struct TestTeamPolicyHandleByValue {

 }  // namespace

+namespace {
+template <typename ExecutionSpace>
+struct TestRepeatedTeamReduce {
+  static constexpr int ncol = 1500;  // nothing special, just some work
+
+  KOKKOS_FUNCTION void operator()(
+      const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type &team)
+      const {
+    // non-divisible by power of two to make triggering problems easier
+    constexpr int nlev = 129;
+    constexpr auto pi  = Kokkos::Experimental::pi_v<double>;
+    double b           = 0.;
+    for (int ri = 0; ri < 10; ++ri) {
+      // The contributions here must be sufficiently complex, simply adding ones
+      // wasn't enough to trigger the bug.
+      const auto g1 = [&](const int k, double &acc) {
+        acc += Kokkos::cos(pi * double(k) / nlev);
+      };
+      const auto g2 = [&](const int k, double &acc) {
+        acc += Kokkos::sin(pi * double(k) / nlev);
+      };
+      double a1, a2;
+      Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team, nlev), g1, a1);
+      Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team, nlev), g2, a2);
+      b += a1;
+      b += a2;
+    }
+    const auto h = [&]() {
+      const auto col = team.league_rank();
+      v(col)         = b + col;
+    };
+    Kokkos::single(Kokkos::PerTeam(team), h);
+  }
+
+  KOKKOS_FUNCTION void operator()(const int i, int &bad) const {
+    if (v(i) != v(0) + i) {
+      ++bad;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("Failing at %d!\n", i);
+    }
+  }
+
+  TestRepeatedTeamReduce() : v("v", ncol) { test(); }
+
+  void test() {
+    int team_size_recommended =
+        Kokkos::TeamPolicy<ExecutionSpace>(1, 1).team_size_recommended(
+            *this, Kokkos::ParallelForTag());
+    // Choose a non-recommened (non-power of two for GPUs) team size
+    int team_size = team_size_recommended > 1 ? team_size_recommended - 1 : 1;
+
+    // The failure was non-deterministic so run the test a bunch of times
+    for (int it = 0; it < 100; ++it) {
+      Kokkos::parallel_for(
+          Kokkos::TeamPolicy<ExecutionSpace>(ncol, team_size, 1), *this);
+
+      int bad = 0;
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecutionSpace>(0, ncol),
+                              *this, bad);
+      ASSERT_EQ(bad, 0) << " Failing in iteration " << it;
+    }
+  }
+
+  Kokkos::View<double *, ExecutionSpace> v;
+};
+
+}  // namespace
+
 }  // namespace Test

 /*--------------------------------------------------------------------------*/
--- a/lib/kokkos/core/unit_test/TestTeamReductionScan.hpp
+++ b/lib/kokkos/core/unit_test/TestTeamReductionScan.hpp
@ -134,5 +134,15 @@ TEST(TEST_CATEGORY, team_parallel_dummy_with_reducer_and_scratch_space) {
  }
 }

+TEST(TEST_CATEGORY, repeated_team_reduce) {
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+  if (std::is_same<TEST_EXECSPACE, Kokkos::Experimental::OpenMPTarget>::value)
+    GTEST_SKIP() << "skipping since team_reduce for OpenMPTarget is not "
+                    "properly implemented";
+#endif
+
+  TestRepeatedTeamReduce<TEST_EXECSPACE>();
+}
+
 }  // namespace Test
 #endif
--- a/lib/kokkos/core/unit_test/TestViewIsAssignable.hpp
+++ b/lib/kokkos/core/unit_test/TestViewIsAssignable.hpp
@ -92,8 +92,18 @@ TEST(TEST_CATEGORY, view_is_assignable) {
                          View<double*, left, d_exec>>::test(false, false, 10);

  // Layout assignment
+  Impl::TestAssignability<View<int, left, d_exec>,
+                          View<int, right, d_exec>>::test(true, true);
  Impl::TestAssignability<View<int*, left, d_exec>,
                          View<int*, right, d_exec>>::test(true, true, 10);
+  Impl::TestAssignability<View<int[5], left, d_exec>,
+                          View<int*, right, d_exec>>::test(false, false, 10);
+  Impl::TestAssignability<View<int[10], left, d_exec>,
+                          View<int*, right, d_exec>>::test(false, true, 10);
+  Impl::TestAssignability<View<int*, left, d_exec>,
+                          View<int[5], right, d_exec>>::test(true, true);
+  Impl::TestAssignability<View<int[5], left, d_exec>,
+                          View<int[10], right, d_exec>>::test(false, false);

  // This could be made possible (due to the degenerate nature of the views) but
  // we do not allow this yet
--- a/lib/kokkos/kokkos_5538.diff
+++ b/lib/kokkos/kokkos_5538.diff
@ -0,0 +1,199 @@
+diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos
+index 22af411f32..530510a0d1 100644
+--- a/lib/kokkos/Makefile.kokkos
+++ b/lib/kokkos/Makefile.kokkos
+@@ -20,7 +20,7 @@ KOKKOS_DEVICES ?= "OpenMP"
+ #KOKKOS_DEVICES ?= "Threads"
+ # Options:
+ # Intel:    KNC,KNL,SNB,HSW,BDW,SKL,SKX,ICL,ICX,SPR
+-# NVIDIA:   Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86
+# NVIDIA:   Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86,Hopper90
+ # ARM:      ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX
+ # IBM:      BGQ,Power7,Power8,Power9
+ # AMD-GPUS: Vega900,Vega906,Vega908,Vega90A
+@@ -401,6 +401,7 @@ KOKKOS_INTERNAL_USE_ARCH_VOLTA72 := $(call kokkos_has_string,$(KOKKOS_ARCH),Volt
+ KOKKOS_INTERNAL_USE_ARCH_TURING75 := $(call kokkos_has_string,$(KOKKOS_ARCH),Turing75)
+ KOKKOS_INTERNAL_USE_ARCH_AMPERE80 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere80)
+ KOKKOS_INTERNAL_USE_ARCH_AMPERE86 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere86)
+KOKKOS_INTERNAL_USE_ARCH_HOPPER90 := $(call kokkos_has_string,$(KOKKOS_ARCH),Hopper90)
+ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30)  \
+                                               + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32)  \
+                                               + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35)  \
+@@ -414,7 +415,8 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLE
+                                               + $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72)   \
+                                               + $(KOKKOS_INTERNAL_USE_ARCH_TURING75)  \
+                                               + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE80)  \
+-                                              + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86))
+                                              + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86)  \
+                                              + $(KOKKOS_INTERNAL_USE_ARCH_HOPPER90))
+ 
+ #SEK: This seems like a bug to me
+ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
+@@ -1194,6 +1196,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA_ARCH), 1)
+     tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE86")
+     KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_86
+   endif
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_HOPPER90), 1)
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER90")
+    KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_90
+  endif
+ 
+   ifneq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
+     KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)
+diff --git a/lib/kokkos/cmake/KokkosCore_config.h.in b/lib/kokkos/cmake/KokkosCore_config.h.in
+index 88ddc48378..b83ced9243 100644
+--- a/lib/kokkos/cmake/KokkosCore_config.h.in
+++ b/lib/kokkos/cmake/KokkosCore_config.h.in
+@@ -102,6 +102,7 @@
+ #cmakedefine KOKKOS_ARCH_AMPERE
+ #cmakedefine KOKKOS_ARCH_AMPERE80
+ #cmakedefine KOKKOS_ARCH_AMPERE86
+#cmakedefine KOKKOS_ARCH_HOPPER90
+ #cmakedefine KOKKOS_ARCH_AMD_ZEN
+ #cmakedefine KOKKOS_ARCH_AMD_ZEN2
+ #cmakedefine KOKKOS_ARCH_AMD_ZEN3
+diff --git a/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc b/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc
+index f56cef1651..2585a6a64c 100644
+--- a/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc
+++ b/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc
+@@ -74,6 +74,7 @@ int main() {
+     case 75: std::cout << "Set -DKokkos_ARCH_TURING75=ON ." << std::endl; break;
+     case 80: std::cout << "Set -DKokkos_ARCH_AMPERE80=ON ." << std::endl; break;
+     case 86: std::cout << "Set -DKokkos_ARCH_AMPERE86=ON ." << std::endl; break;
+    case 90: std::cout << "Set -DKokkos_ARCH_HOPPER90=ON ." << std::endl; break;
+     default:
+       std::cout << "Compute capability " << compute_capability
+                 << " is not supported" << std::endl;
+diff --git a/lib/kokkos/cmake/kokkos_arch.cmake b/lib/kokkos/cmake/kokkos_arch.cmake
+index ef16aad047..c1d76cceeb 100644
+--- a/lib/kokkos/cmake/kokkos_arch.cmake
+++ b/lib/kokkos/cmake/kokkos_arch.cmake
+@@ -86,6 +86,7 @@ KOKKOS_ARCH_OPTION(VOLTA72         GPU  "NVIDIA Volta generation CC 7.2"   "KOKK
+ KOKKOS_ARCH_OPTION(TURING75        GPU  "NVIDIA Turing generation CC 7.5"  "KOKKOS_SHOW_CUDA_ARCHS")
+ KOKKOS_ARCH_OPTION(AMPERE80        GPU  "NVIDIA Ampere generation CC 8.0"  "KOKKOS_SHOW_CUDA_ARCHS")
+ KOKKOS_ARCH_OPTION(AMPERE86        GPU  "NVIDIA Ampere generation CC 8.6"  "KOKKOS_SHOW_CUDA_ARCHS")
+KOKKOS_ARCH_OPTION(HOPPER90        GPU  "NVIDIA Hopper generation CC 9.0"  "KOKKOS_SHOW_CUDA_ARCHS")
+ 
+ IF(Kokkos_ENABLE_HIP OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_UNSUPPORTED_ARCHS)
+   SET(KOKKOS_SHOW_HIP_ARCHS ON)
+@@ -544,6 +545,7 @@ CHECK_CUDA_ARCH(VOLTA72   sm_72)
+ CHECK_CUDA_ARCH(TURING75  sm_75)
+ CHECK_CUDA_ARCH(AMPERE80  sm_80)
+ CHECK_CUDA_ARCH(AMPERE86  sm_86)
+CHECK_CUDA_ARCH(HOPPER90  sm_90)
+ 
+ SET(AMDGPU_ARCH_ALREADY_SPECIFIED "")
+ FUNCTION(CHECK_AMDGPU_ARCH ARCH FLAG)
+@@ -806,6 +808,10 @@ IF (KOKKOS_ARCH_AMPERE80 OR KOKKOS_ARCH_AMPERE86)
+   SET(KOKKOS_ARCH_AMPERE ON)
+ ENDIF()
+ 
+IF (KOKKOS_ARCH_HOPPER90)
+  SET(KOKKOS_ARCH_HOPPER ON)
+ENDIF()
+
+ #Regardless of version, make sure we define the general architecture name
+ IF (KOKKOS_ARCH_VEGA900 OR KOKKOS_ARCH_VEGA906 OR KOKKOS_ARCH_VEGA908 OR KOKKOS_ARCH_VEGA90A)
+   SET(KOKKOS_ARCH_VEGA ON)
+diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
+index 56f9117844..fcd4773dbc 100644
+--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
+@@ -232,7 +232,8 @@ inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) {
+       case 61: return 96;
+       case 70:
+       case 80:
+-      case 86: return 8;
+      case 86:
+      case 90: return 8;
+       case 75: return 32;
+       default:
+         Kokkos::Impl::throw_runtime_exception(
+diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp
+index 40a263561f..8c40ebd60d 100644
+--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_Conversion.hpp
+@@ -418,7 +418,7 @@ KOKKOS_INLINE_FUNCTION
+ #endif  // CUDA_VERSION >= 11000 && CUDA_VERSION < 11010
+ 
+ #if CUDA_VERSION >= 11010 && \
+-    ((defined(KOKKOS_ARCH_AMPERE80) || defined(KOKKOS_ARCH_AMPERE86)))
+    ((defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER)))
+ KOKKOS_INLINE_FUNCTION
+ bhalf_t cast_to_bhalf(bhalf_t val) { return val; }
+ KOKKOS_INLINE_FUNCTION
+diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp
+index f9451ecfe6..2ce1efb98c 100644
+--- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp
+++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp
+@@ -51,7 +51,7 @@ namespace Kokkos::Experimental::Impl {
+ 
+ struct OpenACC_Traits {
+ #if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
+-    defined(KOKKOS_ARCH_AMPERE)
+    defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_HOPPER)
+   static constexpr acc_device_t dev_type     = acc_device_nvidia;
+   static constexpr bool may_fallback_to_host = false;
+ #else
+diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
+index a9bc085912..27ee1d4232 100644
+--- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
+++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
+@@ -115,8 +115,9 @@ void OpenMPTargetInternal::impl_initialize() {
+ 
+   // FIXME_OPENMPTARGET:  Only fix the number of teams for NVIDIA architectures
+   // from Pascal and upwards.
+-#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
+-    defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
+#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) ||    \
+    defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
+    defined(KOKKOS_ARCH_HOPPER)
+ #if defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG >= 1300)
+   omp_set_num_teams(512);
+ #endif
+diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
+index 840db4327c..7e5addbc5b 100644
+--- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
+@@ -155,7 +155,7 @@ void SYCL::impl_initialize(InitializationSettings const& settings) {
+ #if !defined(KOKKOS_ARCH_INTEL_GPU) && !defined(KOKKOS_ARCH_KEPLER) && \
+     !defined(KOKKOS_ARCH_MAXWELL) && !defined(KOKKOS_ARCH_PASCAL) &&   \
+     !defined(KOKKOS_ARCH_VOLTA) && !defined(KOKKOS_ARCH_TURING75) &&   \
+-    !defined(KOKKOS_ARCH_AMPERE)
+    !defined(KOKKOS_ARCH_AMPERE) && !defined(KOKKOS_ARCH_HOPPER)
+   if (!settings.has_device_id() && gpu_devices.empty()) {
+     Impl::SYCLInternal::singleton().initialize(sycl::device());
+     return;
+diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
+index 5ac7d8af30..ba101f699e 100644
+--- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
+@@ -335,9 +335,10 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
+     return std::min({
+              int(m_space.impl_internal_space_instance()->m_maxWorkgroupSize),
+       // FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs.
+-#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
+-    defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) ||   \
+-    defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
+#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) ||  \
+    defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) ||    \
+    defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
+    defined(KOKKOS_ARCH_HOPPER)
+                  256,
+ #endif
+                  max_threads_for_memory
+@@ -367,9 +368,10 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
+     return std::min<int>({
+              int(m_space.impl_internal_space_instance()->m_maxWorkgroupSize),
+       // FIXME_SYCL Avoid requesting to many registers on NVIDIA GPUs.
+-#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) || \
+-    defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) ||   \
+-    defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
+#if defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL) ||  \
+    defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) ||    \
+    defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE) || \
+    defined(KOKKOS_ARCH_HOPPER)
+                  256,
+ #endif
+                  max_threads_for_memory
--- a/lib/kokkos/kokkos_5706.diff
+++ b/lib/kokkos/kokkos_5706.diff
@ -0,0 +1,523 @@
+diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
+index fcd4773dbc..30b6958a67 100644
+--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
+@@ -207,7 +207,6 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
+                                 LaunchBounds{});
+ }
+ 
+-// Assuming cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1)
+ // NOTE these number can be obtained several ways:
+ // * One option is to download the CUDA Occupancy Calculator spreadsheet, select
+ // "Compute Capability" first and check what is the smallest "Shared Memory
+@@ -242,6 +241,7 @@ inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) {
+     return 0;
+   }() * 1024;
+ }
+
+ }  // namespace Impl
+ }  // namespace Kokkos
+ 
+diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
+index 5811498e01..e22eb3b842 100644
+--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
+@@ -569,12 +569,6 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default
+   }
+ #endif
+ 
+-#ifdef KOKKOS_ENABLE_PRE_CUDA_10_DEPRECATION_API
+-  cudaThreadSetCacheConfig(cudaFuncCachePreferShared);
+-#else
+-  cudaDeviceSetCacheConfig(cudaFuncCachePreferShared);
+-#endif
+-
+   // Init the array for used for arbitrarily sized atomics
+   if (stream == nullptr) Impl::initialize_host_cuda_lock_arrays();
+ 
+diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
+index b7a80ad84f..5c4c3a7d39 100644
+--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
+@@ -93,10 +93,6 @@ namespace Impl {
+ //   __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
+ // function qualifier which could be used to improve performance.
+ //----------------------------------------------------------------------------
+-// Maximize L1 cache and minimize shared memory:
+-//   cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1 );
+-// For 2.0 capability: 48 KB L1 and 16 KB shared
+-//----------------------------------------------------------------------------
+ 
+ template <class DriverType>
+ __global__ static void cuda_parallel_launch_constant_memory() {
+@@ -158,63 +154,105 @@ inline void check_shmem_request(CudaInternal const* cuda_instance, int shmem) {
+   }
+ }
+ 
+-// This function needs to be template on DriverType and LaunchBounds
+// These functions needs to be template on DriverType and LaunchBounds
+ // so that the static bool is unique for each type combo
+ // KernelFuncPtr does not necessarily contain that type information.
+
+ template <class DriverType, class LaunchBounds, class KernelFuncPtr>
+-inline void configure_shmem_preference(KernelFuncPtr const& func,
+-                                       bool prefer_shmem) {
+const cudaFuncAttributes& get_cuda_kernel_func_attributes(
+    const KernelFuncPtr& func) {
+  // Only call cudaFuncGetAttributes once for each unique kernel
+  // by leveraging static variable initialization rules
+  auto wrap_get_attributes = [&]() -> cudaFuncAttributes {
+    cudaFuncAttributes attr;
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncGetAttributes(&attr, func));
+    return attr;
+  };
+  static cudaFuncAttributes func_attr = wrap_get_attributes();
+  return func_attr;
+}
+
+template <class DriverType, class LaunchBounds, class KernelFuncPtr>
+inline void configure_shmem_preference(const KernelFuncPtr& func,
+                                       const cudaDeviceProp& device_props,
+                                       const size_t block_size, int& shmem,
+                                       const size_t occupancy) {
+ #ifndef KOKKOS_ARCH_KEPLER
+-  // On Kepler the L1 has no benefit since it doesn't cache reads
+
+  const auto& func_attr =
+      get_cuda_kernel_func_attributes<DriverType, LaunchBounds>(func);
+
+  // Compute limits for number of blocks due to registers/SM
+  const size_t regs_per_sm     = device_props.regsPerMultiprocessor;
+  const size_t regs_per_thread = func_attr.numRegs;
+  // The granularity of register allocation is chunks of 256 registers per warp
+  // -> 8 registers per thread
+  const size_t allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
+  const size_t max_blocks_regs =
+      regs_per_sm / (allocated_regs_per_thread * block_size);
+
+  // Compute how many threads per sm we actually want
+  const size_t max_threads_per_sm = device_props.maxThreadsPerMultiProcessor;
+  // only allocate multiples of warp size
+  const size_t num_threads_desired =
+      ((max_threads_per_sm * occupancy / 100 + 31) / 32) * 32;
+  // Get close to the desired occupancy,
+  // don't undershoot by much but also don't allocate a whole new block just
+  // because one is a few threads over otherwise.
+  size_t num_blocks_desired =
+      (num_threads_desired + block_size * 0.8) / block_size;
+  num_blocks_desired = ::std::min(max_blocks_regs, num_blocks_desired);
+  if (num_blocks_desired == 0) num_blocks_desired = 1;
+
+  // Calculate how much shared memory we need per block
+  size_t shmem_per_block = shmem + func_attr.sharedSizeBytes;
+
+  // The minimum shared memory allocation we can have in total per SM is 8kB.
+  // If we want to lower occupancy we have to make sure we request at least that
+  // much in aggregate over all blocks, so that shared memory actually becomes a
+  // limiting factor for occupancy
+  constexpr size_t min_shmem_size_per_sm = 8192;
+  if ((occupancy < 100) &&
+      (shmem_per_block * num_blocks_desired < min_shmem_size_per_sm)) {
+    shmem_per_block = min_shmem_size_per_sm / num_blocks_desired;
+    // Need to set the caller's shmem variable so that the
+    // kernel launch uses the correct dynamic shared memory request
+    shmem = shmem_per_block - func_attr.sharedSizeBytes;
+  }
+
+  // Compute the carveout fraction we need based on occupancy
+  // Use multiples of 8kB
+  const size_t max_shmem_per_sm = device_props.sharedMemPerMultiprocessor;
+  size_t carveout               = shmem_per_block == 0
+                        ? 0
+                        : 100 *
+                              (((num_blocks_desired * shmem_per_block +
+                                 min_shmem_size_per_sm - 1) /
+                                min_shmem_size_per_sm) *
+                               min_shmem_size_per_sm) /
+                              max_shmem_per_sm;
+  if (carveout > 100) carveout = 100;
+
+  // Set the carveout, but only call it once per kernel or when it changes
+   auto set_cache_config = [&] {
+-    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
+-        func,
+-        (prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1)));
+-    return prefer_shmem;
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetAttribute(
+        func, cudaFuncAttributePreferredSharedMemoryCarveout, carveout));
+    return carveout;
+   };
+-  static bool cache_config_preference_cached = set_cache_config();
+-  if (cache_config_preference_cached != prefer_shmem) {
+  // Store the value in a static variable so we only reset if needed
+  static size_t cache_config_preference_cached = set_cache_config();
+  if (cache_config_preference_cached != carveout) {
+     cache_config_preference_cached = set_cache_config();
+   }
+ #else
+   // Use the parameters so we don't get a warning
+   (void)func;
+-  (void)prefer_shmem;
+  (void)device_props;
+  (void)block_size;
+  (void)occupancy;
+ #endif
+ }
+ 
+-template <class Policy>
+-std::enable_if_t<Policy::experimental_contains_desired_occupancy>
+-modify_launch_configuration_if_desired_occupancy_is_specified(
+-    Policy const& policy, cudaDeviceProp const& properties,
+-    cudaFuncAttributes const& attributes, dim3 const& block, int& shmem,
+-    bool& prefer_shmem) {
+-  int const block_size        = block.x * block.y * block.z;
+-  int const desired_occupancy = policy.impl_get_desired_occupancy().value();
+-
+-  size_t const shmem_per_sm_prefer_l1 = get_shmem_per_sm_prefer_l1(properties);
+-  size_t const static_shmem           = attributes.sharedSizeBytes;
+-
+-  // round to nearest integer and avoid division by zero
+-  int active_blocks = std::max(
+-      1, static_cast<int>(std::round(
+-             static_cast<double>(properties.maxThreadsPerMultiProcessor) /
+-             block_size * desired_occupancy / 100)));
+-  int const dynamic_shmem =
+-      shmem_per_sm_prefer_l1 / active_blocks - static_shmem;
+-
+-  if (dynamic_shmem > shmem) {
+-    shmem        = dynamic_shmem;
+-    prefer_shmem = false;
+-  }
+-}
+-
+-template <class Policy>
+-std::enable_if_t<!Policy::experimental_contains_desired_occupancy>
+-modify_launch_configuration_if_desired_occupancy_is_specified(
+-    Policy const&, cudaDeviceProp const&, cudaFuncAttributes const&,
+-    dim3 const& /*block*/, int& /*shmem*/, bool& /*prefer_shmem*/) {}
+-
+ // </editor-fold> end Some helper functions for launch code readability }}}1
+ //==============================================================================
+ 
+@@ -348,7 +386,7 @@ struct CudaParallelLaunchKernelInvoker<
+ #ifdef KOKKOS_CUDA_ENABLE_GRAPHS
+   inline static void create_parallel_launch_graph_node(
+       DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
+-      CudaInternal const* cuda_instance, bool prefer_shmem) {
+      CudaInternal const* cuda_instance) {
+     //----------------------------------------
+     auto const& graph = Impl::get_cuda_graph_from_kernel(driver);
+     KOKKOS_EXPECTS(bool(graph));
+@@ -358,8 +396,15 @@ struct CudaParallelLaunchKernelInvoker<
+ 
+     if (!Impl::is_empty_launch(grid, block)) {
+       Impl::check_shmem_request(cuda_instance, shmem);
+-      Impl::configure_shmem_preference<DriverType, LaunchBounds>(
+-          base_t::get_kernel_func(), prefer_shmem);
+      if (DriverType::Policy::
+                        experimental_contains_desired_occupancy) {
+        int desired_occupancy =
+            driver.get_policy().impl_get_desired_occupancy().value();
+        size_t block_size = block.x * block.y * block.z;
+        Impl::configure_shmem_preference<DriverType, LaunchBounds>(
+            base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
+            shmem, desired_occupancy);
+      }
+ 
+       void const* args[] = {&driver};
+ 
+@@ -442,7 +487,7 @@ struct CudaParallelLaunchKernelInvoker<
+ #ifdef KOKKOS_CUDA_ENABLE_GRAPHS
+   inline static void create_parallel_launch_graph_node(
+       DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
+-      CudaInternal const* cuda_instance, bool prefer_shmem) {
+      CudaInternal const* cuda_instance) {
+     //----------------------------------------
+     auto const& graph = Impl::get_cuda_graph_from_kernel(driver);
+     KOKKOS_EXPECTS(bool(graph));
+@@ -452,8 +497,15 @@ struct CudaParallelLaunchKernelInvoker<
+ 
+     if (!Impl::is_empty_launch(grid, block)) {
+       Impl::check_shmem_request(cuda_instance, shmem);
+-      Impl::configure_shmem_preference<DriverType, LaunchBounds>(
+-          base_t::get_kernel_func(), prefer_shmem);
+      if constexpr (DriverType::Policy::
+                        experimental_contains_desired_occupancy) {
+        int desired_occupancy =
+            driver.get_policy().impl_get_desired_occupancy().value();
+        size_t block_size = block.x * block.y * block.z;
+        Impl::configure_shmem_preference<DriverType, LaunchBounds>(
+            base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
+            shmem, desired_occupancy);
+      }
+ 
+       auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver);
+ 
+@@ -566,7 +618,7 @@ struct CudaParallelLaunchKernelInvoker<
+ #ifdef KOKKOS_CUDA_ENABLE_GRAPHS
+   inline static void create_parallel_launch_graph_node(
+       DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem,
+-      CudaInternal const* cuda_instance, bool prefer_shmem) {
+      CudaInternal const* cuda_instance) {
+     // Just use global memory; coordinating through events to share constant
+     // memory with the non-graph interface is not really reasonable since
+     // events don't work with Graphs directly, and this would anyway require
+@@ -580,7 +632,7 @@ struct CudaParallelLaunchKernelInvoker<
+         DriverType, LaunchBounds,
+         Experimental::CudaLaunchMechanism::GlobalMemory>;
+     global_launch_impl_t::create_parallel_launch_graph_node(
+-        driver, grid, block, shmem, cuda_instance, prefer_shmem);
+        driver, grid, block, shmem, cuda_instance);
+   }
+ #endif
+ };
+@@ -613,8 +665,7 @@ struct CudaParallelLaunchImpl<
+ 
+   inline static void launch_kernel(const DriverType& driver, const dim3& grid,
+                                    const dim3& block, int shmem,
+-                                   const CudaInternal* cuda_instance,
+-                                   bool prefer_shmem) {
+                                   const CudaInternal* cuda_instance) {
+     if (!Impl::is_empty_launch(grid, block)) {
+       // Prevent multiple threads to simultaneously set the cache configuration
+       // preference and launch the same kernel
+@@ -623,18 +674,17 @@ struct CudaParallelLaunchImpl<
+ 
+       Impl::check_shmem_request(cuda_instance, shmem);
+ 
+-      // If a desired occupancy is specified, we compute how much shared memory
+-      // to ask for to achieve that occupancy, assuming that the cache
+-      // configuration is `cudaFuncCachePreferL1`.  If the amount of dynamic
+-      // shared memory computed is actually smaller than `shmem` we overwrite
+-      // `shmem` and set `prefer_shmem` to `false`.
+-      modify_launch_configuration_if_desired_occupancy_is_specified(
+-          driver.get_policy(), cuda_instance->m_deviceProp,
+-          get_cuda_func_attributes(), block, shmem, prefer_shmem);
+-
+-      Impl::configure_shmem_preference<
+-          DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
+-          base_t::get_kernel_func(), prefer_shmem);
+      if (DriverType::Policy::
+                        experimental_contains_desired_occupancy) {
+        int desired_occupancy =
+            driver.get_policy().impl_get_desired_occupancy().value();
+        size_t block_size = block.x * block.y * block.z;
+        Impl::configure_shmem_preference<
+            DriverType,
+            Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
+            base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
+            shmem, desired_occupancy);
+      }
+ 
+       KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
+ 
+@@ -650,18 +700,9 @@ struct CudaParallelLaunchImpl<
+   }
+ 
+   static cudaFuncAttributes get_cuda_func_attributes() {
+-    // Race condition inside of cudaFuncGetAttributes if the same address is
+-    // given requires using a local variable as input instead of a static Rely
+-    // on static variable initialization to make sure only one thread executes
+-    // the code and the result is visible.
+-    auto wrap_get_attributes = []() -> cudaFuncAttributes {
+-      cudaFuncAttributes attr_tmp;
+-      KOKKOS_IMPL_CUDA_SAFE_CALL(
+-          cudaFuncGetAttributes(&attr_tmp, base_t::get_kernel_func()));
+-      return attr_tmp;
+-    };
+-    static cudaFuncAttributes attr = wrap_get_attributes();
+-    return attr;
+    return get_cuda_kernel_func_attributes<
+        DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
+        base_t::get_kernel_func());
+   }
+ };
+ 
+diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
+index e586bb4cc6..0e348c092a 100644
+--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
+@@ -121,8 +121,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
+               maxblocks[1]),
+           1);
+       CudaParallelLaunch<ParallelFor, LaunchBounds>(
+-          *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
+-          false);
+          *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
+     } else if (RP::rank == 3) {
+       const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], m_rp.m_tile[2]);
+       KOKKOS_ASSERT(block.x > 0);
+@@ -139,8 +138,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
+               (m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1) / block.z,
+               maxblocks[2]));
+       CudaParallelLaunch<ParallelFor, LaunchBounds>(
+-          *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
+-          false);
+          *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
+     } else if (RP::rank == 4) {
+       // id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to
+       // threadIdx.z
+@@ -158,8 +156,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
+               (m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1) / block.z,
+               maxblocks[2]));
+       CudaParallelLaunch<ParallelFor, LaunchBounds>(
+-          *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
+-          false);
+          *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
+     } else if (RP::rank == 5) {
+       // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to
+       // threadIdx.z
+@@ -175,8 +172,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
+               (m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1) / block.z,
+               maxblocks[2]));
+       CudaParallelLaunch<ParallelFor, LaunchBounds>(
+-          *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
+-          false);
+          *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
+     } else if (RP::rank == 6) {
+       // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4,id5 to
+       // threadIdx.z
+@@ -191,8 +187,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
+           std::min<array_index_type>(m_rp.m_tile_end[4] * m_rp.m_tile_end[5],
+                                      maxblocks[2]));
+       CudaParallelLaunch<ParallelFor, LaunchBounds>(
+-          *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
+-          false);
+          *this, grid, block, 0, m_rp.space().impl_internal_space_instance());
+     } else {
+       Kokkos::abort("Kokkos::MDRange Error: Exceeded rank bounds with Cuda\n");
+     }
+@@ -405,8 +400,8 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
+ 
+       CudaParallelLaunch<ParallelReduce, LaunchBounds>(
+           *this, grid, block, shmem,
+-          m_policy.space().impl_internal_space_instance(),
+-          false);  // copy to device and execute
+          m_policy.space()
+	      .impl_internal_space_instance()); // copy to device and execute
+ 
+       if (!m_result_ptr_device_accessible) {
+         if (m_result_ptr) {
+diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
+index ac160f8fe2..d1031751c2 100644
+--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp
+@@ -135,8 +135,7 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
+ #endif
+ 
+     CudaParallelLaunch<ParallelFor, LaunchBounds>(
+-        *this, grid, block, 0, m_policy.space().impl_internal_space_instance(),
+-        false);
+        *this, grid, block, 0, m_policy.space().impl_internal_space_instance());
+   }
+ 
+   ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
+@@ -375,8 +374,8 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
+ 
+       CudaParallelLaunch<ParallelReduce, LaunchBounds>(
+           *this, grid, block, shmem,
+-          m_policy.space().impl_internal_space_instance(),
+-          false);  // copy to device and execute
+          m_policy.space()
+	      .impl_internal_space_instance());  // copy to device and execute
+ 
+       if (!m_result_ptr_device_accessible) {
+         if (m_result_ptr) {
+@@ -726,16 +725,16 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
+         m_final = false;
+         CudaParallelLaunch<ParallelScan, LaunchBounds>(
+             *this, grid, block, shmem,
+-            m_policy.space().impl_internal_space_instance(),
+-            false);  // copy to device and execute
+            m_policy.space()
+	        .impl_internal_space_instance());  // copy to device and execute
+ #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
+       }
+ #endif
+       m_final = true;
+       CudaParallelLaunch<ParallelScan, LaunchBounds>(
+           *this, grid, block, shmem,
+-          m_policy.space().impl_internal_space_instance(),
+-          false);  // copy to device and execute
+          m_policy.space()
+	      .impl_internal_space_instance());  // copy to device and execute
+     }
+   }
+ 
+@@ -1038,16 +1037,16 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
+         m_final = false;
+         CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>(
+             *this, grid, block, shmem,
+-            m_policy.space().impl_internal_space_instance(),
+-            false);  // copy to device and execute
+            m_policy.space()
+	        .impl_internal_space_instance());  // copy to device and execute
+ #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
+       }
+ #endif
+       m_final = true;
+       CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>(
+           *this, grid, block, shmem,
+-          m_policy.space().impl_internal_space_instance(),
+-          false);  // copy to device and execute
+          m_policy.space()
+              .impl_internal_space_instance());  // copy to device and execute
+ 
+       const int size = Analysis::value_size(m_functor);
+ #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
+diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
+index cdd16085b3..ea9430b812 100644
+--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
+@@ -552,8 +552,8 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
+ 
+     CudaParallelLaunch<ParallelFor, LaunchBounds>(
+         *this, grid, block, shmem_size_total,
+-        m_policy.space().impl_internal_space_instance(),
+-        true);  // copy to device and execute
+        m_policy.space()
+	    .impl_internal_space_instance());  // copy to device and execute
+   }
+ 
+   ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
+@@ -878,8 +878,8 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
+ 
+       CudaParallelLaunch<ParallelReduce, LaunchBounds>(
+           *this, grid, block, shmem_size_total,
+-          m_policy.space().impl_internal_space_instance(),
+-          true);  // copy to device and execute
+          m_policy.space()
+	      .impl_internal_space_instance());  // copy to device and execute
+ 
+       if (!m_result_ptr_device_accessible) {
+         m_policy.space().fence(
+diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
+index 34d4bef9fd..178012431c 100644
+--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
+@@ -428,11 +428,6 @@ struct CudaReductionsFunctor<FunctorType, false, false> {
+ //   __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
+ // function qualifier which could be used to improve performance.
+ //----------------------------------------------------------------------------
+-// Maximize shared memory and minimize L1 cache:
+-//   cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferShared );
+-// For 2.0 capability: 48 KB shared and 16 KB L1
+-//----------------------------------------------------------------------------
+-//----------------------------------------------------------------------------
+ /*
+  *  Algorithmic constraints:
+  *   (a) blockDim.y <= 1024
+diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
+index fb3a6b138f..a12378a891 100644
+--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
+@@ -100,8 +100,7 @@ class ParallelFor<FunctorType, Kokkos::WorkGraphPolicy<Traits...>,
+     const int shared = 0;
+ 
+     Kokkos::Impl::CudaParallelLaunch<Self>(
+-        *this, grid, block, shared, Cuda().impl_internal_space_instance(),
+-        false);
+        *this, grid, block, shared, Cuda().impl_internal_space_instance());
+   }
+ 
+   inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
--- a/lib/kokkos/kokkos_5731.diff
+++ b/lib/kokkos/kokkos_5731.diff
@ -0,0 +1,46 @@
+diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
+index 30b6958a67..b94f053272 100644
+--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
+@@ -207,41 +207,6 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
+                                 LaunchBounds{});
+ }
+ 
+-// NOTE these number can be obtained several ways:
+-// * One option is to download the CUDA Occupancy Calculator spreadsheet, select
+-// "Compute Capability" first and check what is the smallest "Shared Memory
+-// Size Config" that is available.  The "Shared Memory Per Multiprocessor" in
+-// bytes is then to be found below in the summary.
+-// * Another option would be to look for the information in the "Tuning
+-// Guide(s)" of the CUDA Toolkit Documentation for each GPU architecture, in
+-// the "Shared Memory" section (more tedious)
+-inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) {
+-  int const compute_capability = properties.major * 10 + properties.minor;
+-  return [compute_capability]() {
+-    switch (compute_capability) {
+-      case 30:
+-      case 32:
+-      case 35: return 16;
+-      case 37: return 80;
+-      case 50:
+-      case 53:
+-      case 60:
+-      case 62: return 64;
+-      case 52:
+-      case 61: return 96;
+-      case 70:
+-      case 80:
+-      case 86:
+-      case 90: return 8;
+-      case 75: return 32;
+-      default:
+-        Kokkos::Impl::throw_runtime_exception(
+-            "Unknown device in cuda block size deduction");
+-    }
+-    return 0;
+-  }() * 1024;
+-}
+-
+ }  // namespace Impl
+ }  // namespace Kokkos
+ 
--- a/lib/kokkos/kokkos_5739.diff
+++ b/lib/kokkos/kokkos_5739.diff
@ -0,0 +1,204 @@
+diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
+index b94f053272..252c13c524 100644
+--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
+@@ -53,17 +53,69 @@
+ namespace Kokkos {
+ namespace Impl {
+ 
+inline int cuda_warp_per_sm_allocation_granularity(
+    cudaDeviceProp const& properties) {
+  // Allocation granularity of warps in each sm
+  switch (properties.major) {
+    case 3:
+    case 5:
+    case 7:
+    case 8:
+    case 9: return 4;
+    case 6: return (properties.minor == 0 ? 2 : 4);
+    default:
+      throw_runtime_exception(
+          "Unknown device in cuda warp per sm allocation granularity");
+      return 0;
+  }
+}
+
+inline int cuda_max_warps_per_sm_registers(
+    cudaDeviceProp const& properties, cudaFuncAttributes const& attributes) {
+  // Maximum number of warps per sm as a function of register counts,
+  // subject to the constraint that warps are allocated with a fixed granularity
+  int const max_regs_per_block = properties.regsPerBlock;
+  int const regs_per_warp      = attributes.numRegs * properties.warpSize;
+  int const warp_granularity =
+      cuda_warp_per_sm_allocation_granularity(properties);
+  // The granularity of register allocation is chunks of 256 registers per warp,
+  // which implies a need to over-allocate, so we round up
+  int const allocated_regs_per_warp = (regs_per_warp + 256 - 1) / 256;
+
+  // The maximum number of warps per SM is constrained from above by register
+  // allocation. To satisfy the constraint that warps per SM is allocated at a
+  // finite granularity, we need to round down.
+  int const max_warps_per_sm =
+      warp_granularity *
+      (max_regs_per_block / (allocated_regs_per_warp * warp_granularity));
+
+  return max_warps_per_sm;
+}
+
+ inline int cuda_max_active_blocks_per_sm(cudaDeviceProp const& properties,
+                                          cudaFuncAttributes const& attributes,
+                                          int block_size, size_t dynamic_shmem) {
+-  // Limits due do registers/SM
+  // Limits due to registers/SM
+   int const regs_per_sm     = properties.regsPerMultiprocessor;
+   int const regs_per_thread = attributes.numRegs;
+   // The granularity of register allocation is chunks of 256 registers per warp
+   // -> 8 registers per thread
+   int const allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
+-  int const max_blocks_regs =
+-      regs_per_sm / (allocated_regs_per_thread * block_size);
+  int max_blocks_regs = regs_per_sm / (allocated_regs_per_thread * block_size);
+
+  // Compute the maximum number of warps as a function of the number of
+  // registers
+  int const max_warps_per_sm_registers =
+      cuda_max_warps_per_sm_registers(properties, attributes);
+
+  // Constrain the number of blocks to respect the maximum number of warps per
+  // SM On face value this should be an equality, but due to the warp
+  // granularity constraints noted in `cuda_max_warps_per_sm_registers` the
+  // left-hand-side of this comparison can overshoot what the hardware allows
+  // based on register counts alone
+  while ((max_blocks_regs * block_size / properties.warpSize) >
+         max_warps_per_sm_registers)
+    max_blocks_regs--;
+ 
+   // Limits due to shared memory/SM
+   size_t const shmem_per_sm            = properties.sharedMemPerMultiprocessor;
+@@ -207,6 +259,19 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
+                                 LaunchBounds{});
+ }
+ 
+template <class LaunchBounds>
+int cuda_get_opt_block_size_no_shmem(const cudaFuncAttributes& attr,
+                                     LaunchBounds) {
+  auto const& prop = Kokkos::Cuda().cuda_device_prop();
+
+  // Thin version of cuda_get_opt_block_size for cases where there is no shared
+  // memory
+  auto const block_size_to_no_shmem = [&](int /*block_size*/) { return 0; };
+
+  return cuda_deduce_block_size(false, prop, attr, block_size_to_no_shmem,
+                                LaunchBounds{});
+}
+
+ }  // namespace Impl
+ }  // namespace Kokkos
+ 
+diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
+index 5c4c3a7d39..170183ca0a 100644
+--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
+@@ -188,9 +188,23 @@ inline void configure_shmem_preference(const KernelFuncPtr& func,
+   // The granularity of register allocation is chunks of 256 registers per warp
+   // -> 8 registers per thread
+   const size_t allocated_regs_per_thread = 8 * ((regs_per_thread + 8 - 1) / 8);
+-  const size_t max_blocks_regs =
+  size_t max_blocks_regs =
+       regs_per_sm / (allocated_regs_per_thread * block_size);
+ 
+  // Compute the maximum number of warps as a function of the number of
+  // registers
+  const size_t max_warps_per_sm_registers =
+      cuda_max_warps_per_sm_registers(device_props, func_attr);
+
+  // Constrain the number of blocks to respect the maximum number of warps per
+  // SM On face value this should be an equality, but due to the warp
+  // granularity constraints noted in `cuda_max_warps_per_sm_registers` the
+  // left-hand-side of this comparison can overshoot what the hardware allows
+  // based on register counts alone
+  while ((max_blocks_regs * block_size / device_props.warpSize) >
+         max_warps_per_sm_registers)
+    max_blocks_regs--;
+
+   // Compute how many threads per sm we actually want
+   const size_t max_threads_per_sm = device_props.maxThreadsPerMultiProcessor;
+   // only allocate multiples of warp size
+diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
+index 0e348c092a..7e4f62f12e 100644
+--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp
+@@ -67,6 +67,34 @@
+ namespace Kokkos {
+ namespace Impl {
+ 
+template <typename ParallelType, typename Policy, typename LaunchBounds>
+int max_tile_size_product_helper(const Policy& pol, const LaunchBounds&) {
+  cudaFuncAttributes attr =
+      CudaParallelLaunch<ParallelType,
+                         LaunchBounds>::get_cuda_func_attributes();
+  auto const& prop = pol.space().cuda_device_prop();
+
+  // Limits due to registers/SM, MDRange doesn't have
+  // shared memory constraints
+  int const optimal_block_size =
+      Kokkos::Impl::cuda_get_opt_block_size_no_shmem(attr, LaunchBounds{});
+
+  // Compute how many blocks of this size we can launch, based on warp
+  // constraints
+  int const max_warps_per_sm_registers =
+      Kokkos::Impl::cuda_max_warps_per_sm_registers(prop, attr);
+  int const max_num_threads_from_warps =
+      max_warps_per_sm_registers * prop.warpSize;
+  int const max_num_blocks = max_num_threads_from_warps / optimal_block_size;
+
+  // Compute the total number of threads
+  int const max_threads_per_sm = optimal_block_size * max_num_blocks;
+
+  return std::min(
+      max_threads_per_sm,
+      static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
+}
+
+ template <class FunctorType, class... Traits>
+ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
+  public:
+@@ -85,18 +113,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
+  public:
+   template <typename Policy, typename Functor>
+   static int max_tile_size_product(const Policy& pol, const Functor&) {
+-    cudaFuncAttributes attr =
+-        CudaParallelLaunch<ParallelFor,
+-                           LaunchBounds>::get_cuda_func_attributes();
+-    auto const& prop = pol.space().cuda_device_prop();
+-    // Limits due to registers/SM, MDRange doesn't have
+-    // shared memory constraints
+-    int const regs_per_sm        = prop.regsPerMultiprocessor;
+-    int const regs_per_thread    = attr.numRegs;
+-    int const max_threads_per_sm = regs_per_sm / regs_per_thread;
+-    return std::min(
+-        max_threads_per_sm,
+-        static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
+    return max_tile_size_product_helper<ParallelFor>(pol, LaunchBounds{});
+   }
+   Policy const& get_policy() const { return m_rp; }
+   inline __device__ void operator()() const {
+@@ -258,17 +275,7 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
+  public:
+   template <typename Policy, typename Functor>
+   static int max_tile_size_product(const Policy& pol, const Functor&) {
+-    cudaFuncAttributes attr =
+-        CudaParallelLaunch<ParallelReduce,
+-                           LaunchBounds>::get_cuda_func_attributes();
+-    auto const& prop = pol.space().cuda_device_prop();
+-    // Limits due do registers/SM
+-    int const regs_per_sm        = prop.regsPerMultiprocessor;
+-    int const regs_per_thread    = attr.numRegs;
+-    int const max_threads_per_sm = regs_per_sm / regs_per_thread;
+-    return std::min(
+-        max_threads_per_sm,
+-        static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism));
+    return max_tile_size_product_helper<ParallelReduce>(pol, LaunchBounds{});
+   }
+   Policy const& get_policy() const { return m_policy; }
+   inline __device__ void exec_range(reference_type update) const {
--- a/lib/kokkos/kokkos_fix_5706_apply_last.diff
+++ b/lib/kokkos/kokkos_fix_5706_apply_last.diff
@ -0,0 +1,63 @@
+diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
+index 170183ca0a..ba43e362bb 100644
+--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
+@@ -412,12 +412,16 @@ struct CudaParallelLaunchKernelInvoker<
+       Impl::check_shmem_request(cuda_instance, shmem);
+       if (DriverType::Policy::
+                         experimental_contains_desired_occupancy) {
+      /*
+         int desired_occupancy =
+             driver.get_policy().impl_get_desired_occupancy().value();
+         size_t block_size = block.x * block.y * block.z;
+         Impl::configure_shmem_preference<DriverType, LaunchBounds>(
+             base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
+-            shmem, desired_occupancy);
+            shmem, desired_occupancy);*/
+        Kokkos::Impl::throw_runtime_exception(
+        std::string("Cuda graph node creation FAILED:"
+                    " occupancy requests are currently broken."));
+       }
+ 
+       void const* args[] = {&driver};
+@@ -511,14 +515,17 @@ struct CudaParallelLaunchKernelInvoker<
+ 
+     if (!Impl::is_empty_launch(grid, block)) {
+       Impl::check_shmem_request(cuda_instance, shmem);
+-      if constexpr (DriverType::Policy::
+      if (DriverType::Policy::
+                         experimental_contains_desired_occupancy) {
+-        int desired_occupancy =
+        /*int desired_occupancy =
+             driver.get_policy().impl_get_desired_occupancy().value();
+         size_t block_size = block.x * block.y * block.z;
+         Impl::configure_shmem_preference<DriverType, LaunchBounds>(
+             base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
+-            shmem, desired_occupancy);
+            shmem, desired_occupancy);*/
+        Kokkos::Impl::throw_runtime_exception(
+        std::string("Cuda graph node creation FAILED:"
+                    " occupancy requests are currently broken."));
+       }
+ 
+       auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver);
+@@ -690,14 +697,17 @@ struct CudaParallelLaunchImpl<
+ 
+       if (DriverType::Policy::
+                         experimental_contains_desired_occupancy) {
+-        int desired_occupancy =
+        /*int desired_occupancy =
+             driver.get_policy().impl_get_desired_occupancy().value();
+         size_t block_size = block.x * block.y * block.z;
+         Impl::configure_shmem_preference<
+             DriverType,
+             Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>(
+             base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size,
+-            shmem, desired_occupancy);
+            shmem, desired_occupancy);*/
+        Kokkos::Impl::throw_runtime_exception(
+        std::string("Cuda graph node creation FAILED:"
+                    " occupancy requests are currently broken."));
+       }
+ 
+       KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
--- a/lib/kokkos/master_history.txt
+++ b/lib/kokkos/master_history.txt
@ -29,3 +29,4 @@ tag:  3.5.00     date: 11:19:2021    master: c28a8b03    release: 21b879e4
 tag:  3.6.00     date: 04:14:2022    master: 2834f94a    release: 6ea708ff
 tag:  3.6.01     date: 06:16:2022    master: b52f8c83    release: afe9b404
 tag:  3.7.00     date: 08:25:2022    master: d19aab99    release: 0018e5fb
+tag:  3.7.01     date: 12:01:2022    master: 61d7db55    release: d3bb8cfe
--- a/lib/kokkos/simd/cmake/Dependencies.cmake
+++ b/lib/kokkos/simd/cmake/Dependencies.cmake
@ -1,5 +1,5 @@
 TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
  LIB_REQUIRED_PACKAGES KokkosCore
-  LIB_OPTIONAL_TPLS Pthread CUDA HWLOC HPX
+  LIB_OPTIONAL_TPLS Pthread CUDA HWLOC
  TEST_OPTIONAL_TPLS CUSPARSE
  )
--- a/lib/kokkos/tpls/.clang-format
+++ b/lib/kokkos/tpls/.clang-format
@ -0,0 +1,3 @@
+#Official Tool: clang-format version 8.0.0
+DisableFormat: true
+SortIncludes: false
--- a/lib/kokkos/tpls/desul/include/desul/atomics/Lock_Array_Cuda.hpp
+++ b/lib/kokkos/tpls/desul/include/desul/atomics/Lock_Array_Cuda.hpp
@ -76,7 +76,7 @@ namespace Impl {
 /// instances in other translation units, we must update this CUDA global
 /// variable based on the Host global variable prior to running any kernels
 /// that will use it.
-/// That is the purpose of the ensure_cuda_lock_arrays_on_device function.
+/// That is the purpose of the KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE macro.
 __device__
 #ifdef __CUDACC_RDC__
    __constant__ extern
@ -138,42 +138,33 @@ namespace {
 static int lock_array_copied = 0;
 inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
 }  // namespace
-
-#ifdef __CUDACC_RDC__
-inline
-#else
-static
-#endif
-    void
-    copy_cuda_lock_arrays_to_device() {
-  if (lock_array_copied == 0) {
-    cudaMemcpyToSymbol(CUDA_SPACE_ATOMIC_LOCKS_DEVICE,
-                       &CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h,
-                       sizeof(int32_t*));
-    cudaMemcpyToSymbol(CUDA_SPACE_ATOMIC_LOCKS_NODE,
-                       &CUDA_SPACE_ATOMIC_LOCKS_NODE_h,
-                       sizeof(int32_t*));
-  }
-  lock_array_copied = 1;
-}
-
 }  // namespace Impl
 }  // namespace desul
+/* It is critical that this code be a macro, so that it will
+   capture the right address for desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE
+   putting this in an inline function will NOT do the right thing! */
+#define DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()                       \
+  {                                                                        \
+    if (::desul::Impl::lock_array_copied == 0) {                           \
+      cudaMemcpyToSymbol(::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE,    \
+                         &::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h, \
+                         sizeof(int32_t*));                                \
+      cudaMemcpyToSymbol(::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_NODE,      \
+                         &::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_NODE_h,   \
+                         sizeof(int32_t*));                                \
+    }                                                                      \
+    ::desul::Impl::lock_array_copied = 1;                                  \
+  }

 #endif /* defined( __CUDACC__ ) */

 #endif /* defined( DESUL_HAVE_CUDA_ATOMICS ) */

-namespace desul {
-
 #if defined(__CUDACC_RDC__) || (!defined(__CUDACC__))
-inline void ensure_cuda_lock_arrays_on_device() {}
+#define DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
 #else
-static inline void ensure_cuda_lock_arrays_on_device() {
-  Impl::copy_cuda_lock_arrays_to_device();
-}
+#define DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() \
+  DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()
 #endif

-}  // namespace desul
-
-#endif /* #ifndef DESUL_ATOMICS_LOCK_ARRAY_CUDA_HPP_ */
+#endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP_ */
--- a/lib/kokkos/tpls/desul/src/Lock_Array_CUDA.cpp
+++ b/lib/kokkos/tpls/desul/src/Lock_Array_CUDA.cpp
@ -70,7 +70,7 @@ void init_lock_arrays_cuda() {
                             "init_lock_arrays_cuda: cudaMalloc host locks");

  auto error_sync1 = cudaDeviceSynchronize();
-  copy_cuda_lock_arrays_to_device();
+  DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
  check_error_and_throw_cuda(error_sync1, "init_lock_arrays_cuda: post mallocs");
  init_lock_arrays_cuda_kernel<<<(CUDA_SPACE_ATOMIC_MASK + 1 + 255) / 256, 256>>>();
  auto error_sync2 = cudaDeviceSynchronize();
@ -85,7 +85,7 @@ void finalize_lock_arrays_cuda() {
  CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr;
  CUDA_SPACE_ATOMIC_LOCKS_NODE_h = nullptr;
 #ifdef __CUDACC_RDC__
-  copy_cuda_lock_arrays_to_device();
+  DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
 #endif
 }

--- a/src/KOKKOS/fix_langevin_kokkos.h
+++ b/src/KOKKOS/fix_langevin_kokkos.h
@ -190,6 +190,20 @@ namespace LAMMPS_NS {
          Tp_BIAS,Tp_RMASS,Tp_ZERO>(i);
      }

+      KOKKOS_INLINE_FUNCTION
+      static void init(value_type &update) {
+        update.fx = 0.0;
+        update.fy = 0.0;
+        update.fz = 0.0;
+      }
+      KOKKOS_INLINE_FUNCTION
+      static void join(value_type &update,
+                       const value_type &source) {
+        update.fx += source.fx;
+        update.fy += source.fy;
+        update.fz += source.fz;
+      }
+
      KOKKOS_INLINE_FUNCTION
      static void init(volatile value_type &update) {
        update.fx = 0.0;
@ -233,6 +247,15 @@ namespace LAMMPS_NS {
        energy += c.compute_energy_item(i);
      }
      KOKKOS_INLINE_FUNCTION
+      static void init(value_type &update) {
+        update = 0.0;
+      }
+      KOKKOS_INLINE_FUNCTION
+      static void join(value_type &update,
+                       const value_type &source) {
+        update += source;
+      }
+      KOKKOS_INLINE_FUNCTION
      static void init(volatile value_type &update) {
        update = 0.0;
      }
--- a/src/KOKKOS/kokkos.cpp
+++ b/src/KOKKOS/kokkos.cpp
@ -77,7 +77,6 @@ GPU_AWARE_UNKNOWN

 using namespace LAMMPS_NS;

-Kokkos::InitArguments KokkosLMP::args{-1, -1, -1, false};
 int KokkosLMP::is_finalized = 0;
 int KokkosLMP::init_ngpus = 0;

@ -110,7 +109,6 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
  ngpus = 0;
  int device = 0;
  nthreads = 1;
-  numa = 1;

  int iarg = 0;
  while (iarg < narg) {
@ -189,30 +187,24 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)

      iarg += 2;

-    } else if (strcmp(arg[iarg],"n") == 0 ||
-               strcmp(arg[iarg],"numa") == 0) {
-      numa = utils::inumeric(FLERR, arg[iarg+1], false, lmp);
-      iarg += 2;
-
    } else error->all(FLERR,"Invalid Kokkos command-line arg: {}", arg[iarg]);
  }

  // Initialize Kokkos. However, we cannot change any
  // Kokkos library parameters after the first initalization

-  if (args.num_threads != -1) {
-    if ((args.num_threads != nthreads) || (args.num_numa != numa) || (args.device_id != device))
+  Kokkos::InitializationSettings args;
+
+  if (args.has_num_threads()) {
+    if ((args.get_num_threads() != nthreads) || (args.get_device_id() != device))
      if (me == 0)
-        error->warning(FLERR,"Kokkos package already initalized, "
-                       "cannot reinitialize with different parameters");
-    nthreads = args.num_threads;
-    numa = args.num_numa;
-    device = args.device_id;
+        error->warning(FLERR,"Kokkos package already initalized. Cannot change parameters");
+    nthreads = args.get_num_threads();
+    device = args.get_device_id();
    ngpus = init_ngpus;
  } else {
-    args.num_threads = nthreads;
-    args.num_numa = numa;
-    args.device_id = device;
+    args.set_num_threads(nthreads);
+    args.set_device_id(device);
    init_ngpus = ngpus;
  }

@ -350,7 +342,7 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)

 /* ---------------------------------------------------------------------- */

-void KokkosLMP::initialize(Kokkos::InitArguments args, Error *error)
+void KokkosLMP::initialize(Kokkos::InitializationSettings args, Error *error)
 {
  if (!Kokkos::is_initialized()) {
    if (is_finalized)
--- a/src/KOKKOS/kokkos.h
+++ b/src/KOKKOS/kokkos.h
@ -43,7 +43,6 @@ class KokkosLMP : protected Pointers {
  int forward_fix_comm_changed;
  int reverse_comm_changed;
  int nthreads,ngpus;
-  int numa;
  int auto_sync;
  int gpu_aware_flag;
  int neigh_thread;
@ -53,12 +52,11 @@ class KokkosLMP : protected Pointers {
  double binsize;

  static int is_finalized;
-  static Kokkos::InitArguments args;
  static int init_ngpus;

  KokkosLMP(class LAMMPS *, int, char **);

-  static void initialize(Kokkos::InitArguments, Error *);
+  static void initialize(Kokkos::InitializationSettings, Error *);
  static void finalize();
  void accelerator(int, char **);
  int neigh_count(int);
--- a/src/KOKKOS/pair_reaxff_kokkos.h
+++ b/src/KOKKOS/pair_reaxff_kokkos.h
@ -524,6 +524,12 @@ struct PairReaxKokkosFindBondFunctor  {
  PairReaxFFKokkos<DeviceType> c;
  PairReaxKokkosFindBondFunctor(PairReaxFFKokkos<DeviceType>* c_ptr):c(*c_ptr) {};

+  KOKKOS_INLINE_FUNCTION
+  void join(int &dst,
+             const int &src) const {
+    dst = MAX(dst,src);
+  }
+
  KOKKOS_INLINE_FUNCTION
  void join(volatile int &dst,
             const volatile int &src) const {
--- a/src/accelerator_kokkos.h
+++ b/src/accelerator_kokkos.h
@ -52,7 +52,6 @@ class KokkosLMP {
  int kokkos_exists;
  int nthreads;
  int ngpus;
-  int numa;

  KokkosLMP(class LAMMPS *, int, char **) { kokkos_exists = 0; }
  ~KokkosLMP() {}
--- a/src/comm.cpp
+++ b/src/comm.cpp
@ -91,7 +91,7 @@ Comm::Comm(LAMMPS *lmp) : Pointers(lmp)
  nthreads = 1;
 #ifdef _OPENMP
  if (lmp->kokkos) {
-    nthreads = lmp->kokkos->nthreads * lmp->kokkos->numa;
+    nthreads = lmp->kokkos->nthreads;
  } else if (getenv("OMP_NUM_THREADS") == nullptr) {
    nthreads = 1;
    if (me == 0)